comparison druntime/src/compiler/dmd/arraydouble.d @ 759:d3eb054172f9

Added copy of druntime from DMD 2.020 modified for LDC.
author Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date Tue, 11 Nov 2008 01:52:37 +0100
parents
children
comparison
equal deleted inserted replaced
758:f04dde6e882c 759:d3eb054172f9
1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for double array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
6 */
7
8 module rt.arraydouble;
9
10 private import util.cpuid;
11
12 version (Unittest)
13 {
14 /* This is so unit tests will test every CPU variant
15 */
16 int cpuid;
17 const int CPUID_MAX = 5;
18 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
19 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
20 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
21 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
22 }
23 else
24 {
25 alias util.cpuid.mmx mmx;
26 alias util.cpuid.sse sse;
27 alias util.cpuid.sse2 sse2;
28 alias util.cpuid.amd3dnow amd3dnow;
29 }
30
31 //version = log;
32
33 bool disjoint(T)(T[] a, T[] b)
34 {
35 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
36 }
37
38 /* Performance figures measured by Burton Radons
39 */
40
41 alias double T;
42
43 extern (C):
44
45 /* ======================================================================== */
46
47 /***********************
48 * Computes:
49 * a[] = b[] + c[]
50 */
51
52 T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b)
53 in
54 {
55 assert(a.length == b.length && b.length == c.length);
56 assert(disjoint(a, b));
57 assert(disjoint(a, c));
58 assert(disjoint(b, c));
59 }
60 body
61 {
62 auto aptr = a.ptr;
63 auto aend = aptr + a.length;
64 auto bptr = b.ptr;
65 auto cptr = c.ptr;
66
67 version (D_InlineAsm_X86)
68 {
69 // SSE2 version is 333% faster
70 if (sse2() && b.length >= 16)
71 {
72 auto n = aptr + (b.length & ~15);
73
74 // Unaligned case
75 asm
76 {
77 mov EAX, bptr; // left operand
78 mov ECX, cptr; // right operand
79 mov ESI, aptr; // destination operand
80 mov EDI, n; // end comparison
81
82 align 8;
83 startsseloopb:
84 movupd XMM0, [EAX];
85 movupd XMM1, [EAX+16];
86 movupd XMM2, [EAX+32];
87 movupd XMM3, [EAX+48];
88 add EAX, 64;
89 movupd XMM4, [ECX];
90 movupd XMM5, [ECX+16];
91 movupd XMM6, [ECX+32];
92 movupd XMM7, [ECX+48];
93 add ESI, 64;
94 addpd XMM0, XMM4;
95 addpd XMM1, XMM5;
96 addpd XMM2, XMM6;
97 addpd XMM3, XMM7;
98 add ECX, 64;
99 movupd [ESI+ 0-64], XMM0;
100 movupd [ESI+16-64], XMM1;
101 movupd [ESI+32-64], XMM2;
102 movupd [ESI+48-64], XMM3;
103 cmp ESI, EDI;
104 jb startsseloopb;
105
106 mov aptr, ESI;
107 mov bptr, EAX;
108 mov cptr, ECX;
109 }
110 }
111 }
112
113 // Handle remainder
114 while (aptr < aend)
115 *aptr++ = *bptr++ + *cptr++;
116
117 return a;
118 }
119
120
121 unittest
122 {
123 printf("_arraySliceSliceAddSliceAssign_d unittest\n");
124 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
125 {
126 version (log) printf(" cpuid %d\n", cpuid);
127
128 for (int j = 0; j < 2; j++)
129 {
130 const int dim = 67;
131 T[] a = new T[dim + j]; // aligned on 16 byte boundary
132 a = a[j .. dim + j]; // misalign for second iteration
133 T[] b = new T[dim + j];
134 b = b[j .. dim + j];
135 T[] c = new T[dim + j];
136 c = c[j .. dim + j];
137
138 for (int i = 0; i < dim; i++)
139 { a[i] = cast(T)i;
140 b[i] = cast(T)(i + 7);
141 c[i] = cast(T)(i * 2);
142 }
143
144 c[] = a[] + b[];
145
146 for (int i = 0; i < dim; i++)
147 {
148 if (c[i] != cast(T)(a[i] + b[i]))
149 {
150 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
151 assert(0);
152 }
153 }
154 }
155 }
156 }
157
158 /* ======================================================================== */
159
160 /***********************
161 * Computes:
162 * a[] = b[] - c[]
163 */
164
165 T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b)
166 in
167 {
168 assert(a.length == b.length && b.length == c.length);
169 assert(disjoint(a, b));
170 assert(disjoint(a, c));
171 assert(disjoint(b, c));
172 }
173 body
174 {
175 auto aptr = a.ptr;
176 auto aend = aptr + a.length;
177 auto bptr = b.ptr;
178 auto cptr = c.ptr;
179
180 version (D_InlineAsm_X86)
181 {
182 // SSE2 version is 324% faster
183 if (sse2() && b.length >= 8)
184 {
185 auto n = aptr + (b.length & ~7);
186
187 // Unaligned case
188 asm
189 {
190 mov EAX, bptr; // left operand
191 mov ECX, cptr; // right operand
192 mov ESI, aptr; // destination operand
193 mov EDI, n; // end comparison
194
195 align 8;
196 startsseloopb:
197 movupd XMM0, [EAX];
198 movupd XMM1, [EAX+16];
199 movupd XMM2, [EAX+32];
200 movupd XMM3, [EAX+48];
201 add EAX, 64;
202 movupd XMM4, [ECX];
203 movupd XMM5, [ECX+16];
204 movupd XMM6, [ECX+32];
205 movupd XMM7, [ECX+48];
206 add ESI, 64;
207 subpd XMM0, XMM4;
208 subpd XMM1, XMM5;
209 subpd XMM2, XMM6;
210 subpd XMM3, XMM7;
211 add ECX, 64;
212 movupd [ESI+ 0-64], XMM0;
213 movupd [ESI+16-64], XMM1;
214 movupd [ESI+32-64], XMM2;
215 movupd [ESI+48-64], XMM3;
216 cmp ESI, EDI;
217 jb startsseloopb;
218
219 mov aptr, ESI;
220 mov bptr, EAX;
221 mov cptr, ECX;
222 }
223 }
224 }
225
226 // Handle remainder
227 while (aptr < aend)
228 *aptr++ = *bptr++ - *cptr++;
229
230 return a;
231 }
232
233
234 unittest
235 {
236 printf("_arraySliceSliceMinSliceAssign_d unittest\n");
237 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
238 {
239 version (log) printf(" cpuid %d\n", cpuid);
240
241 for (int j = 0; j < 2; j++)
242 {
243 const int dim = 67;
244 T[] a = new T[dim + j]; // aligned on 16 byte boundary
245 a = a[j .. dim + j]; // misalign for second iteration
246 T[] b = new T[dim + j];
247 b = b[j .. dim + j];
248 T[] c = new T[dim + j];
249 c = c[j .. dim + j];
250
251 for (int i = 0; i < dim; i++)
252 { a[i] = cast(T)i;
253 b[i] = cast(T)(i + 7);
254 c[i] = cast(T)(i * 2);
255 }
256
257 c[] = a[] - b[];
258
259 for (int i = 0; i < dim; i++)
260 {
261 if (c[i] != cast(T)(a[i] - b[i]))
262 {
263 printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]);
264 assert(0);
265 }
266 }
267 }
268 }
269 }
270
271
272 /* ======================================================================== */
273
274 /***********************
275 * Computes:
276 * a[] = b[] + value
277 */
278
279 T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b)
280 in
281 {
282 assert(a.length == b.length);
283 assert(disjoint(a, b));
284 }
285 body
286 {
287 //printf("_arraySliceExpAddSliceAssign_d()\n");
288 auto aptr = a.ptr;
289 auto aend = aptr + a.length;
290 auto bptr = b.ptr;
291
292 version (D_InlineAsm_X86)
293 {
294 // SSE2 version is 305% faster
295 if (sse2() && a.length >= 8)
296 {
297 auto n = aptr + (a.length & ~7);
298
299 // Unaligned case
300 asm
301 {
302 mov EAX, bptr;
303 mov ESI, aptr;
304 mov EDI, n;
305 movsd XMM4, value;
306 shufpd XMM4, XMM4, 0;
307
308 align 8;
309 startsseloop:
310 add ESI, 64;
311 movupd XMM0, [EAX];
312 movupd XMM1, [EAX+16];
313 movupd XMM2, [EAX+32];
314 movupd XMM3, [EAX+48];
315 add EAX, 64;
316 addpd XMM0, XMM4;
317 addpd XMM1, XMM4;
318 addpd XMM2, XMM4;
319 addpd XMM3, XMM4;
320 movupd [ESI+ 0-64], XMM0;
321 movupd [ESI+16-64], XMM1;
322 movupd [ESI+32-64], XMM2;
323 movupd [ESI+48-64], XMM3;
324 cmp ESI, EDI;
325 jb startsseloop;
326
327 mov aptr, ESI;
328 mov bptr, EAX;
329 }
330 }
331 }
332
333 while (aptr < aend)
334 *aptr++ = *bptr++ + value;
335
336 return a;
337 }
338
339 unittest
340 {
341 printf("_arraySliceExpAddSliceAssign_d unittest\n");
342 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
343 {
344 version (log) printf(" cpuid %d\n", cpuid);
345
346 for (int j = 0; j < 2; j++)
347 {
348 const int dim = 67;
349 T[] a = new T[dim + j]; // aligned on 16 byte boundary
350 a = a[j .. dim + j]; // misalign for second iteration
351 T[] b = new T[dim + j];
352 b = b[j .. dim + j];
353 T[] c = new T[dim + j];
354 c = c[j .. dim + j];
355
356 for (int i = 0; i < dim; i++)
357 { a[i] = cast(T)i;
358 b[i] = cast(T)(i + 7);
359 c[i] = cast(T)(i * 2);
360 }
361
362 c[] = a[] + 6;
363
364 for (int i = 0; i < dim; i++)
365 {
366 if (c[i] != cast(T)(a[i] + 6))
367 {
368 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
369 assert(0);
370 }
371 }
372 }
373 }
374 }
375
376 /* ======================================================================== */
377
378 /***********************
379 * Computes:
380 * a[] += value
381 */
382
383 T[] _arrayExpSliceAddass_d(T[] a, T value)
384 {
385 //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
386 auto aptr = a.ptr;
387 auto aend = aptr + a.length;
388
389 version (D_InlineAsm_X86)
390 {
391 // SSE2 version is 114% faster
392 if (sse2() && a.length >= 8)
393 {
394 auto n = cast(T*)((cast(uint)aend) & ~7);
395 if (aptr < n)
396
397 // Unaligned case
398 asm
399 {
400 mov ESI, aptr;
401 mov EDI, n;
402 movsd XMM4, value;
403 shufpd XMM4, XMM4, 0;
404
405 align 8;
406 startsseloopa:
407 movupd XMM0, [ESI];
408 movupd XMM1, [ESI+16];
409 movupd XMM2, [ESI+32];
410 movupd XMM3, [ESI+48];
411 add ESI, 64;
412 addpd XMM0, XMM4;
413 addpd XMM1, XMM4;
414 addpd XMM2, XMM4;
415 addpd XMM3, XMM4;
416 movupd [ESI+ 0-64], XMM0;
417 movupd [ESI+16-64], XMM1;
418 movupd [ESI+32-64], XMM2;
419 movupd [ESI+48-64], XMM3;
420 cmp ESI, EDI;
421 jb startsseloopa;
422
423 mov aptr, ESI;
424 }
425 }
426 }
427
428 while (aptr < aend)
429 *aptr++ += value;
430
431 return a;
432 }
433
434 unittest
435 {
436 printf("_arrayExpSliceAddass_d unittest\n");
437 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
438 {
439 version (log) printf(" cpuid %d\n", cpuid);
440
441 for (int j = 0; j < 2; j++)
442 {
443 const int dim = 67;
444 T[] a = new T[dim + j]; // aligned on 16 byte boundary
445 a = a[j .. dim + j]; // misalign for second iteration
446 T[] b = new T[dim + j];
447 b = b[j .. dim + j];
448 T[] c = new T[dim + j];
449 c = c[j .. dim + j];
450
451 for (int i = 0; i < dim; i++)
452 { a[i] = cast(T)i;
453 b[i] = cast(T)(i + 7);
454 c[i] = cast(T)(i * 2);
455 }
456
457 a[] = c[];
458 c[] += 6;
459
460 for (int i = 0; i < dim; i++)
461 {
462 if (c[i] != cast(T)(a[i] + 6))
463 {
464 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
465 assert(0);
466 }
467 }
468 }
469 }
470 }
471
472 /* ======================================================================== */
473
474 /***********************
475 * Computes:
476 * a[] += b[]
477 */
478
479 T[] _arraySliceSliceAddass_d(T[] a, T[] b)
480 in
481 {
482 assert (a.length == b.length);
483 assert (disjoint(a, b));
484 }
485 body
486 {
487 //printf("_arraySliceSliceAddass_d()\n");
488 auto aptr = a.ptr;
489 auto aend = aptr + a.length;
490 auto bptr = b.ptr;
491
492 version (D_InlineAsm_X86)
493 {
494 // SSE2 version is 183% faster
495 if (sse2() && a.length >= 8)
496 {
497 auto n = aptr + (a.length & ~7);
498
499 // Unaligned case
500 asm
501 {
502 mov ECX, bptr; // right operand
503 mov ESI, aptr; // destination operand
504 mov EDI, n; // end comparison
505
506 align 8;
507 startsseloopb:
508 movupd XMM0, [ESI];
509 movupd XMM1, [ESI+16];
510 movupd XMM2, [ESI+32];
511 movupd XMM3, [ESI+48];
512 add ESI, 64;
513 movupd XMM4, [ECX];
514 movupd XMM5, [ECX+16];
515 movupd XMM6, [ECX+32];
516 movupd XMM7, [ECX+48];
517 add ECX, 64;
518 addpd XMM0, XMM4;
519 addpd XMM1, XMM5;
520 addpd XMM2, XMM6;
521 addpd XMM3, XMM7;
522 movupd [ESI+ 0-64], XMM0;
523 movupd [ESI+16-64], XMM1;
524 movupd [ESI+32-64], XMM2;
525 movupd [ESI+48-64], XMM3;
526 cmp ESI, EDI;
527 jb startsseloopb;
528
529 mov aptr, ESI;
530 mov bptr, ECX;
531 }
532 }
533 }
534
535 while (aptr < aend)
536 *aptr++ += *bptr++;
537
538 return a;
539 }
540
541 unittest
542 {
543 printf("_arraySliceSliceAddass_d unittest\n");
544 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
545 {
546 version (log) printf(" cpuid %d\n", cpuid);
547
548 for (int j = 0; j < 2; j++)
549 {
550 const int dim = 67;
551 T[] a = new T[dim + j]; // aligned on 16 byte boundary
552 a = a[j .. dim + j]; // misalign for second iteration
553 T[] b = new T[dim + j];
554 b = b[j .. dim + j];
555 T[] c = new T[dim + j];
556 c = c[j .. dim + j];
557
558 for (int i = 0; i < dim; i++)
559 { a[i] = cast(T)i;
560 b[i] = cast(T)(i + 7);
561 c[i] = cast(T)(i * 2);
562 }
563
564 a[] = c[];
565 c[] += b[];
566
567 for (int i = 0; i < dim; i++)
568 {
569 if (c[i] != cast(T)(a[i] + b[i]))
570 {
571 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
572 assert(0);
573 }
574 }
575 }
576 }
577 }
578
579 /* ======================================================================== */
580
581 /***********************
582 * Computes:
583 * a[] = b[] - value
584 */
585
586 T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b)
587 in
588 {
589 assert (a.length == b.length);
590 assert (disjoint(a, b));
591 }
592 body
593 {
594 //printf("_arraySliceExpMinSliceAssign_d()\n");
595 auto aptr = a.ptr;
596 auto aend = aptr + a.length;
597 auto bptr = b.ptr;
598
599 version (D_InlineAsm_X86)
600 {
601 // SSE2 version is 305% faster
602 if (sse2() && a.length >= 8)
603 {
604 auto n = aptr + (a.length & ~7);
605
606 // Unaligned case
607 asm
608 {
609 mov EAX, bptr;
610 mov ESI, aptr;
611 mov EDI, n;
612 movsd XMM4, value;
613 shufpd XMM4, XMM4, 0;
614
615 align 8;
616 startsseloop:
617 add ESI, 64;
618 movupd XMM0, [EAX];
619 movupd XMM1, [EAX+16];
620 movupd XMM2, [EAX+32];
621 movupd XMM3, [EAX+48];
622 add EAX, 64;
623 subpd XMM0, XMM4;
624 subpd XMM1, XMM4;
625 subpd XMM2, XMM4;
626 subpd XMM3, XMM4;
627 movupd [ESI+ 0-64], XMM0;
628 movupd [ESI+16-64], XMM1;
629 movupd [ESI+32-64], XMM2;
630 movupd [ESI+48-64], XMM3;
631 cmp ESI, EDI;
632 jb startsseloop;
633
634 mov aptr, ESI;
635 mov bptr, EAX;
636 }
637 }
638 }
639
640 while (aptr < aend)
641 *aptr++ = *bptr++ - value;
642
643 return a;
644 }
645
646 unittest
647 {
648 printf("_arraySliceExpMinSliceAssign_d unittest\n");
649 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
650 {
651 version (log) printf(" cpuid %d\n", cpuid);
652
653 for (int j = 0; j < 2; j++)
654 {
655 const int dim = 67;
656 T[] a = new T[dim + j]; // aligned on 16 byte boundary
657 a = a[j .. dim + j]; // misalign for second iteration
658 T[] b = new T[dim + j];
659 b = b[j .. dim + j];
660 T[] c = new T[dim + j];
661 c = c[j .. dim + j];
662
663 for (int i = 0; i < dim; i++)
664 { a[i] = cast(T)i;
665 b[i] = cast(T)(i + 7);
666 c[i] = cast(T)(i * 2);
667 }
668
669 c[] = a[] - 6;
670
671 for (int i = 0; i < dim; i++)
672 {
673 if (c[i] != cast(T)(a[i] - 6))
674 {
675 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
676 assert(0);
677 }
678 }
679 }
680 }
681 }
682
683 /* ======================================================================== */
684
685 /***********************
686 * Computes:
687 * a[] = value - b[]
688 */
689
690 T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value)
691 in
692 {
693 assert (a.length == b.length);
694 assert (disjoint(a, b));
695 }
696 body
697 {
698 //printf("_arrayExpSliceMinSliceAssign_d()\n");
699 auto aptr = a.ptr;
700 auto aend = aptr + a.length;
701 auto bptr = b.ptr;
702
703 version (D_InlineAsm_X86)
704 {
705 // SSE2 version is 66% faster
706 if (sse2() && a.length >= 8)
707 {
708 auto n = aptr + (a.length & ~7);
709
710 // Unaligned case
711 asm
712 {
713 mov EAX, bptr;
714 mov ESI, aptr;
715 mov EDI, n;
716 movsd XMM4, value;
717 shufpd XMM4, XMM4, 0;
718
719 align 8;
720 startsseloop:
721 add ESI, 64;
722 movapd XMM5, XMM4;
723 movapd XMM6, XMM4;
724 movupd XMM0, [EAX];
725 movupd XMM1, [EAX+16];
726 movupd XMM2, [EAX+32];
727 movupd XMM3, [EAX+48];
728 add EAX, 64;
729 subpd XMM5, XMM0;
730 subpd XMM6, XMM1;
731 movupd [ESI+ 0-64], XMM5;
732 movupd [ESI+16-64], XMM6;
733 movapd XMM5, XMM4;
734 movapd XMM6, XMM4;
735 subpd XMM5, XMM2;
736 subpd XMM6, XMM3;
737 movupd [ESI+32-64], XMM5;
738 movupd [ESI+48-64], XMM6;
739 cmp ESI, EDI;
740 jb startsseloop;
741
742 mov aptr, ESI;
743 mov bptr, EAX;
744 }
745 }
746 }
747
748 while (aptr < aend)
749 *aptr++ = value - *bptr++;
750
751 return a;
752 }
753
754 unittest
755 {
756 printf("_arrayExpSliceMinSliceAssign_d unittest\n");
757 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
758 {
759 version (log) printf(" cpuid %d\n", cpuid);
760
761 for (int j = 0; j < 2; j++)
762 {
763 const int dim = 67;
764 T[] a = new T[dim + j]; // aligned on 16 byte boundary
765 a = a[j .. dim + j]; // misalign for second iteration
766 T[] b = new T[dim + j];
767 b = b[j .. dim + j];
768 T[] c = new T[dim + j];
769 c = c[j .. dim + j];
770
771 for (int i = 0; i < dim; i++)
772 { a[i] = cast(T)i;
773 b[i] = cast(T)(i + 7);
774 c[i] = cast(T)(i * 2);
775 }
776
777 c[] = 6 - a[];
778
779 for (int i = 0; i < dim; i++)
780 {
781 if (c[i] != cast(T)(6 - a[i]))
782 {
783 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
784 assert(0);
785 }
786 }
787 }
788 }
789 }
790
791 /* ======================================================================== */
792
793 /***********************
794 * Computes:
795 * a[] -= value
796 */
797
798 T[] _arrayExpSliceMinass_d(T[] a, T value)
799 {
800 //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
801 auto aptr = a.ptr;
802 auto aend = aptr + a.length;
803
804 version (D_InlineAsm_X86)
805 {
806 // SSE2 version is 115% faster
807 if (sse2() && a.length >= 8)
808 {
809 auto n = cast(T*)((cast(uint)aend) & ~7);
810 if (aptr < n)
811
812 // Unaligned case
813 asm
814 {
815 mov ESI, aptr;
816 mov EDI, n;
817 movsd XMM4, value;
818 shufpd XMM4, XMM4, 0;
819
820 align 8;
821 startsseloopa:
822 movupd XMM0, [ESI];
823 movupd XMM1, [ESI+16];
824 movupd XMM2, [ESI+32];
825 movupd XMM3, [ESI+48];
826 add ESI, 64;
827 subpd XMM0, XMM4;
828 subpd XMM1, XMM4;
829 subpd XMM2, XMM4;
830 subpd XMM3, XMM4;
831 movupd [ESI+ 0-64], XMM0;
832 movupd [ESI+16-64], XMM1;
833 movupd [ESI+32-64], XMM2;
834 movupd [ESI+48-64], XMM3;
835 cmp ESI, EDI;
836 jb startsseloopa;
837
838 mov aptr, ESI;
839 }
840 }
841 }
842
843 while (aptr < aend)
844 *aptr++ -= value;
845
846 return a;
847 }
848
849 unittest
850 {
851 printf("_arrayExpSliceMinass_d unittest\n");
852 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
853 {
854 version (log) printf(" cpuid %d\n", cpuid);
855
856 for (int j = 0; j < 2; j++)
857 {
858 const int dim = 67;
859 T[] a = new T[dim + j]; // aligned on 16 byte boundary
860 a = a[j .. dim + j]; // misalign for second iteration
861 T[] b = new T[dim + j];
862 b = b[j .. dim + j];
863 T[] c = new T[dim + j];
864 c = c[j .. dim + j];
865
866 for (int i = 0; i < dim; i++)
867 { a[i] = cast(T)i;
868 b[i] = cast(T)(i + 7);
869 c[i] = cast(T)(i * 2);
870 }
871
872 a[] = c[];
873 c[] -= 6;
874
875 for (int i = 0; i < dim; i++)
876 {
877 if (c[i] != cast(T)(a[i] - 6))
878 {
879 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
880 assert(0);
881 }
882 }
883 }
884 }
885 }
886
887 /* ======================================================================== */
888
889 /***********************
890 * Computes:
891 * a[] -= b[]
892 */
893
894 T[] _arraySliceSliceMinass_d(T[] a, T[] b)
895 in
896 {
897 assert (a.length == b.length);
898 assert (disjoint(a, b));
899 }
900 body
901 {
902 //printf("_arraySliceSliceMinass_d()\n");
903 auto aptr = a.ptr;
904 auto aend = aptr + a.length;
905 auto bptr = b.ptr;
906
907 version (D_InlineAsm_X86)
908 {
909 // SSE2 version is 183% faster
910 if (sse2() && a.length >= 8)
911 {
912 auto n = aptr + (a.length & ~7);
913
914 // Unaligned case
915 asm
916 {
917 mov ECX, bptr; // right operand
918 mov ESI, aptr; // destination operand
919 mov EDI, n; // end comparison
920
921 align 8;
922 startsseloopb:
923 movupd XMM0, [ESI];
924 movupd XMM1, [ESI+16];
925 movupd XMM2, [ESI+32];
926 movupd XMM3, [ESI+48];
927 add ESI, 64;
928 movupd XMM4, [ECX];
929 movupd XMM5, [ECX+16];
930 movupd XMM6, [ECX+32];
931 movupd XMM7, [ECX+48];
932 add ECX, 64;
933 subpd XMM0, XMM4;
934 subpd XMM1, XMM5;
935 subpd XMM2, XMM6;
936 subpd XMM3, XMM7;
937 movupd [ESI+ 0-64], XMM0;
938 movupd [ESI+16-64], XMM1;
939 movupd [ESI+32-64], XMM2;
940 movupd [ESI+48-64], XMM3;
941 cmp ESI, EDI;
942 jb startsseloopb;
943
944 mov aptr, ESI;
945 mov bptr, ECX;
946 }
947 }
948 }
949
950 while (aptr < aend)
951 *aptr++ -= *bptr++;
952
953 return a;
954 }
955
956 unittest
957 {
958 printf("_arrayExpSliceMinass_d unittest\n");
959 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
960 {
961 version (log) printf(" cpuid %d\n", cpuid);
962
963 for (int j = 0; j < 2; j++)
964 {
965 const int dim = 67;
966 T[] a = new T[dim + j]; // aligned on 16 byte boundary
967 a = a[j .. dim + j]; // misalign for second iteration
968 T[] b = new T[dim + j];
969 b = b[j .. dim + j];
970 T[] c = new T[dim + j];
971 c = c[j .. dim + j];
972
973 for (int i = 0; i < dim; i++)
974 { a[i] = cast(T)i;
975 b[i] = cast(T)(i + 7);
976 c[i] = cast(T)(i * 2);
977 }
978
979 a[] = c[];
980 c[] -= 6;
981
982 for (int i = 0; i < dim; i++)
983 {
984 if (c[i] != cast(T)(a[i] - 6))
985 {
986 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
987 assert(0);
988 }
989 }
990 }
991 }
992 }
993
994 /* ======================================================================== */
995
996 /***********************
997 * Computes:
998 * a[] = b[] * value
999 */
1000
1001 T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b)
1002 in
1003 {
1004 assert(a.length == b.length);
1005 assert(disjoint(a, b));
1006 }
1007 body
1008 {
1009 //printf("_arraySliceExpMulSliceAssign_d()\n");
1010 auto aptr = a.ptr;
1011 auto aend = aptr + a.length;
1012 auto bptr = b.ptr;
1013
1014 version (D_InlineAsm_X86)
1015 {
1016 // SSE2 version is 304% faster
1017 if (sse2() && a.length >= 8)
1018 {
1019 auto n = aptr + (a.length & ~7);
1020
1021 // Unaligned case
1022 asm
1023 {
1024 mov EAX, bptr;
1025 mov ESI, aptr;
1026 mov EDI, n;
1027 movsd XMM4, value;
1028 shufpd XMM4, XMM4, 0;
1029
1030 align 8;
1031 startsseloop:
1032 add ESI, 64;
1033 movupd XMM0, [EAX];
1034 movupd XMM1, [EAX+16];
1035 movupd XMM2, [EAX+32];
1036 movupd XMM3, [EAX+48];
1037 add EAX, 64;
1038 mulpd XMM0, XMM4;
1039 mulpd XMM1, XMM4;
1040 mulpd XMM2, XMM4;
1041 mulpd XMM3, XMM4;
1042 movupd [ESI+ 0-64], XMM0;
1043 movupd [ESI+16-64], XMM1;
1044 movupd [ESI+32-64], XMM2;
1045 movupd [ESI+48-64], XMM3;
1046 cmp ESI, EDI;
1047 jb startsseloop;
1048
1049 mov aptr, ESI;
1050 mov bptr, EAX;
1051 }
1052 }
1053 }
1054
1055 while (aptr < aend)
1056 *aptr++ = *bptr++ * value;
1057
1058 return a;
1059 }
1060
1061 unittest
1062 {
1063 printf("_arraySliceExpMulSliceAssign_d unittest\n");
1064 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1065 {
1066 version (log) printf(" cpuid %d\n", cpuid);
1067
1068 for (int j = 0; j < 2; j++)
1069 {
1070 const int dim = 67;
1071 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1072 a = a[j .. dim + j]; // misalign for second iteration
1073 T[] b = new T[dim + j];
1074 b = b[j .. dim + j];
1075 T[] c = new T[dim + j];
1076 c = c[j .. dim + j];
1077
1078 for (int i = 0; i < dim; i++)
1079 { a[i] = cast(T)i;
1080 b[i] = cast(T)(i + 7);
1081 c[i] = cast(T)(i * 2);
1082 }
1083
1084 c[] = a[] * 6;
1085
1086 for (int i = 0; i < dim; i++)
1087 {
1088 if (c[i] != cast(T)(a[i] * 6))
1089 {
1090 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1091 assert(0);
1092 }
1093 }
1094 }
1095 }
1096 }
1097
1098 /* ======================================================================== */
1099
1100 /***********************
1101 * Computes:
1102 * a[] = b[] * c[]
1103 */
1104
1105 T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b)
1106 in
1107 {
1108 assert(a.length == b.length && b.length == c.length);
1109 assert(disjoint(a, b));
1110 assert(disjoint(a, c));
1111 assert(disjoint(b, c));
1112 }
1113 body
1114 {
1115 //printf("_arraySliceSliceMulSliceAssign_d()\n");
1116 auto aptr = a.ptr;
1117 auto aend = aptr + a.length;
1118 auto bptr = b.ptr;
1119 auto cptr = c.ptr;
1120
1121 version (D_InlineAsm_X86)
1122 {
1123 // SSE2 version is 329% faster
1124 if (sse2() && a.length >= 8)
1125 {
1126 auto n = aptr + (a.length & ~7);
1127
1128 // Unaligned case
1129 asm
1130 {
1131 mov EAX, bptr; // left operand
1132 mov ECX, cptr; // right operand
1133 mov ESI, aptr; // destination operand
1134 mov EDI, n; // end comparison
1135
1136 align 8;
1137 startsseloopb:
1138 movupd XMM0, [EAX];
1139 movupd XMM1, [EAX+16];
1140 movupd XMM2, [EAX+32];
1141 movupd XMM3, [EAX+48];
1142 add ESI, 64;
1143 movupd XMM4, [ECX];
1144 movupd XMM5, [ECX+16];
1145 movupd XMM6, [ECX+32];
1146 movupd XMM7, [ECX+48];
1147 add EAX, 64;
1148 mulpd XMM0, XMM4;
1149 mulpd XMM1, XMM5;
1150 mulpd XMM2, XMM6;
1151 mulpd XMM3, XMM7;
1152 add ECX, 64;
1153 movupd [ESI+ 0-64], XMM0;
1154 movupd [ESI+16-64], XMM1;
1155 movupd [ESI+32-64], XMM2;
1156 movupd [ESI+48-64], XMM3;
1157 cmp ESI, EDI;
1158 jb startsseloopb;
1159
1160 mov aptr, ESI;
1161 mov bptr, EAX;
1162 mov cptr, ECX;
1163 }
1164 }
1165 }
1166
1167 while (aptr < aend)
1168 *aptr++ = *bptr++ * *cptr++;
1169
1170 return a;
1171 }
1172
1173 unittest
1174 {
1175 printf("_arraySliceSliceMulSliceAssign_d unittest\n");
1176 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1177 {
1178 version (log) printf(" cpuid %d\n", cpuid);
1179
1180 for (int j = 0; j < 2; j++)
1181 {
1182 const int dim = 67;
1183 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1184 a = a[j .. dim + j]; // misalign for second iteration
1185 T[] b = new T[dim + j];
1186 b = b[j .. dim + j];
1187 T[] c = new T[dim + j];
1188 c = c[j .. dim + j];
1189
1190 for (int i = 0; i < dim; i++)
1191 { a[i] = cast(T)i;
1192 b[i] = cast(T)(i + 7);
1193 c[i] = cast(T)(i * 2);
1194 }
1195
1196 c[] = a[] * b[];
1197
1198 for (int i = 0; i < dim; i++)
1199 {
1200 if (c[i] != cast(T)(a[i] * b[i]))
1201 {
1202 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
1203 assert(0);
1204 }
1205 }
1206 }
1207 }
1208 }
1209
1210 /* ======================================================================== */
1211
1212 /***********************
1213 * Computes:
1214 * a[] *= value
1215 */
1216
1217 T[] _arrayExpSliceMulass_d(T[] a, T value)
1218 {
1219 //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1220 auto aptr = a.ptr;
1221 auto aend = aptr + a.length;
1222
1223 version (D_InlineAsm_X86)
1224 {
1225 // SSE2 version is 109% faster
1226 if (sse2() && a.length >= 8)
1227 {
1228 auto n = cast(T*)((cast(uint)aend) & ~7);
1229 if (aptr < n)
1230
1231 // Unaligned case
1232 asm
1233 {
1234 mov ESI, aptr;
1235 mov EDI, n;
1236 movsd XMM4, value;
1237 shufpd XMM4, XMM4, 0;
1238
1239 align 8;
1240 startsseloopa:
1241 movupd XMM0, [ESI];
1242 movupd XMM1, [ESI+16];
1243 movupd XMM2, [ESI+32];
1244 movupd XMM3, [ESI+48];
1245 add ESI, 64;
1246 mulpd XMM0, XMM4;
1247 mulpd XMM1, XMM4;
1248 mulpd XMM2, XMM4;
1249 mulpd XMM3, XMM4;
1250 movupd [ESI+ 0-64], XMM0;
1251 movupd [ESI+16-64], XMM1;
1252 movupd [ESI+32-64], XMM2;
1253 movupd [ESI+48-64], XMM3;
1254 cmp ESI, EDI;
1255 jb startsseloopa;
1256
1257 mov aptr, ESI;
1258 }
1259 }
1260 }
1261
1262 while (aptr < aend)
1263 *aptr++ *= value;
1264
1265 return a;
1266 }
1267
1268 unittest
1269 {
1270 printf("_arrayExpSliceMulass_d unittest\n");
1271 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1272 {
1273 version (log) printf(" cpuid %d\n", cpuid);
1274
1275 for (int j = 0; j < 2; j++)
1276 {
1277 const int dim = 67;
1278 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1279 a = a[j .. dim + j]; // misalign for second iteration
1280 T[] b = new T[dim + j];
1281 b = b[j .. dim + j];
1282 T[] c = new T[dim + j];
1283 c = c[j .. dim + j];
1284
1285 for (int i = 0; i < dim; i++)
1286 { a[i] = cast(T)i;
1287 b[i] = cast(T)(i + 7);
1288 c[i] = cast(T)(i * 2);
1289 }
1290
1291 a[] = c[];
1292 c[] *= 6;
1293
1294 for (int i = 0; i < dim; i++)
1295 {
1296 if (c[i] != cast(T)(a[i] * 6))
1297 {
1298 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1299 assert(0);
1300 }
1301 }
1302 }
1303 }
1304 }
1305
1306 /* ======================================================================== */
1307
1308 /***********************
1309 * Computes:
1310 * a[] *= b[]
1311 */
1312
1313 T[] _arraySliceSliceMulass_d(T[] a, T[] b)
1314 in
1315 {
1316 assert (a.length == b.length);
1317 assert (disjoint(a, b));
1318 }
1319 body
1320 {
1321 //printf("_arraySliceSliceMulass_d()\n");
1322 auto aptr = a.ptr;
1323 auto aend = aptr + a.length;
1324 auto bptr = b.ptr;
1325
1326 version (D_InlineAsm_X86)
1327 {
1328 // SSE2 version is 205% faster
1329 if (sse2() && a.length >= 8)
1330 {
1331 auto n = aptr + (a.length & ~7);
1332
1333 // Unaligned case
1334 asm
1335 {
1336 mov ECX, bptr; // right operand
1337 mov ESI, aptr; // destination operand
1338 mov EDI, n; // end comparison
1339
1340 align 8;
1341 startsseloopb:
1342 movupd XMM0, [ESI];
1343 movupd XMM1, [ESI+16];
1344 movupd XMM2, [ESI+32];
1345 movupd XMM3, [ESI+48];
1346 add ESI, 64;
1347 movupd XMM4, [ECX];
1348 movupd XMM5, [ECX+16];
1349 movupd XMM6, [ECX+32];
1350 movupd XMM7, [ECX+48];
1351 add ECX, 64;
1352 mulpd XMM0, XMM4;
1353 mulpd XMM1, XMM5;
1354 mulpd XMM2, XMM6;
1355 mulpd XMM3, XMM7;
1356 movupd [ESI+ 0-64], XMM0;
1357 movupd [ESI+16-64], XMM1;
1358 movupd [ESI+32-64], XMM2;
1359 movupd [ESI+48-64], XMM3;
1360 cmp ESI, EDI;
1361 jb startsseloopb;
1362
1363 mov aptr, ESI;
1364 mov bptr, ECX;
1365 }
1366 }
1367 }
1368
1369 while (aptr < aend)
1370 *aptr++ *= *bptr++;
1371
1372 return a;
1373 }
1374
1375 unittest
1376 {
1377 printf("_arrayExpSliceMulass_d unittest\n");
1378 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1379 {
1380 version (log) printf(" cpuid %d\n", cpuid);
1381
1382 for (int j = 0; j < 2; j++)
1383 {
1384 const int dim = 67;
1385 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1386 a = a[j .. dim + j]; // misalign for second iteration
1387 T[] b = new T[dim + j];
1388 b = b[j .. dim + j];
1389 T[] c = new T[dim + j];
1390 c = c[j .. dim + j];
1391
1392 for (int i = 0; i < dim; i++)
1393 { a[i] = cast(T)i;
1394 b[i] = cast(T)(i + 7);
1395 c[i] = cast(T)(i * 2);
1396 }
1397
1398 a[] = c[];
1399 c[] *= 6;
1400
1401 for (int i = 0; i < dim; i++)
1402 {
1403 if (c[i] != cast(T)(a[i] * 6))
1404 {
1405 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1406 assert(0);
1407 }
1408 }
1409 }
1410 }
1411 }
1412
1413 /* ======================================================================== */
1414
1415 /***********************
1416 * Computes:
1417 * a[] = b[] / value
1418 */
1419
1420 T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b)
1421 in
1422 {
1423 assert(a.length == b.length);
1424 assert(disjoint(a, b));
1425 }
1426 body
1427 {
1428 //printf("_arraySliceExpDivSliceAssign_d()\n");
1429 auto aptr = a.ptr;
1430 auto aend = aptr + a.length;
1431 auto bptr = b.ptr;
1432
1433 /* Multiplying by the reciprocal is faster, but does
1434 * not produce as accurate an answer.
1435 */
1436 T recip = cast(T)1 / value;
1437
1438 version (D_InlineAsm_X86)
1439 {
1440 // SSE2 version is 299% faster
1441 if (sse2() && a.length >= 8)
1442 {
1443 auto n = aptr + (a.length & ~7);
1444
1445 // Unaligned case
1446 asm
1447 {
1448 mov EAX, bptr;
1449 mov ESI, aptr;
1450 mov EDI, n;
1451 movsd XMM4, recip;
1452 //movsd XMM4, value
1453 //rcpsd XMM4, XMM4
1454 shufpd XMM4, XMM4, 0;
1455
1456 align 8;
1457 startsseloop:
1458 add ESI, 64;
1459 movupd XMM0, [EAX];
1460 movupd XMM1, [EAX+16];
1461 movupd XMM2, [EAX+32];
1462 movupd XMM3, [EAX+48];
1463 add EAX, 64;
1464 mulpd XMM0, XMM4;
1465 mulpd XMM1, XMM4;
1466 mulpd XMM2, XMM4;
1467 mulpd XMM3, XMM4;
1468 //divpd XMM0, XMM4;
1469 //divpd XMM1, XMM4;
1470 //divpd XMM2, XMM4;
1471 //divpd XMM3, XMM4;
1472 movupd [ESI+ 0-64], XMM0;
1473 movupd [ESI+16-64], XMM1;
1474 movupd [ESI+32-64], XMM2;
1475 movupd [ESI+48-64], XMM3;
1476 cmp ESI, EDI;
1477 jb startsseloop;
1478
1479 mov aptr, ESI;
1480 mov bptr, EAX;
1481 }
1482 }
1483 }
1484
1485 while (aptr < aend)
1486 {
1487 *aptr++ = *bptr++ / value;
1488 //*aptr++ = *bptr++ * recip;
1489 }
1490
1491 return a;
1492 }
1493
1494 unittest
1495 {
1496 printf("_arraySliceExpDivSliceAssign_d unittest\n");
1497 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1498 {
1499 version (log) printf(" cpuid %d\n", cpuid);
1500
1501 for (int j = 0; j < 2; j++)
1502 {
1503 const int dim = 67;
1504 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1505 a = a[j .. dim + j]; // misalign for second iteration
1506 T[] b = new T[dim + j];
1507 b = b[j .. dim + j];
1508 T[] c = new T[dim + j];
1509 c = c[j .. dim + j];
1510
1511 for (int i = 0; i < dim; i++)
1512 { a[i] = cast(T)i;
1513 b[i] = cast(T)(i + 7);
1514 c[i] = cast(T)(i * 2);
1515 }
1516
1517 c[] = a[] / 8;
1518
1519 for (int i = 0; i < dim; i++)
1520 {
1521 //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]);
1522 if (c[i] != cast(T)(a[i] / 8))
1523 {
1524 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
1525 assert(0);
1526 }
1527 }
1528 }
1529 }
1530 }
1531
1532 /* ======================================================================== */
1533
1534 /***********************
1535 * Computes:
1536 * a[] /= value
1537 */
1538
1539 T[] _arrayExpSliceDivass_d(T[] a, T value)
1540 {
1541 //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1542 auto aptr = a.ptr;
1543 auto aend = aptr + a.length;
1544
1545 /* Multiplying by the reciprocal is faster, but does
1546 * not produce as accurate an answer.
1547 */
1548 T recip = cast(T)1 / value;
1549
1550 version (D_InlineAsm_X86)
1551 {
1552 // SSE2 version is 65% faster
1553 if (sse2() && a.length >= 8)
1554 {
1555 auto n = aptr + (a.length & ~7);
1556
1557 // Unaligned case
1558 asm
1559 {
1560 mov ESI, aptr;
1561 mov EDI, n;
1562 movsd XMM4, recip;
1563 //movsd XMM4, value
1564 //rcpsd XMM4, XMM4
1565 shufpd XMM4, XMM4, 0;
1566
1567 align 8;
1568 startsseloopa:
1569 movupd XMM0, [ESI];
1570 movupd XMM1, [ESI+16];
1571 movupd XMM2, [ESI+32];
1572 movupd XMM3, [ESI+48];
1573 add ESI, 64;
1574 mulpd XMM0, XMM4;
1575 mulpd XMM1, XMM4;
1576 mulpd XMM2, XMM4;
1577 mulpd XMM3, XMM4;
1578 //divpd XMM0, XMM4;
1579 //divpd XMM1, XMM4;
1580 //divpd XMM2, XMM4;
1581 //divpd XMM3, XMM4;
1582 movupd [ESI+ 0-64], XMM0;
1583 movupd [ESI+16-64], XMM1;
1584 movupd [ESI+32-64], XMM2;
1585 movupd [ESI+48-64], XMM3;
1586 cmp ESI, EDI;
1587 jb startsseloopa;
1588
1589 mov aptr, ESI;
1590 }
1591 }
1592 }
1593
1594 while (aptr < aend)
1595 *aptr++ *= recip;
1596
1597 return a;
1598 }
1599
1600
1601 unittest
1602 {
1603 printf("_arrayExpSliceDivass_d unittest\n");
1604 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1605 {
1606 version (log) printf(" cpuid %d\n", cpuid);
1607
1608 for (int j = 0; j < 2; j++)
1609 {
1610 const int dim = 67;
1611 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1612 a = a[j .. dim + j]; // misalign for second iteration
1613 T[] b = new T[dim + j];
1614 b = b[j .. dim + j];
1615 T[] c = new T[dim + j];
1616 c = c[j .. dim + j];
1617
1618 for (int i = 0; i < dim; i++)
1619 { a[i] = cast(T)i;
1620 b[i] = cast(T)(i + 7);
1621 c[i] = cast(T)(i * 2);
1622 }
1623
1624 a[] = c[];
1625 c[] /= 8;
1626
1627 for (int i = 0; i < dim; i++)
1628 {
1629 if (c[i] != cast(T)(a[i] / 8))
1630 {
1631 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
1632 assert(0);
1633 }
1634 }
1635 }
1636 }
1637 }
1638
1639
1640 /* ======================================================================== */
1641
1642 /***********************
1643 * Computes:
1644 * a[] -= b[] * value
1645 */
1646
1647 T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b)
1648 {
1649 return _arraySliceExpMulSliceAddass_d(a, -value, b);
1650 }
1651
1652 /***********************
1653 * Computes:
1654 * a[] += b[] * value
1655 */
1656
1657 T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b)
1658 in
1659 {
1660 assert(a.length == b.length);
1661 assert(disjoint(a, b));
1662 }
1663 body
1664 {
1665 auto aptr = a.ptr;
1666 auto aend = aptr + a.length;
1667 auto bptr = b.ptr;
1668
1669 // Handle remainder
1670 while (aptr < aend)
1671 *aptr++ += *bptr++ * value;
1672
1673 return a;
1674 }
1675
1676 unittest
1677 {
1678 printf("_arraySliceExpMulSliceAddass_d unittest\n");
1679
1680 cpuid = 1;
1681 {
1682 version (log) printf(" cpuid %d\n", cpuid);
1683
1684 for (int j = 0; j < 1; j++)
1685 {
1686 const int dim = 67;
1687 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1688 a = a[j .. dim + j]; // misalign for second iteration
1689 T[] b = new T[dim + j];
1690 b = b[j .. dim + j];
1691 T[] c = new T[dim + j];
1692 c = c[j .. dim + j];
1693
1694 for (int i = 0; i < dim; i++)
1695 { a[i] = cast(T)i;
1696 b[i] = cast(T)(i + 7);
1697 c[i] = cast(T)(i * 2);
1698 }
1699
1700 b[] = c[];
1701 c[] += a[] * 6;
1702
1703 for (int i = 0; i < dim; i++)
1704 {
1705 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
1706 if (c[i] != cast(T)(b[i] + a[i] * 6))
1707 {
1708 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
1709 assert(0);
1710 }
1711 }
1712 }
1713 }
1714 }