comparison druntime/src/compiler/ldc/arraydouble.d @ 1458:e0b2d67cfe7c

Added druntime (this should be removed once it works).
author Robert Clipsham <robert@octarineparrot.com>
date Tue, 02 Jun 2009 17:43:06 +0100
parents
children
comparison
equal deleted inserted replaced
1456:7b218ec1044f 1458:e0b2d67cfe7c
1 /**
2 * Contains SSE2 and MMX versions of certain operations for double.
3 *
4 * Copyright: Copyright Digital Mars 2008 - 2009.
5 * License: <a href="http://www.boost.org/LICENSE_1_0.txt>Boost License 1.0</a>.
6 * Authors: Walter Bright, based on code originally written by Burton Radons
7 *
8 * Copyright Digital Mars 2008 - 2009.
9 * Distributed under the Boost Software License, Version 1.0.
10 * (See accompanying file LICENSE_1_0.txt or copy at
11 * http://www.boost.org/LICENSE_1_0.txt)
12 */
13 module rt.arraydouble;
14
15 private import rt.util.cpuid;
16
17 version (unittest)
18 {
19 private import core.stdc.stdio : printf;
20 /* This is so unit tests will test every CPU variant
21 */
22 int cpuid;
23 const int CPUID_MAX = 5;
24 bool mmx() { return cpuid == 1 && rt.util.cpuid.mmx(); }
25 bool sse() { return cpuid == 2 && rt.util.cpuid.sse(); }
26 bool sse2() { return cpuid == 3 && rt.util.cpuid.sse2(); }
27 bool amd3dnow() { return cpuid == 4 && rt.util.cpuid.amd3dnow(); }
28 }
29 else
30 {
31 alias rt.util.cpuid.mmx mmx;
32 alias rt.util.cpuid.sse sse;
33 alias rt.util.cpuid.sse2 sse2;
34 alias rt.util.cpuid.amd3dnow amd3dnow;
35 }
36
37 //version = log;
38
39 bool disjoint(T)(T[] a, T[] b)
40 {
41 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
42 }
43
44 /* Performance figures measured by Burton Radons
45 */
46
47 alias double T;
48
49 extern (C):
50
51 /* ======================================================================== */
52
53 /***********************
54 * Computes:
55 * a[] = b[] + c[]
56 */
57
58 T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b)
59 in
60 {
61 assert(a.length == b.length && b.length == c.length);
62 assert(disjoint(a, b));
63 assert(disjoint(a, c));
64 assert(disjoint(b, c));
65 }
66 body
67 {
68 auto aptr = a.ptr;
69 auto aend = aptr + a.length;
70 auto bptr = b.ptr;
71 auto cptr = c.ptr;
72
73 version (D_InlineAsm_X86)
74 {
75 // SSE2 version is 333% faster
76 if (sse2() && b.length >= 16)
77 {
78 auto n = aptr + (b.length & ~15);
79
80 // Unaligned case
81 asm
82 {
83 mov EAX, bptr; // left operand
84 mov ECX, cptr; // right operand
85 mov ESI, aptr; // destination operand
86 mov EDI, n; // end comparison
87
88 align 8;
89 startsseloopb:
90 movupd XMM0, [EAX];
91 movupd XMM1, [EAX+16];
92 movupd XMM2, [EAX+32];
93 movupd XMM3, [EAX+48];
94 add EAX, 64;
95 movupd XMM4, [ECX];
96 movupd XMM5, [ECX+16];
97 movupd XMM6, [ECX+32];
98 movupd XMM7, [ECX+48];
99 add ESI, 64;
100 addpd XMM0, XMM4;
101 addpd XMM1, XMM5;
102 addpd XMM2, XMM6;
103 addpd XMM3, XMM7;
104 add ECX, 64;
105 movupd [ESI+ 0-64], XMM0;
106 movupd [ESI+16-64], XMM1;
107 movupd [ESI+32-64], XMM2;
108 movupd [ESI+48-64], XMM3;
109 cmp ESI, EDI;
110 jb startsseloopb;
111
112 mov aptr, ESI;
113 mov bptr, EAX;
114 mov cptr, ECX;
115 }
116 }
117 }
118
119 // Handle remainder
120 while (aptr < aend)
121 *aptr++ = *bptr++ + *cptr++;
122
123 return a;
124 }
125
126
127 unittest
128 {
129 printf("_arraySliceSliceAddSliceAssign_d unittest\n");
130 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
131 {
132 version (log) printf(" cpuid %d\n", cpuid);
133
134 for (int j = 0; j < 2; j++)
135 {
136 const int dim = 67;
137 T[] a = new T[dim + j]; // aligned on 16 byte boundary
138 a = a[j .. dim + j]; // misalign for second iteration
139 T[] b = new T[dim + j];
140 b = b[j .. dim + j];
141 T[] c = new T[dim + j];
142 c = c[j .. dim + j];
143
144 for (int i = 0; i < dim; i++)
145 { a[i] = cast(T)i;
146 b[i] = cast(T)(i + 7);
147 c[i] = cast(T)(i * 2);
148 }
149
150 c[] = a[] + b[];
151
152 for (int i = 0; i < dim; i++)
153 {
154 if (c[i] != cast(T)(a[i] + b[i]))
155 {
156 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
157 assert(0);
158 }
159 }
160 }
161 }
162 }
163
164 /* ======================================================================== */
165
166 /***********************
167 * Computes:
168 * a[] = b[] - c[]
169 */
170
171 T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b)
172 in
173 {
174 assert(a.length == b.length && b.length == c.length);
175 assert(disjoint(a, b));
176 assert(disjoint(a, c));
177 assert(disjoint(b, c));
178 }
179 body
180 {
181 auto aptr = a.ptr;
182 auto aend = aptr + a.length;
183 auto bptr = b.ptr;
184 auto cptr = c.ptr;
185
186 version (D_InlineAsm_X86)
187 {
188 // SSE2 version is 324% faster
189 if (sse2() && b.length >= 8)
190 {
191 auto n = aptr + (b.length & ~7);
192
193 // Unaligned case
194 asm
195 {
196 mov EAX, bptr; // left operand
197 mov ECX, cptr; // right operand
198 mov ESI, aptr; // destination operand
199 mov EDI, n; // end comparison
200
201 align 8;
202 startsseloopb:
203 movupd XMM0, [EAX];
204 movupd XMM1, [EAX+16];
205 movupd XMM2, [EAX+32];
206 movupd XMM3, [EAX+48];
207 add EAX, 64;
208 movupd XMM4, [ECX];
209 movupd XMM5, [ECX+16];
210 movupd XMM6, [ECX+32];
211 movupd XMM7, [ECX+48];
212 add ESI, 64;
213 subpd XMM0, XMM4;
214 subpd XMM1, XMM5;
215 subpd XMM2, XMM6;
216 subpd XMM3, XMM7;
217 add ECX, 64;
218 movupd [ESI+ 0-64], XMM0;
219 movupd [ESI+16-64], XMM1;
220 movupd [ESI+32-64], XMM2;
221 movupd [ESI+48-64], XMM3;
222 cmp ESI, EDI;
223 jb startsseloopb;
224
225 mov aptr, ESI;
226 mov bptr, EAX;
227 mov cptr, ECX;
228 }
229 }
230 }
231
232 // Handle remainder
233 while (aptr < aend)
234 *aptr++ = *bptr++ - *cptr++;
235
236 return a;
237 }
238
239
240 unittest
241 {
242 printf("_arraySliceSliceMinSliceAssign_d unittest\n");
243 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
244 {
245 version (log) printf(" cpuid %d\n", cpuid);
246
247 for (int j = 0; j < 2; j++)
248 {
249 const int dim = 67;
250 T[] a = new T[dim + j]; // aligned on 16 byte boundary
251 a = a[j .. dim + j]; // misalign for second iteration
252 T[] b = new T[dim + j];
253 b = b[j .. dim + j];
254 T[] c = new T[dim + j];
255 c = c[j .. dim + j];
256
257 for (int i = 0; i < dim; i++)
258 { a[i] = cast(T)i;
259 b[i] = cast(T)(i + 7);
260 c[i] = cast(T)(i * 2);
261 }
262
263 c[] = a[] - b[];
264
265 for (int i = 0; i < dim; i++)
266 {
267 if (c[i] != cast(T)(a[i] - b[i]))
268 {
269 printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]);
270 assert(0);
271 }
272 }
273 }
274 }
275 }
276
277
278 /* ======================================================================== */
279
280 /***********************
281 * Computes:
282 * a[] = b[] + value
283 */
284
285 T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b)
286 in
287 {
288 assert(a.length == b.length);
289 assert(disjoint(a, b));
290 }
291 body
292 {
293 //printf("_arraySliceExpAddSliceAssign_d()\n");
294 auto aptr = a.ptr;
295 auto aend = aptr + a.length;
296 auto bptr = b.ptr;
297
298 version (D_InlineAsm_X86)
299 {
300 // SSE2 version is 305% faster
301 if (sse2() && a.length >= 8)
302 {
303 auto n = aptr + (a.length & ~7);
304
305 // Unaligned case
306 asm
307 {
308 mov EAX, bptr;
309 mov ESI, aptr;
310 mov EDI, n;
311 movsd XMM4, value;
312 shufpd XMM4, XMM4, 0;
313
314 align 8;
315 startsseloop:
316 add ESI, 64;
317 movupd XMM0, [EAX];
318 movupd XMM1, [EAX+16];
319 movupd XMM2, [EAX+32];
320 movupd XMM3, [EAX+48];
321 add EAX, 64;
322 addpd XMM0, XMM4;
323 addpd XMM1, XMM4;
324 addpd XMM2, XMM4;
325 addpd XMM3, XMM4;
326 movupd [ESI+ 0-64], XMM0;
327 movupd [ESI+16-64], XMM1;
328 movupd [ESI+32-64], XMM2;
329 movupd [ESI+48-64], XMM3;
330 cmp ESI, EDI;
331 jb startsseloop;
332
333 mov aptr, ESI;
334 mov bptr, EAX;
335 }
336 }
337 }
338
339 while (aptr < aend)
340 *aptr++ = *bptr++ + value;
341
342 return a;
343 }
344
345 unittest
346 {
347 printf("_arraySliceExpAddSliceAssign_d unittest\n");
348 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
349 {
350 version (log) printf(" cpuid %d\n", cpuid);
351
352 for (int j = 0; j < 2; j++)
353 {
354 const int dim = 67;
355 T[] a = new T[dim + j]; // aligned on 16 byte boundary
356 a = a[j .. dim + j]; // misalign for second iteration
357 T[] b = new T[dim + j];
358 b = b[j .. dim + j];
359 T[] c = new T[dim + j];
360 c = c[j .. dim + j];
361
362 for (int i = 0; i < dim; i++)
363 { a[i] = cast(T)i;
364 b[i] = cast(T)(i + 7);
365 c[i] = cast(T)(i * 2);
366 }
367
368 c[] = a[] + 6;
369
370 for (int i = 0; i < dim; i++)
371 {
372 if (c[i] != cast(T)(a[i] + 6))
373 {
374 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
375 assert(0);
376 }
377 }
378 }
379 }
380 }
381
382 /* ======================================================================== */
383
384 /***********************
385 * Computes:
386 * a[] += value
387 */
388
389 T[] _arrayExpSliceAddass_d(T[] a, T value)
390 {
391 //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
392 auto aptr = a.ptr;
393 auto aend = aptr + a.length;
394
395 version (D_InlineAsm_X86)
396 {
397 // SSE2 version is 114% faster
398 if (sse2() && a.length >= 8)
399 {
400 auto n = cast(T*)((cast(uint)aend) & ~7);
401 if (aptr < n)
402
403 // Unaligned case
404 asm
405 {
406 mov ESI, aptr;
407 mov EDI, n;
408 movsd XMM4, value;
409 shufpd XMM4, XMM4, 0;
410
411 align 8;
412 startsseloopa:
413 movupd XMM0, [ESI];
414 movupd XMM1, [ESI+16];
415 movupd XMM2, [ESI+32];
416 movupd XMM3, [ESI+48];
417 add ESI, 64;
418 addpd XMM0, XMM4;
419 addpd XMM1, XMM4;
420 addpd XMM2, XMM4;
421 addpd XMM3, XMM4;
422 movupd [ESI+ 0-64], XMM0;
423 movupd [ESI+16-64], XMM1;
424 movupd [ESI+32-64], XMM2;
425 movupd [ESI+48-64], XMM3;
426 cmp ESI, EDI;
427 jb startsseloopa;
428
429 mov aptr, ESI;
430 }
431 }
432 }
433
434 while (aptr < aend)
435 *aptr++ += value;
436
437 return a;
438 }
439
440 unittest
441 {
442 printf("_arrayExpSliceAddass_d unittest\n");
443 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
444 {
445 version (log) printf(" cpuid %d\n", cpuid);
446
447 for (int j = 0; j < 2; j++)
448 {
449 const int dim = 67;
450 T[] a = new T[dim + j]; // aligned on 16 byte boundary
451 a = a[j .. dim + j]; // misalign for second iteration
452 T[] b = new T[dim + j];
453 b = b[j .. dim + j];
454 T[] c = new T[dim + j];
455 c = c[j .. dim + j];
456
457 for (int i = 0; i < dim; i++)
458 { a[i] = cast(T)i;
459 b[i] = cast(T)(i + 7);
460 c[i] = cast(T)(i * 2);
461 }
462
463 a[] = c[];
464 c[] += 6;
465
466 for (int i = 0; i < dim; i++)
467 {
468 if (c[i] != cast(T)(a[i] + 6))
469 {
470 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
471 assert(0);
472 }
473 }
474 }
475 }
476 }
477
478 /* ======================================================================== */
479
480 /***********************
481 * Computes:
482 * a[] += b[]
483 */
484
485 T[] _arraySliceSliceAddass_d(T[] a, T[] b)
486 in
487 {
488 assert (a.length == b.length);
489 assert (disjoint(a, b));
490 }
491 body
492 {
493 //printf("_arraySliceSliceAddass_d()\n");
494 auto aptr = a.ptr;
495 auto aend = aptr + a.length;
496 auto bptr = b.ptr;
497
498 version (D_InlineAsm_X86)
499 {
500 // SSE2 version is 183% faster
501 if (sse2() && a.length >= 8)
502 {
503 auto n = aptr + (a.length & ~7);
504
505 // Unaligned case
506 asm
507 {
508 mov ECX, bptr; // right operand
509 mov ESI, aptr; // destination operand
510 mov EDI, n; // end comparison
511
512 align 8;
513 startsseloopb:
514 movupd XMM0, [ESI];
515 movupd XMM1, [ESI+16];
516 movupd XMM2, [ESI+32];
517 movupd XMM3, [ESI+48];
518 add ESI, 64;
519 movupd XMM4, [ECX];
520 movupd XMM5, [ECX+16];
521 movupd XMM6, [ECX+32];
522 movupd XMM7, [ECX+48];
523 add ECX, 64;
524 addpd XMM0, XMM4;
525 addpd XMM1, XMM5;
526 addpd XMM2, XMM6;
527 addpd XMM3, XMM7;
528 movupd [ESI+ 0-64], XMM0;
529 movupd [ESI+16-64], XMM1;
530 movupd [ESI+32-64], XMM2;
531 movupd [ESI+48-64], XMM3;
532 cmp ESI, EDI;
533 jb startsseloopb;
534
535 mov aptr, ESI;
536 mov bptr, ECX;
537 }
538 }
539 }
540
541 while (aptr < aend)
542 *aptr++ += *bptr++;
543
544 return a;
545 }
546
547 unittest
548 {
549 printf("_arraySliceSliceAddass_d unittest\n");
550 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
551 {
552 version (log) printf(" cpuid %d\n", cpuid);
553
554 for (int j = 0; j < 2; j++)
555 {
556 const int dim = 67;
557 T[] a = new T[dim + j]; // aligned on 16 byte boundary
558 a = a[j .. dim + j]; // misalign for second iteration
559 T[] b = new T[dim + j];
560 b = b[j .. dim + j];
561 T[] c = new T[dim + j];
562 c = c[j .. dim + j];
563
564 for (int i = 0; i < dim; i++)
565 { a[i] = cast(T)i;
566 b[i] = cast(T)(i + 7);
567 c[i] = cast(T)(i * 2);
568 }
569
570 a[] = c[];
571 c[] += b[];
572
573 for (int i = 0; i < dim; i++)
574 {
575 if (c[i] != cast(T)(a[i] + b[i]))
576 {
577 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
578 assert(0);
579 }
580 }
581 }
582 }
583 }
584
585 /* ======================================================================== */
586
587 /***********************
588 * Computes:
589 * a[] = b[] - value
590 */
591
592 T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b)
593 in
594 {
595 assert (a.length == b.length);
596 assert (disjoint(a, b));
597 }
598 body
599 {
600 //printf("_arraySliceExpMinSliceAssign_d()\n");
601 auto aptr = a.ptr;
602 auto aend = aptr + a.length;
603 auto bptr = b.ptr;
604
605 version (D_InlineAsm_X86)
606 {
607 // SSE2 version is 305% faster
608 if (sse2() && a.length >= 8)
609 {
610 auto n = aptr + (a.length & ~7);
611
612 // Unaligned case
613 asm
614 {
615 mov EAX, bptr;
616 mov ESI, aptr;
617 mov EDI, n;
618 movsd XMM4, value;
619 shufpd XMM4, XMM4, 0;
620
621 align 8;
622 startsseloop:
623 add ESI, 64;
624 movupd XMM0, [EAX];
625 movupd XMM1, [EAX+16];
626 movupd XMM2, [EAX+32];
627 movupd XMM3, [EAX+48];
628 add EAX, 64;
629 subpd XMM0, XMM4;
630 subpd XMM1, XMM4;
631 subpd XMM2, XMM4;
632 subpd XMM3, XMM4;
633 movupd [ESI+ 0-64], XMM0;
634 movupd [ESI+16-64], XMM1;
635 movupd [ESI+32-64], XMM2;
636 movupd [ESI+48-64], XMM3;
637 cmp ESI, EDI;
638 jb startsseloop;
639
640 mov aptr, ESI;
641 mov bptr, EAX;
642 }
643 }
644 }
645
646 while (aptr < aend)
647 *aptr++ = *bptr++ - value;
648
649 return a;
650 }
651
652 unittest
653 {
654 printf("_arraySliceExpMinSliceAssign_d unittest\n");
655 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
656 {
657 version (log) printf(" cpuid %d\n", cpuid);
658
659 for (int j = 0; j < 2; j++)
660 {
661 const int dim = 67;
662 T[] a = new T[dim + j]; // aligned on 16 byte boundary
663 a = a[j .. dim + j]; // misalign for second iteration
664 T[] b = new T[dim + j];
665 b = b[j .. dim + j];
666 T[] c = new T[dim + j];
667 c = c[j .. dim + j];
668
669 for (int i = 0; i < dim; i++)
670 { a[i] = cast(T)i;
671 b[i] = cast(T)(i + 7);
672 c[i] = cast(T)(i * 2);
673 }
674
675 c[] = a[] - 6;
676
677 for (int i = 0; i < dim; i++)
678 {
679 if (c[i] != cast(T)(a[i] - 6))
680 {
681 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
682 assert(0);
683 }
684 }
685 }
686 }
687 }
688
689 /* ======================================================================== */
690
691 /***********************
692 * Computes:
693 * a[] = value - b[]
694 */
695
696 T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value)
697 in
698 {
699 assert (a.length == b.length);
700 assert (disjoint(a, b));
701 }
702 body
703 {
704 //printf("_arrayExpSliceMinSliceAssign_d()\n");
705 auto aptr = a.ptr;
706 auto aend = aptr + a.length;
707 auto bptr = b.ptr;
708
709 version (D_InlineAsm_X86)
710 {
711 // SSE2 version is 66% faster
712 if (sse2() && a.length >= 8)
713 {
714 auto n = aptr + (a.length & ~7);
715
716 // Unaligned case
717 asm
718 {
719 mov EAX, bptr;
720 mov ESI, aptr;
721 mov EDI, n;
722 movsd XMM4, value;
723 shufpd XMM4, XMM4, 0;
724
725 align 8;
726 startsseloop:
727 add ESI, 64;
728 movapd XMM5, XMM4;
729 movapd XMM6, XMM4;
730 movupd XMM0, [EAX];
731 movupd XMM1, [EAX+16];
732 movupd XMM2, [EAX+32];
733 movupd XMM3, [EAX+48];
734 add EAX, 64;
735 subpd XMM5, XMM0;
736 subpd XMM6, XMM1;
737 movupd [ESI+ 0-64], XMM5;
738 movupd [ESI+16-64], XMM6;
739 movapd XMM5, XMM4;
740 movapd XMM6, XMM4;
741 subpd XMM5, XMM2;
742 subpd XMM6, XMM3;
743 movupd [ESI+32-64], XMM5;
744 movupd [ESI+48-64], XMM6;
745 cmp ESI, EDI;
746 jb startsseloop;
747
748 mov aptr, ESI;
749 mov bptr, EAX;
750 }
751 }
752 }
753
754 while (aptr < aend)
755 *aptr++ = value - *bptr++;
756
757 return a;
758 }
759
760 unittest
761 {
762 printf("_arrayExpSliceMinSliceAssign_d unittest\n");
763 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
764 {
765 version (log) printf(" cpuid %d\n", cpuid);
766
767 for (int j = 0; j < 2; j++)
768 {
769 const int dim = 67;
770 T[] a = new T[dim + j]; // aligned on 16 byte boundary
771 a = a[j .. dim + j]; // misalign for second iteration
772 T[] b = new T[dim + j];
773 b = b[j .. dim + j];
774 T[] c = new T[dim + j];
775 c = c[j .. dim + j];
776
777 for (int i = 0; i < dim; i++)
778 { a[i] = cast(T)i;
779 b[i] = cast(T)(i + 7);
780 c[i] = cast(T)(i * 2);
781 }
782
783 c[] = 6 - a[];
784
785 for (int i = 0; i < dim; i++)
786 {
787 if (c[i] != cast(T)(6 - a[i]))
788 {
789 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
790 assert(0);
791 }
792 }
793 }
794 }
795 }
796
797 /* ======================================================================== */
798
799 /***********************
800 * Computes:
801 * a[] -= value
802 */
803
804 T[] _arrayExpSliceMinass_d(T[] a, T value)
805 {
806 //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
807 auto aptr = a.ptr;
808 auto aend = aptr + a.length;
809
810 version (D_InlineAsm_X86)
811 {
812 // SSE2 version is 115% faster
813 if (sse2() && a.length >= 8)
814 {
815 auto n = cast(T*)((cast(uint)aend) & ~7);
816 if (aptr < n)
817
818 // Unaligned case
819 asm
820 {
821 mov ESI, aptr;
822 mov EDI, n;
823 movsd XMM4, value;
824 shufpd XMM4, XMM4, 0;
825
826 align 8;
827 startsseloopa:
828 movupd XMM0, [ESI];
829 movupd XMM1, [ESI+16];
830 movupd XMM2, [ESI+32];
831 movupd XMM3, [ESI+48];
832 add ESI, 64;
833 subpd XMM0, XMM4;
834 subpd XMM1, XMM4;
835 subpd XMM2, XMM4;
836 subpd XMM3, XMM4;
837 movupd [ESI+ 0-64], XMM0;
838 movupd [ESI+16-64], XMM1;
839 movupd [ESI+32-64], XMM2;
840 movupd [ESI+48-64], XMM3;
841 cmp ESI, EDI;
842 jb startsseloopa;
843
844 mov aptr, ESI;
845 }
846 }
847 }
848
849 while (aptr < aend)
850 *aptr++ -= value;
851
852 return a;
853 }
854
855 unittest
856 {
857 printf("_arrayExpSliceMinass_d unittest\n");
858 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
859 {
860 version (log) printf(" cpuid %d\n", cpuid);
861
862 for (int j = 0; j < 2; j++)
863 {
864 const int dim = 67;
865 T[] a = new T[dim + j]; // aligned on 16 byte boundary
866 a = a[j .. dim + j]; // misalign for second iteration
867 T[] b = new T[dim + j];
868 b = b[j .. dim + j];
869 T[] c = new T[dim + j];
870 c = c[j .. dim + j];
871
872 for (int i = 0; i < dim; i++)
873 { a[i] = cast(T)i;
874 b[i] = cast(T)(i + 7);
875 c[i] = cast(T)(i * 2);
876 }
877
878 a[] = c[];
879 c[] -= 6;
880
881 for (int i = 0; i < dim; i++)
882 {
883 if (c[i] != cast(T)(a[i] - 6))
884 {
885 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
886 assert(0);
887 }
888 }
889 }
890 }
891 }
892
893 /* ======================================================================== */
894
895 /***********************
896 * Computes:
897 * a[] -= b[]
898 */
899
900 T[] _arraySliceSliceMinass_d(T[] a, T[] b)
901 in
902 {
903 assert (a.length == b.length);
904 assert (disjoint(a, b));
905 }
906 body
907 {
908 //printf("_arraySliceSliceMinass_d()\n");
909 auto aptr = a.ptr;
910 auto aend = aptr + a.length;
911 auto bptr = b.ptr;
912
913 version (D_InlineAsm_X86)
914 {
915 // SSE2 version is 183% faster
916 if (sse2() && a.length >= 8)
917 {
918 auto n = aptr + (a.length & ~7);
919
920 // Unaligned case
921 asm
922 {
923 mov ECX, bptr; // right operand
924 mov ESI, aptr; // destination operand
925 mov EDI, n; // end comparison
926
927 align 8;
928 startsseloopb:
929 movupd XMM0, [ESI];
930 movupd XMM1, [ESI+16];
931 movupd XMM2, [ESI+32];
932 movupd XMM3, [ESI+48];
933 add ESI, 64;
934 movupd XMM4, [ECX];
935 movupd XMM5, [ECX+16];
936 movupd XMM6, [ECX+32];
937 movupd XMM7, [ECX+48];
938 add ECX, 64;
939 subpd XMM0, XMM4;
940 subpd XMM1, XMM5;
941 subpd XMM2, XMM6;
942 subpd XMM3, XMM7;
943 movupd [ESI+ 0-64], XMM0;
944 movupd [ESI+16-64], XMM1;
945 movupd [ESI+32-64], XMM2;
946 movupd [ESI+48-64], XMM3;
947 cmp ESI, EDI;
948 jb startsseloopb;
949
950 mov aptr, ESI;
951 mov bptr, ECX;
952 }
953 }
954 }
955
956 while (aptr < aend)
957 *aptr++ -= *bptr++;
958
959 return a;
960 }
961
962 unittest
963 {
964 printf("_arrayExpSliceMinass_d unittest\n");
965 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
966 {
967 version (log) printf(" cpuid %d\n", cpuid);
968
969 for (int j = 0; j < 2; j++)
970 {
971 const int dim = 67;
972 T[] a = new T[dim + j]; // aligned on 16 byte boundary
973 a = a[j .. dim + j]; // misalign for second iteration
974 T[] b = new T[dim + j];
975 b = b[j .. dim + j];
976 T[] c = new T[dim + j];
977 c = c[j .. dim + j];
978
979 for (int i = 0; i < dim; i++)
980 { a[i] = cast(T)i;
981 b[i] = cast(T)(i + 7);
982 c[i] = cast(T)(i * 2);
983 }
984
985 a[] = c[];
986 c[] -= 6;
987
988 for (int i = 0; i < dim; i++)
989 {
990 if (c[i] != cast(T)(a[i] - 6))
991 {
992 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
993 assert(0);
994 }
995 }
996 }
997 }
998 }
999
1000 /* ======================================================================== */
1001
1002 /***********************
1003 * Computes:
1004 * a[] = b[] * value
1005 */
1006
1007 T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b)
1008 in
1009 {
1010 assert(a.length == b.length);
1011 assert(disjoint(a, b));
1012 }
1013 body
1014 {
1015 //printf("_arraySliceExpMulSliceAssign_d()\n");
1016 auto aptr = a.ptr;
1017 auto aend = aptr + a.length;
1018 auto bptr = b.ptr;
1019
1020 version (D_InlineAsm_X86)
1021 {
1022 // SSE2 version is 304% faster
1023 if (sse2() && a.length >= 8)
1024 {
1025 auto n = aptr + (a.length & ~7);
1026
1027 // Unaligned case
1028 asm
1029 {
1030 mov EAX, bptr;
1031 mov ESI, aptr;
1032 mov EDI, n;
1033 movsd XMM4, value;
1034 shufpd XMM4, XMM4, 0;
1035
1036 align 8;
1037 startsseloop:
1038 add ESI, 64;
1039 movupd XMM0, [EAX];
1040 movupd XMM1, [EAX+16];
1041 movupd XMM2, [EAX+32];
1042 movupd XMM3, [EAX+48];
1043 add EAX, 64;
1044 mulpd XMM0, XMM4;
1045 mulpd XMM1, XMM4;
1046 mulpd XMM2, XMM4;
1047 mulpd XMM3, XMM4;
1048 movupd [ESI+ 0-64], XMM0;
1049 movupd [ESI+16-64], XMM1;
1050 movupd [ESI+32-64], XMM2;
1051 movupd [ESI+48-64], XMM3;
1052 cmp ESI, EDI;
1053 jb startsseloop;
1054
1055 mov aptr, ESI;
1056 mov bptr, EAX;
1057 }
1058 }
1059 }
1060
1061 while (aptr < aend)
1062 *aptr++ = *bptr++ * value;
1063
1064 return a;
1065 }
1066
1067 unittest
1068 {
1069 printf("_arraySliceExpMulSliceAssign_d unittest\n");
1070 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1071 {
1072 version (log) printf(" cpuid %d\n", cpuid);
1073
1074 for (int j = 0; j < 2; j++)
1075 {
1076 const int dim = 67;
1077 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1078 a = a[j .. dim + j]; // misalign for second iteration
1079 T[] b = new T[dim + j];
1080 b = b[j .. dim + j];
1081 T[] c = new T[dim + j];
1082 c = c[j .. dim + j];
1083
1084 for (int i = 0; i < dim; i++)
1085 { a[i] = cast(T)i;
1086 b[i] = cast(T)(i + 7);
1087 c[i] = cast(T)(i * 2);
1088 }
1089
1090 c[] = a[] * 6;
1091
1092 for (int i = 0; i < dim; i++)
1093 {
1094 if (c[i] != cast(T)(a[i] * 6))
1095 {
1096 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1097 assert(0);
1098 }
1099 }
1100 }
1101 }
1102 }
1103
1104 /* ======================================================================== */
1105
1106 /***********************
1107 * Computes:
1108 * a[] = b[] * c[]
1109 */
1110
1111 T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b)
1112 in
1113 {
1114 assert(a.length == b.length && b.length == c.length);
1115 assert(disjoint(a, b));
1116 assert(disjoint(a, c));
1117 assert(disjoint(b, c));
1118 }
1119 body
1120 {
1121 //printf("_arraySliceSliceMulSliceAssign_d()\n");
1122 auto aptr = a.ptr;
1123 auto aend = aptr + a.length;
1124 auto bptr = b.ptr;
1125 auto cptr = c.ptr;
1126
1127 version (D_InlineAsm_X86)
1128 {
1129 // SSE2 version is 329% faster
1130 if (sse2() && a.length >= 8)
1131 {
1132 auto n = aptr + (a.length & ~7);
1133
1134 // Unaligned case
1135 asm
1136 {
1137 mov EAX, bptr; // left operand
1138 mov ECX, cptr; // right operand
1139 mov ESI, aptr; // destination operand
1140 mov EDI, n; // end comparison
1141
1142 align 8;
1143 startsseloopb:
1144 movupd XMM0, [EAX];
1145 movupd XMM1, [EAX+16];
1146 movupd XMM2, [EAX+32];
1147 movupd XMM3, [EAX+48];
1148 add ESI, 64;
1149 movupd XMM4, [ECX];
1150 movupd XMM5, [ECX+16];
1151 movupd XMM6, [ECX+32];
1152 movupd XMM7, [ECX+48];
1153 add EAX, 64;
1154 mulpd XMM0, XMM4;
1155 mulpd XMM1, XMM5;
1156 mulpd XMM2, XMM6;
1157 mulpd XMM3, XMM7;
1158 add ECX, 64;
1159 movupd [ESI+ 0-64], XMM0;
1160 movupd [ESI+16-64], XMM1;
1161 movupd [ESI+32-64], XMM2;
1162 movupd [ESI+48-64], XMM3;
1163 cmp ESI, EDI;
1164 jb startsseloopb;
1165
1166 mov aptr, ESI;
1167 mov bptr, EAX;
1168 mov cptr, ECX;
1169 }
1170 }
1171 }
1172
1173 while (aptr < aend)
1174 *aptr++ = *bptr++ * *cptr++;
1175
1176 return a;
1177 }
1178
1179 unittest
1180 {
1181 printf("_arraySliceSliceMulSliceAssign_d unittest\n");
1182 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1183 {
1184 version (log) printf(" cpuid %d\n", cpuid);
1185
1186 for (int j = 0; j < 2; j++)
1187 {
1188 const int dim = 67;
1189 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1190 a = a[j .. dim + j]; // misalign for second iteration
1191 T[] b = new T[dim + j];
1192 b = b[j .. dim + j];
1193 T[] c = new T[dim + j];
1194 c = c[j .. dim + j];
1195
1196 for (int i = 0; i < dim; i++)
1197 { a[i] = cast(T)i;
1198 b[i] = cast(T)(i + 7);
1199 c[i] = cast(T)(i * 2);
1200 }
1201
1202 c[] = a[] * b[];
1203
1204 for (int i = 0; i < dim; i++)
1205 {
1206 if (c[i] != cast(T)(a[i] * b[i]))
1207 {
1208 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
1209 assert(0);
1210 }
1211 }
1212 }
1213 }
1214 }
1215
1216 /* ======================================================================== */
1217
1218 /***********************
1219 * Computes:
1220 * a[] *= value
1221 */
1222
1223 T[] _arrayExpSliceMulass_d(T[] a, T value)
1224 {
1225 //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1226 auto aptr = a.ptr;
1227 auto aend = aptr + a.length;
1228
1229 version (D_InlineAsm_X86)
1230 {
1231 // SSE2 version is 109% faster
1232 if (sse2() && a.length >= 8)
1233 {
1234 auto n = cast(T*)((cast(uint)aend) & ~7);
1235 if (aptr < n)
1236
1237 // Unaligned case
1238 asm
1239 {
1240 mov ESI, aptr;
1241 mov EDI, n;
1242 movsd XMM4, value;
1243 shufpd XMM4, XMM4, 0;
1244
1245 align 8;
1246 startsseloopa:
1247 movupd XMM0, [ESI];
1248 movupd XMM1, [ESI+16];
1249 movupd XMM2, [ESI+32];
1250 movupd XMM3, [ESI+48];
1251 add ESI, 64;
1252 mulpd XMM0, XMM4;
1253 mulpd XMM1, XMM4;
1254 mulpd XMM2, XMM4;
1255 mulpd XMM3, XMM4;
1256 movupd [ESI+ 0-64], XMM0;
1257 movupd [ESI+16-64], XMM1;
1258 movupd [ESI+32-64], XMM2;
1259 movupd [ESI+48-64], XMM3;
1260 cmp ESI, EDI;
1261 jb startsseloopa;
1262
1263 mov aptr, ESI;
1264 }
1265 }
1266 }
1267
1268 while (aptr < aend)
1269 *aptr++ *= value;
1270
1271 return a;
1272 }
1273
1274 unittest
1275 {
1276 printf("_arrayExpSliceMulass_d unittest\n");
1277 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1278 {
1279 version (log) printf(" cpuid %d\n", cpuid);
1280
1281 for (int j = 0; j < 2; j++)
1282 {
1283 const int dim = 67;
1284 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1285 a = a[j .. dim + j]; // misalign for second iteration
1286 T[] b = new T[dim + j];
1287 b = b[j .. dim + j];
1288 T[] c = new T[dim + j];
1289 c = c[j .. dim + j];
1290
1291 for (int i = 0; i < dim; i++)
1292 { a[i] = cast(T)i;
1293 b[i] = cast(T)(i + 7);
1294 c[i] = cast(T)(i * 2);
1295 }
1296
1297 a[] = c[];
1298 c[] *= 6;
1299
1300 for (int i = 0; i < dim; i++)
1301 {
1302 if (c[i] != cast(T)(a[i] * 6))
1303 {
1304 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1305 assert(0);
1306 }
1307 }
1308 }
1309 }
1310 }
1311
1312 /* ======================================================================== */
1313
1314 /***********************
1315 * Computes:
1316 * a[] *= b[]
1317 */
1318
1319 T[] _arraySliceSliceMulass_d(T[] a, T[] b)
1320 in
1321 {
1322 assert (a.length == b.length);
1323 assert (disjoint(a, b));
1324 }
1325 body
1326 {
1327 //printf("_arraySliceSliceMulass_d()\n");
1328 auto aptr = a.ptr;
1329 auto aend = aptr + a.length;
1330 auto bptr = b.ptr;
1331
1332 version (D_InlineAsm_X86)
1333 {
1334 // SSE2 version is 205% faster
1335 if (sse2() && a.length >= 8)
1336 {
1337 auto n = aptr + (a.length & ~7);
1338
1339 // Unaligned case
1340 asm
1341 {
1342 mov ECX, bptr; // right operand
1343 mov ESI, aptr; // destination operand
1344 mov EDI, n; // end comparison
1345
1346 align 8;
1347 startsseloopb:
1348 movupd XMM0, [ESI];
1349 movupd XMM1, [ESI+16];
1350 movupd XMM2, [ESI+32];
1351 movupd XMM3, [ESI+48];
1352 add ESI, 64;
1353 movupd XMM4, [ECX];
1354 movupd XMM5, [ECX+16];
1355 movupd XMM6, [ECX+32];
1356 movupd XMM7, [ECX+48];
1357 add ECX, 64;
1358 mulpd XMM0, XMM4;
1359 mulpd XMM1, XMM5;
1360 mulpd XMM2, XMM6;
1361 mulpd XMM3, XMM7;
1362 movupd [ESI+ 0-64], XMM0;
1363 movupd [ESI+16-64], XMM1;
1364 movupd [ESI+32-64], XMM2;
1365 movupd [ESI+48-64], XMM3;
1366 cmp ESI, EDI;
1367 jb startsseloopb;
1368
1369 mov aptr, ESI;
1370 mov bptr, ECX;
1371 }
1372 }
1373 }
1374
1375 while (aptr < aend)
1376 *aptr++ *= *bptr++;
1377
1378 return a;
1379 }
1380
1381 unittest
1382 {
1383 printf("_arrayExpSliceMulass_d unittest\n");
1384 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1385 {
1386 version (log) printf(" cpuid %d\n", cpuid);
1387
1388 for (int j = 0; j < 2; j++)
1389 {
1390 const int dim = 67;
1391 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1392 a = a[j .. dim + j]; // misalign for second iteration
1393 T[] b = new T[dim + j];
1394 b = b[j .. dim + j];
1395 T[] c = new T[dim + j];
1396 c = c[j .. dim + j];
1397
1398 for (int i = 0; i < dim; i++)
1399 { a[i] = cast(T)i;
1400 b[i] = cast(T)(i + 7);
1401 c[i] = cast(T)(i * 2);
1402 }
1403
1404 a[] = c[];
1405 c[] *= 6;
1406
1407 for (int i = 0; i < dim; i++)
1408 {
1409 if (c[i] != cast(T)(a[i] * 6))
1410 {
1411 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1412 assert(0);
1413 }
1414 }
1415 }
1416 }
1417 }
1418
1419 /* ======================================================================== */
1420
1421 /***********************
1422 * Computes:
1423 * a[] = b[] / value
1424 */
1425
1426 T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b)
1427 in
1428 {
1429 assert(a.length == b.length);
1430 assert(disjoint(a, b));
1431 }
1432 body
1433 {
1434 //printf("_arraySliceExpDivSliceAssign_d()\n");
1435 auto aptr = a.ptr;
1436 auto aend = aptr + a.length;
1437 auto bptr = b.ptr;
1438
1439 /* Multiplying by the reciprocal is faster, but does
1440 * not produce as accurate an answer.
1441 */
1442 T recip = cast(T)1 / value;
1443
1444 version (D_InlineAsm_X86)
1445 {
1446 // SSE2 version is 299% faster
1447 if (sse2() && a.length >= 8)
1448 {
1449 auto n = aptr + (a.length & ~7);
1450
1451 // Unaligned case
1452 asm
1453 {
1454 mov EAX, bptr;
1455 mov ESI, aptr;
1456 mov EDI, n;
1457 movsd XMM4, recip;
1458 //movsd XMM4, value
1459 //rcpsd XMM4, XMM4
1460 shufpd XMM4, XMM4, 0;
1461
1462 align 8;
1463 startsseloop:
1464 add ESI, 64;
1465 movupd XMM0, [EAX];
1466 movupd XMM1, [EAX+16];
1467 movupd XMM2, [EAX+32];
1468 movupd XMM3, [EAX+48];
1469 add EAX, 64;
1470 mulpd XMM0, XMM4;
1471 mulpd XMM1, XMM4;
1472 mulpd XMM2, XMM4;
1473 mulpd XMM3, XMM4;
1474 //divpd XMM0, XMM4;
1475 //divpd XMM1, XMM4;
1476 //divpd XMM2, XMM4;
1477 //divpd XMM3, XMM4;
1478 movupd [ESI+ 0-64], XMM0;
1479 movupd [ESI+16-64], XMM1;
1480 movupd [ESI+32-64], XMM2;
1481 movupd [ESI+48-64], XMM3;
1482 cmp ESI, EDI;
1483 jb startsseloop;
1484
1485 mov aptr, ESI;
1486 mov bptr, EAX;
1487 }
1488 }
1489 }
1490
1491 while (aptr < aend)
1492 {
1493 *aptr++ = *bptr++ / value;
1494 //*aptr++ = *bptr++ * recip;
1495 }
1496
1497 return a;
1498 }
1499
1500 unittest
1501 {
1502 printf("_arraySliceExpDivSliceAssign_d unittest\n");
1503 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1504 {
1505 version (log) printf(" cpuid %d\n", cpuid);
1506
1507 for (int j = 0; j < 2; j++)
1508 {
1509 const int dim = 67;
1510 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1511 a = a[j .. dim + j]; // misalign for second iteration
1512 T[] b = new T[dim + j];
1513 b = b[j .. dim + j];
1514 T[] c = new T[dim + j];
1515 c = c[j .. dim + j];
1516
1517 for (int i = 0; i < dim; i++)
1518 { a[i] = cast(T)i;
1519 b[i] = cast(T)(i + 7);
1520 c[i] = cast(T)(i * 2);
1521 }
1522
1523 c[] = a[] / 8;
1524
1525 for (int i = 0; i < dim; i++)
1526 {
1527 //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]);
1528 if (c[i] != cast(T)(a[i] / 8))
1529 {
1530 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
1531 assert(0);
1532 }
1533 }
1534 }
1535 }
1536 }
1537
1538 /* ======================================================================== */
1539
1540 /***********************
1541 * Computes:
1542 * a[] /= value
1543 */
1544
1545 T[] _arrayExpSliceDivass_d(T[] a, T value)
1546 {
1547 //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1548 auto aptr = a.ptr;
1549 auto aend = aptr + a.length;
1550
1551 /* Multiplying by the reciprocal is faster, but does
1552 * not produce as accurate an answer.
1553 */
1554 T recip = cast(T)1 / value;
1555
1556 version (D_InlineAsm_X86)
1557 {
1558 // SSE2 version is 65% faster
1559 if (sse2() && a.length >= 8)
1560 {
1561 auto n = aptr + (a.length & ~7);
1562
1563 // Unaligned case
1564 asm
1565 {
1566 mov ESI, aptr;
1567 mov EDI, n;
1568 movsd XMM4, recip;
1569 //movsd XMM4, value
1570 //rcpsd XMM4, XMM4
1571 shufpd XMM4, XMM4, 0;
1572
1573 align 8;
1574 startsseloopa:
1575 movupd XMM0, [ESI];
1576 movupd XMM1, [ESI+16];
1577 movupd XMM2, [ESI+32];
1578 movupd XMM3, [ESI+48];
1579 add ESI, 64;
1580 mulpd XMM0, XMM4;
1581 mulpd XMM1, XMM4;
1582 mulpd XMM2, XMM4;
1583 mulpd XMM3, XMM4;
1584 //divpd XMM0, XMM4;
1585 //divpd XMM1, XMM4;
1586 //divpd XMM2, XMM4;
1587 //divpd XMM3, XMM4;
1588 movupd [ESI+ 0-64], XMM0;
1589 movupd [ESI+16-64], XMM1;
1590 movupd [ESI+32-64], XMM2;
1591 movupd [ESI+48-64], XMM3;
1592 cmp ESI, EDI;
1593 jb startsseloopa;
1594
1595 mov aptr, ESI;
1596 }
1597 }
1598 }
1599
1600 while (aptr < aend)
1601 *aptr++ *= recip;
1602
1603 return a;
1604 }
1605
1606
1607 unittest
1608 {
1609 printf("_arrayExpSliceDivass_d unittest\n");
1610 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1611 {
1612 version (log) printf(" cpuid %d\n", cpuid);
1613
1614 for (int j = 0; j < 2; j++)
1615 {
1616 const int dim = 67;
1617 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1618 a = a[j .. dim + j]; // misalign for second iteration
1619 T[] b = new T[dim + j];
1620 b = b[j .. dim + j];
1621 T[] c = new T[dim + j];
1622 c = c[j .. dim + j];
1623
1624 for (int i = 0; i < dim; i++)
1625 { a[i] = cast(T)i;
1626 b[i] = cast(T)(i + 7);
1627 c[i] = cast(T)(i * 2);
1628 }
1629
1630 a[] = c[];
1631 c[] /= 8;
1632
1633 for (int i = 0; i < dim; i++)
1634 {
1635 if (c[i] != cast(T)(a[i] / 8))
1636 {
1637 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
1638 assert(0);
1639 }
1640 }
1641 }
1642 }
1643 }
1644
1645
1646 /* ======================================================================== */
1647
1648 /***********************
1649 * Computes:
1650 * a[] -= b[] * value
1651 */
1652
1653 T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b)
1654 {
1655 return _arraySliceExpMulSliceAddass_d(a, -value, b);
1656 }
1657
1658 /***********************
1659 * Computes:
1660 * a[] += b[] * value
1661 */
1662
1663 T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b)
1664 in
1665 {
1666 assert(a.length == b.length);
1667 assert(disjoint(a, b));
1668 }
1669 body
1670 {
1671 auto aptr = a.ptr;
1672 auto aend = aptr + a.length;
1673 auto bptr = b.ptr;
1674
1675 // Handle remainder
1676 while (aptr < aend)
1677 *aptr++ += *bptr++ * value;
1678
1679 return a;
1680 }
1681
1682 unittest
1683 {
1684 printf("_arraySliceExpMulSliceAddass_d unittest\n");
1685
1686 cpuid = 1;
1687 {
1688 version (log) printf(" cpuid %d\n", cpuid);
1689
1690 for (int j = 0; j < 1; j++)
1691 {
1692 const int dim = 67;
1693 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1694 a = a[j .. dim + j]; // misalign for second iteration
1695 T[] b = new T[dim + j];
1696 b = b[j .. dim + j];
1697 T[] c = new T[dim + j];
1698 c = c[j .. dim + j];
1699
1700 for (int i = 0; i < dim; i++)
1701 { a[i] = cast(T)i;
1702 b[i] = cast(T)(i + 7);
1703 c[i] = cast(T)(i * 2);
1704 }
1705
1706 b[] = c[];
1707 c[] += a[] * 6;
1708
1709 for (int i = 0; i < dim; i++)
1710 {
1711 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
1712 if (c[i] != cast(T)(b[i] + a[i] * 6))
1713 {
1714 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
1715 assert(0);
1716 }
1717 }
1718 }
1719 }
1720 }