comparison druntime/src/compiler/dmd/arrayfloat.d @ 759:d3eb054172f9

Added copy of druntime from DMD 2.020 modified for LDC.
author Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date Tue, 11 Nov 2008 01:52:37 +0100
parents
children
comparison
equal deleted inserted replaced
758:f04dde6e882c 759:d3eb054172f9
1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for float array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
6 */
7
8 module rt.arrayfloat;
9
10 private import util.cpuid;
11
12 version (Unittest)
13 {
14 /* This is so unit tests will test every CPU variant
15 */
16 int cpuid;
17 const int CPUID_MAX = 5;
18 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
19 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
20 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
21 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
22 }
23 else
24 {
25 alias util.cpuid.mmx mmx;
26 alias util.cpuid.sse sse;
27 alias util.cpuid.sse2 sse2;
28 alias util.cpuid.amd3dnow amd3dnow;
29 }
30
31 //version = log;
32
33 bool disjoint(T)(T[] a, T[] b)
34 {
35 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
36 }
37
38 alias float T;
39
40 extern (C):
41
42 /* ======================================================================== */
43
44 /***********************
45 * Computes:
46 * a[] = b[] + c[]
47 */
48
49 T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b)
50 in
51 {
52 assert(a.length == b.length && b.length == c.length);
53 assert(disjoint(a, b));
54 assert(disjoint(a, c));
55 assert(disjoint(b, c));
56 }
57 body
58 {
59 //printf("_arraySliceSliceAddSliceAssign_f()\n");
60 auto aptr = a.ptr;
61 auto aend = aptr + a.length;
62 auto bptr = b.ptr;
63 auto cptr = c.ptr;
64
65 version (D_InlineAsm_X86)
66 {
67 // SSE version is 834% faster
68 if (sse() && b.length >= 16)
69 {
70 version (log) printf("\tsse unaligned\n");
71 auto n = aptr + (b.length & ~15);
72
73 // Unaligned case
74 asm
75 {
76 mov EAX, bptr; // left operand
77 mov ECX, cptr; // right operand
78 mov ESI, aptr; // destination operand
79 mov EDI, n; // end comparison
80
81 align 8;
82 startsseloopb:
83 movups XMM0, [EAX];
84 movups XMM1, [EAX+16];
85 movups XMM2, [EAX+32];
86 movups XMM3, [EAX+48];
87 add EAX, 64;
88 movups XMM4, [ECX];
89 movups XMM5, [ECX+16];
90 movups XMM6, [ECX+32];
91 movups XMM7, [ECX+48];
92 add ESI, 64;
93 addps XMM0, XMM4;
94 addps XMM1, XMM5;
95 addps XMM2, XMM6;
96 addps XMM3, XMM7;
97 add ECX, 64;
98 movups [ESI+ 0-64], XMM0;
99 movups [ESI+16-64], XMM1;
100 movups [ESI+32-64], XMM2;
101 movups [ESI+48-64], XMM3;
102 cmp ESI, EDI;
103 jb startsseloopb;
104
105 mov aptr, ESI;
106 mov bptr, EAX;
107 mov cptr, ECX;
108 }
109 }
110 else
111 // 3DNow! version is only 13% faster
112 if (amd3dnow() && b.length >= 8)
113 {
114 version (log) printf("\tamd3dnow\n");
115 auto n = aptr + (b.length & ~7);
116
117 asm
118 {
119 mov ESI, aptr; // destination operand
120 mov EDI, n; // end comparison
121 mov EAX, bptr; // left operand
122 mov ECX, cptr; // right operand
123
124 align 4;
125 start3dnow:
126 movq MM0, [EAX];
127 movq MM1, [EAX+8];
128 movq MM2, [EAX+16];
129 movq MM3, [EAX+24];
130 pfadd MM0, [ECX];
131 pfadd MM1, [ECX+8];
132 pfadd MM2, [ECX+16];
133 pfadd MM3, [ECX+24];
134 movq [ESI], MM0;
135 movq [ESI+8], MM1;
136 movq [ESI+16], MM2;
137 movq [ESI+24], MM3;
138 add ECX, 32;
139 add ESI, 32;
140 add EAX, 32;
141 cmp ESI, EDI;
142 jb start3dnow;
143
144 emms;
145 mov aptr, ESI;
146 mov bptr, EAX;
147 mov cptr, ECX;
148 }
149 }
150 }
151
152 // Handle remainder
153 version (log) if (aptr < aend) printf("\tbase\n");
154 while (aptr < aend)
155 *aptr++ = *bptr++ + *cptr++;
156
157 return a;
158 }
159
160
161 unittest
162 {
163 printf("_arraySliceSliceAddSliceAssign_f unittest\n");
164 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
165 {
166 version (log) printf(" cpuid %d\n", cpuid);
167
168 for (int j = 0; j < 2; j++)
169 {
170 const int dim = 67;
171 T[] a = new T[dim + j]; // aligned on 16 byte boundary
172 a = a[j .. dim + j]; // misalign for second iteration
173 T[] b = new T[dim + j];
174 b = b[j .. dim + j];
175 T[] c = new T[dim + j];
176 c = c[j .. dim + j];
177
178 for (int i = 0; i < dim; i++)
179 { a[i] = cast(T)i;
180 b[i] = cast(T)(i + 7);
181 c[i] = cast(T)(i * 2);
182 }
183
184 c[] = a[] + b[];
185
186 for (int i = 0; i < dim; i++)
187 {
188 if (c[i] != cast(T)(a[i] + b[i]))
189 {
190 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
191 assert(0);
192 }
193 }
194 }
195 }
196 }
197
198 /* ======================================================================== */
199
200 /***********************
201 * Computes:
202 * a[] = b[] - c[]
203 */
204
205 T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b)
206 in
207 {
208 assert(a.length == b.length && b.length == c.length);
209 assert(disjoint(a, b));
210 assert(disjoint(a, c));
211 assert(disjoint(b, c));
212 }
213 body
214 {
215 auto aptr = a.ptr;
216 auto aend = aptr + a.length;
217 auto bptr = b.ptr;
218 auto cptr = c.ptr;
219
220 version (D_InlineAsm_X86)
221 {
222 // SSE version is 834% faster
223 if (sse() && b.length >= 16)
224 {
225 auto n = aptr + (b.length & ~15);
226
227 // Unaligned case
228 asm
229 {
230 mov EAX, bptr; // left operand
231 mov ECX, cptr; // right operand
232 mov ESI, aptr; // destination operand
233 mov EDI, n; // end comparison
234
235 align 8;
236 startsseloopb:
237 movups XMM0, [EAX];
238 movups XMM1, [EAX+16];
239 movups XMM2, [EAX+32];
240 movups XMM3, [EAX+48];
241 add EAX, 64;
242 movups XMM4, [ECX];
243 movups XMM5, [ECX+16];
244 movups XMM6, [ECX+32];
245 movups XMM7, [ECX+48];
246 add ESI, 64;
247 subps XMM0, XMM4;
248 subps XMM1, XMM5;
249 subps XMM2, XMM6;
250 subps XMM3, XMM7;
251 add ECX, 64;
252 movups [ESI+ 0-64], XMM0;
253 movups [ESI+16-64], XMM1;
254 movups [ESI+32-64], XMM2;
255 movups [ESI+48-64], XMM3;
256 cmp ESI, EDI;
257 jb startsseloopb;
258
259 mov aptr, ESI;
260 mov bptr, EAX;
261 mov cptr, ECX;
262 }
263 }
264 else
265 // 3DNow! version is only 13% faster
266 if (amd3dnow() && b.length >= 8)
267 {
268 auto n = aptr + (b.length & ~7);
269
270 asm
271 {
272 mov ESI, aptr; // destination operand
273 mov EDI, n; // end comparison
274 mov EAX, bptr; // left operand
275 mov ECX, cptr; // right operand
276
277 align 4;
278 start3dnow:
279 movq MM0, [EAX];
280 movq MM1, [EAX+8];
281 movq MM2, [EAX+16];
282 movq MM3, [EAX+24];
283 pfsub MM0, [ECX];
284 pfsub MM1, [ECX+8];
285 pfsub MM2, [ECX+16];
286 pfsub MM3, [ECX+24];
287 movq [ESI], MM0;
288 movq [ESI+8], MM1;
289 movq [ESI+16], MM2;
290 movq [ESI+24], MM3;
291 add ECX, 32;
292 add ESI, 32;
293 add EAX, 32;
294 cmp ESI, EDI;
295 jb start3dnow;
296
297 emms;
298 mov aptr, ESI;
299 mov bptr, EAX;
300 mov cptr, ECX;
301 }
302 }
303 }
304
305 // Handle remainder
306 while (aptr < aend)
307 *aptr++ = *bptr++ - *cptr++;
308
309 return a;
310 }
311
312
313 unittest
314 {
315 printf("_arraySliceSliceMinSliceAssign_f unittest\n");
316 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
317 {
318 version (log) printf(" cpuid %d\n", cpuid);
319
320 for (int j = 0; j < 2; j++)
321 {
322 const int dim = 67;
323 T[] a = new T[dim + j]; // aligned on 16 byte boundary
324 a = a[j .. dim + j]; // misalign for second iteration
325 T[] b = new T[dim + j];
326 b = b[j .. dim + j];
327 T[] c = new T[dim + j];
328 c = c[j .. dim + j];
329
330 for (int i = 0; i < dim; i++)
331 { a[i] = cast(T)i;
332 b[i] = cast(T)(i + 7);
333 c[i] = cast(T)(i * 2);
334 }
335
336 c[] = a[] - b[];
337
338 for (int i = 0; i < dim; i++)
339 {
340 if (c[i] != cast(T)(a[i] - b[i]))
341 {
342 printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]);
343 assert(0);
344 }
345 }
346 }
347 }
348 }
349
350 /* ======================================================================== */
351
352 /***********************
353 * Computes:
354 * a[] = b[] + value
355 */
356
357 T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b)
358 in
359 {
360 assert(a.length == b.length);
361 assert(disjoint(a, b));
362 }
363 body
364 {
365 //printf("_arraySliceExpAddSliceAssign_f()\n");
366 auto aptr = a.ptr;
367 auto aend = aptr + a.length;
368 auto bptr = b.ptr;
369
370 version (D_InlineAsm_X86)
371 {
372 // SSE version is 665% faster
373 if (sse() && a.length >= 16)
374 {
375 auto n = aptr + (a.length & ~15);
376
377 // Unaligned case
378 asm
379 {
380 mov EAX, bptr;
381 mov ESI, aptr;
382 mov EDI, n;
383 movss XMM4, value;
384 shufps XMM4, XMM4, 0;
385
386 align 8;
387 startsseloop:
388 add ESI, 64;
389 movups XMM0, [EAX];
390 movups XMM1, [EAX+16];
391 movups XMM2, [EAX+32];
392 movups XMM3, [EAX+48];
393 add EAX, 64;
394 addps XMM0, XMM4;
395 addps XMM1, XMM4;
396 addps XMM2, XMM4;
397 addps XMM3, XMM4;
398 movups [ESI+ 0-64], XMM0;
399 movups [ESI+16-64], XMM1;
400 movups [ESI+32-64], XMM2;
401 movups [ESI+48-64], XMM3;
402 cmp ESI, EDI;
403 jb startsseloop;
404
405 mov aptr, ESI;
406 mov bptr, EAX;
407 }
408 }
409 else
410 // 3DNow! version is 69% faster
411 if (amd3dnow() && a.length >= 8)
412 {
413 auto n = aptr + (a.length & ~7);
414
415 ulong w = *cast(uint *) &value;
416 ulong v = w | (w << 32L);
417
418 asm
419 {
420 mov ESI, aptr;
421 mov EDI, n;
422 mov EAX, bptr;
423 movq MM4, qword ptr [v];
424
425 align 8;
426 start3dnow:
427 movq MM0, [EAX];
428 movq MM1, [EAX+8];
429 movq MM2, [EAX+16];
430 movq MM3, [EAX+24];
431 pfadd MM0, MM4;
432 pfadd MM1, MM4;
433 pfadd MM2, MM4;
434 pfadd MM3, MM4;
435 movq [ESI], MM0;
436 movq [ESI+8], MM1;
437 movq [ESI+16], MM2;
438 movq [ESI+24], MM3;
439 add ESI, 32;
440 add EAX, 32;
441 cmp ESI, EDI;
442 jb start3dnow;
443
444 emms;
445 mov aptr, ESI;
446 mov bptr, EAX;
447 }
448 }
449 }
450
451 while (aptr < aend)
452 *aptr++ = *bptr++ + value;
453
454 return a;
455 }
456
457 unittest
458 {
459 printf("_arraySliceExpAddSliceAssign_f unittest\n");
460 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
461 {
462 version (log) printf(" cpuid %d\n", cpuid);
463
464 for (int j = 0; j < 2; j++)
465 {
466 const int dim = 67;
467 T[] a = new T[dim + j]; // aligned on 16 byte boundary
468 a = a[j .. dim + j]; // misalign for second iteration
469 T[] b = new T[dim + j];
470 b = b[j .. dim + j];
471 T[] c = new T[dim + j];
472 c = c[j .. dim + j];
473
474 for (int i = 0; i < dim; i++)
475 { a[i] = cast(T)i;
476 b[i] = cast(T)(i + 7);
477 c[i] = cast(T)(i * 2);
478 }
479
480 c[] = a[] + 6;
481
482 for (int i = 0; i < dim; i++)
483 {
484 if (c[i] != cast(T)(a[i] + 6))
485 {
486 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
487 assert(0);
488 }
489 }
490 }
491 }
492 }
493
494 /* ======================================================================== */
495
496 /***********************
497 * Computes:
498 * a[] += value
499 */
500
501 T[] _arrayExpSliceAddass_f(T[] a, T value)
502 {
503 //printf("_arrayExpSliceAddass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
504 auto aptr = a.ptr;
505 auto aend = aptr + a.length;
506
507 version (D_InlineAsm_X86)
508 {
509 // SSE version is 302% faster
510 if (sse() && a.length >= 16)
511 {
512 // align pointer
513 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
514 while (aptr < n)
515 *aptr++ += value;
516 n = cast(T*)((cast(uint)aend) & ~15);
517 if (aptr < n)
518
519 // Aligned case
520 asm
521 {
522 mov ESI, aptr;
523 mov EDI, n;
524 movss XMM4, value;
525 shufps XMM4, XMM4, 0;
526
527 align 8;
528 startsseloopa:
529 movaps XMM0, [ESI];
530 movaps XMM1, [ESI+16];
531 movaps XMM2, [ESI+32];
532 movaps XMM3, [ESI+48];
533 add ESI, 64;
534 addps XMM0, XMM4;
535 addps XMM1, XMM4;
536 addps XMM2, XMM4;
537 addps XMM3, XMM4;
538 movaps [ESI+ 0-64], XMM0;
539 movaps [ESI+16-64], XMM1;
540 movaps [ESI+32-64], XMM2;
541 movaps [ESI+48-64], XMM3;
542 cmp ESI, EDI;
543 jb startsseloopa;
544
545 mov aptr, ESI;
546 }
547 }
548 else
549 // 3DNow! version is 63% faster
550 if (amd3dnow() && a.length >= 8)
551 {
552 auto n = aptr + (a.length & ~7);
553
554 ulong w = *cast(uint *) &value;
555 ulong v = w | (w << 32L);
556
557 asm
558 {
559 mov ESI, dword ptr [aptr];
560 mov EDI, dword ptr [n];
561 movq MM4, qword ptr [v];
562
563 align 8;
564 start3dnow:
565 movq MM0, [ESI];
566 movq MM1, [ESI+8];
567 movq MM2, [ESI+16];
568 movq MM3, [ESI+24];
569 pfadd MM0, MM4;
570 pfadd MM1, MM4;
571 pfadd MM2, MM4;
572 pfadd MM3, MM4;
573 movq [ESI], MM0;
574 movq [ESI+8], MM1;
575 movq [ESI+16], MM2;
576 movq [ESI+24], MM3;
577 add ESI, 32;
578 cmp ESI, EDI;
579 jb start3dnow;
580
581 emms;
582 mov dword ptr [aptr], ESI;
583 }
584 }
585 }
586
587 while (aptr < aend)
588 *aptr++ += value;
589
590 return a;
591 }
592
593 unittest
594 {
595 printf("_arrayExpSliceAddass_f unittest\n");
596 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
597 {
598 version (log) printf(" cpuid %d\n", cpuid);
599
600 for (int j = 0; j < 2; j++)
601 {
602 const int dim = 67;
603 T[] a = new T[dim + j]; // aligned on 16 byte boundary
604 a = a[j .. dim + j]; // misalign for second iteration
605 T[] b = new T[dim + j];
606 b = b[j .. dim + j];
607 T[] c = new T[dim + j];
608 c = c[j .. dim + j];
609
610 for (int i = 0; i < dim; i++)
611 { a[i] = cast(T)i;
612 b[i] = cast(T)(i + 7);
613 c[i] = cast(T)(i * 2);
614 }
615
616 a[] = c[];
617 c[] += 6;
618
619 for (int i = 0; i < dim; i++)
620 {
621 if (c[i] != cast(T)(a[i] + 6))
622 {
623 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
624 assert(0);
625 }
626 }
627 }
628 }
629 }
630
631 /* ======================================================================== */
632
633 /***********************
634 * Computes:
635 * a[] += b[]
636 */
637
638 T[] _arraySliceSliceAddass_f(T[] a, T[] b)
639 in
640 {
641 assert (a.length == b.length);
642 assert (disjoint(a, b));
643 }
644 body
645 {
646 //printf("_arraySliceSliceAddass_f()\n");
647 auto aptr = a.ptr;
648 auto aend = aptr + a.length;
649 auto bptr = b.ptr;
650
651 version (D_InlineAsm_X86)
652 {
653 // SSE version is 468% faster
654 if (sse() && a.length >= 16)
655 {
656 auto n = aptr + (a.length & ~15);
657
658 // Unaligned case
659 asm
660 {
661 mov ECX, bptr; // right operand
662 mov ESI, aptr; // destination operand
663 mov EDI, n; // end comparison
664
665 align 8;
666 startsseloopb:
667 movups XMM0, [ESI];
668 movups XMM1, [ESI+16];
669 movups XMM2, [ESI+32];
670 movups XMM3, [ESI+48];
671 add ESI, 64;
672 movups XMM4, [ECX];
673 movups XMM5, [ECX+16];
674 movups XMM6, [ECX+32];
675 movups XMM7, [ECX+48];
676 add ECX, 64;
677 addps XMM0, XMM4;
678 addps XMM1, XMM5;
679 addps XMM2, XMM6;
680 addps XMM3, XMM7;
681 movups [ESI+ 0-64], XMM0;
682 movups [ESI+16-64], XMM1;
683 movups [ESI+32-64], XMM2;
684 movups [ESI+48-64], XMM3;
685 cmp ESI, EDI;
686 jb startsseloopb;
687
688 mov aptr, ESI;
689 mov bptr, ECX;
690 }
691 }
692 else
693 // 3DNow! version is 57% faster
694 if (amd3dnow() && a.length >= 8)
695 {
696 auto n = aptr + (a.length & ~7);
697
698 asm
699 {
700 mov ESI, dword ptr [aptr]; // destination operand
701 mov EDI, dword ptr [n]; // end comparison
702 mov ECX, dword ptr [bptr]; // right operand
703
704 align 4;
705 start3dnow:
706 movq MM0, [ESI];
707 movq MM1, [ESI+8];
708 movq MM2, [ESI+16];
709 movq MM3, [ESI+24];
710 pfadd MM0, [ECX];
711 pfadd MM1, [ECX+8];
712 pfadd MM2, [ECX+16];
713 pfadd MM3, [ECX+24];
714 movq [ESI], MM0;
715 movq [ESI+8], MM1;
716 movq [ESI+16], MM2;
717 movq [ESI+24], MM3;
718 add ESI, 32;
719 add ECX, 32;
720 cmp ESI, EDI;
721 jb start3dnow;
722
723 emms;
724 mov dword ptr [aptr], ESI;
725 mov dword ptr [bptr], ECX;
726 }
727 }
728 }
729
730 while (aptr < aend)
731 *aptr++ += *bptr++;
732
733 return a;
734 }
735
736 unittest
737 {
738 printf("_arraySliceSliceAddass_f unittest\n");
739 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
740 {
741 version (log) printf(" cpuid %d\n", cpuid);
742
743 for (int j = 0; j < 2; j++)
744 {
745 const int dim = 67;
746 T[] a = new T[dim + j]; // aligned on 16 byte boundary
747 a = a[j .. dim + j]; // misalign for second iteration
748 T[] b = new T[dim + j];
749 b = b[j .. dim + j];
750 T[] c = new T[dim + j];
751 c = c[j .. dim + j];
752
753 for (int i = 0; i < dim; i++)
754 { a[i] = cast(T)i;
755 b[i] = cast(T)(i + 7);
756 c[i] = cast(T)(i * 2);
757 }
758
759 a[] = c[];
760 c[] += b[];
761
762 for (int i = 0; i < dim; i++)
763 {
764 if (c[i] != cast(T)(a[i] + b[i]))
765 {
766 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
767 assert(0);
768 }
769 }
770 }
771 }
772 }
773
774 /* ======================================================================== */
775
776 /***********************
777 * Computes:
778 * a[] = b[] - value
779 */
780
781 T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b)
782 in
783 {
784 assert (a.length == b.length);
785 assert (disjoint(a, b));
786 }
787 body
788 {
789 //printf("_arraySliceExpMinSliceAssign_f()\n");
790 auto aptr = a.ptr;
791 auto aend = aptr + a.length;
792 auto bptr = b.ptr;
793
794 version (D_InlineAsm_X86)
795 {
796 // SSE version is 622% faster
797 if (sse() && a.length >= 16)
798 {
799 auto n = aptr + (a.length & ~15);
800
801 // Unaligned case
802 asm
803 {
804 mov EAX, bptr;
805 mov ESI, aptr;
806 mov EDI, n;
807 movss XMM4, value;
808 shufps XMM4, XMM4, 0;
809
810 align 8;
811 startsseloop:
812 add ESI, 64;
813 movups XMM0, [EAX];
814 movups XMM1, [EAX+16];
815 movups XMM2, [EAX+32];
816 movups XMM3, [EAX+48];
817 add EAX, 64;
818 subps XMM0, XMM4;
819 subps XMM1, XMM4;
820 subps XMM2, XMM4;
821 subps XMM3, XMM4;
822 movups [ESI+ 0-64], XMM0;
823 movups [ESI+16-64], XMM1;
824 movups [ESI+32-64], XMM2;
825 movups [ESI+48-64], XMM3;
826 cmp ESI, EDI;
827 jb startsseloop;
828
829 mov aptr, ESI;
830 mov bptr, EAX;
831 }
832 }
833 else
834 // 3DNow! version is 67% faster
835 if (amd3dnow() && a.length >= 8)
836 {
837 auto n = aptr + (a.length & ~7);
838
839 T[2] w;
840
841 w[0] = w[1] = value;
842
843 asm
844 {
845 mov ESI, dword ptr [aptr];
846 mov EDI, dword ptr [n];
847 mov EAX, dword ptr [bptr];
848 movq MM4, qword ptr [w];
849
850 align 8;
851 start3dnow:
852 movq MM0, [EAX];
853 movq MM1, [EAX+8];
854 movq MM2, [EAX+16];
855 movq MM3, [EAX+24];
856 pfsub MM0, MM4;
857 pfsub MM1, MM4;
858 pfsub MM2, MM4;
859 pfsub MM3, MM4;
860 movq [ESI], MM0;
861 movq [ESI+8], MM1;
862 movq [ESI+16], MM2;
863 movq [ESI+24], MM3;
864 add ESI, 32;
865 add EAX, 32;
866 cmp ESI, EDI;
867 jb start3dnow;
868
869 emms;
870 mov dword ptr [aptr], ESI;
871 mov dword ptr [bptr], EAX;
872 }
873 }
874 }
875
876 while (aptr < aend)
877 *aptr++ = *bptr++ - value;
878
879 return a;
880 }
881
882 unittest
883 {
884 printf("_arraySliceExpMinSliceAssign_f unittest\n");
885 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
886 {
887 version (log) printf(" cpuid %d\n", cpuid);
888
889 for (int j = 0; j < 2; j++)
890 {
891 const int dim = 67;
892 T[] a = new T[dim + j]; // aligned on 16 byte boundary
893 a = a[j .. dim + j]; // misalign for second iteration
894 T[] b = new T[dim + j];
895 b = b[j .. dim + j];
896 T[] c = new T[dim + j];
897 c = c[j .. dim + j];
898
899 for (int i = 0; i < dim; i++)
900 { a[i] = cast(T)i;
901 b[i] = cast(T)(i + 7);
902 c[i] = cast(T)(i * 2);
903 }
904
905 c[] = a[] - 6;
906
907 for (int i = 0; i < dim; i++)
908 {
909 if (c[i] != cast(T)(a[i] - 6))
910 {
911 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
912 assert(0);
913 }
914 }
915 }
916 }
917 }
918
919 /* ======================================================================== */
920
921 /***********************
922 * Computes:
923 * a[] = value - b[]
924 */
925
926 T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value)
927 in
928 {
929 assert (a.length == b.length);
930 assert (disjoint(a, b));
931 }
932 body
933 {
934 //printf("_arrayExpSliceMinSliceAssign_f()\n");
935 auto aptr = a.ptr;
936 auto aend = aptr + a.length;
937 auto bptr = b.ptr;
938
939 version (D_InlineAsm_X86)
940 {
941 // SSE version is 690% faster
942 if (sse() && a.length >= 16)
943 {
944 auto n = aptr + (a.length & ~15);
945
946 // Unaligned case
947 asm
948 {
949 mov EAX, bptr;
950 mov ESI, aptr;
951 mov EDI, n;
952 movss XMM4, value;
953 shufps XMM4, XMM4, 0;
954
955 align 8;
956 startsseloop:
957 add ESI, 64;
958 movaps XMM5, XMM4;
959 movaps XMM6, XMM4;
960 movups XMM0, [EAX];
961 movups XMM1, [EAX+16];
962 movups XMM2, [EAX+32];
963 movups XMM3, [EAX+48];
964 add EAX, 64;
965 subps XMM5, XMM0;
966 subps XMM6, XMM1;
967 movups [ESI+ 0-64], XMM5;
968 movups [ESI+16-64], XMM6;
969 movaps XMM5, XMM4;
970 movaps XMM6, XMM4;
971 subps XMM5, XMM2;
972 subps XMM6, XMM3;
973 movups [ESI+32-64], XMM5;
974 movups [ESI+48-64], XMM6;
975 cmp ESI, EDI;
976 jb startsseloop;
977
978 mov aptr, ESI;
979 mov bptr, EAX;
980 }
981 }
982 else
983 // 3DNow! version is 67% faster
984 if (amd3dnow() && a.length >= 8)
985 {
986 auto n = aptr + (a.length & ~7);
987
988 ulong w = *cast(uint *) &value;
989 ulong v = w | (w << 32L);
990
991 asm
992 {
993 mov ESI, aptr;
994 mov EDI, n;
995 mov EAX, bptr;
996 movq MM4, qword ptr [v];
997
998 align 8;
999 start3dnow:
1000 movq MM0, [EAX];
1001 movq MM1, [EAX+8];
1002 movq MM2, [EAX+16];
1003 movq MM3, [EAX+24];
1004 pfsubr MM0, MM4;
1005 pfsubr MM1, MM4;
1006 pfsubr MM2, MM4;
1007 pfsubr MM3, MM4;
1008 movq [ESI], MM0;
1009 movq [ESI+8], MM1;
1010 movq [ESI+16], MM2;
1011 movq [ESI+24], MM3;
1012 add ESI, 32;
1013 add EAX, 32;
1014 cmp ESI, EDI;
1015 jb start3dnow;
1016
1017 emms;
1018 mov aptr, ESI;
1019 mov bptr, EAX;
1020 }
1021 }
1022 }
1023
1024 while (aptr < aend)
1025 *aptr++ = value - *bptr++;
1026
1027 return a;
1028 }
1029
1030 unittest
1031 {
1032 printf("_arrayExpSliceMinSliceAssign_f unittest\n");
1033 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1034 {
1035 version (log) printf(" cpuid %d\n", cpuid);
1036
1037 for (int j = 0; j < 2; j++)
1038 {
1039 const int dim = 67;
1040 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1041 a = a[j .. dim + j]; // misalign for second iteration
1042 T[] b = new T[dim + j];
1043 b = b[j .. dim + j];
1044 T[] c = new T[dim + j];
1045 c = c[j .. dim + j];
1046
1047 for (int i = 0; i < dim; i++)
1048 { a[i] = cast(T)i;
1049 b[i] = cast(T)(i + 7);
1050 c[i] = cast(T)(i * 2);
1051 }
1052
1053 c[] = 6 - a[];
1054
1055 for (int i = 0; i < dim; i++)
1056 {
1057 if (c[i] != cast(T)(6 - a[i]))
1058 {
1059 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
1060 assert(0);
1061 }
1062 }
1063 }
1064 }
1065 }
1066
1067 /* ======================================================================== */
1068
1069 /***********************
1070 * Computes:
1071 * a[] -= value
1072 */
1073
1074 T[] _arrayExpSliceMinass_f(T[] a, T value)
1075 {
1076 //printf("_arrayExpSliceMinass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1077 auto aptr = a.ptr;
1078 auto aend = aptr + a.length;
1079
1080 version (D_InlineAsm_X86)
1081 {
1082 // SSE version is 304% faster
1083 if (sse() && a.length >= 16)
1084 {
1085 // align pointer
1086 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
1087 while (aptr < n)
1088 *aptr++ -= value;
1089 n = cast(T*)((cast(uint)aend) & ~15);
1090 if (aptr < n)
1091
1092 // Aligned case
1093 asm
1094 {
1095 mov ESI, aptr;
1096 mov EDI, n;
1097 movss XMM4, value;
1098 shufps XMM4, XMM4, 0;
1099
1100 align 8;
1101 startsseloopa:
1102 movaps XMM0, [ESI];
1103 movaps XMM1, [ESI+16];
1104 movaps XMM2, [ESI+32];
1105 movaps XMM3, [ESI+48];
1106 add ESI, 64;
1107 subps XMM0, XMM4;
1108 subps XMM1, XMM4;
1109 subps XMM2, XMM4;
1110 subps XMM3, XMM4;
1111 movaps [ESI+ 0-64], XMM0;
1112 movaps [ESI+16-64], XMM1;
1113 movaps [ESI+32-64], XMM2;
1114 movaps [ESI+48-64], XMM3;
1115 cmp ESI, EDI;
1116 jb startsseloopa;
1117
1118 mov aptr, ESI;
1119 }
1120 }
1121 else
1122 // 3DNow! version is 63% faster
1123 if (amd3dnow() && a.length >= 8)
1124 {
1125 auto n = aptr + (a.length & ~7);
1126
1127 ulong w = *cast(uint *) &value;
1128 ulong v = w | (w << 32L);
1129
1130 asm
1131 {
1132 mov ESI, dword ptr [aptr];
1133 mov EDI, dword ptr [n];
1134 movq MM4, qword ptr [v];
1135
1136 align 8;
1137 start:
1138 movq MM0, [ESI];
1139 movq MM1, [ESI+8];
1140 movq MM2, [ESI+16];
1141 movq MM3, [ESI+24];
1142 pfsub MM0, MM4;
1143 pfsub MM1, MM4;
1144 pfsub MM2, MM4;
1145 pfsub MM3, MM4;
1146 movq [ESI], MM0;
1147 movq [ESI+8], MM1;
1148 movq [ESI+16], MM2;
1149 movq [ESI+24], MM3;
1150 add ESI, 32;
1151 cmp ESI, EDI;
1152 jb start;
1153
1154 emms;
1155 mov dword ptr [aptr], ESI;
1156 }
1157 }
1158 }
1159
1160 while (aptr < aend)
1161 *aptr++ -= value;
1162
1163 return a;
1164 }
1165
1166 unittest
1167 {
1168 printf("_arrayExpSliceminass_f unittest\n");
1169 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1170 {
1171 version (log) printf(" cpuid %d\n", cpuid);
1172
1173 for (int j = 0; j < 2; j++)
1174 {
1175 const int dim = 67;
1176 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1177 a = a[j .. dim + j]; // misalign for second iteration
1178 T[] b = new T[dim + j];
1179 b = b[j .. dim + j];
1180 T[] c = new T[dim + j];
1181 c = c[j .. dim + j];
1182
1183 for (int i = 0; i < dim; i++)
1184 { a[i] = cast(T)i;
1185 b[i] = cast(T)(i + 7);
1186 c[i] = cast(T)(i * 2);
1187 }
1188
1189 a[] = c[];
1190 c[] -= 6;
1191
1192 for (int i = 0; i < dim; i++)
1193 {
1194 if (c[i] != cast(T)(a[i] - 6))
1195 {
1196 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1197 assert(0);
1198 }
1199 }
1200 }
1201 }
1202 }
1203
1204 /* ======================================================================== */
1205
1206 /***********************
1207 * Computes:
1208 * a[] -= b[]
1209 */
1210
1211 T[] _arraySliceSliceMinass_f(T[] a, T[] b)
1212 in
1213 {
1214 assert (a.length == b.length);
1215 assert (disjoint(a, b));
1216 }
1217 body
1218 {
1219 //printf("_arraySliceSliceMinass_f()\n");
1220 auto aptr = a.ptr;
1221 auto aend = aptr + a.length;
1222 auto bptr = b.ptr;
1223
1224 version (D_InlineAsm_X86)
1225 {
1226 // SSE version is 468% faster
1227 if (sse() && a.length >= 16)
1228 {
1229 auto n = aptr + (a.length & ~15);
1230
1231 // Unaligned case
1232 asm
1233 {
1234 mov ECX, bptr; // right operand
1235 mov ESI, aptr; // destination operand
1236 mov EDI, n; // end comparison
1237
1238 align 8;
1239 startsseloopb:
1240 movups XMM0, [ESI];
1241 movups XMM1, [ESI+16];
1242 movups XMM2, [ESI+32];
1243 movups XMM3, [ESI+48];
1244 add ESI, 64;
1245 movups XMM4, [ECX];
1246 movups XMM5, [ECX+16];
1247 movups XMM6, [ECX+32];
1248 movups XMM7, [ECX+48];
1249 add ECX, 64;
1250 subps XMM0, XMM4;
1251 subps XMM1, XMM5;
1252 subps XMM2, XMM6;
1253 subps XMM3, XMM7;
1254 movups [ESI+ 0-64], XMM0;
1255 movups [ESI+16-64], XMM1;
1256 movups [ESI+32-64], XMM2;
1257 movups [ESI+48-64], XMM3;
1258 cmp ESI, EDI;
1259 jb startsseloopb;
1260
1261 mov aptr, ESI;
1262 mov bptr, ECX;
1263 }
1264 }
1265 else
1266 // 3DNow! version is 57% faster
1267 if (amd3dnow() && a.length >= 8)
1268 {
1269 auto n = aptr + (a.length & ~7);
1270
1271 asm
1272 {
1273 mov ESI, dword ptr [aptr]; // destination operand
1274 mov EDI, dword ptr [n]; // end comparison
1275 mov ECX, dword ptr [bptr]; // right operand
1276
1277 align 4;
1278 start:
1279 movq MM0, [ESI];
1280 movq MM1, [ESI+8];
1281 movq MM2, [ESI+16];
1282 movq MM3, [ESI+24];
1283 pfsub MM0, [ECX];
1284 pfsub MM1, [ECX+8];
1285 pfsub MM2, [ECX+16];
1286 pfsub MM3, [ECX+24];
1287 movq [ESI], MM0;
1288 movq [ESI+8], MM1;
1289 movq [ESI+16], MM2;
1290 movq [ESI+24], MM3;
1291 add ESI, 32;
1292 add ECX, 32;
1293 cmp ESI, EDI;
1294 jb start;
1295
1296 emms;
1297 mov aptr, ESI;
1298 mov bptr, ECX;
1299 }
1300 }
1301 }
1302
1303 while (aptr < aend)
1304 *aptr++ -= *bptr++;
1305
1306 return a;
1307 }
1308
1309 unittest
1310 {
1311 printf("_arrayExpSliceMinass_f unittest\n");
1312 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1313 {
1314 version (log) printf(" cpuid %d\n", cpuid);
1315
1316 for (int j = 0; j < 2; j++)
1317 {
1318 const int dim = 67;
1319 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1320 a = a[j .. dim + j]; // misalign for second iteration
1321 T[] b = new T[dim + j];
1322 b = b[j .. dim + j];
1323 T[] c = new T[dim + j];
1324 c = c[j .. dim + j];
1325
1326 for (int i = 0; i < dim; i++)
1327 { a[i] = cast(T)i;
1328 b[i] = cast(T)(i + 7);
1329 c[i] = cast(T)(i * 2);
1330 }
1331
1332 a[] = c[];
1333 c[] -= 6;
1334
1335 for (int i = 0; i < dim; i++)
1336 {
1337 if (c[i] != cast(T)(a[i] - 6))
1338 {
1339 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1340 assert(0);
1341 }
1342 }
1343 }
1344 }
1345 }
1346
1347 /* ======================================================================== */
1348
1349 /***********************
1350 * Computes:
1351 * a[] = b[] * value
1352 */
1353
1354 T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b)
1355 in
1356 {
1357 assert(a.length == b.length);
1358 assert(disjoint(a, b));
1359 }
1360 body
1361 {
1362 //printf("_arraySliceExpMulSliceAssign_f()\n");
1363 auto aptr = a.ptr;
1364 auto aend = aptr + a.length;
1365 auto bptr = b.ptr;
1366
1367 version (D_InlineAsm_X86)
1368 {
1369 // SSE version is 607% faster
1370 if (sse() && a.length >= 16)
1371 {
1372 auto n = aptr + (a.length & ~15);
1373
1374 // Unaligned case
1375 asm
1376 {
1377 mov EAX, bptr;
1378 mov ESI, aptr;
1379 mov EDI, n;
1380 movss XMM4, value;
1381 shufps XMM4, XMM4, 0;
1382
1383 align 8;
1384 startsseloop:
1385 add ESI, 64;
1386 movups XMM0, [EAX];
1387 movups XMM1, [EAX+16];
1388 movups XMM2, [EAX+32];
1389 movups XMM3, [EAX+48];
1390 add EAX, 64;
1391 mulps XMM0, XMM4;
1392 mulps XMM1, XMM4;
1393 mulps XMM2, XMM4;
1394 mulps XMM3, XMM4;
1395 movups [ESI+ 0-64], XMM0;
1396 movups [ESI+16-64], XMM1;
1397 movups [ESI+32-64], XMM2;
1398 movups [ESI+48-64], XMM3;
1399 cmp ESI, EDI;
1400 jb startsseloop;
1401
1402 mov aptr, ESI;
1403 mov bptr, EAX;
1404 }
1405 }
1406 else
1407 // 3DNow! version is 69% faster
1408 if (amd3dnow() && a.length >= 8)
1409 {
1410 auto n = aptr + (a.length & ~7);
1411
1412 ulong w = *cast(uint *) &value;
1413 ulong v = w | (w << 32L);
1414
1415 asm
1416 {
1417 mov ESI, dword ptr [aptr];
1418 mov EDI, dword ptr [n];
1419 mov EAX, dword ptr [bptr];
1420 movq MM4, qword ptr [v];
1421
1422 align 8;
1423 start:
1424 movq MM0, [EAX];
1425 movq MM1, [EAX+8];
1426 movq MM2, [EAX+16];
1427 movq MM3, [EAX+24];
1428 pfmul MM0, MM4;
1429 pfmul MM1, MM4;
1430 pfmul MM2, MM4;
1431 pfmul MM3, MM4;
1432 movq [ESI], MM0;
1433 movq [ESI+8], MM1;
1434 movq [ESI+16], MM2;
1435 movq [ESI+24], MM3;
1436 add ESI, 32;
1437 add EAX, 32;
1438 cmp ESI, EDI;
1439 jb start;
1440
1441 emms;
1442 mov aptr, ESI;
1443 mov bptr, EAX;
1444 }
1445 }
1446 }
1447
1448 while (aptr < aend)
1449 *aptr++ = *bptr++ * value;
1450
1451 return a;
1452 }
1453
1454 unittest
1455 {
1456 printf("_arraySliceExpMulSliceAssign_f unittest\n");
1457 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1458 {
1459 version (log) printf(" cpuid %d\n", cpuid);
1460
1461 for (int j = 0; j < 2; j++)
1462 {
1463 const int dim = 67;
1464 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1465 a = a[j .. dim + j]; // misalign for second iteration
1466 T[] b = new T[dim + j];
1467 b = b[j .. dim + j];
1468 T[] c = new T[dim + j];
1469 c = c[j .. dim + j];
1470
1471 for (int i = 0; i < dim; i++)
1472 { a[i] = cast(T)i;
1473 b[i] = cast(T)(i + 7);
1474 c[i] = cast(T)(i * 2);
1475 }
1476
1477 c[] = a[] * 6;
1478
1479 for (int i = 0; i < dim; i++)
1480 {
1481 if (c[i] != cast(T)(a[i] * 6))
1482 {
1483 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1484 assert(0);
1485 }
1486 }
1487 }
1488 }
1489 }
1490
1491 /* ======================================================================== */
1492
1493 /***********************
1494 * Computes:
1495 * a[] = b[] * c[]
1496 */
1497
1498 T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b)
1499 in
1500 {
1501 assert(a.length == b.length && b.length == c.length);
1502 assert(disjoint(a, b));
1503 assert(disjoint(a, c));
1504 assert(disjoint(b, c));
1505 }
1506 body
1507 {
1508 //printf("_arraySliceSliceMulSliceAssign_f()\n");
1509 auto aptr = a.ptr;
1510 auto aend = aptr + a.length;
1511 auto bptr = b.ptr;
1512 auto cptr = c.ptr;
1513
1514 version (D_InlineAsm_X86)
1515 {
1516 // SSE version is 833% faster
1517 if (sse() && a.length >= 16)
1518 {
1519 auto n = aptr + (a.length & ~15);
1520
1521 // Unaligned case
1522 asm
1523 {
1524 mov EAX, bptr; // left operand
1525 mov ECX, cptr; // right operand
1526 mov ESI, aptr; // destination operand
1527 mov EDI, n; // end comparison
1528
1529 align 8;
1530 startsseloopb:
1531 movups XMM0, [EAX];
1532 movups XMM1, [EAX+16];
1533 movups XMM2, [EAX+32];
1534 movups XMM3, [EAX+48];
1535 add ESI, 64;
1536 movups XMM4, [ECX];
1537 movups XMM5, [ECX+16];
1538 movups XMM6, [ECX+32];
1539 movups XMM7, [ECX+48];
1540 add EAX, 64;
1541 mulps XMM0, XMM4;
1542 mulps XMM1, XMM5;
1543 mulps XMM2, XMM6;
1544 mulps XMM3, XMM7;
1545 add ECX, 64;
1546 movups [ESI+ 0-64], XMM0;
1547 movups [ESI+16-64], XMM1;
1548 movups [ESI+32-64], XMM2;
1549 movups [ESI+48-64], XMM3;
1550 cmp ESI, EDI;
1551 jb startsseloopb;
1552
1553 mov aptr, ESI;
1554 mov bptr, EAX;
1555 mov cptr, ECX;
1556 }
1557 }
1558 else
1559 // 3DNow! version is only 13% faster
1560 if (amd3dnow() && a.length >= 8)
1561 {
1562 auto n = aptr + (a.length & ~7);
1563
1564 asm
1565 {
1566 mov ESI, dword ptr [aptr]; // destination operand
1567 mov EDI, dword ptr [n]; // end comparison
1568 mov EAX, dword ptr [bptr]; // left operand
1569 mov ECX, dword ptr [cptr]; // right operand
1570
1571 align 4;
1572 start:
1573 movq MM0, [EAX];
1574 movq MM1, [EAX+8];
1575 movq MM2, [EAX+16];
1576 movq MM3, [EAX+24];
1577 pfmul MM0, [ECX];
1578 pfmul MM1, [ECX+8];
1579 pfmul MM2, [ECX+16];
1580 pfmul MM3, [ECX+24];
1581 movq [ESI], MM0;
1582 movq [ESI+8], MM1;
1583 movq [ESI+16], MM2;
1584 movq [ESI+24], MM3;
1585 add ECX, 32;
1586 add ESI, 32;
1587 add EAX, 32;
1588 cmp ESI, EDI;
1589 jb start;
1590
1591 emms;
1592 mov aptr, ESI;
1593 mov bptr, EAX;
1594 mov cptr, ECX;
1595 }
1596 }
1597 }
1598
1599 while (aptr < aend)
1600 *aptr++ = *bptr++ * *cptr++;
1601
1602 return a;
1603 }
1604
1605 unittest
1606 {
1607 printf("_arraySliceSliceMulSliceAssign_f unittest\n");
1608 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1609 {
1610 version (log) printf(" cpuid %d\n", cpuid);
1611
1612 for (int j = 0; j < 2; j++)
1613 {
1614 const int dim = 67;
1615 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1616 a = a[j .. dim + j]; // misalign for second iteration
1617 T[] b = new T[dim + j];
1618 b = b[j .. dim + j];
1619 T[] c = new T[dim + j];
1620 c = c[j .. dim + j];
1621
1622 for (int i = 0; i < dim; i++)
1623 { a[i] = cast(T)i;
1624 b[i] = cast(T)(i + 7);
1625 c[i] = cast(T)(i * 2);
1626 }
1627
1628 c[] = a[] * b[];
1629
1630 for (int i = 0; i < dim; i++)
1631 {
1632 if (c[i] != cast(T)(a[i] * b[i]))
1633 {
1634 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
1635 assert(0);
1636 }
1637 }
1638 }
1639 }
1640 }
1641
1642 /* ======================================================================== */
1643
1644 /***********************
1645 * Computes:
1646 * a[] *= value
1647 */
1648
1649 T[] _arrayExpSliceMulass_f(T[] a, T value)
1650 {
1651 //printf("_arrayExpSliceMulass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1652 auto aptr = a.ptr;
1653 auto aend = aptr + a.length;
1654
1655 version (D_InlineAsm_X86)
1656 {
1657 // SSE version is 303% faster
1658 if (sse() && a.length >= 16)
1659 {
1660 // align pointer
1661 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
1662 while (aptr < n)
1663 *aptr++ *= value;
1664 n = cast(T*)((cast(uint)aend) & ~15);
1665 if (aptr < n)
1666
1667 // Aligned case
1668 asm
1669 {
1670 mov ESI, aptr;
1671 mov EDI, n;
1672 movss XMM4, value;
1673 shufps XMM4, XMM4, 0;
1674
1675 align 8;
1676 startsseloopa:
1677 movaps XMM0, [ESI];
1678 movaps XMM1, [ESI+16];
1679 movaps XMM2, [ESI+32];
1680 movaps XMM3, [ESI+48];
1681 add ESI, 64;
1682 mulps XMM0, XMM4;
1683 mulps XMM1, XMM4;
1684 mulps XMM2, XMM4;
1685 mulps XMM3, XMM4;
1686 movaps [ESI+ 0-64], XMM0;
1687 movaps [ESI+16-64], XMM1;
1688 movaps [ESI+32-64], XMM2;
1689 movaps [ESI+48-64], XMM3;
1690 cmp ESI, EDI;
1691 jb startsseloopa;
1692
1693 mov aptr, ESI;
1694 }
1695 }
1696 else
1697 // 3DNow! version is 63% faster
1698 if (amd3dnow() && a.length >= 8)
1699 {
1700 auto n = aptr + (a.length & ~7);
1701
1702 ulong w = *cast(uint *) &value;
1703 ulong v = w | (w << 32L);
1704
1705 asm
1706 {
1707 mov ESI, dword ptr [aptr];
1708 mov EDI, dword ptr [n];
1709 movq MM4, qword ptr [v];
1710
1711 align 8;
1712 start:
1713 movq MM0, [ESI];
1714 movq MM1, [ESI+8];
1715 movq MM2, [ESI+16];
1716 movq MM3, [ESI+24];
1717 pfmul MM0, MM4;
1718 pfmul MM1, MM4;
1719 pfmul MM2, MM4;
1720 pfmul MM3, MM4;
1721 movq [ESI], MM0;
1722 movq [ESI+8], MM1;
1723 movq [ESI+16], MM2;
1724 movq [ESI+24], MM3;
1725 add ESI, 32;
1726 cmp ESI, EDI;
1727 jb start;
1728
1729 emms;
1730 mov dword ptr [aptr], ESI;
1731 }
1732 }
1733 }
1734
1735 while (aptr < aend)
1736 *aptr++ *= value;
1737
1738 return a;
1739 }
1740
1741 unittest
1742 {
1743 printf("_arrayExpSliceMulass_f unittest\n");
1744 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1745 {
1746 version (log) printf(" cpuid %d\n", cpuid);
1747
1748 for (int j = 0; j < 2; j++)
1749 {
1750 const int dim = 67;
1751 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1752 a = a[j .. dim + j]; // misalign for second iteration
1753 T[] b = new T[dim + j];
1754 b = b[j .. dim + j];
1755 T[] c = new T[dim + j];
1756 c = c[j .. dim + j];
1757
1758 for (int i = 0; i < dim; i++)
1759 { a[i] = cast(T)i;
1760 b[i] = cast(T)(i + 7);
1761 c[i] = cast(T)(i * 2);
1762 }
1763
1764 a[] = c[];
1765 c[] *= 6;
1766
1767 for (int i = 0; i < dim; i++)
1768 {
1769 if (c[i] != cast(T)(a[i] * 6))
1770 {
1771 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1772 assert(0);
1773 }
1774 }
1775 }
1776 }
1777 }
1778
1779 /* ======================================================================== */
1780
1781 /***********************
1782 * Computes:
1783 * a[] *= b[]
1784 */
1785
1786 T[] _arraySliceSliceMulass_f(T[] a, T[] b)
1787 in
1788 {
1789 assert (a.length == b.length);
1790 assert (disjoint(a, b));
1791 }
1792 body
1793 {
1794 //printf("_arraySliceSliceMulass_f()\n");
1795 auto aptr = a.ptr;
1796 auto aend = aptr + a.length;
1797 auto bptr = b.ptr;
1798
1799 version (D_InlineAsm_X86)
1800 {
1801 // SSE version is 525% faster
1802 if (sse() && a.length >= 16)
1803 {
1804 auto n = aptr + (a.length & ~15);
1805
1806 // Unaligned case
1807 asm
1808 {
1809 mov ECX, bptr; // right operand
1810 mov ESI, aptr; // destination operand
1811 mov EDI, n; // end comparison
1812
1813 align 8;
1814 startsseloopb:
1815 movups XMM0, [ESI];
1816 movups XMM1, [ESI+16];
1817 movups XMM2, [ESI+32];
1818 movups XMM3, [ESI+48];
1819 add ESI, 64;
1820 movups XMM4, [ECX];
1821 movups XMM5, [ECX+16];
1822 movups XMM6, [ECX+32];
1823 movups XMM7, [ECX+48];
1824 add ECX, 64;
1825 mulps XMM0, XMM4;
1826 mulps XMM1, XMM5;
1827 mulps XMM2, XMM6;
1828 mulps XMM3, XMM7;
1829 movups [ESI+ 0-64], XMM0;
1830 movups [ESI+16-64], XMM1;
1831 movups [ESI+32-64], XMM2;
1832 movups [ESI+48-64], XMM3;
1833 cmp ESI, EDI;
1834 jb startsseloopb;
1835
1836 mov aptr, ESI;
1837 mov bptr, ECX;
1838 }
1839 }
1840 else
1841 // 3DNow! version is 57% faster
1842 if (amd3dnow() && a.length >= 8)
1843 {
1844 auto n = aptr + (a.length & ~7);
1845
1846 asm
1847 {
1848 mov ESI, dword ptr [aptr]; // destination operand
1849 mov EDI, dword ptr [n]; // end comparison
1850 mov ECX, dword ptr [bptr]; // right operand
1851
1852 align 4;
1853 start:
1854 movq MM0, [ESI];
1855 movq MM1, [ESI+8];
1856 movq MM2, [ESI+16];
1857 movq MM3, [ESI+24];
1858 pfmul MM0, [ECX];
1859 pfmul MM1, [ECX+8];
1860 pfmul MM2, [ECX+16];
1861 pfmul MM3, [ECX+24];
1862 movq [ESI], MM0;
1863 movq [ESI+8], MM1;
1864 movq [ESI+16], MM2;
1865 movq [ESI+24], MM3;
1866 add ESI, 32;
1867 add ECX, 32;
1868 cmp ESI, EDI;
1869 jb start;
1870
1871 emms;
1872 mov dword ptr [aptr], ESI;
1873 mov dword ptr [bptr], ECX;
1874 }
1875 }
1876 }
1877
1878 while (aptr < aend)
1879 *aptr++ *= *bptr++;
1880
1881 return a;
1882 }
1883
1884 unittest
1885 {
1886 printf("_arrayExpSliceMulass_f unittest\n");
1887 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1888 {
1889 version (log) printf(" cpuid %d\n", cpuid);
1890
1891 for (int j = 0; j < 2; j++)
1892 {
1893 const int dim = 67;
1894 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1895 a = a[j .. dim + j]; // misalign for second iteration
1896 T[] b = new T[dim + j];
1897 b = b[j .. dim + j];
1898 T[] c = new T[dim + j];
1899 c = c[j .. dim + j];
1900
1901 for (int i = 0; i < dim; i++)
1902 { a[i] = cast(T)i;
1903 b[i] = cast(T)(i + 7);
1904 c[i] = cast(T)(i * 2);
1905 }
1906
1907 a[] = c[];
1908 c[] *= 6;
1909
1910 for (int i = 0; i < dim; i++)
1911 {
1912 if (c[i] != cast(T)(a[i] * 6))
1913 {
1914 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1915 assert(0);
1916 }
1917 }
1918 }
1919 }
1920 }
1921
1922 /* ======================================================================== */
1923
1924 /***********************
1925 * Computes:
1926 * a[] = b[] / value
1927 */
1928
1929 T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b)
1930 in
1931 {
1932 assert(a.length == b.length);
1933 assert(disjoint(a, b));
1934 }
1935 body
1936 {
1937 //printf("_arraySliceExpDivSliceAssign_f()\n");
1938 auto aptr = a.ptr;
1939 auto aend = aptr + a.length;
1940 auto bptr = b.ptr;
1941
1942 /* Multiplying by the reciprocal is faster, but does
1943 * not produce as accurate an answer.
1944 */
1945 T recip = cast(T)1 / value;
1946
1947 version (D_InlineAsm_X86)
1948 {
1949 // SSE version is 587% faster
1950 if (sse() && a.length >= 16)
1951 {
1952 auto n = aptr + (a.length & ~15);
1953
1954 // Unaligned case
1955 asm
1956 {
1957 mov EAX, bptr;
1958 mov ESI, aptr;
1959 mov EDI, n;
1960 movss XMM4, recip;
1961 //movss XMM4, value
1962 //rcpss XMM4, XMM4
1963 shufps XMM4, XMM4, 0;
1964
1965 align 8;
1966 startsseloop:
1967 add ESI, 64;
1968 movups XMM0, [EAX];
1969 movups XMM1, [EAX+16];
1970 movups XMM2, [EAX+32];
1971 movups XMM3, [EAX+48];
1972 add EAX, 64;
1973 mulps XMM0, XMM4;
1974 mulps XMM1, XMM4;
1975 mulps XMM2, XMM4;
1976 mulps XMM3, XMM4;
1977 //divps XMM0, XMM4;
1978 //divps XMM1, XMM4;
1979 //divps XMM2, XMM4;
1980 //divps XMM3, XMM4;
1981 movups [ESI+ 0-64], XMM0;
1982 movups [ESI+16-64], XMM1;
1983 movups [ESI+32-64], XMM2;
1984 movups [ESI+48-64], XMM3;
1985 cmp ESI, EDI;
1986 jb startsseloop;
1987
1988 mov aptr, ESI;
1989 mov bptr, EAX;
1990 }
1991 }
1992 else
1993 // 3DNow! version is 72% faster
1994 if (amd3dnow() && a.length >= 8)
1995 {
1996 auto n = aptr + (a.length & ~7);
1997
1998 T[2] w = void;
1999
2000 w[0] = recip;
2001 w[1] = recip;
2002
2003 asm
2004 {
2005 mov ESI, dword ptr [aptr];
2006 mov EDI, dword ptr [n];
2007 mov EAX, dword ptr [bptr];
2008 movq MM4, qword ptr [w];
2009
2010 align 8;
2011 start:
2012 movq MM0, [EAX];
2013 movq MM1, [EAX+8];
2014 movq MM2, [EAX+16];
2015 movq MM3, [EAX+24];
2016 pfmul MM0, MM4;
2017 pfmul MM1, MM4;
2018 pfmul MM2, MM4;
2019 pfmul MM3, MM4;
2020 movq [ESI], MM0;
2021 movq [ESI+8], MM1;
2022 movq [ESI+16], MM2;
2023 movq [ESI+24], MM3;
2024 add ESI, 32;
2025 add EAX, 32;
2026 cmp ESI, EDI;
2027 jb start;
2028
2029 emms;
2030 mov dword ptr [aptr], ESI;
2031 mov dword ptr [bptr], EAX;
2032 }
2033 }
2034 }
2035
2036 while (aptr < aend)
2037 *aptr++ = *bptr++ * recip;
2038
2039 return a;
2040 }
2041
2042 unittest
2043 {
2044 printf("_arraySliceExpDivSliceAssign_f unittest\n");
2045 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2046 {
2047 version (log) printf(" cpuid %d\n", cpuid);
2048
2049 for (int j = 0; j < 2; j++)
2050 {
2051 const int dim = 67;
2052 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2053 a = a[j .. dim + j]; // misalign for second iteration
2054 T[] b = new T[dim + j];
2055 b = b[j .. dim + j];
2056 T[] c = new T[dim + j];
2057 c = c[j .. dim + j];
2058
2059 for (int i = 0; i < dim; i++)
2060 { a[i] = cast(T)i;
2061 b[i] = cast(T)(i + 7);
2062 c[i] = cast(T)(i * 2);
2063 }
2064
2065 c[] = a[] / 8;
2066
2067 for (int i = 0; i < dim; i++)
2068 {
2069 if (c[i] != cast(T)(a[i] / 8))
2070 {
2071 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
2072 assert(0);
2073 }
2074 }
2075 }
2076 }
2077 }
2078
2079 /* ======================================================================== */
2080
2081 /***********************
2082 * Computes:
2083 * a[] /= value
2084 */
2085
2086 T[] _arrayExpSliceDivass_f(T[] a, T value)
2087 {
2088 //printf("_arrayExpSliceDivass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
2089 auto aptr = a.ptr;
2090 auto aend = aptr + a.length;
2091
2092 /* Multiplying by the reciprocal is faster, but does
2093 * not produce as accurate an answer.
2094 */
2095 T recip = cast(T)1 / value;
2096
2097 version (D_InlineAsm_X86)
2098 {
2099 // SSE version is 245% faster
2100 if (sse() && a.length >= 16)
2101 {
2102 // align pointer
2103 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
2104 while (aptr < n)
2105 *aptr++ *= recip;
2106 n = cast(T*)((cast(uint)aend) & ~15);
2107 if (aptr < n)
2108
2109 // Aligned case
2110 asm
2111 {
2112 mov ESI, aptr;
2113 mov EDI, n;
2114 movss XMM4, recip;
2115 //movss XMM4, value
2116 //rcpss XMM4, XMM4
2117 shufps XMM4, XMM4, 0;
2118
2119 align 8;
2120 startsseloopa:
2121 movaps XMM0, [ESI];
2122 movaps XMM1, [ESI+16];
2123 movaps XMM2, [ESI+32];
2124 movaps XMM3, [ESI+48];
2125 add ESI, 64;
2126 mulps XMM0, XMM4;
2127 mulps XMM1, XMM4;
2128 mulps XMM2, XMM4;
2129 mulps XMM3, XMM4;
2130 //divps XMM0, XMM4;
2131 //divps XMM1, XMM4;
2132 //divps XMM2, XMM4;
2133 //divps XMM3, XMM4;
2134 movaps [ESI+ 0-64], XMM0;
2135 movaps [ESI+16-64], XMM1;
2136 movaps [ESI+32-64], XMM2;
2137 movaps [ESI+48-64], XMM3;
2138 cmp ESI, EDI;
2139 jb startsseloopa;
2140
2141 mov aptr, ESI;
2142 }
2143 }
2144 else
2145 // 3DNow! version is 57% faster
2146 if (amd3dnow() && a.length >= 8)
2147 {
2148 auto n = aptr + (a.length & ~7);
2149
2150 T[2] w = void;
2151
2152 w[0] = w[1] = recip;
2153
2154 asm
2155 {
2156 mov ESI, dword ptr [aptr];
2157 mov EDI, dword ptr [n];
2158 movq MM4, qword ptr [w];
2159
2160 align 8;
2161 start:
2162 movq MM0, [ESI];
2163 movq MM1, [ESI+8];
2164 movq MM2, [ESI+16];
2165 movq MM3, [ESI+24];
2166 pfmul MM0, MM4;
2167 pfmul MM1, MM4;
2168 pfmul MM2, MM4;
2169 pfmul MM3, MM4;
2170 movq [ESI], MM0;
2171 movq [ESI+8], MM1;
2172 movq [ESI+16], MM2;
2173 movq [ESI+24], MM3;
2174 add ESI, 32;
2175 cmp ESI, EDI;
2176 jb start;
2177
2178 emms;
2179 mov dword ptr [aptr], ESI;
2180 }
2181 }
2182 }
2183
2184 while (aptr < aend)
2185 *aptr++ *= recip;
2186
2187 return a;
2188 }
2189
2190 unittest
2191 {
2192 printf("_arrayExpSliceDivass_f unittest\n");
2193 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2194 {
2195 version (log) printf(" cpuid %d\n", cpuid);
2196
2197 for (int j = 0; j < 2; j++)
2198 {
2199 const int dim = 67;
2200 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2201 a = a[j .. dim + j]; // misalign for second iteration
2202 T[] b = new T[dim + j];
2203 b = b[j .. dim + j];
2204 T[] c = new T[dim + j];
2205 c = c[j .. dim + j];
2206
2207 for (int i = 0; i < dim; i++)
2208 { a[i] = cast(T)i;
2209 b[i] = cast(T)(i + 7);
2210 c[i] = cast(T)(i * 2);
2211 }
2212
2213 a[] = c[];
2214 c[] /= 8;
2215
2216 for (int i = 0; i < dim; i++)
2217 {
2218 if (c[i] != cast(T)(a[i] / 8))
2219 {
2220 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
2221 assert(0);
2222 }
2223 }
2224 }
2225 }
2226 }
2227
2228
2229 /* ======================================================================== */
2230
2231 /***********************
2232 * Computes:
2233 * a[] -= b[] * value
2234 */
2235
2236 T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b)
2237 {
2238 return _arraySliceExpMulSliceAddass_f(a, -value, b);
2239 }
2240
2241 /***********************
2242 * Computes:
2243 * a[] += b[] * value
2244 */
2245
2246 T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b)
2247 in
2248 {
2249 assert(a.length == b.length);
2250 assert(disjoint(a, b));
2251 }
2252 body
2253 {
2254 auto aptr = a.ptr;
2255 auto aend = aptr + a.length;
2256 auto bptr = b.ptr;
2257
2258 // Handle remainder
2259 while (aptr < aend)
2260 *aptr++ += *bptr++ * value;
2261
2262 return a;
2263 }
2264
2265 unittest
2266 {
2267 printf("_arraySliceExpMulSliceAddass_f unittest\n");
2268
2269 cpuid = 1;
2270 {
2271 version (log) printf(" cpuid %d\n", cpuid);
2272
2273 for (int j = 0; j < 1; j++)
2274 {
2275 const int dim = 67;
2276 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2277 a = a[j .. dim + j]; // misalign for second iteration
2278 T[] b = new T[dim + j];
2279 b = b[j .. dim + j];
2280 T[] c = new T[dim + j];
2281 c = c[j .. dim + j];
2282
2283 for (int i = 0; i < dim; i++)
2284 { a[i] = cast(T)i;
2285 b[i] = cast(T)(i + 7);
2286 c[i] = cast(T)(i * 2);
2287 }
2288
2289 b[] = c[];
2290 c[] += a[] * 6;
2291
2292 for (int i = 0; i < dim; i++)
2293 {
2294 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
2295 if (c[i] != cast(T)(b[i] + a[i] * 6))
2296 {
2297 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
2298 assert(0);
2299 }
2300 }
2301 }
2302 }
2303 }