comparison druntime/src/compiler/dmd/arrayfloat.d @ 1458:e0b2d67cfe7c

Added druntime (this should be removed once it works).
author Robert Clipsham <robert@octarineparrot.com>
date Tue, 02 Jun 2009 17:43:06 +0100
parents
children
comparison
equal deleted inserted replaced
1456:7b218ec1044f 1458:e0b2d67cfe7c
1 /**
2 * Contains SSE2 and MMX versions of certain operations for float.
3 *
4 * Copyright: Copyright Digital Mars 2008 - 2009.
5 * License: <a href="http://www.boost.org/LICENSE_1_0.txt>Boost License 1.0</a>.
6 * Authors: Walter Bright, based on code originally written by Burton Radons
7 *
8 * Copyright Digital Mars 2008 - 2009.
9 * Distributed under the Boost Software License, Version 1.0.
10 * (See accompanying file LICENSE_1_0.txt or copy at
11 * http://www.boost.org/LICENSE_1_0.txt)
12 */
13 module rt.arrayfloat;
14
15 private import rt.util.cpuid;
16
17 version (unittest)
18 {
19 private import core.stdc.stdio : printf;
20 /* This is so unit tests will test every CPU variant
21 */
22 int cpuid;
23 const int CPUID_MAX = 5;
24 bool mmx() { return cpuid == 1 && rt.util.cpuid.mmx(); }
25 bool sse() { return cpuid == 2 && rt.util.cpuid.sse(); }
26 bool sse2() { return cpuid == 3 && rt.util.cpuid.sse2(); }
27 bool amd3dnow() { return cpuid == 4 && rt.util.cpuid.amd3dnow(); }
28 }
29 else
30 {
31 alias rt.util.cpuid.mmx mmx;
32 alias rt.util.cpuid.sse sse;
33 alias rt.util.cpuid.sse2 sse2;
34 alias rt.util.cpuid.amd3dnow amd3dnow;
35 }
36
37 //version = log;
38
39 bool disjoint(T)(T[] a, T[] b)
40 {
41 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
42 }
43
44 alias float T;
45
46 extern (C):
47
48 /* ======================================================================== */
49
50 /***********************
51 * Computes:
52 * a[] = b[] + c[]
53 */
54
55 T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b)
56 in
57 {
58 assert(a.length == b.length && b.length == c.length);
59 assert(disjoint(a, b));
60 assert(disjoint(a, c));
61 assert(disjoint(b, c));
62 }
63 body
64 {
65 //printf("_arraySliceSliceAddSliceAssign_f()\n");
66 auto aptr = a.ptr;
67 auto aend = aptr + a.length;
68 auto bptr = b.ptr;
69 auto cptr = c.ptr;
70
71 version (D_InlineAsm_X86)
72 {
73 // SSE version is 834% faster
74 if (sse() && b.length >= 16)
75 {
76 version (log) printf("\tsse unaligned\n");
77 auto n = aptr + (b.length & ~15);
78
79 // Unaligned case
80 asm
81 {
82 mov EAX, bptr; // left operand
83 mov ECX, cptr; // right operand
84 mov ESI, aptr; // destination operand
85 mov EDI, n; // end comparison
86
87 align 8;
88 startsseloopb:
89 movups XMM0, [EAX];
90 movups XMM1, [EAX+16];
91 movups XMM2, [EAX+32];
92 movups XMM3, [EAX+48];
93 add EAX, 64;
94 movups XMM4, [ECX];
95 movups XMM5, [ECX+16];
96 movups XMM6, [ECX+32];
97 movups XMM7, [ECX+48];
98 add ESI, 64;
99 addps XMM0, XMM4;
100 addps XMM1, XMM5;
101 addps XMM2, XMM6;
102 addps XMM3, XMM7;
103 add ECX, 64;
104 movups [ESI+ 0-64], XMM0;
105 movups [ESI+16-64], XMM1;
106 movups [ESI+32-64], XMM2;
107 movups [ESI+48-64], XMM3;
108 cmp ESI, EDI;
109 jb startsseloopb;
110
111 mov aptr, ESI;
112 mov bptr, EAX;
113 mov cptr, ECX;
114 }
115 }
116 else
117 // 3DNow! version is only 13% faster
118 if (amd3dnow() && b.length >= 8)
119 {
120 version (log) printf("\tamd3dnow\n");
121 auto n = aptr + (b.length & ~7);
122
123 asm
124 {
125 mov ESI, aptr; // destination operand
126 mov EDI, n; // end comparison
127 mov EAX, bptr; // left operand
128 mov ECX, cptr; // right operand
129
130 align 4;
131 start3dnow:
132 movq MM0, [EAX];
133 movq MM1, [EAX+8];
134 movq MM2, [EAX+16];
135 movq MM3, [EAX+24];
136 pfadd MM0, [ECX];
137 pfadd MM1, [ECX+8];
138 pfadd MM2, [ECX+16];
139 pfadd MM3, [ECX+24];
140 movq [ESI], MM0;
141 movq [ESI+8], MM1;
142 movq [ESI+16], MM2;
143 movq [ESI+24], MM3;
144 add ECX, 32;
145 add ESI, 32;
146 add EAX, 32;
147 cmp ESI, EDI;
148 jb start3dnow;
149
150 emms;
151 mov aptr, ESI;
152 mov bptr, EAX;
153 mov cptr, ECX;
154 }
155 }
156 }
157
158 // Handle remainder
159 version (log) if (aptr < aend) printf("\tbase\n");
160 while (aptr < aend)
161 *aptr++ = *bptr++ + *cptr++;
162
163 return a;
164 }
165
166
167 unittest
168 {
169 printf("_arraySliceSliceAddSliceAssign_f unittest\n");
170 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
171 {
172 version (log) printf(" cpuid %d\n", cpuid);
173
174 for (int j = 0; j < 2; j++)
175 {
176 const int dim = 67;
177 T[] a = new T[dim + j]; // aligned on 16 byte boundary
178 a = a[j .. dim + j]; // misalign for second iteration
179 T[] b = new T[dim + j];
180 b = b[j .. dim + j];
181 T[] c = new T[dim + j];
182 c = c[j .. dim + j];
183
184 for (int i = 0; i < dim; i++)
185 { a[i] = cast(T)i;
186 b[i] = cast(T)(i + 7);
187 c[i] = cast(T)(i * 2);
188 }
189
190 c[] = a[] + b[];
191
192 for (int i = 0; i < dim; i++)
193 {
194 if (c[i] != cast(T)(a[i] + b[i]))
195 {
196 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
197 assert(0);
198 }
199 }
200 }
201 }
202 }
203
204 /* ======================================================================== */
205
206 /***********************
207 * Computes:
208 * a[] = b[] - c[]
209 */
210
211 T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b)
212 in
213 {
214 assert(a.length == b.length && b.length == c.length);
215 assert(disjoint(a, b));
216 assert(disjoint(a, c));
217 assert(disjoint(b, c));
218 }
219 body
220 {
221 auto aptr = a.ptr;
222 auto aend = aptr + a.length;
223 auto bptr = b.ptr;
224 auto cptr = c.ptr;
225
226 version (D_InlineAsm_X86)
227 {
228 // SSE version is 834% faster
229 if (sse() && b.length >= 16)
230 {
231 auto n = aptr + (b.length & ~15);
232
233 // Unaligned case
234 asm
235 {
236 mov EAX, bptr; // left operand
237 mov ECX, cptr; // right operand
238 mov ESI, aptr; // destination operand
239 mov EDI, n; // end comparison
240
241 align 8;
242 startsseloopb:
243 movups XMM0, [EAX];
244 movups XMM1, [EAX+16];
245 movups XMM2, [EAX+32];
246 movups XMM3, [EAX+48];
247 add EAX, 64;
248 movups XMM4, [ECX];
249 movups XMM5, [ECX+16];
250 movups XMM6, [ECX+32];
251 movups XMM7, [ECX+48];
252 add ESI, 64;
253 subps XMM0, XMM4;
254 subps XMM1, XMM5;
255 subps XMM2, XMM6;
256 subps XMM3, XMM7;
257 add ECX, 64;
258 movups [ESI+ 0-64], XMM0;
259 movups [ESI+16-64], XMM1;
260 movups [ESI+32-64], XMM2;
261 movups [ESI+48-64], XMM3;
262 cmp ESI, EDI;
263 jb startsseloopb;
264
265 mov aptr, ESI;
266 mov bptr, EAX;
267 mov cptr, ECX;
268 }
269 }
270 else
271 // 3DNow! version is only 13% faster
272 if (amd3dnow() && b.length >= 8)
273 {
274 auto n = aptr + (b.length & ~7);
275
276 asm
277 {
278 mov ESI, aptr; // destination operand
279 mov EDI, n; // end comparison
280 mov EAX, bptr; // left operand
281 mov ECX, cptr; // right operand
282
283 align 4;
284 start3dnow:
285 movq MM0, [EAX];
286 movq MM1, [EAX+8];
287 movq MM2, [EAX+16];
288 movq MM3, [EAX+24];
289 pfsub MM0, [ECX];
290 pfsub MM1, [ECX+8];
291 pfsub MM2, [ECX+16];
292 pfsub MM3, [ECX+24];
293 movq [ESI], MM0;
294 movq [ESI+8], MM1;
295 movq [ESI+16], MM2;
296 movq [ESI+24], MM3;
297 add ECX, 32;
298 add ESI, 32;
299 add EAX, 32;
300 cmp ESI, EDI;
301 jb start3dnow;
302
303 emms;
304 mov aptr, ESI;
305 mov bptr, EAX;
306 mov cptr, ECX;
307 }
308 }
309 }
310
311 // Handle remainder
312 while (aptr < aend)
313 *aptr++ = *bptr++ - *cptr++;
314
315 return a;
316 }
317
318
319 unittest
320 {
321 printf("_arraySliceSliceMinSliceAssign_f unittest\n");
322 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
323 {
324 version (log) printf(" cpuid %d\n", cpuid);
325
326 for (int j = 0; j < 2; j++)
327 {
328 const int dim = 67;
329 T[] a = new T[dim + j]; // aligned on 16 byte boundary
330 a = a[j .. dim + j]; // misalign for second iteration
331 T[] b = new T[dim + j];
332 b = b[j .. dim + j];
333 T[] c = new T[dim + j];
334 c = c[j .. dim + j];
335
336 for (int i = 0; i < dim; i++)
337 { a[i] = cast(T)i;
338 b[i] = cast(T)(i + 7);
339 c[i] = cast(T)(i * 2);
340 }
341
342 c[] = a[] - b[];
343
344 for (int i = 0; i < dim; i++)
345 {
346 if (c[i] != cast(T)(a[i] - b[i]))
347 {
348 printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]);
349 assert(0);
350 }
351 }
352 }
353 }
354 }
355
356 /* ======================================================================== */
357
358 /***********************
359 * Computes:
360 * a[] = b[] + value
361 */
362
363 T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b)
364 in
365 {
366 assert(a.length == b.length);
367 assert(disjoint(a, b));
368 }
369 body
370 {
371 //printf("_arraySliceExpAddSliceAssign_f()\n");
372 auto aptr = a.ptr;
373 auto aend = aptr + a.length;
374 auto bptr = b.ptr;
375
376 version (D_InlineAsm_X86)
377 {
378 // SSE version is 665% faster
379 if (sse() && a.length >= 16)
380 {
381 auto n = aptr + (a.length & ~15);
382
383 // Unaligned case
384 asm
385 {
386 mov EAX, bptr;
387 mov ESI, aptr;
388 mov EDI, n;
389 movss XMM4, value;
390 shufps XMM4, XMM4, 0;
391
392 align 8;
393 startsseloop:
394 add ESI, 64;
395 movups XMM0, [EAX];
396 movups XMM1, [EAX+16];
397 movups XMM2, [EAX+32];
398 movups XMM3, [EAX+48];
399 add EAX, 64;
400 addps XMM0, XMM4;
401 addps XMM1, XMM4;
402 addps XMM2, XMM4;
403 addps XMM3, XMM4;
404 movups [ESI+ 0-64], XMM0;
405 movups [ESI+16-64], XMM1;
406 movups [ESI+32-64], XMM2;
407 movups [ESI+48-64], XMM3;
408 cmp ESI, EDI;
409 jb startsseloop;
410
411 mov aptr, ESI;
412 mov bptr, EAX;
413 }
414 }
415 else
416 // 3DNow! version is 69% faster
417 if (amd3dnow() && a.length >= 8)
418 {
419 auto n = aptr + (a.length & ~7);
420
421 ulong w = *cast(uint *) &value;
422 ulong v = w | (w << 32L);
423
424 asm
425 {
426 mov ESI, aptr;
427 mov EDI, n;
428 mov EAX, bptr;
429 movq MM4, qword ptr [v];
430
431 align 8;
432 start3dnow:
433 movq MM0, [EAX];
434 movq MM1, [EAX+8];
435 movq MM2, [EAX+16];
436 movq MM3, [EAX+24];
437 pfadd MM0, MM4;
438 pfadd MM1, MM4;
439 pfadd MM2, MM4;
440 pfadd MM3, MM4;
441 movq [ESI], MM0;
442 movq [ESI+8], MM1;
443 movq [ESI+16], MM2;
444 movq [ESI+24], MM3;
445 add ESI, 32;
446 add EAX, 32;
447 cmp ESI, EDI;
448 jb start3dnow;
449
450 emms;
451 mov aptr, ESI;
452 mov bptr, EAX;
453 }
454 }
455 }
456
457 while (aptr < aend)
458 *aptr++ = *bptr++ + value;
459
460 return a;
461 }
462
463 unittest
464 {
465 printf("_arraySliceExpAddSliceAssign_f unittest\n");
466 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
467 {
468 version (log) printf(" cpuid %d\n", cpuid);
469
470 for (int j = 0; j < 2; j++)
471 {
472 const int dim = 67;
473 T[] a = new T[dim + j]; // aligned on 16 byte boundary
474 a = a[j .. dim + j]; // misalign for second iteration
475 T[] b = new T[dim + j];
476 b = b[j .. dim + j];
477 T[] c = new T[dim + j];
478 c = c[j .. dim + j];
479
480 for (int i = 0; i < dim; i++)
481 { a[i] = cast(T)i;
482 b[i] = cast(T)(i + 7);
483 c[i] = cast(T)(i * 2);
484 }
485
486 c[] = a[] + 6;
487
488 for (int i = 0; i < dim; i++)
489 {
490 if (c[i] != cast(T)(a[i] + 6))
491 {
492 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
493 assert(0);
494 }
495 }
496 }
497 }
498 }
499
500 /* ======================================================================== */
501
502 /***********************
503 * Computes:
504 * a[] += value
505 */
506
507 T[] _arrayExpSliceAddass_f(T[] a, T value)
508 {
509 //printf("_arrayExpSliceAddass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
510 auto aptr = a.ptr;
511 auto aend = aptr + a.length;
512
513 version (D_InlineAsm_X86)
514 {
515 // SSE version is 302% faster
516 if (sse() && a.length >= 16)
517 {
518 // align pointer
519 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
520 while (aptr < n)
521 *aptr++ += value;
522 n = cast(T*)((cast(uint)aend) & ~15);
523 if (aptr < n)
524
525 // Aligned case
526 asm
527 {
528 mov ESI, aptr;
529 mov EDI, n;
530 movss XMM4, value;
531 shufps XMM4, XMM4, 0;
532
533 align 8;
534 startsseloopa:
535 movaps XMM0, [ESI];
536 movaps XMM1, [ESI+16];
537 movaps XMM2, [ESI+32];
538 movaps XMM3, [ESI+48];
539 add ESI, 64;
540 addps XMM0, XMM4;
541 addps XMM1, XMM4;
542 addps XMM2, XMM4;
543 addps XMM3, XMM4;
544 movaps [ESI+ 0-64], XMM0;
545 movaps [ESI+16-64], XMM1;
546 movaps [ESI+32-64], XMM2;
547 movaps [ESI+48-64], XMM3;
548 cmp ESI, EDI;
549 jb startsseloopa;
550
551 mov aptr, ESI;
552 }
553 }
554 else
555 // 3DNow! version is 63% faster
556 if (amd3dnow() && a.length >= 8)
557 {
558 auto n = aptr + (a.length & ~7);
559
560 ulong w = *cast(uint *) &value;
561 ulong v = w | (w << 32L);
562
563 asm
564 {
565 mov ESI, dword ptr [aptr];
566 mov EDI, dword ptr [n];
567 movq MM4, qword ptr [v];
568
569 align 8;
570 start3dnow:
571 movq MM0, [ESI];
572 movq MM1, [ESI+8];
573 movq MM2, [ESI+16];
574 movq MM3, [ESI+24];
575 pfadd MM0, MM4;
576 pfadd MM1, MM4;
577 pfadd MM2, MM4;
578 pfadd MM3, MM4;
579 movq [ESI], MM0;
580 movq [ESI+8], MM1;
581 movq [ESI+16], MM2;
582 movq [ESI+24], MM3;
583 add ESI, 32;
584 cmp ESI, EDI;
585 jb start3dnow;
586
587 emms;
588 mov dword ptr [aptr], ESI;
589 }
590 }
591 }
592
593 while (aptr < aend)
594 *aptr++ += value;
595
596 return a;
597 }
598
599 unittest
600 {
601 printf("_arrayExpSliceAddass_f unittest\n");
602 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
603 {
604 version (log) printf(" cpuid %d\n", cpuid);
605
606 for (int j = 0; j < 2; j++)
607 {
608 const int dim = 67;
609 T[] a = new T[dim + j]; // aligned on 16 byte boundary
610 a = a[j .. dim + j]; // misalign for second iteration
611 T[] b = new T[dim + j];
612 b = b[j .. dim + j];
613 T[] c = new T[dim + j];
614 c = c[j .. dim + j];
615
616 for (int i = 0; i < dim; i++)
617 { a[i] = cast(T)i;
618 b[i] = cast(T)(i + 7);
619 c[i] = cast(T)(i * 2);
620 }
621
622 a[] = c[];
623 c[] += 6;
624
625 for (int i = 0; i < dim; i++)
626 {
627 if (c[i] != cast(T)(a[i] + 6))
628 {
629 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
630 assert(0);
631 }
632 }
633 }
634 }
635 }
636
637 /* ======================================================================== */
638
639 /***********************
640 * Computes:
641 * a[] += b[]
642 */
643
644 T[] _arraySliceSliceAddass_f(T[] a, T[] b)
645 in
646 {
647 assert (a.length == b.length);
648 assert (disjoint(a, b));
649 }
650 body
651 {
652 //printf("_arraySliceSliceAddass_f()\n");
653 auto aptr = a.ptr;
654 auto aend = aptr + a.length;
655 auto bptr = b.ptr;
656
657 version (D_InlineAsm_X86)
658 {
659 // SSE version is 468% faster
660 if (sse() && a.length >= 16)
661 {
662 auto n = aptr + (a.length & ~15);
663
664 // Unaligned case
665 asm
666 {
667 mov ECX, bptr; // right operand
668 mov ESI, aptr; // destination operand
669 mov EDI, n; // end comparison
670
671 align 8;
672 startsseloopb:
673 movups XMM0, [ESI];
674 movups XMM1, [ESI+16];
675 movups XMM2, [ESI+32];
676 movups XMM3, [ESI+48];
677 add ESI, 64;
678 movups XMM4, [ECX];
679 movups XMM5, [ECX+16];
680 movups XMM6, [ECX+32];
681 movups XMM7, [ECX+48];
682 add ECX, 64;
683 addps XMM0, XMM4;
684 addps XMM1, XMM5;
685 addps XMM2, XMM6;
686 addps XMM3, XMM7;
687 movups [ESI+ 0-64], XMM0;
688 movups [ESI+16-64], XMM1;
689 movups [ESI+32-64], XMM2;
690 movups [ESI+48-64], XMM3;
691 cmp ESI, EDI;
692 jb startsseloopb;
693
694 mov aptr, ESI;
695 mov bptr, ECX;
696 }
697 }
698 else
699 // 3DNow! version is 57% faster
700 if (amd3dnow() && a.length >= 8)
701 {
702 auto n = aptr + (a.length & ~7);
703
704 asm
705 {
706 mov ESI, dword ptr [aptr]; // destination operand
707 mov EDI, dword ptr [n]; // end comparison
708 mov ECX, dword ptr [bptr]; // right operand
709
710 align 4;
711 start3dnow:
712 movq MM0, [ESI];
713 movq MM1, [ESI+8];
714 movq MM2, [ESI+16];
715 movq MM3, [ESI+24];
716 pfadd MM0, [ECX];
717 pfadd MM1, [ECX+8];
718 pfadd MM2, [ECX+16];
719 pfadd MM3, [ECX+24];
720 movq [ESI], MM0;
721 movq [ESI+8], MM1;
722 movq [ESI+16], MM2;
723 movq [ESI+24], MM3;
724 add ESI, 32;
725 add ECX, 32;
726 cmp ESI, EDI;
727 jb start3dnow;
728
729 emms;
730 mov dword ptr [aptr], ESI;
731 mov dword ptr [bptr], ECX;
732 }
733 }
734 }
735
736 while (aptr < aend)
737 *aptr++ += *bptr++;
738
739 return a;
740 }
741
742 unittest
743 {
744 printf("_arraySliceSliceAddass_f unittest\n");
745 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
746 {
747 version (log) printf(" cpuid %d\n", cpuid);
748
749 for (int j = 0; j < 2; j++)
750 {
751 const int dim = 67;
752 T[] a = new T[dim + j]; // aligned on 16 byte boundary
753 a = a[j .. dim + j]; // misalign for second iteration
754 T[] b = new T[dim + j];
755 b = b[j .. dim + j];
756 T[] c = new T[dim + j];
757 c = c[j .. dim + j];
758
759 for (int i = 0; i < dim; i++)
760 { a[i] = cast(T)i;
761 b[i] = cast(T)(i + 7);
762 c[i] = cast(T)(i * 2);
763 }
764
765 a[] = c[];
766 c[] += b[];
767
768 for (int i = 0; i < dim; i++)
769 {
770 if (c[i] != cast(T)(a[i] + b[i]))
771 {
772 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
773 assert(0);
774 }
775 }
776 }
777 }
778 }
779
780 /* ======================================================================== */
781
782 /***********************
783 * Computes:
784 * a[] = b[] - value
785 */
786
787 T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b)
788 in
789 {
790 assert (a.length == b.length);
791 assert (disjoint(a, b));
792 }
793 body
794 {
795 //printf("_arraySliceExpMinSliceAssign_f()\n");
796 auto aptr = a.ptr;
797 auto aend = aptr + a.length;
798 auto bptr = b.ptr;
799
800 version (D_InlineAsm_X86)
801 {
802 // SSE version is 622% faster
803 if (sse() && a.length >= 16)
804 {
805 auto n = aptr + (a.length & ~15);
806
807 // Unaligned case
808 asm
809 {
810 mov EAX, bptr;
811 mov ESI, aptr;
812 mov EDI, n;
813 movss XMM4, value;
814 shufps XMM4, XMM4, 0;
815
816 align 8;
817 startsseloop:
818 add ESI, 64;
819 movups XMM0, [EAX];
820 movups XMM1, [EAX+16];
821 movups XMM2, [EAX+32];
822 movups XMM3, [EAX+48];
823 add EAX, 64;
824 subps XMM0, XMM4;
825 subps XMM1, XMM4;
826 subps XMM2, XMM4;
827 subps XMM3, XMM4;
828 movups [ESI+ 0-64], XMM0;
829 movups [ESI+16-64], XMM1;
830 movups [ESI+32-64], XMM2;
831 movups [ESI+48-64], XMM3;
832 cmp ESI, EDI;
833 jb startsseloop;
834
835 mov aptr, ESI;
836 mov bptr, EAX;
837 }
838 }
839 else
840 // 3DNow! version is 67% faster
841 if (amd3dnow() && a.length >= 8)
842 {
843 auto n = aptr + (a.length & ~7);
844
845 T[2] w;
846
847 w[0] = w[1] = value;
848
849 asm
850 {
851 mov ESI, dword ptr [aptr];
852 mov EDI, dword ptr [n];
853 mov EAX, dword ptr [bptr];
854 movq MM4, qword ptr [w];
855
856 align 8;
857 start3dnow:
858 movq MM0, [EAX];
859 movq MM1, [EAX+8];
860 movq MM2, [EAX+16];
861 movq MM3, [EAX+24];
862 pfsub MM0, MM4;
863 pfsub MM1, MM4;
864 pfsub MM2, MM4;
865 pfsub MM3, MM4;
866 movq [ESI], MM0;
867 movq [ESI+8], MM1;
868 movq [ESI+16], MM2;
869 movq [ESI+24], MM3;
870 add ESI, 32;
871 add EAX, 32;
872 cmp ESI, EDI;
873 jb start3dnow;
874
875 emms;
876 mov dword ptr [aptr], ESI;
877 mov dword ptr [bptr], EAX;
878 }
879 }
880 }
881
882 while (aptr < aend)
883 *aptr++ = *bptr++ - value;
884
885 return a;
886 }
887
888 unittest
889 {
890 printf("_arraySliceExpMinSliceAssign_f unittest\n");
891 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
892 {
893 version (log) printf(" cpuid %d\n", cpuid);
894
895 for (int j = 0; j < 2; j++)
896 {
897 const int dim = 67;
898 T[] a = new T[dim + j]; // aligned on 16 byte boundary
899 a = a[j .. dim + j]; // misalign for second iteration
900 T[] b = new T[dim + j];
901 b = b[j .. dim + j];
902 T[] c = new T[dim + j];
903 c = c[j .. dim + j];
904
905 for (int i = 0; i < dim; i++)
906 { a[i] = cast(T)i;
907 b[i] = cast(T)(i + 7);
908 c[i] = cast(T)(i * 2);
909 }
910
911 c[] = a[] - 6;
912
913 for (int i = 0; i < dim; i++)
914 {
915 if (c[i] != cast(T)(a[i] - 6))
916 {
917 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
918 assert(0);
919 }
920 }
921 }
922 }
923 }
924
925 /* ======================================================================== */
926
927 /***********************
928 * Computes:
929 * a[] = value - b[]
930 */
931
932 T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value)
933 in
934 {
935 assert (a.length == b.length);
936 assert (disjoint(a, b));
937 }
938 body
939 {
940 //printf("_arrayExpSliceMinSliceAssign_f()\n");
941 auto aptr = a.ptr;
942 auto aend = aptr + a.length;
943 auto bptr = b.ptr;
944
945 version (D_InlineAsm_X86)
946 {
947 // SSE version is 690% faster
948 if (sse() && a.length >= 16)
949 {
950 auto n = aptr + (a.length & ~15);
951
952 // Unaligned case
953 asm
954 {
955 mov EAX, bptr;
956 mov ESI, aptr;
957 mov EDI, n;
958 movss XMM4, value;
959 shufps XMM4, XMM4, 0;
960
961 align 8;
962 startsseloop:
963 add ESI, 64;
964 movaps XMM5, XMM4;
965 movaps XMM6, XMM4;
966 movups XMM0, [EAX];
967 movups XMM1, [EAX+16];
968 movups XMM2, [EAX+32];
969 movups XMM3, [EAX+48];
970 add EAX, 64;
971 subps XMM5, XMM0;
972 subps XMM6, XMM1;
973 movups [ESI+ 0-64], XMM5;
974 movups [ESI+16-64], XMM6;
975 movaps XMM5, XMM4;
976 movaps XMM6, XMM4;
977 subps XMM5, XMM2;
978 subps XMM6, XMM3;
979 movups [ESI+32-64], XMM5;
980 movups [ESI+48-64], XMM6;
981 cmp ESI, EDI;
982 jb startsseloop;
983
984 mov aptr, ESI;
985 mov bptr, EAX;
986 }
987 }
988 else
989 // 3DNow! version is 67% faster
990 if (amd3dnow() && a.length >= 8)
991 {
992 auto n = aptr + (a.length & ~7);
993
994 ulong w = *cast(uint *) &value;
995 ulong v = w | (w << 32L);
996
997 asm
998 {
999 mov ESI, aptr;
1000 mov EDI, n;
1001 mov EAX, bptr;
1002 movq MM4, qword ptr [v];
1003
1004 align 8;
1005 start3dnow:
1006 movq MM0, [EAX];
1007 movq MM1, [EAX+8];
1008 movq MM2, [EAX+16];
1009 movq MM3, [EAX+24];
1010 pfsubr MM0, MM4;
1011 pfsubr MM1, MM4;
1012 pfsubr MM2, MM4;
1013 pfsubr MM3, MM4;
1014 movq [ESI], MM0;
1015 movq [ESI+8], MM1;
1016 movq [ESI+16], MM2;
1017 movq [ESI+24], MM3;
1018 add ESI, 32;
1019 add EAX, 32;
1020 cmp ESI, EDI;
1021 jb start3dnow;
1022
1023 emms;
1024 mov aptr, ESI;
1025 mov bptr, EAX;
1026 }
1027 }
1028 }
1029
1030 while (aptr < aend)
1031 *aptr++ = value - *bptr++;
1032
1033 return a;
1034 }
1035
1036 unittest
1037 {
1038 printf("_arrayExpSliceMinSliceAssign_f unittest\n");
1039 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1040 {
1041 version (log) printf(" cpuid %d\n", cpuid);
1042
1043 for (int j = 0; j < 2; j++)
1044 {
1045 const int dim = 67;
1046 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1047 a = a[j .. dim + j]; // misalign for second iteration
1048 T[] b = new T[dim + j];
1049 b = b[j .. dim + j];
1050 T[] c = new T[dim + j];
1051 c = c[j .. dim + j];
1052
1053 for (int i = 0; i < dim; i++)
1054 { a[i] = cast(T)i;
1055 b[i] = cast(T)(i + 7);
1056 c[i] = cast(T)(i * 2);
1057 }
1058
1059 c[] = 6 - a[];
1060
1061 for (int i = 0; i < dim; i++)
1062 {
1063 if (c[i] != cast(T)(6 - a[i]))
1064 {
1065 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
1066 assert(0);
1067 }
1068 }
1069 }
1070 }
1071 }
1072
1073 /* ======================================================================== */
1074
1075 /***********************
1076 * Computes:
1077 * a[] -= value
1078 */
1079
1080 T[] _arrayExpSliceMinass_f(T[] a, T value)
1081 {
1082 //printf("_arrayExpSliceMinass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1083 auto aptr = a.ptr;
1084 auto aend = aptr + a.length;
1085
1086 version (D_InlineAsm_X86)
1087 {
1088 // SSE version is 304% faster
1089 if (sse() && a.length >= 16)
1090 {
1091 // align pointer
1092 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
1093 while (aptr < n)
1094 *aptr++ -= value;
1095 n = cast(T*)((cast(uint)aend) & ~15);
1096 if (aptr < n)
1097
1098 // Aligned case
1099 asm
1100 {
1101 mov ESI, aptr;
1102 mov EDI, n;
1103 movss XMM4, value;
1104 shufps XMM4, XMM4, 0;
1105
1106 align 8;
1107 startsseloopa:
1108 movaps XMM0, [ESI];
1109 movaps XMM1, [ESI+16];
1110 movaps XMM2, [ESI+32];
1111 movaps XMM3, [ESI+48];
1112 add ESI, 64;
1113 subps XMM0, XMM4;
1114 subps XMM1, XMM4;
1115 subps XMM2, XMM4;
1116 subps XMM3, XMM4;
1117 movaps [ESI+ 0-64], XMM0;
1118 movaps [ESI+16-64], XMM1;
1119 movaps [ESI+32-64], XMM2;
1120 movaps [ESI+48-64], XMM3;
1121 cmp ESI, EDI;
1122 jb startsseloopa;
1123
1124 mov aptr, ESI;
1125 }
1126 }
1127 else
1128 // 3DNow! version is 63% faster
1129 if (amd3dnow() && a.length >= 8)
1130 {
1131 auto n = aptr + (a.length & ~7);
1132
1133 ulong w = *cast(uint *) &value;
1134 ulong v = w | (w << 32L);
1135
1136 asm
1137 {
1138 mov ESI, dword ptr [aptr];
1139 mov EDI, dword ptr [n];
1140 movq MM4, qword ptr [v];
1141
1142 align 8;
1143 start:
1144 movq MM0, [ESI];
1145 movq MM1, [ESI+8];
1146 movq MM2, [ESI+16];
1147 movq MM3, [ESI+24];
1148 pfsub MM0, MM4;
1149 pfsub MM1, MM4;
1150 pfsub MM2, MM4;
1151 pfsub MM3, MM4;
1152 movq [ESI], MM0;
1153 movq [ESI+8], MM1;
1154 movq [ESI+16], MM2;
1155 movq [ESI+24], MM3;
1156 add ESI, 32;
1157 cmp ESI, EDI;
1158 jb start;
1159
1160 emms;
1161 mov dword ptr [aptr], ESI;
1162 }
1163 }
1164 }
1165
1166 while (aptr < aend)
1167 *aptr++ -= value;
1168
1169 return a;
1170 }
1171
1172 unittest
1173 {
1174 printf("_arrayExpSliceminass_f unittest\n");
1175 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1176 {
1177 version (log) printf(" cpuid %d\n", cpuid);
1178
1179 for (int j = 0; j < 2; j++)
1180 {
1181 const int dim = 67;
1182 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1183 a = a[j .. dim + j]; // misalign for second iteration
1184 T[] b = new T[dim + j];
1185 b = b[j .. dim + j];
1186 T[] c = new T[dim + j];
1187 c = c[j .. dim + j];
1188
1189 for (int i = 0; i < dim; i++)
1190 { a[i] = cast(T)i;
1191 b[i] = cast(T)(i + 7);
1192 c[i] = cast(T)(i * 2);
1193 }
1194
1195 a[] = c[];
1196 c[] -= 6;
1197
1198 for (int i = 0; i < dim; i++)
1199 {
1200 if (c[i] != cast(T)(a[i] - 6))
1201 {
1202 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1203 assert(0);
1204 }
1205 }
1206 }
1207 }
1208 }
1209
1210 /* ======================================================================== */
1211
1212 /***********************
1213 * Computes:
1214 * a[] -= b[]
1215 */
1216
1217 T[] _arraySliceSliceMinass_f(T[] a, T[] b)
1218 in
1219 {
1220 assert (a.length == b.length);
1221 assert (disjoint(a, b));
1222 }
1223 body
1224 {
1225 //printf("_arraySliceSliceMinass_f()\n");
1226 auto aptr = a.ptr;
1227 auto aend = aptr + a.length;
1228 auto bptr = b.ptr;
1229
1230 version (D_InlineAsm_X86)
1231 {
1232 // SSE version is 468% faster
1233 if (sse() && a.length >= 16)
1234 {
1235 auto n = aptr + (a.length & ~15);
1236
1237 // Unaligned case
1238 asm
1239 {
1240 mov ECX, bptr; // right operand
1241 mov ESI, aptr; // destination operand
1242 mov EDI, n; // end comparison
1243
1244 align 8;
1245 startsseloopb:
1246 movups XMM0, [ESI];
1247 movups XMM1, [ESI+16];
1248 movups XMM2, [ESI+32];
1249 movups XMM3, [ESI+48];
1250 add ESI, 64;
1251 movups XMM4, [ECX];
1252 movups XMM5, [ECX+16];
1253 movups XMM6, [ECX+32];
1254 movups XMM7, [ECX+48];
1255 add ECX, 64;
1256 subps XMM0, XMM4;
1257 subps XMM1, XMM5;
1258 subps XMM2, XMM6;
1259 subps XMM3, XMM7;
1260 movups [ESI+ 0-64], XMM0;
1261 movups [ESI+16-64], XMM1;
1262 movups [ESI+32-64], XMM2;
1263 movups [ESI+48-64], XMM3;
1264 cmp ESI, EDI;
1265 jb startsseloopb;
1266
1267 mov aptr, ESI;
1268 mov bptr, ECX;
1269 }
1270 }
1271 else
1272 // 3DNow! version is 57% faster
1273 if (amd3dnow() && a.length >= 8)
1274 {
1275 auto n = aptr + (a.length & ~7);
1276
1277 asm
1278 {
1279 mov ESI, dword ptr [aptr]; // destination operand
1280 mov EDI, dword ptr [n]; // end comparison
1281 mov ECX, dword ptr [bptr]; // right operand
1282
1283 align 4;
1284 start:
1285 movq MM0, [ESI];
1286 movq MM1, [ESI+8];
1287 movq MM2, [ESI+16];
1288 movq MM3, [ESI+24];
1289 pfsub MM0, [ECX];
1290 pfsub MM1, [ECX+8];
1291 pfsub MM2, [ECX+16];
1292 pfsub MM3, [ECX+24];
1293 movq [ESI], MM0;
1294 movq [ESI+8], MM1;
1295 movq [ESI+16], MM2;
1296 movq [ESI+24], MM3;
1297 add ESI, 32;
1298 add ECX, 32;
1299 cmp ESI, EDI;
1300 jb start;
1301
1302 emms;
1303 mov aptr, ESI;
1304 mov bptr, ECX;
1305 }
1306 }
1307 }
1308
1309 while (aptr < aend)
1310 *aptr++ -= *bptr++;
1311
1312 return a;
1313 }
1314
1315 unittest
1316 {
1317 printf("_arrayExpSliceMinass_f unittest\n");
1318 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1319 {
1320 version (log) printf(" cpuid %d\n", cpuid);
1321
1322 for (int j = 0; j < 2; j++)
1323 {
1324 const int dim = 67;
1325 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1326 a = a[j .. dim + j]; // misalign for second iteration
1327 T[] b = new T[dim + j];
1328 b = b[j .. dim + j];
1329 T[] c = new T[dim + j];
1330 c = c[j .. dim + j];
1331
1332 for (int i = 0; i < dim; i++)
1333 { a[i] = cast(T)i;
1334 b[i] = cast(T)(i + 7);
1335 c[i] = cast(T)(i * 2);
1336 }
1337
1338 a[] = c[];
1339 c[] -= 6;
1340
1341 for (int i = 0; i < dim; i++)
1342 {
1343 if (c[i] != cast(T)(a[i] - 6))
1344 {
1345 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1346 assert(0);
1347 }
1348 }
1349 }
1350 }
1351 }
1352
1353 /* ======================================================================== */
1354
1355 /***********************
1356 * Computes:
1357 * a[] = b[] * value
1358 */
1359
1360 T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b)
1361 in
1362 {
1363 assert(a.length == b.length);
1364 assert(disjoint(a, b));
1365 }
1366 body
1367 {
1368 //printf("_arraySliceExpMulSliceAssign_f()\n");
1369 auto aptr = a.ptr;
1370 auto aend = aptr + a.length;
1371 auto bptr = b.ptr;
1372
1373 version (D_InlineAsm_X86)
1374 {
1375 // SSE version is 607% faster
1376 if (sse() && a.length >= 16)
1377 {
1378 auto n = aptr + (a.length & ~15);
1379
1380 // Unaligned case
1381 asm
1382 {
1383 mov EAX, bptr;
1384 mov ESI, aptr;
1385 mov EDI, n;
1386 movss XMM4, value;
1387 shufps XMM4, XMM4, 0;
1388
1389 align 8;
1390 startsseloop:
1391 add ESI, 64;
1392 movups XMM0, [EAX];
1393 movups XMM1, [EAX+16];
1394 movups XMM2, [EAX+32];
1395 movups XMM3, [EAX+48];
1396 add EAX, 64;
1397 mulps XMM0, XMM4;
1398 mulps XMM1, XMM4;
1399 mulps XMM2, XMM4;
1400 mulps XMM3, XMM4;
1401 movups [ESI+ 0-64], XMM0;
1402 movups [ESI+16-64], XMM1;
1403 movups [ESI+32-64], XMM2;
1404 movups [ESI+48-64], XMM3;
1405 cmp ESI, EDI;
1406 jb startsseloop;
1407
1408 mov aptr, ESI;
1409 mov bptr, EAX;
1410 }
1411 }
1412 else
1413 // 3DNow! version is 69% faster
1414 if (amd3dnow() && a.length >= 8)
1415 {
1416 auto n = aptr + (a.length & ~7);
1417
1418 ulong w = *cast(uint *) &value;
1419 ulong v = w | (w << 32L);
1420
1421 asm
1422 {
1423 mov ESI, dword ptr [aptr];
1424 mov EDI, dword ptr [n];
1425 mov EAX, dword ptr [bptr];
1426 movq MM4, qword ptr [v];
1427
1428 align 8;
1429 start:
1430 movq MM0, [EAX];
1431 movq MM1, [EAX+8];
1432 movq MM2, [EAX+16];
1433 movq MM3, [EAX+24];
1434 pfmul MM0, MM4;
1435 pfmul MM1, MM4;
1436 pfmul MM2, MM4;
1437 pfmul MM3, MM4;
1438 movq [ESI], MM0;
1439 movq [ESI+8], MM1;
1440 movq [ESI+16], MM2;
1441 movq [ESI+24], MM3;
1442 add ESI, 32;
1443 add EAX, 32;
1444 cmp ESI, EDI;
1445 jb start;
1446
1447 emms;
1448 mov aptr, ESI;
1449 mov bptr, EAX;
1450 }
1451 }
1452 }
1453
1454 while (aptr < aend)
1455 *aptr++ = *bptr++ * value;
1456
1457 return a;
1458 }
1459
1460 unittest
1461 {
1462 printf("_arraySliceExpMulSliceAssign_f unittest\n");
1463 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1464 {
1465 version (log) printf(" cpuid %d\n", cpuid);
1466
1467 for (int j = 0; j < 2; j++)
1468 {
1469 const int dim = 67;
1470 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1471 a = a[j .. dim + j]; // misalign for second iteration
1472 T[] b = new T[dim + j];
1473 b = b[j .. dim + j];
1474 T[] c = new T[dim + j];
1475 c = c[j .. dim + j];
1476
1477 for (int i = 0; i < dim; i++)
1478 { a[i] = cast(T)i;
1479 b[i] = cast(T)(i + 7);
1480 c[i] = cast(T)(i * 2);
1481 }
1482
1483 c[] = a[] * 6;
1484
1485 for (int i = 0; i < dim; i++)
1486 {
1487 if (c[i] != cast(T)(a[i] * 6))
1488 {
1489 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1490 assert(0);
1491 }
1492 }
1493 }
1494 }
1495 }
1496
1497 /* ======================================================================== */
1498
1499 /***********************
1500 * Computes:
1501 * a[] = b[] * c[]
1502 */
1503
1504 T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b)
1505 in
1506 {
1507 assert(a.length == b.length && b.length == c.length);
1508 assert(disjoint(a, b));
1509 assert(disjoint(a, c));
1510 assert(disjoint(b, c));
1511 }
1512 body
1513 {
1514 //printf("_arraySliceSliceMulSliceAssign_f()\n");
1515 auto aptr = a.ptr;
1516 auto aend = aptr + a.length;
1517 auto bptr = b.ptr;
1518 auto cptr = c.ptr;
1519
1520 version (D_InlineAsm_X86)
1521 {
1522 // SSE version is 833% faster
1523 if (sse() && a.length >= 16)
1524 {
1525 auto n = aptr + (a.length & ~15);
1526
1527 // Unaligned case
1528 asm
1529 {
1530 mov EAX, bptr; // left operand
1531 mov ECX, cptr; // right operand
1532 mov ESI, aptr; // destination operand
1533 mov EDI, n; // end comparison
1534
1535 align 8;
1536 startsseloopb:
1537 movups XMM0, [EAX];
1538 movups XMM1, [EAX+16];
1539 movups XMM2, [EAX+32];
1540 movups XMM3, [EAX+48];
1541 add ESI, 64;
1542 movups XMM4, [ECX];
1543 movups XMM5, [ECX+16];
1544 movups XMM6, [ECX+32];
1545 movups XMM7, [ECX+48];
1546 add EAX, 64;
1547 mulps XMM0, XMM4;
1548 mulps XMM1, XMM5;
1549 mulps XMM2, XMM6;
1550 mulps XMM3, XMM7;
1551 add ECX, 64;
1552 movups [ESI+ 0-64], XMM0;
1553 movups [ESI+16-64], XMM1;
1554 movups [ESI+32-64], XMM2;
1555 movups [ESI+48-64], XMM3;
1556 cmp ESI, EDI;
1557 jb startsseloopb;
1558
1559 mov aptr, ESI;
1560 mov bptr, EAX;
1561 mov cptr, ECX;
1562 }
1563 }
1564 else
1565 // 3DNow! version is only 13% faster
1566 if (amd3dnow() && a.length >= 8)
1567 {
1568 auto n = aptr + (a.length & ~7);
1569
1570 asm
1571 {
1572 mov ESI, dword ptr [aptr]; // destination operand
1573 mov EDI, dword ptr [n]; // end comparison
1574 mov EAX, dword ptr [bptr]; // left operand
1575 mov ECX, dword ptr [cptr]; // right operand
1576
1577 align 4;
1578 start:
1579 movq MM0, [EAX];
1580 movq MM1, [EAX+8];
1581 movq MM2, [EAX+16];
1582 movq MM3, [EAX+24];
1583 pfmul MM0, [ECX];
1584 pfmul MM1, [ECX+8];
1585 pfmul MM2, [ECX+16];
1586 pfmul MM3, [ECX+24];
1587 movq [ESI], MM0;
1588 movq [ESI+8], MM1;
1589 movq [ESI+16], MM2;
1590 movq [ESI+24], MM3;
1591 add ECX, 32;
1592 add ESI, 32;
1593 add EAX, 32;
1594 cmp ESI, EDI;
1595 jb start;
1596
1597 emms;
1598 mov aptr, ESI;
1599 mov bptr, EAX;
1600 mov cptr, ECX;
1601 }
1602 }
1603 }
1604
1605 while (aptr < aend)
1606 *aptr++ = *bptr++ * *cptr++;
1607
1608 return a;
1609 }
1610
1611 unittest
1612 {
1613 printf("_arraySliceSliceMulSliceAssign_f unittest\n");
1614 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1615 {
1616 version (log) printf(" cpuid %d\n", cpuid);
1617
1618 for (int j = 0; j < 2; j++)
1619 {
1620 const int dim = 67;
1621 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1622 a = a[j .. dim + j]; // misalign for second iteration
1623 T[] b = new T[dim + j];
1624 b = b[j .. dim + j];
1625 T[] c = new T[dim + j];
1626 c = c[j .. dim + j];
1627
1628 for (int i = 0; i < dim; i++)
1629 { a[i] = cast(T)i;
1630 b[i] = cast(T)(i + 7);
1631 c[i] = cast(T)(i * 2);
1632 }
1633
1634 c[] = a[] * b[];
1635
1636 for (int i = 0; i < dim; i++)
1637 {
1638 if (c[i] != cast(T)(a[i] * b[i]))
1639 {
1640 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
1641 assert(0);
1642 }
1643 }
1644 }
1645 }
1646 }
1647
1648 /* ======================================================================== */
1649
1650 /***********************
1651 * Computes:
1652 * a[] *= value
1653 */
1654
1655 T[] _arrayExpSliceMulass_f(T[] a, T value)
1656 {
1657 //printf("_arrayExpSliceMulass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1658 auto aptr = a.ptr;
1659 auto aend = aptr + a.length;
1660
1661 version (D_InlineAsm_X86)
1662 {
1663 // SSE version is 303% faster
1664 if (sse() && a.length >= 16)
1665 {
1666 // align pointer
1667 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
1668 while (aptr < n)
1669 *aptr++ *= value;
1670 n = cast(T*)((cast(uint)aend) & ~15);
1671 if (aptr < n)
1672
1673 // Aligned case
1674 asm
1675 {
1676 mov ESI, aptr;
1677 mov EDI, n;
1678 movss XMM4, value;
1679 shufps XMM4, XMM4, 0;
1680
1681 align 8;
1682 startsseloopa:
1683 movaps XMM0, [ESI];
1684 movaps XMM1, [ESI+16];
1685 movaps XMM2, [ESI+32];
1686 movaps XMM3, [ESI+48];
1687 add ESI, 64;
1688 mulps XMM0, XMM4;
1689 mulps XMM1, XMM4;
1690 mulps XMM2, XMM4;
1691 mulps XMM3, XMM4;
1692 movaps [ESI+ 0-64], XMM0;
1693 movaps [ESI+16-64], XMM1;
1694 movaps [ESI+32-64], XMM2;
1695 movaps [ESI+48-64], XMM3;
1696 cmp ESI, EDI;
1697 jb startsseloopa;
1698
1699 mov aptr, ESI;
1700 }
1701 }
1702 else
1703 // 3DNow! version is 63% faster
1704 if (amd3dnow() && a.length >= 8)
1705 {
1706 auto n = aptr + (a.length & ~7);
1707
1708 ulong w = *cast(uint *) &value;
1709 ulong v = w | (w << 32L);
1710
1711 asm
1712 {
1713 mov ESI, dword ptr [aptr];
1714 mov EDI, dword ptr [n];
1715 movq MM4, qword ptr [v];
1716
1717 align 8;
1718 start:
1719 movq MM0, [ESI];
1720 movq MM1, [ESI+8];
1721 movq MM2, [ESI+16];
1722 movq MM3, [ESI+24];
1723 pfmul MM0, MM4;
1724 pfmul MM1, MM4;
1725 pfmul MM2, MM4;
1726 pfmul MM3, MM4;
1727 movq [ESI], MM0;
1728 movq [ESI+8], MM1;
1729 movq [ESI+16], MM2;
1730 movq [ESI+24], MM3;
1731 add ESI, 32;
1732 cmp ESI, EDI;
1733 jb start;
1734
1735 emms;
1736 mov dword ptr [aptr], ESI;
1737 }
1738 }
1739 }
1740
1741 while (aptr < aend)
1742 *aptr++ *= value;
1743
1744 return a;
1745 }
1746
1747 unittest
1748 {
1749 printf("_arrayExpSliceMulass_f unittest\n");
1750 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1751 {
1752 version (log) printf(" cpuid %d\n", cpuid);
1753
1754 for (int j = 0; j < 2; j++)
1755 {
1756 const int dim = 67;
1757 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1758 a = a[j .. dim + j]; // misalign for second iteration
1759 T[] b = new T[dim + j];
1760 b = b[j .. dim + j];
1761 T[] c = new T[dim + j];
1762 c = c[j .. dim + j];
1763
1764 for (int i = 0; i < dim; i++)
1765 { a[i] = cast(T)i;
1766 b[i] = cast(T)(i + 7);
1767 c[i] = cast(T)(i * 2);
1768 }
1769
1770 a[] = c[];
1771 c[] *= 6;
1772
1773 for (int i = 0; i < dim; i++)
1774 {
1775 if (c[i] != cast(T)(a[i] * 6))
1776 {
1777 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1778 assert(0);
1779 }
1780 }
1781 }
1782 }
1783 }
1784
1785 /* ======================================================================== */
1786
1787 /***********************
1788 * Computes:
1789 * a[] *= b[]
1790 */
1791
1792 T[] _arraySliceSliceMulass_f(T[] a, T[] b)
1793 in
1794 {
1795 assert (a.length == b.length);
1796 assert (disjoint(a, b));
1797 }
1798 body
1799 {
1800 //printf("_arraySliceSliceMulass_f()\n");
1801 auto aptr = a.ptr;
1802 auto aend = aptr + a.length;
1803 auto bptr = b.ptr;
1804
1805 version (D_InlineAsm_X86)
1806 {
1807 // SSE version is 525% faster
1808 if (sse() && a.length >= 16)
1809 {
1810 auto n = aptr + (a.length & ~15);
1811
1812 // Unaligned case
1813 asm
1814 {
1815 mov ECX, bptr; // right operand
1816 mov ESI, aptr; // destination operand
1817 mov EDI, n; // end comparison
1818
1819 align 8;
1820 startsseloopb:
1821 movups XMM0, [ESI];
1822 movups XMM1, [ESI+16];
1823 movups XMM2, [ESI+32];
1824 movups XMM3, [ESI+48];
1825 add ESI, 64;
1826 movups XMM4, [ECX];
1827 movups XMM5, [ECX+16];
1828 movups XMM6, [ECX+32];
1829 movups XMM7, [ECX+48];
1830 add ECX, 64;
1831 mulps XMM0, XMM4;
1832 mulps XMM1, XMM5;
1833 mulps XMM2, XMM6;
1834 mulps XMM3, XMM7;
1835 movups [ESI+ 0-64], XMM0;
1836 movups [ESI+16-64], XMM1;
1837 movups [ESI+32-64], XMM2;
1838 movups [ESI+48-64], XMM3;
1839 cmp ESI, EDI;
1840 jb startsseloopb;
1841
1842 mov aptr, ESI;
1843 mov bptr, ECX;
1844 }
1845 }
1846 else
1847 // 3DNow! version is 57% faster
1848 if (amd3dnow() && a.length >= 8)
1849 {
1850 auto n = aptr + (a.length & ~7);
1851
1852 asm
1853 {
1854 mov ESI, dword ptr [aptr]; // destination operand
1855 mov EDI, dword ptr [n]; // end comparison
1856 mov ECX, dword ptr [bptr]; // right operand
1857
1858 align 4;
1859 start:
1860 movq MM0, [ESI];
1861 movq MM1, [ESI+8];
1862 movq MM2, [ESI+16];
1863 movq MM3, [ESI+24];
1864 pfmul MM0, [ECX];
1865 pfmul MM1, [ECX+8];
1866 pfmul MM2, [ECX+16];
1867 pfmul MM3, [ECX+24];
1868 movq [ESI], MM0;
1869 movq [ESI+8], MM1;
1870 movq [ESI+16], MM2;
1871 movq [ESI+24], MM3;
1872 add ESI, 32;
1873 add ECX, 32;
1874 cmp ESI, EDI;
1875 jb start;
1876
1877 emms;
1878 mov dword ptr [aptr], ESI;
1879 mov dword ptr [bptr], ECX;
1880 }
1881 }
1882 }
1883
1884 while (aptr < aend)
1885 *aptr++ *= *bptr++;
1886
1887 return a;
1888 }
1889
1890 unittest
1891 {
1892 printf("_arrayExpSliceMulass_f unittest\n");
1893 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1894 {
1895 version (log) printf(" cpuid %d\n", cpuid);
1896
1897 for (int j = 0; j < 2; j++)
1898 {
1899 const int dim = 67;
1900 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1901 a = a[j .. dim + j]; // misalign for second iteration
1902 T[] b = new T[dim + j];
1903 b = b[j .. dim + j];
1904 T[] c = new T[dim + j];
1905 c = c[j .. dim + j];
1906
1907 for (int i = 0; i < dim; i++)
1908 { a[i] = cast(T)i;
1909 b[i] = cast(T)(i + 7);
1910 c[i] = cast(T)(i * 2);
1911 }
1912
1913 a[] = c[];
1914 c[] *= 6;
1915
1916 for (int i = 0; i < dim; i++)
1917 {
1918 if (c[i] != cast(T)(a[i] * 6))
1919 {
1920 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1921 assert(0);
1922 }
1923 }
1924 }
1925 }
1926 }
1927
1928 /* ======================================================================== */
1929
1930 /***********************
1931 * Computes:
1932 * a[] = b[] / value
1933 */
1934
1935 T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b)
1936 in
1937 {
1938 assert(a.length == b.length);
1939 assert(disjoint(a, b));
1940 }
1941 body
1942 {
1943 //printf("_arraySliceExpDivSliceAssign_f()\n");
1944 auto aptr = a.ptr;
1945 auto aend = aptr + a.length;
1946 auto bptr = b.ptr;
1947
1948 /* Multiplying by the reciprocal is faster, but does
1949 * not produce as accurate an answer.
1950 */
1951 T recip = cast(T)1 / value;
1952
1953 version (D_InlineAsm_X86)
1954 {
1955 // SSE version is 587% faster
1956 if (sse() && a.length >= 16)
1957 {
1958 auto n = aptr + (a.length & ~15);
1959
1960 // Unaligned case
1961 asm
1962 {
1963 mov EAX, bptr;
1964 mov ESI, aptr;
1965 mov EDI, n;
1966 movss XMM4, recip;
1967 //movss XMM4, value
1968 //rcpss XMM4, XMM4
1969 shufps XMM4, XMM4, 0;
1970
1971 align 8;
1972 startsseloop:
1973 add ESI, 64;
1974 movups XMM0, [EAX];
1975 movups XMM1, [EAX+16];
1976 movups XMM2, [EAX+32];
1977 movups XMM3, [EAX+48];
1978 add EAX, 64;
1979 mulps XMM0, XMM4;
1980 mulps XMM1, XMM4;
1981 mulps XMM2, XMM4;
1982 mulps XMM3, XMM4;
1983 //divps XMM0, XMM4;
1984 //divps XMM1, XMM4;
1985 //divps XMM2, XMM4;
1986 //divps XMM3, XMM4;
1987 movups [ESI+ 0-64], XMM0;
1988 movups [ESI+16-64], XMM1;
1989 movups [ESI+32-64], XMM2;
1990 movups [ESI+48-64], XMM3;
1991 cmp ESI, EDI;
1992 jb startsseloop;
1993
1994 mov aptr, ESI;
1995 mov bptr, EAX;
1996 }
1997 }
1998 else
1999 // 3DNow! version is 72% faster
2000 if (amd3dnow() && a.length >= 8)
2001 {
2002 auto n = aptr + (a.length & ~7);
2003
2004 T[2] w = void;
2005
2006 w[0] = recip;
2007 w[1] = recip;
2008
2009 asm
2010 {
2011 mov ESI, dword ptr [aptr];
2012 mov EDI, dword ptr [n];
2013 mov EAX, dword ptr [bptr];
2014 movq MM4, qword ptr [w];
2015
2016 align 8;
2017 start:
2018 movq MM0, [EAX];
2019 movq MM1, [EAX+8];
2020 movq MM2, [EAX+16];
2021 movq MM3, [EAX+24];
2022 pfmul MM0, MM4;
2023 pfmul MM1, MM4;
2024 pfmul MM2, MM4;
2025 pfmul MM3, MM4;
2026 movq [ESI], MM0;
2027 movq [ESI+8], MM1;
2028 movq [ESI+16], MM2;
2029 movq [ESI+24], MM3;
2030 add ESI, 32;
2031 add EAX, 32;
2032 cmp ESI, EDI;
2033 jb start;
2034
2035 emms;
2036 mov dword ptr [aptr], ESI;
2037 mov dword ptr [bptr], EAX;
2038 }
2039 }
2040 }
2041
2042 while (aptr < aend)
2043 *aptr++ = *bptr++ * recip;
2044
2045 return a;
2046 }
2047
2048 unittest
2049 {
2050 printf("_arraySliceExpDivSliceAssign_f unittest\n");
2051 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2052 {
2053 version (log) printf(" cpuid %d\n", cpuid);
2054
2055 for (int j = 0; j < 2; j++)
2056 {
2057 const int dim = 67;
2058 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2059 a = a[j .. dim + j]; // misalign for second iteration
2060 T[] b = new T[dim + j];
2061 b = b[j .. dim + j];
2062 T[] c = new T[dim + j];
2063 c = c[j .. dim + j];
2064
2065 for (int i = 0; i < dim; i++)
2066 { a[i] = cast(T)i;
2067 b[i] = cast(T)(i + 7);
2068 c[i] = cast(T)(i * 2);
2069 }
2070
2071 c[] = a[] / 8;
2072
2073 for (int i = 0; i < dim; i++)
2074 {
2075 if (c[i] != cast(T)(a[i] / 8))
2076 {
2077 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
2078 assert(0);
2079 }
2080 }
2081 }
2082 }
2083 }
2084
2085 /* ======================================================================== */
2086
2087 /***********************
2088 * Computes:
2089 * a[] /= value
2090 */
2091
2092 T[] _arrayExpSliceDivass_f(T[] a, T value)
2093 {
2094 //printf("_arrayExpSliceDivass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
2095 auto aptr = a.ptr;
2096 auto aend = aptr + a.length;
2097
2098 /* Multiplying by the reciprocal is faster, but does
2099 * not produce as accurate an answer.
2100 */
2101 T recip = cast(T)1 / value;
2102
2103 version (D_InlineAsm_X86)
2104 {
2105 // SSE version is 245% faster
2106 if (sse() && a.length >= 16)
2107 {
2108 // align pointer
2109 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
2110 while (aptr < n)
2111 *aptr++ *= recip;
2112 n = cast(T*)((cast(uint)aend) & ~15);
2113 if (aptr < n)
2114
2115 // Aligned case
2116 asm
2117 {
2118 mov ESI, aptr;
2119 mov EDI, n;
2120 movss XMM4, recip;
2121 //movss XMM4, value
2122 //rcpss XMM4, XMM4
2123 shufps XMM4, XMM4, 0;
2124
2125 align 8;
2126 startsseloopa:
2127 movaps XMM0, [ESI];
2128 movaps XMM1, [ESI+16];
2129 movaps XMM2, [ESI+32];
2130 movaps XMM3, [ESI+48];
2131 add ESI, 64;
2132 mulps XMM0, XMM4;
2133 mulps XMM1, XMM4;
2134 mulps XMM2, XMM4;
2135 mulps XMM3, XMM4;
2136 //divps XMM0, XMM4;
2137 //divps XMM1, XMM4;
2138 //divps XMM2, XMM4;
2139 //divps XMM3, XMM4;
2140 movaps [ESI+ 0-64], XMM0;
2141 movaps [ESI+16-64], XMM1;
2142 movaps [ESI+32-64], XMM2;
2143 movaps [ESI+48-64], XMM3;
2144 cmp ESI, EDI;
2145 jb startsseloopa;
2146
2147 mov aptr, ESI;
2148 }
2149 }
2150 else
2151 // 3DNow! version is 57% faster
2152 if (amd3dnow() && a.length >= 8)
2153 {
2154 auto n = aptr + (a.length & ~7);
2155
2156 T[2] w = void;
2157
2158 w[0] = w[1] = recip;
2159
2160 asm
2161 {
2162 mov ESI, dword ptr [aptr];
2163 mov EDI, dword ptr [n];
2164 movq MM4, qword ptr [w];
2165
2166 align 8;
2167 start:
2168 movq MM0, [ESI];
2169 movq MM1, [ESI+8];
2170 movq MM2, [ESI+16];
2171 movq MM3, [ESI+24];
2172 pfmul MM0, MM4;
2173 pfmul MM1, MM4;
2174 pfmul MM2, MM4;
2175 pfmul MM3, MM4;
2176 movq [ESI], MM0;
2177 movq [ESI+8], MM1;
2178 movq [ESI+16], MM2;
2179 movq [ESI+24], MM3;
2180 add ESI, 32;
2181 cmp ESI, EDI;
2182 jb start;
2183
2184 emms;
2185 mov dword ptr [aptr], ESI;
2186 }
2187 }
2188 }
2189
2190 while (aptr < aend)
2191 *aptr++ *= recip;
2192
2193 return a;
2194 }
2195
2196 unittest
2197 {
2198 printf("_arrayExpSliceDivass_f unittest\n");
2199 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2200 {
2201 version (log) printf(" cpuid %d\n", cpuid);
2202
2203 for (int j = 0; j < 2; j++)
2204 {
2205 const int dim = 67;
2206 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2207 a = a[j .. dim + j]; // misalign for second iteration
2208 T[] b = new T[dim + j];
2209 b = b[j .. dim + j];
2210 T[] c = new T[dim + j];
2211 c = c[j .. dim + j];
2212
2213 for (int i = 0; i < dim; i++)
2214 { a[i] = cast(T)i;
2215 b[i] = cast(T)(i + 7);
2216 c[i] = cast(T)(i * 2);
2217 }
2218
2219 a[] = c[];
2220 c[] /= 8;
2221
2222 for (int i = 0; i < dim; i++)
2223 {
2224 if (c[i] != cast(T)(a[i] / 8))
2225 {
2226 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
2227 assert(0);
2228 }
2229 }
2230 }
2231 }
2232 }
2233
2234
2235 /* ======================================================================== */
2236
2237 /***********************
2238 * Computes:
2239 * a[] -= b[] * value
2240 */
2241
2242 T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b)
2243 {
2244 return _arraySliceExpMulSliceAddass_f(a, -value, b);
2245 }
2246
2247 /***********************
2248 * Computes:
2249 * a[] += b[] * value
2250 */
2251
2252 T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b)
2253 in
2254 {
2255 assert(a.length == b.length);
2256 assert(disjoint(a, b));
2257 }
2258 body
2259 {
2260 auto aptr = a.ptr;
2261 auto aend = aptr + a.length;
2262 auto bptr = b.ptr;
2263
2264 // Handle remainder
2265 while (aptr < aend)
2266 *aptr++ += *bptr++ * value;
2267
2268 return a;
2269 }
2270
2271 unittest
2272 {
2273 printf("_arraySliceExpMulSliceAddass_f unittest\n");
2274
2275 cpuid = 1;
2276 {
2277 version (log) printf(" cpuid %d\n", cpuid);
2278
2279 for (int j = 0; j < 1; j++)
2280 {
2281 const int dim = 67;
2282 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2283 a = a[j .. dim + j]; // misalign for second iteration
2284 T[] b = new T[dim + j];
2285 b = b[j .. dim + j];
2286 T[] c = new T[dim + j];
2287 c = c[j .. dim + j];
2288
2289 for (int i = 0; i < dim; i++)
2290 { a[i] = cast(T)i;
2291 b[i] = cast(T)(i + 7);
2292 c[i] = cast(T)(i * 2);
2293 }
2294
2295 b[] = c[];
2296 c[] += a[] * 6;
2297
2298 for (int i = 0; i < dim; i++)
2299 {
2300 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
2301 if (c[i] != cast(T)(b[i] + a[i] * 6))
2302 {
2303 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
2304 assert(0);
2305 }
2306 }
2307 }
2308 }
2309 }