comparison druntime/src/compiler/dmd/arrayshort.d @ 759:d3eb054172f9

Added copy of druntime from DMD 2.020 modified for LDC.
author Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date Tue, 11 Nov 2008 01:52:37 +0100
parents
children
comparison
equal deleted inserted replaced
758:f04dde6e882c 759:d3eb054172f9
1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for byte array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
6 */
7
8 /* Contains SSE2 and MMX versions of certain operations for wchar, short,
9 * and ushort ('u', 's' and 't' suffixes).
10 */
11
12 module rt.arrayshort;
13
14 private import util.cpuid;
15
16 version (Unittest)
17 {
18 /* This is so unit tests will test every CPU variant
19 */
20 int cpuid;
21 const int CPUID_MAX = 4;
22 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
23 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
24 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
25 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
26 }
27 else
28 {
29 alias util.cpuid.mmx mmx;
30 alias util.cpuid.sse sse;
31 alias util.cpuid.sse2 sse2;
32 alias util.cpuid.sse2 sse2;
33 }
34
35 //version = log;
36
37 bool disjoint(T)(T[] a, T[] b)
38 {
39 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
40 }
41
42 alias short T;
43
44 extern (C):
45
46 /* ======================================================================== */
47
48 /***********************
49 * Computes:
50 * a[] = b[] + value
51 */
52
53 T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b)
54 {
55 return _arraySliceExpAddSliceAssign_s(a, value, b);
56 }
57
58 T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b)
59 {
60 return _arraySliceExpAddSliceAssign_s(a, value, b);
61 }
62
63 T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b)
64 in
65 {
66 assert(a.length == b.length);
67 assert(disjoint(a, b));
68 }
69 body
70 {
71 //printf("_arraySliceExpAddSliceAssign_s()\n");
72 auto aptr = a.ptr;
73 auto aend = aptr + a.length;
74 auto bptr = b.ptr;
75
76 version (D_InlineAsm_X86)
77 {
78 // SSE2 aligned version is 3343% faster
79 if (sse2() && a.length >= 16)
80 {
81 auto n = aptr + (a.length & ~15);
82
83 uint l = cast(ushort) value;
84 l |= (l << 16);
85
86 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
87 {
88 asm // unaligned case
89 {
90 mov ESI, aptr;
91 mov EDI, n;
92 mov EAX, bptr;
93 movd XMM2, l;
94 pshufd XMM2, XMM2, 0;
95
96 align 4;
97 startaddsse2u:
98 add ESI, 32;
99 movdqu XMM0, [EAX];
100 movdqu XMM1, [EAX+16];
101 add EAX, 32;
102 paddw XMM0, XMM2;
103 paddw XMM1, XMM2;
104 movdqu [ESI -32], XMM0;
105 movdqu [ESI+16-32], XMM1;
106 cmp ESI, EDI;
107 jb startaddsse2u;
108
109 mov aptr, ESI;
110 mov bptr, EAX;
111 }
112 }
113 else
114 {
115 asm // aligned case
116 {
117 mov ESI, aptr;
118 mov EDI, n;
119 mov EAX, bptr;
120 movd XMM2, l;
121 pshufd XMM2, XMM2, 0;
122
123 align 4;
124 startaddsse2a:
125 add ESI, 32;
126 movdqa XMM0, [EAX];
127 movdqa XMM1, [EAX+16];
128 add EAX, 32;
129 paddw XMM0, XMM2;
130 paddw XMM1, XMM2;
131 movdqa [ESI -32], XMM0;
132 movdqa [ESI+16-32], XMM1;
133 cmp ESI, EDI;
134 jb startaddsse2a;
135
136 mov aptr, ESI;
137 mov bptr, EAX;
138 }
139 }
140 }
141 else
142 // MMX version is 3343% faster
143 if (mmx() && a.length >= 8)
144 {
145 auto n = aptr + (a.length & ~7);
146
147 uint l = cast(ushort) value;
148
149 asm
150 {
151 mov ESI, aptr;
152 mov EDI, n;
153 mov EAX, bptr;
154 movd MM2, l;
155 pshufw MM2, MM2, 0;
156
157 align 4;
158 startmmx:
159 add ESI, 16;
160 movq MM0, [EAX];
161 movq MM1, [EAX+8];
162 add EAX, 16;
163 paddw MM0, MM2;
164 paddw MM1, MM2;
165 movq [ESI -16], MM0;
166 movq [ESI+8-16], MM1;
167 cmp ESI, EDI;
168 jb startmmx;
169
170 emms;
171 mov aptr, ESI;
172 mov bptr, EAX;
173 }
174 }
175 }
176
177 while (aptr < aend)
178 *aptr++ = cast(T)(*bptr++ + value);
179
180 return a;
181 }
182
183 unittest
184 {
185 printf("_arraySliceExpAddSliceAssign_s unittest\n");
186
187 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
188 {
189 version (log) printf(" cpuid %d\n", cpuid);
190
191 for (int j = 0; j < 2; j++)
192 {
193 const int dim = 67;
194 T[] a = new T[dim + j]; // aligned on 16 byte boundary
195 a = a[j .. dim + j]; // misalign for second iteration
196 T[] b = new T[dim + j];
197 b = b[j .. dim + j];
198 T[] c = new T[dim + j];
199 c = c[j .. dim + j];
200
201 for (int i = 0; i < dim; i++)
202 { a[i] = cast(T)i;
203 b[i] = cast(T)(i + 7);
204 c[i] = cast(T)(i * 2);
205 }
206
207 c[] = a[] + 6;
208
209 for (int i = 0; i < dim; i++)
210 {
211 if (c[i] != cast(T)(a[i] + 6))
212 {
213 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
214 assert(0);
215 }
216 }
217 }
218 }
219 }
220
221
222 /* ======================================================================== */
223
224 /***********************
225 * Computes:
226 * a[] = b[] + c[]
227 */
228
229 T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b)
230 {
231 return _arraySliceSliceAddSliceAssign_s(a, c, b);
232 }
233
234 T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b)
235 {
236 return _arraySliceSliceAddSliceAssign_s(a, c, b);
237 }
238
239 T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b)
240 in
241 {
242 assert(a.length == b.length && b.length == c.length);
243 assert(disjoint(a, b));
244 assert(disjoint(a, c));
245 assert(disjoint(b, c));
246 }
247 body
248 {
249 //printf("_arraySliceSliceAddSliceAssign_s()\n");
250 auto aptr = a.ptr;
251 auto aend = aptr + a.length;
252 auto bptr = b.ptr;
253 auto cptr = c.ptr;
254
255 version (D_InlineAsm_X86)
256 {
257 // SSE2 aligned version is 3777% faster
258 if (sse2() && a.length >= 16)
259 {
260 auto n = aptr + (a.length & ~15);
261
262 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
263 {
264 asm // unaligned case
265 {
266 mov ESI, aptr;
267 mov EDI, n;
268 mov EAX, bptr;
269 mov ECX, cptr;
270
271 align 4;
272 startsse2u:
273 add ESI, 32;
274 movdqu XMM0, [EAX];
275 movdqu XMM1, [EAX+16];
276 add EAX, 32;
277 movdqu XMM2, [ECX];
278 movdqu XMM3, [ECX+16];
279 add ECX, 32;
280 paddw XMM0, XMM2;
281 paddw XMM1, XMM3;
282 movdqu [ESI -32], XMM0;
283 movdqu [ESI+16-32], XMM1;
284 cmp ESI, EDI;
285 jb startsse2u;
286
287 mov aptr, ESI;
288 mov bptr, EAX;
289 mov cptr, ECX;
290 }
291 }
292 else
293 {
294 asm // aligned case
295 {
296 mov ESI, aptr;
297 mov EDI, n;
298 mov EAX, bptr;
299 mov ECX, cptr;
300
301 align 4;
302 startsse2a:
303 add ESI, 32;
304 movdqa XMM0, [EAX];
305 movdqa XMM1, [EAX+16];
306 add EAX, 32;
307 movdqa XMM2, [ECX];
308 movdqa XMM3, [ECX+16];
309 add ECX, 32;
310 paddw XMM0, XMM2;
311 paddw XMM1, XMM3;
312 movdqa [ESI -32], XMM0;
313 movdqa [ESI+16-32], XMM1;
314 cmp ESI, EDI;
315 jb startsse2a;
316
317 mov aptr, ESI;
318 mov bptr, EAX;
319 mov cptr, ECX;
320 }
321 }
322 }
323 else
324 // MMX version is 2068% faster
325 if (mmx() && a.length >= 8)
326 {
327 auto n = aptr + (a.length & ~7);
328
329 asm
330 {
331 mov ESI, aptr;
332 mov EDI, n;
333 mov EAX, bptr;
334 mov ECX, cptr;
335
336 align 4;
337 startmmx:
338 add ESI, 16;
339 movq MM0, [EAX];
340 movq MM1, [EAX+8];
341 add EAX, 16;
342 movq MM2, [ECX];
343 movq MM3, [ECX+8];
344 add ECX, 16;
345 paddw MM0, MM2;
346 paddw MM1, MM3;
347 movq [ESI -16], MM0;
348 movq [ESI+8-16], MM1;
349 cmp ESI, EDI;
350 jb startmmx;
351
352 emms;
353 mov aptr, ESI;
354 mov bptr, EAX;
355 mov cptr, ECX;
356 }
357 }
358 }
359
360 while (aptr < aend)
361 *aptr++ = cast(T)(*bptr++ + *cptr++);
362
363 return a;
364 }
365
366 unittest
367 {
368 printf("_arraySliceSliceAddSliceAssign_s unittest\n");
369
370 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
371 {
372 version (log) printf(" cpuid %d\n", cpuid);
373
374 for (int j = 0; j < 2; j++)
375 {
376 const int dim = 67;
377 T[] a = new T[dim + j]; // aligned on 16 byte boundary
378 a = a[j .. dim + j]; // misalign for second iteration
379 T[] b = new T[dim + j];
380 b = b[j .. dim + j];
381 T[] c = new T[dim + j];
382 c = c[j .. dim + j];
383
384 for (int i = 0; i < dim; i++)
385 { a[i] = cast(T)i;
386 b[i] = cast(T)(i + 7);
387 c[i] = cast(T)(i * 2);
388 }
389
390 c[] = a[] + b[];
391
392 for (int i = 0; i < dim; i++)
393 {
394 if (c[i] != cast(T)(a[i] + b[i]))
395 {
396 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
397 assert(0);
398 }
399 }
400 }
401 }
402 }
403
404
405 /* ======================================================================== */
406
407 /***********************
408 * Computes:
409 * a[] += value
410 */
411
412 T[] _arrayExpSliceAddass_u(T[] a, T value)
413 {
414 return _arrayExpSliceAddass_s(a, value);
415 }
416
417 T[] _arrayExpSliceAddass_t(T[] a, T value)
418 {
419 return _arrayExpSliceAddass_s(a, value);
420 }
421
422 T[] _arrayExpSliceAddass_s(T[] a, T value)
423 {
424 //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
425 auto aptr = a.ptr;
426 auto aend = aptr + a.length;
427
428 version (D_InlineAsm_X86)
429 {
430 // SSE2 aligned version is 832% faster
431 if (sse2() && a.length >= 16)
432 {
433 auto n = aptr + (a.length & ~15);
434
435 uint l = cast(ushort) value;
436 l |= (l << 16);
437
438 if (((cast(uint) aptr) & 15) != 0)
439 {
440 asm // unaligned case
441 {
442 mov ESI, aptr;
443 mov EDI, n;
444 movd XMM2, l;
445 pshufd XMM2, XMM2, 0;
446
447 align 4;
448 startaddsse2u:
449 movdqu XMM0, [ESI];
450 movdqu XMM1, [ESI+16];
451 add ESI, 32;
452 paddw XMM0, XMM2;
453 paddw XMM1, XMM2;
454 movdqu [ESI -32], XMM0;
455 movdqu [ESI+16-32], XMM1;
456 cmp ESI, EDI;
457 jb startaddsse2u;
458
459 mov aptr, ESI;
460 }
461 }
462 else
463 {
464 asm // aligned case
465 {
466 mov ESI, aptr;
467 mov EDI, n;
468 movd XMM2, l;
469 pshufd XMM2, XMM2, 0;
470
471 align 4;
472 startaddsse2a:
473 movdqa XMM0, [ESI];
474 movdqa XMM1, [ESI+16];
475 add ESI, 32;
476 paddw XMM0, XMM2;
477 paddw XMM1, XMM2;
478 movdqa [ESI -32], XMM0;
479 movdqa [ESI+16-32], XMM1;
480 cmp ESI, EDI;
481 jb startaddsse2a;
482
483 mov aptr, ESI;
484 }
485 }
486 }
487 else
488 // MMX version is 826% faster
489 if (mmx() && a.length >= 8)
490 {
491 auto n = aptr + (a.length & ~7);
492
493 uint l = cast(ushort) value;
494
495 asm
496 {
497 mov ESI, aptr;
498 mov EDI, n;
499 movd MM2, l;
500 pshufw MM2, MM2, 0;
501
502 align 4;
503 startmmx:
504 movq MM0, [ESI];
505 movq MM1, [ESI+8];
506 add ESI, 16;
507 paddw MM0, MM2;
508 paddw MM1, MM2;
509 movq [ESI -16], MM0;
510 movq [ESI+8-16], MM1;
511 cmp ESI, EDI;
512 jb startmmx;
513
514 emms;
515 mov aptr, ESI;
516 }
517 }
518 }
519
520 while (aptr < aend)
521 *aptr++ += value;
522
523 return a;
524 }
525
526 unittest
527 {
528 printf("_arrayExpSliceAddass_s unittest\n");
529
530 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
531 {
532 version (log) printf(" cpuid %d\n", cpuid);
533
534 for (int j = 0; j < 2; j++)
535 {
536 const int dim = 67;
537 T[] a = new T[dim + j]; // aligned on 16 byte boundary
538 a = a[j .. dim + j]; // misalign for second iteration
539 T[] b = new T[dim + j];
540 b = b[j .. dim + j];
541 T[] c = new T[dim + j];
542 c = c[j .. dim + j];
543
544 for (int i = 0; i < dim; i++)
545 { a[i] = cast(T)i;
546 b[i] = cast(T)(i + 7);
547 c[i] = cast(T)(i * 2);
548 }
549
550 a[] = c[];
551 a[] += 6;
552
553 for (int i = 0; i < dim; i++)
554 {
555 if (a[i] != cast(T)(c[i] + 6))
556 {
557 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
558 assert(0);
559 }
560 }
561 }
562 }
563 }
564
565
566 /* ======================================================================== */
567
568 /***********************
569 * Computes:
570 * a[] += b[]
571 */
572
573 T[] _arraySliceSliceAddass_u(T[] a, T[] b)
574 {
575 return _arraySliceSliceAddass_s(a, b);
576 }
577
578 T[] _arraySliceSliceAddass_t(T[] a, T[] b)
579 {
580 return _arraySliceSliceAddass_s(a, b);
581 }
582
583 T[] _arraySliceSliceAddass_s(T[] a, T[] b)
584 in
585 {
586 assert (a.length == b.length);
587 assert (disjoint(a, b));
588 }
589 body
590 {
591 //printf("_arraySliceSliceAddass_s()\n");
592 auto aptr = a.ptr;
593 auto aend = aptr + a.length;
594 auto bptr = b.ptr;
595
596 version (D_InlineAsm_X86)
597 {
598 // SSE2 aligned version is 2085% faster
599 if (sse2() && a.length >= 16)
600 {
601 auto n = aptr + (a.length & ~15);
602
603 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
604 {
605 asm // unaligned case
606 {
607 mov ESI, aptr;
608 mov EDI, n;
609 mov ECX, bptr;
610
611 align 4;
612 startsse2u:
613 movdqu XMM0, [ESI];
614 movdqu XMM1, [ESI+16];
615 add ESI, 32;
616 movdqu XMM2, [ECX];
617 movdqu XMM3, [ECX+16];
618 add ECX, 32;
619 paddw XMM0, XMM2;
620 paddw XMM1, XMM3;
621 movdqu [ESI -32], XMM0;
622 movdqu [ESI+16-32], XMM1;
623 cmp ESI, EDI;
624 jb startsse2u;
625
626 mov aptr, ESI;
627 mov bptr, ECX;
628 }
629 }
630 else
631 {
632 asm // aligned case
633 {
634 mov ESI, aptr;
635 mov EDI, n;
636 mov ECX, bptr;
637
638 align 4;
639 startsse2a:
640 movdqa XMM0, [ESI];
641 movdqa XMM1, [ESI+16];
642 add ESI, 32;
643 movdqa XMM2, [ECX];
644 movdqa XMM3, [ECX+16];
645 add ECX, 32;
646 paddw XMM0, XMM2;
647 paddw XMM1, XMM3;
648 movdqa [ESI -32], XMM0;
649 movdqa [ESI+16-32], XMM1;
650 cmp ESI, EDI;
651 jb startsse2a;
652
653 mov aptr, ESI;
654 mov bptr, ECX;
655 }
656 }
657 }
658 else
659 // MMX version is 1022% faster
660 if (mmx() && a.length >= 8)
661 {
662 auto n = aptr + (a.length & ~7);
663
664 asm
665 {
666 mov ESI, aptr;
667 mov EDI, n;
668 mov ECX, bptr;
669
670 align 4;
671 start:
672 movq MM0, [ESI];
673 movq MM1, [ESI+8];
674 add ESI, 16;
675 movq MM2, [ECX];
676 movq MM3, [ECX+8];
677 add ECX, 16;
678 paddw MM0, MM2;
679 paddw MM1, MM3;
680 movq [ESI -16], MM0;
681 movq [ESI+8-16], MM1;
682 cmp ESI, EDI;
683 jb start;
684
685 emms;
686 mov aptr, ESI;
687 mov bptr, ECX;
688 }
689 }
690 }
691
692 while (aptr < aend)
693 *aptr++ += *bptr++;
694
695 return a;
696 }
697
698 unittest
699 {
700 printf("_arraySliceSliceAddass_s unittest\n");
701
702 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
703 {
704 version (log) printf(" cpuid %d\n", cpuid);
705
706 for (int j = 0; j < 2; j++)
707 {
708 const int dim = 67;
709 T[] a = new T[dim + j]; // aligned on 16 byte boundary
710 a = a[j .. dim + j]; // misalign for second iteration
711 T[] b = new T[dim + j];
712 b = b[j .. dim + j];
713 T[] c = new T[dim + j];
714 c = c[j .. dim + j];
715
716 for (int i = 0; i < dim; i++)
717 { a[i] = cast(T)i;
718 b[i] = cast(T)(i + 7);
719 c[i] = cast(T)(i * 2);
720 }
721
722 b[] = c[];
723 c[] += a[];
724
725 for (int i = 0; i < dim; i++)
726 {
727 if (c[i] != cast(T)(b[i] + a[i]))
728 {
729 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
730 assert(0);
731 }
732 }
733 }
734 }
735 }
736
737
738 /* ======================================================================== */
739
740 /***********************
741 * Computes:
742 * a[] = b[] - value
743 */
744
745 T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b)
746 {
747 return _arraySliceExpMinSliceAssign_s(a, value, b);
748 }
749
750 T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b)
751 {
752 return _arraySliceExpMinSliceAssign_s(a, value, b);
753 }
754
755 T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b)
756 in
757 {
758 assert(a.length == b.length);
759 assert(disjoint(a, b));
760 }
761 body
762 {
763 //printf("_arraySliceExpMinSliceAssign_s()\n");
764 auto aptr = a.ptr;
765 auto aend = aptr + a.length;
766 auto bptr = b.ptr;
767
768 version (D_InlineAsm_X86)
769 {
770 // SSE2 aligned version is 3695% faster
771 if (sse2() && a.length >= 16)
772 {
773 auto n = aptr + (a.length & ~15);
774
775 uint l = cast(ushort) value;
776 l |= (l << 16);
777
778 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
779 {
780 asm // unaligned case
781 {
782 mov ESI, aptr;
783 mov EDI, n;
784 mov EAX, bptr;
785 movd XMM2, l;
786 pshufd XMM2, XMM2, 0;
787
788 align 4;
789 startaddsse2u:
790 add ESI, 32;
791 movdqu XMM0, [EAX];
792 movdqu XMM1, [EAX+16];
793 add EAX, 32;
794 psubw XMM0, XMM2;
795 psubw XMM1, XMM2;
796 movdqu [ESI -32], XMM0;
797 movdqu [ESI+16-32], XMM1;
798 cmp ESI, EDI;
799 jb startaddsse2u;
800
801 mov aptr, ESI;
802 mov bptr, EAX;
803 }
804 }
805 else
806 {
807 asm // aligned case
808 {
809 mov ESI, aptr;
810 mov EDI, n;
811 mov EAX, bptr;
812 movd XMM2, l;
813 pshufd XMM2, XMM2, 0;
814
815 align 4;
816 startaddsse2a:
817 add ESI, 32;
818 movdqa XMM0, [EAX];
819 movdqa XMM1, [EAX+16];
820 add EAX, 32;
821 psubw XMM0, XMM2;
822 psubw XMM1, XMM2;
823 movdqa [ESI -32], XMM0;
824 movdqa [ESI+16-32], XMM1;
825 cmp ESI, EDI;
826 jb startaddsse2a;
827
828 mov aptr, ESI;
829 mov bptr, EAX;
830 }
831 }
832 }
833 else
834 // MMX version is 3049% faster
835 if (mmx() && a.length >= 8)
836 {
837 auto n = aptr + (a.length & ~7);
838
839 uint l = cast(ushort) value;
840
841 asm
842 {
843 mov ESI, aptr;
844 mov EDI, n;
845 mov EAX, bptr;
846 movd MM2, l;
847 pshufw MM2, MM2, 0;
848
849 align 4;
850 startmmx:
851 add ESI, 16;
852 movq MM0, [EAX];
853 movq MM1, [EAX+8];
854 add EAX, 16;
855 psubw MM0, MM2;
856 psubw MM1, MM2;
857 movq [ESI -16], MM0;
858 movq [ESI+8-16], MM1;
859 cmp ESI, EDI;
860 jb startmmx;
861
862 emms;
863 mov aptr, ESI;
864 mov bptr, EAX;
865 }
866 }
867 }
868
869 while (aptr < aend)
870 *aptr++ = cast(T)(*bptr++ - value);
871
872 return a;
873 }
874
875 unittest
876 {
877 printf("_arraySliceExpMinSliceAssign_s unittest\n");
878
879 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
880 {
881 version (log) printf(" cpuid %d\n", cpuid);
882
883 for (int j = 0; j < 2; j++)
884 {
885 const int dim = 67;
886 T[] a = new T[dim + j]; // aligned on 16 byte boundary
887 a = a[j .. dim + j]; // misalign for second iteration
888 T[] b = new T[dim + j];
889 b = b[j .. dim + j];
890 T[] c = new T[dim + j];
891 c = c[j .. dim + j];
892
893 for (int i = 0; i < dim; i++)
894 { a[i] = cast(T)i;
895 b[i] = cast(T)(i + 7);
896 c[i] = cast(T)(i * 2);
897 }
898
899 c[] = a[] - 6;
900
901 for (int i = 0; i < dim; i++)
902 {
903 if (c[i] != cast(T)(a[i] - 6))
904 {
905 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
906 assert(0);
907 }
908 }
909 }
910 }
911 }
912
913
914 /* ======================================================================== */
915
916 /***********************
917 * Computes:
918 * a[] = value - b[]
919 */
920
921 T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value)
922 {
923 return _arrayExpSliceMinSliceAssign_s(a, b, value);
924 }
925
926 T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value)
927 {
928 return _arrayExpSliceMinSliceAssign_s(a, b, value);
929 }
930
931 T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value)
932 in
933 {
934 assert(a.length == b.length);
935 assert(disjoint(a, b));
936 }
937 body
938 {
939 //printf("_arrayExpSliceMinSliceAssign_s()\n");
940 auto aptr = a.ptr;
941 auto aend = aptr + a.length;
942 auto bptr = b.ptr;
943
944 version (D_InlineAsm_X86)
945 {
946 // SSE2 aligned version is 4995% faster
947 if (sse2() && a.length >= 16)
948 {
949 auto n = aptr + (a.length & ~15);
950
951 uint l = cast(ushort) value;
952 l |= (l << 16);
953
954 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
955 {
956 asm // unaligned case
957 {
958 mov ESI, aptr;
959 mov EDI, n;
960 mov EAX, bptr;
961
962 align 4;
963 startaddsse2u:
964 movd XMM2, l;
965 pshufd XMM2, XMM2, 0;
966 movd XMM3, l;
967 pshufd XMM3, XMM3, 0;
968 add ESI, 32;
969 movdqu XMM0, [EAX];
970 movdqu XMM1, [EAX+16];
971 add EAX, 32;
972 psubw XMM2, XMM0;
973 psubw XMM3, XMM1;
974 movdqu [ESI -32], XMM2;
975 movdqu [ESI+16-32], XMM3;
976 cmp ESI, EDI;
977 jb startaddsse2u;
978
979 mov aptr, ESI;
980 mov bptr, EAX;
981 }
982 }
983 else
984 {
985 asm // aligned case
986 {
987 mov ESI, aptr;
988 mov EDI, n;
989 mov EAX, bptr;
990
991 align 4;
992 startaddsse2a:
993 movd XMM2, l;
994 pshufd XMM2, XMM2, 0;
995 movd XMM3, l;
996 pshufd XMM3, XMM3, 0;
997 add ESI, 32;
998 movdqa XMM0, [EAX];
999 movdqa XMM1, [EAX+16];
1000 add EAX, 32;
1001 psubw XMM2, XMM0;
1002 psubw XMM3, XMM1;
1003 movdqa [ESI -32], XMM2;
1004 movdqa [ESI+16-32], XMM3;
1005 cmp ESI, EDI;
1006 jb startaddsse2a;
1007
1008 mov aptr, ESI;
1009 mov bptr, EAX;
1010 }
1011 }
1012 }
1013 else
1014 // MMX version is 4562% faster
1015 if (mmx() && a.length >= 8)
1016 {
1017 auto n = aptr + (a.length & ~7);
1018
1019 uint l = cast(ushort) value;
1020
1021 asm
1022 {
1023 mov ESI, aptr;
1024 mov EDI, n;
1025 mov EAX, bptr;
1026 movd MM4, l;
1027 pshufw MM4, MM4, 0;
1028
1029 align 4;
1030 startmmx:
1031 add ESI, 16;
1032 movq MM2, [EAX];
1033 movq MM3, [EAX+8];
1034 movq MM0, MM4;
1035 movq MM1, MM4;
1036 add EAX, 16;
1037 psubw MM0, MM2;
1038 psubw MM1, MM3;
1039 movq [ESI -16], MM0;
1040 movq [ESI+8-16], MM1;
1041 cmp ESI, EDI;
1042 jb startmmx;
1043
1044 emms;
1045 mov aptr, ESI;
1046 mov bptr, EAX;
1047 }
1048 }
1049 }
1050
1051 while (aptr < aend)
1052 *aptr++ = cast(T)(value - *bptr++);
1053
1054 return a;
1055 }
1056
1057 unittest
1058 {
1059 printf("_arrayExpSliceMinSliceAssign_s unittest\n");
1060
1061 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1062 {
1063 version (log) printf(" cpuid %d\n", cpuid);
1064
1065 for (int j = 0; j < 2; j++)
1066 {
1067 const int dim = 67;
1068 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1069 a = a[j .. dim + j]; // misalign for second iteration
1070 T[] b = new T[dim + j];
1071 b = b[j .. dim + j];
1072 T[] c = new T[dim + j];
1073 c = c[j .. dim + j];
1074
1075 for (int i = 0; i < dim; i++)
1076 { a[i] = cast(T)i;
1077 b[i] = cast(T)(i + 7);
1078 c[i] = cast(T)(i * 2);
1079 }
1080
1081 c[] = 6 - a[];
1082
1083 for (int i = 0; i < dim; i++)
1084 {
1085 if (c[i] != cast(T)(6 - a[i]))
1086 {
1087 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
1088 assert(0);
1089 }
1090 }
1091 }
1092 }
1093 }
1094
1095
1096 /* ======================================================================== */
1097
1098 /***********************
1099 * Computes:
1100 * a[] = b[] - c[]
1101 */
1102
1103 T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b)
1104 {
1105 return _arraySliceSliceMinSliceAssign_s(a, c, b);
1106 }
1107
1108 T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b)
1109 {
1110 return _arraySliceSliceMinSliceAssign_s(a, c, b);
1111 }
1112
1113 T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b)
1114 in
1115 {
1116 assert(a.length == b.length && b.length == c.length);
1117 assert(disjoint(a, b));
1118 assert(disjoint(a, c));
1119 assert(disjoint(b, c));
1120 }
1121 body
1122 {
1123 auto aptr = a.ptr;
1124 auto aend = aptr + a.length;
1125 auto bptr = b.ptr;
1126 auto cptr = c.ptr;
1127
1128 version (D_InlineAsm_X86)
1129 {
1130 // SSE2 aligned version is 4129% faster
1131 if (sse2() && a.length >= 16)
1132 {
1133 auto n = aptr + (a.length & ~15);
1134
1135 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1136 {
1137 asm // unaligned case
1138 {
1139 mov ESI, aptr;
1140 mov EDI, n;
1141 mov EAX, bptr;
1142 mov ECX, cptr;
1143
1144 align 4;
1145 startsse2u:
1146 add ESI, 32;
1147 movdqu XMM0, [EAX];
1148 movdqu XMM1, [EAX+16];
1149 add EAX, 32;
1150 movdqu XMM2, [ECX];
1151 movdqu XMM3, [ECX+16];
1152 add ECX, 32;
1153 psubw XMM0, XMM2;
1154 psubw XMM1, XMM3;
1155 movdqu [ESI -32], XMM0;
1156 movdqu [ESI+16-32], XMM1;
1157 cmp ESI, EDI;
1158 jb startsse2u;
1159
1160 mov aptr, ESI;
1161 mov bptr, EAX;
1162 mov cptr, ECX;
1163 }
1164 }
1165 else
1166 {
1167 asm // aligned case
1168 {
1169 mov ESI, aptr;
1170 mov EDI, n;
1171 mov EAX, bptr;
1172 mov ECX, cptr;
1173
1174 align 4;
1175 startsse2a:
1176 add ESI, 32;
1177 movdqa XMM0, [EAX];
1178 movdqa XMM1, [EAX+16];
1179 add EAX, 32;
1180 movdqa XMM2, [ECX];
1181 movdqa XMM3, [ECX+16];
1182 add ECX, 32;
1183 psubw XMM0, XMM2;
1184 psubw XMM1, XMM3;
1185 movdqa [ESI -32], XMM0;
1186 movdqa [ESI+16-32], XMM1;
1187 cmp ESI, EDI;
1188 jb startsse2a;
1189
1190 mov aptr, ESI;
1191 mov bptr, EAX;
1192 mov cptr, ECX;
1193 }
1194 }
1195 }
1196 else
1197 // MMX version is 2018% faster
1198 if (mmx() && a.length >= 8)
1199 {
1200 auto n = aptr + (a.length & ~7);
1201
1202 asm
1203 {
1204 mov ESI, aptr;
1205 mov EDI, n;
1206 mov EAX, bptr;
1207 mov ECX, cptr;
1208
1209 align 4;
1210 startmmx:
1211 add ESI, 16;
1212 movq MM0, [EAX];
1213 movq MM1, [EAX+8];
1214 add EAX, 16;
1215 movq MM2, [ECX];
1216 movq MM3, [ECX+8];
1217 add ECX, 16;
1218 psubw MM0, MM2;
1219 psubw MM1, MM3;
1220 movq [ESI -16], MM0;
1221 movq [ESI+8-16], MM1;
1222 cmp ESI, EDI;
1223 jb startmmx;
1224
1225 emms;
1226 mov aptr, ESI;
1227 mov bptr, EAX;
1228 mov cptr, ECX;
1229 }
1230 }
1231 }
1232
1233 while (aptr < aend)
1234 *aptr++ = cast(T)(*bptr++ - *cptr++);
1235
1236 return a;
1237 }
1238
1239 unittest
1240 {
1241 printf("_arraySliceSliceMinSliceAssign_s unittest\n");
1242
1243 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1244 {
1245 version (log) printf(" cpuid %d\n", cpuid);
1246
1247 for (int j = 0; j < 2; j++)
1248 {
1249 const int dim = 67;
1250 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1251 a = a[j .. dim + j]; // misalign for second iteration
1252 T[] b = new T[dim + j];
1253 b = b[j .. dim + j];
1254 T[] c = new T[dim + j];
1255 c = c[j .. dim + j];
1256
1257 for (int i = 0; i < dim; i++)
1258 { a[i] = cast(T)i;
1259 b[i] = cast(T)(i + 7);
1260 c[i] = cast(T)(i * 2);
1261 }
1262
1263 c[] = a[] - b[];
1264
1265 for (int i = 0; i < dim; i++)
1266 {
1267 if (c[i] != cast(T)(a[i] - b[i]))
1268 {
1269 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1270 assert(0);
1271 }
1272 }
1273 }
1274 }
1275 }
1276
1277
1278 /* ======================================================================== */
1279
1280 /***********************
1281 * Computes:
1282 * a[] -= value
1283 */
1284
1285 T[] _arrayExpSliceMinass_u(T[] a, T value)
1286 {
1287 return _arrayExpSliceMinass_s(a, value);
1288 }
1289
1290 T[] _arrayExpSliceMinass_t(T[] a, T value)
1291 {
1292 return _arrayExpSliceMinass_s(a, value);
1293 }
1294
1295 T[] _arrayExpSliceMinass_s(T[] a, T value)
1296 {
1297 //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1298 auto aptr = a.ptr;
1299 auto aend = aptr + a.length;
1300
1301 version (D_InlineAsm_X86)
1302 {
1303 // SSE2 aligned version is 835% faster
1304 if (sse2() && a.length >= 16)
1305 {
1306 auto n = aptr + (a.length & ~15);
1307
1308 uint l = cast(ushort) value;
1309 l |= (l << 16);
1310
1311 if (((cast(uint) aptr) & 15) != 0)
1312 {
1313 asm // unaligned case
1314 {
1315 mov ESI, aptr;
1316 mov EDI, n;
1317 movd XMM2, l;
1318 pshufd XMM2, XMM2, 0;
1319
1320 align 4;
1321 startaddsse2u:
1322 movdqu XMM0, [ESI];
1323 movdqu XMM1, [ESI+16];
1324 add ESI, 32;
1325 psubw XMM0, XMM2;
1326 psubw XMM1, XMM2;
1327 movdqu [ESI -32], XMM0;
1328 movdqu [ESI+16-32], XMM1;
1329 cmp ESI, EDI;
1330 jb startaddsse2u;
1331
1332 mov aptr, ESI;
1333 }
1334 }
1335 else
1336 {
1337 asm // aligned case
1338 {
1339 mov ESI, aptr;
1340 mov EDI, n;
1341 movd XMM2, l;
1342 pshufd XMM2, XMM2, 0;
1343
1344 align 4;
1345 startaddsse2a:
1346 movdqa XMM0, [ESI];
1347 movdqa XMM1, [ESI+16];
1348 add ESI, 32;
1349 psubw XMM0, XMM2;
1350 psubw XMM1, XMM2;
1351 movdqa [ESI -32], XMM0;
1352 movdqa [ESI+16-32], XMM1;
1353 cmp ESI, EDI;
1354 jb startaddsse2a;
1355
1356 mov aptr, ESI;
1357 }
1358 }
1359 }
1360 else
1361 // MMX version is 835% faster
1362 if (mmx() && a.length >= 8)
1363 {
1364 auto n = aptr + (a.length & ~7);
1365
1366 uint l = cast(ushort) value;
1367
1368 asm
1369 {
1370 mov ESI, aptr;
1371 mov EDI, n;
1372 movd MM2, l;
1373 pshufw MM2, MM2, 0;
1374
1375 align 4;
1376 startmmx:
1377 movq MM0, [ESI];
1378 movq MM1, [ESI+8];
1379 add ESI, 16;
1380 psubw MM0, MM2;
1381 psubw MM1, MM2;
1382 movq [ESI -16], MM0;
1383 movq [ESI+8-16], MM1;
1384 cmp ESI, EDI;
1385 jb startmmx;
1386
1387 emms;
1388 mov aptr, ESI;
1389 }
1390 }
1391 }
1392
1393 while (aptr < aend)
1394 *aptr++ -= value;
1395
1396 return a;
1397 }
1398
1399 unittest
1400 {
1401 printf("_arrayExpSliceMinass_s unittest\n");
1402
1403 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1404 {
1405 version (log) printf(" cpuid %d\n", cpuid);
1406
1407 for (int j = 0; j < 2; j++)
1408 {
1409 const int dim = 67;
1410 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1411 a = a[j .. dim + j]; // misalign for second iteration
1412 T[] b = new T[dim + j];
1413 b = b[j .. dim + j];
1414 T[] c = new T[dim + j];
1415 c = c[j .. dim + j];
1416
1417 for (int i = 0; i < dim; i++)
1418 { a[i] = cast(T)i;
1419 b[i] = cast(T)(i + 7);
1420 c[i] = cast(T)(i * 2);
1421 }
1422
1423 a[] = c[];
1424 a[] -= 6;
1425
1426 for (int i = 0; i < dim; i++)
1427 {
1428 if (a[i] != cast(T)(c[i] - 6))
1429 {
1430 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
1431 assert(0);
1432 }
1433 }
1434 }
1435 }
1436 }
1437
1438
1439 /* ======================================================================== */
1440
1441 /***********************
1442 * Computes:
1443 * a[] -= b[]
1444 */
1445
1446 T[] _arraySliceSliceMinass_u(T[] a, T[] b)
1447 {
1448 return _arraySliceSliceMinass_s(a, b);
1449 }
1450
1451 T[] _arraySliceSliceMinass_t(T[] a, T[] b)
1452 {
1453 return _arraySliceSliceMinass_s(a, b);
1454 }
1455
1456 T[] _arraySliceSliceMinass_s(T[] a, T[] b)
1457 in
1458 {
1459 assert (a.length == b.length);
1460 assert (disjoint(a, b));
1461 }
1462 body
1463 {
1464 //printf("_arraySliceSliceMinass_s()\n");
1465 auto aptr = a.ptr;
1466 auto aend = aptr + a.length;
1467 auto bptr = b.ptr;
1468
1469 version (D_InlineAsm_X86)
1470 {
1471 // SSE2 aligned version is 2121% faster
1472 if (sse2() && a.length >= 16)
1473 {
1474 auto n = aptr + (a.length & ~15);
1475
1476 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1477 {
1478 asm // unaligned case
1479 {
1480 mov ESI, aptr;
1481 mov EDI, n;
1482 mov ECX, bptr;
1483
1484 align 4;
1485 startsse2u:
1486 movdqu XMM0, [ESI];
1487 movdqu XMM1, [ESI+16];
1488 add ESI, 32;
1489 movdqu XMM2, [ECX];
1490 movdqu XMM3, [ECX+16];
1491 add ECX, 32;
1492 psubw XMM0, XMM2;
1493 psubw XMM1, XMM3;
1494 movdqu [ESI -32], XMM0;
1495 movdqu [ESI+16-32], XMM1;
1496 cmp ESI, EDI;
1497 jb startsse2u;
1498
1499 mov aptr, ESI;
1500 mov bptr, ECX;
1501 }
1502 }
1503 else
1504 {
1505 asm // aligned case
1506 {
1507 mov ESI, aptr;
1508 mov EDI, n;
1509 mov ECX, bptr;
1510
1511 align 4;
1512 startsse2a:
1513 movdqa XMM0, [ESI];
1514 movdqa XMM1, [ESI+16];
1515 add ESI, 32;
1516 movdqa XMM2, [ECX];
1517 movdqa XMM3, [ECX+16];
1518 add ECX, 32;
1519 psubw XMM0, XMM2;
1520 psubw XMM1, XMM3;
1521 movdqa [ESI -32], XMM0;
1522 movdqa [ESI+16-32], XMM1;
1523 cmp ESI, EDI;
1524 jb startsse2a;
1525
1526 mov aptr, ESI;
1527 mov bptr, ECX;
1528 }
1529 }
1530 }
1531 else
1532 // MMX version is 1116% faster
1533 if (mmx() && a.length >= 8)
1534 {
1535 auto n = aptr + (a.length & ~7);
1536
1537 asm
1538 {
1539 mov ESI, aptr;
1540 mov EDI, n;
1541 mov ECX, bptr;
1542
1543 align 4;
1544 start:
1545 movq MM0, [ESI];
1546 movq MM1, [ESI+8];
1547 add ESI, 16;
1548 movq MM2, [ECX];
1549 movq MM3, [ECX+8];
1550 add ECX, 16;
1551 psubw MM0, MM2;
1552 psubw MM1, MM3;
1553 movq [ESI -16], MM0;
1554 movq [ESI+8-16], MM1;
1555 cmp ESI, EDI;
1556 jb start;
1557
1558 emms;
1559 mov aptr, ESI;
1560 mov bptr, ECX;
1561 }
1562 }
1563 }
1564
1565 while (aptr < aend)
1566 *aptr++ -= *bptr++;
1567
1568 return a;
1569 }
1570
1571 unittest
1572 {
1573 printf("_arraySliceSliceMinass_s unittest\n");
1574
1575 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1576 {
1577 version (log) printf(" cpuid %d\n", cpuid);
1578
1579 for (int j = 0; j < 2; j++)
1580 {
1581 const int dim = 67;
1582 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1583 a = a[j .. dim + j]; // misalign for second iteration
1584 T[] b = new T[dim + j];
1585 b = b[j .. dim + j];
1586 T[] c = new T[dim + j];
1587 c = c[j .. dim + j];
1588
1589 for (int i = 0; i < dim; i++)
1590 { a[i] = cast(T)i;
1591 b[i] = cast(T)(i + 7);
1592 c[i] = cast(T)(i * 2);
1593 }
1594
1595 b[] = c[];
1596 c[] -= a[];
1597
1598 for (int i = 0; i < dim; i++)
1599 {
1600 if (c[i] != cast(T)(b[i] - a[i]))
1601 {
1602 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
1603 assert(0);
1604 }
1605 }
1606 }
1607 }
1608 }
1609
1610
1611 /* ======================================================================== */
1612
1613 /***********************
1614 * Computes:
1615 * a[] = b[] * value
1616 */
1617
1618 T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b)
1619 {
1620 return _arraySliceExpMulSliceAssign_s(a, value, b);
1621 }
1622
1623 T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b)
1624 {
1625 return _arraySliceExpMulSliceAssign_s(a, value, b);
1626 }
1627
1628 T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b)
1629 in
1630 {
1631 assert(a.length == b.length);
1632 assert(disjoint(a, b));
1633 }
1634 body
1635 {
1636 //printf("_arraySliceExpMulSliceAssign_s()\n");
1637 auto aptr = a.ptr;
1638 auto aend = aptr + a.length;
1639 auto bptr = b.ptr;
1640
1641 version (D_InlineAsm_X86)
1642 {
1643 // SSE2 aligned version is 3733% faster
1644 if (sse2() && a.length >= 16)
1645 {
1646 auto n = aptr + (a.length & ~15);
1647
1648 uint l = cast(ushort) value;
1649 l |= l << 16;
1650
1651 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1652 {
1653 asm
1654 {
1655 mov ESI, aptr;
1656 mov EDI, n;
1657 mov EAX, bptr;
1658 movd XMM2, l;
1659 pshufd XMM2, XMM2, 0;
1660
1661 align 4;
1662 startsse2u:
1663 add ESI, 32;
1664 movdqu XMM0, [EAX];
1665 movdqu XMM1, [EAX+16];
1666 add EAX, 32;
1667 pmullw XMM0, XMM2;
1668 pmullw XMM1, XMM2;
1669 movdqu [ESI -32], XMM0;
1670 movdqu [ESI+16-32], XMM1;
1671 cmp ESI, EDI;
1672 jb startsse2u;
1673
1674 mov aptr, ESI;
1675 mov bptr, EAX;
1676 }
1677 }
1678 else
1679 {
1680 asm
1681 {
1682 mov ESI, aptr;
1683 mov EDI, n;
1684 mov EAX, bptr;
1685 movd XMM2, l;
1686 pshufd XMM2, XMM2, 0;
1687
1688 align 4;
1689 startsse2a:
1690 add ESI, 32;
1691 movdqa XMM0, [EAX];
1692 movdqa XMM1, [EAX+16];
1693 add EAX, 32;
1694 pmullw XMM0, XMM2;
1695 pmullw XMM1, XMM2;
1696 movdqa [ESI -32], XMM0;
1697 movdqa [ESI+16-32], XMM1;
1698 cmp ESI, EDI;
1699 jb startsse2a;
1700
1701 mov aptr, ESI;
1702 mov bptr, EAX;
1703 }
1704 }
1705 }
1706 else
1707 // MMX version is 3733% faster
1708 if (mmx() && a.length >= 8)
1709 {
1710 auto n = aptr + (a.length & ~7);
1711
1712 uint l = cast(ushort) value;
1713
1714 asm
1715 {
1716 mov ESI, aptr;
1717 mov EDI, n;
1718 mov EAX, bptr;
1719 movd MM2, l;
1720 pshufw MM2, MM2, 0;
1721
1722 align 4;
1723 startmmx:
1724 add ESI, 16;
1725 movq MM0, [EAX];
1726 movq MM1, [EAX+8];
1727 add EAX, 16;
1728 pmullw MM0, MM2;
1729 pmullw MM1, MM2;
1730 movq [ESI -16], MM0;
1731 movq [ESI+8-16], MM1;
1732 cmp ESI, EDI;
1733 jb startmmx;
1734
1735 emms;
1736 mov aptr, ESI;
1737 mov bptr, EAX;
1738 }
1739 }
1740 }
1741
1742 while (aptr < aend)
1743 *aptr++ = cast(T)(*bptr++ * value);
1744
1745 return a;
1746 }
1747
1748 unittest
1749 {
1750 printf("_arraySliceExpMulSliceAssign_s unittest\n");
1751
1752 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1753 {
1754 version (log) printf(" cpuid %d\n", cpuid);
1755
1756 for (int j = 0; j < 2; j++)
1757 {
1758 const int dim = 67;
1759 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1760 a = a[j .. dim + j]; // misalign for second iteration
1761 T[] b = new T[dim + j];
1762 b = b[j .. dim + j];
1763 T[] c = new T[dim + j];
1764 c = c[j .. dim + j];
1765
1766 for (int i = 0; i < dim; i++)
1767 { a[i] = cast(T)i;
1768 b[i] = cast(T)(i + 7);
1769 c[i] = cast(T)(i * 2);
1770 }
1771
1772 c[] = a[] * 6;
1773
1774 for (int i = 0; i < dim; i++)
1775 {
1776 if (c[i] != cast(T)(a[i] * 6))
1777 {
1778 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
1779 assert(0);
1780 }
1781 }
1782 }
1783 }
1784 }
1785
1786
1787 /* ======================================================================== */
1788
1789 /***********************
1790 * Computes:
1791 * a[] = b[] * c[]
1792 */
1793
1794 T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b)
1795 {
1796 return _arraySliceSliceMulSliceAssign_s(a, c, b);
1797 }
1798
1799 T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b)
1800 {
1801 return _arraySliceSliceMulSliceAssign_s(a, c, b);
1802 }
1803
1804 T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b)
1805 in
1806 {
1807 assert(a.length == b.length && b.length == c.length);
1808 assert(disjoint(a, b));
1809 assert(disjoint(a, c));
1810 assert(disjoint(b, c));
1811 }
1812 body
1813 {
1814 //printf("_arraySliceSliceMulSliceAssign_s()\n");
1815 auto aptr = a.ptr;
1816 auto aend = aptr + a.length;
1817 auto bptr = b.ptr;
1818 auto cptr = c.ptr;
1819
1820 version (D_InlineAsm_X86)
1821 {
1822 // SSE2 aligned version is 2515% faster
1823 if (sse2() && a.length >= 16)
1824 {
1825 auto n = aptr + (a.length & ~15);
1826
1827 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1828 {
1829 asm
1830 {
1831 mov ESI, aptr;
1832 mov EDI, n;
1833 mov EAX, bptr;
1834 mov ECX, cptr;
1835
1836 align 4;
1837 startsse2u:
1838 add ESI, 32;
1839 movdqu XMM0, [EAX];
1840 movdqu XMM2, [ECX];
1841 movdqu XMM1, [EAX+16];
1842 movdqu XMM3, [ECX+16];
1843 add EAX, 32;
1844 add ECX, 32;
1845 pmullw XMM0, XMM2;
1846 pmullw XMM1, XMM3;
1847 movdqu [ESI -32], XMM0;
1848 movdqu [ESI+16-32], XMM1;
1849 cmp ESI, EDI;
1850 jb startsse2u;
1851
1852 mov aptr, ESI;
1853 mov bptr, EAX;
1854 mov cptr, ECX;
1855 }
1856 }
1857 else
1858 {
1859 asm
1860 {
1861 mov ESI, aptr;
1862 mov EDI, n;
1863 mov EAX, bptr;
1864 mov ECX, cptr;
1865
1866 align 4;
1867 startsse2a:
1868 add ESI, 32;
1869 movdqa XMM0, [EAX];
1870 movdqa XMM2, [ECX];
1871 movdqa XMM1, [EAX+16];
1872 movdqa XMM3, [ECX+16];
1873 add EAX, 32;
1874 add ECX, 32;
1875 pmullw XMM0, XMM2;
1876 pmullw XMM1, XMM3;
1877 movdqa [ESI -32], XMM0;
1878 movdqa [ESI+16-32], XMM1;
1879 cmp ESI, EDI;
1880 jb startsse2a;
1881
1882 mov aptr, ESI;
1883 mov bptr, EAX;
1884 mov cptr, ECX;
1885 }
1886 }
1887 }
1888 else
1889 // MMX version is 2515% faster
1890 if (mmx() && a.length >= 8)
1891 {
1892 auto n = aptr + (a.length & ~7);
1893
1894 asm
1895 {
1896 mov ESI, aptr;
1897 mov EDI, n;
1898 mov EAX, bptr;
1899 mov ECX, cptr;
1900
1901 align 4;
1902 startmmx:
1903 add ESI, 16;
1904 movq MM0, [EAX];
1905 movq MM2, [ECX];
1906 movq MM1, [EAX+8];
1907 movq MM3, [ECX+8];
1908 add EAX, 16;
1909 add ECX, 16;
1910 pmullw MM0, MM2;
1911 pmullw MM1, MM3;
1912 movq [ESI -16], MM0;
1913 movq [ESI+8-16], MM1;
1914 cmp ESI, EDI;
1915 jb startmmx;
1916
1917 emms;
1918 mov aptr, ESI;
1919 mov bptr, EAX;
1920 mov cptr, ECX;
1921 }
1922 }
1923 }
1924
1925 while (aptr < aend)
1926 *aptr++ = cast(T)(*bptr++ * *cptr++);
1927
1928 return a;
1929 }
1930
1931 unittest
1932 {
1933 printf("_arraySliceSliceMulSliceAssign_s unittest\n");
1934
1935 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1936 {
1937 version (log) printf(" cpuid %d\n", cpuid);
1938
1939 for (int j = 0; j < 2; j++)
1940 {
1941 const int dim = 67;
1942 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1943 a = a[j .. dim + j]; // misalign for second iteration
1944 T[] b = new T[dim + j];
1945 b = b[j .. dim + j];
1946 T[] c = new T[dim + j];
1947 c = c[j .. dim + j];
1948
1949 for (int i = 0; i < dim; i++)
1950 { a[i] = cast(T)i;
1951 b[i] = cast(T)(i + 7);
1952 c[i] = cast(T)(i * 2);
1953 }
1954
1955 c[] = a[] * b[];
1956
1957 for (int i = 0; i < dim; i++)
1958 {
1959 if (c[i] != cast(T)(a[i] * b[i]))
1960 {
1961 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
1962 assert(0);
1963 }
1964 }
1965 }
1966 }
1967 }
1968
1969
1970 /* ======================================================================== */
1971
1972 /***********************
1973 * Computes:
1974 * a[] *= value
1975 */
1976
1977 T[] _arrayExpSliceMulass_u(T[] a, T value)
1978 {
1979 return _arrayExpSliceMulass_s(a, value);
1980 }
1981
1982 T[] _arrayExpSliceMulass_t(T[] a, T value)
1983 {
1984 return _arrayExpSliceMulass_s(a, value);
1985 }
1986
1987 T[] _arrayExpSliceMulass_s(T[] a, T value)
1988 {
1989 //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1990 auto aptr = a.ptr;
1991 auto aend = aptr + a.length;
1992
1993 version (D_InlineAsm_X86)
1994 {
1995 // SSE2 aligned version is 2044% faster
1996 if (sse2() && a.length >= 16)
1997 {
1998 auto n = aptr + (a.length & ~15);
1999
2000 uint l = cast(ushort) value;
2001 l |= l << 16;
2002
2003 if (((cast(uint) aptr) & 15) != 0)
2004 {
2005 asm
2006 {
2007 mov ESI, aptr;
2008 mov EDI, n;
2009 movd XMM2, l;
2010 pshufd XMM2, XMM2, 0;
2011
2012 align 4;
2013 startsse2u:
2014 movdqu XMM0, [ESI];
2015 movdqu XMM1, [ESI+16];
2016 add ESI, 32;
2017 pmullw XMM0, XMM2;
2018 pmullw XMM1, XMM2;
2019 movdqu [ESI -32], XMM0;
2020 movdqu [ESI+16-32], XMM1;
2021 cmp ESI, EDI;
2022 jb startsse2u;
2023
2024 mov aptr, ESI;
2025 }
2026 }
2027 else
2028 {
2029 asm
2030 {
2031 mov ESI, aptr;
2032 mov EDI, n;
2033 movd XMM2, l;
2034 pshufd XMM2, XMM2, 0;
2035
2036 align 4;
2037 startsse2a:
2038 movdqa XMM0, [ESI];
2039 movdqa XMM1, [ESI+16];
2040 add ESI, 32;
2041 pmullw XMM0, XMM2;
2042 pmullw XMM1, XMM2;
2043 movdqa [ESI -32], XMM0;
2044 movdqa [ESI+16-32], XMM1;
2045 cmp ESI, EDI;
2046 jb startsse2a;
2047
2048 mov aptr, ESI;
2049 }
2050 }
2051 }
2052 else
2053 // MMX version is 2056% faster
2054 if (mmx() && a.length >= 8)
2055 {
2056 auto n = aptr + (a.length & ~7);
2057
2058 uint l = cast(ushort) value;
2059
2060 asm
2061 {
2062 mov ESI, aptr;
2063 mov EDI, n;
2064 movd MM2, l;
2065 pshufw MM2, MM2, 0;
2066
2067 align 4;
2068 startmmx:
2069 movq MM0, [ESI];
2070 movq MM1, [ESI+8];
2071 add ESI, 16;
2072 pmullw MM0, MM2;
2073 pmullw MM1, MM2;
2074 movq [ESI -16], MM0;
2075 movq [ESI+8-16], MM1;
2076 cmp ESI, EDI;
2077 jb startmmx;
2078
2079 emms;
2080 mov aptr, ESI;
2081 }
2082 }
2083 }
2084
2085 while (aptr < aend)
2086 *aptr++ *= value;
2087
2088 return a;
2089 }
2090
2091 unittest
2092 {
2093 printf("_arrayExpSliceMulass_s unittest\n");
2094
2095 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2096 {
2097 version (log) printf(" cpuid %d\n", cpuid);
2098
2099 for (int j = 0; j < 2; j++)
2100 {
2101 const int dim = 67;
2102 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2103 a = a[j .. dim + j]; // misalign for second iteration
2104 T[] b = new T[dim + j];
2105 b = b[j .. dim + j];
2106 T[] c = new T[dim + j];
2107 c = c[j .. dim + j];
2108
2109 for (int i = 0; i < dim; i++)
2110 { a[i] = cast(T)i;
2111 b[i] = cast(T)(i + 7);
2112 c[i] = cast(T)(i * 2);
2113 }
2114
2115 b[] = a[];
2116 a[] *= 6;
2117
2118 for (int i = 0; i < dim; i++)
2119 {
2120 if (a[i] != cast(T)(b[i] * 6))
2121 {
2122 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
2123 assert(0);
2124 }
2125 }
2126 }
2127 }
2128 }
2129
2130
2131 /* ======================================================================== */
2132
2133 /***********************
2134 * Computes:
2135 * a[] *= b[]
2136 */
2137
2138 T[] _arraySliceSliceMulass_u(T[] a, T[] b)
2139 {
2140 return _arraySliceSliceMulass_s(a, b);
2141 }
2142
2143 T[] _arraySliceSliceMulass_t(T[] a, T[] b)
2144 {
2145 return _arraySliceSliceMulass_s(a, b);
2146 }
2147
2148 T[] _arraySliceSliceMulass_s(T[] a, T[] b)
2149 in
2150 {
2151 assert (a.length == b.length);
2152 assert (disjoint(a, b));
2153 }
2154 body
2155 {
2156 //printf("_arraySliceSliceMulass_s()\n");
2157 auto aptr = a.ptr;
2158 auto aend = aptr + a.length;
2159 auto bptr = b.ptr;
2160
2161 version (D_InlineAsm_X86)
2162 {
2163 // SSE2 aligned version is 2519% faster
2164 if (sse2() && a.length >= 16)
2165 {
2166 auto n = aptr + (a.length & ~15);
2167
2168 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
2169 {
2170 asm
2171 {
2172 mov ESI, aptr;
2173 mov EDI, n;
2174 mov ECX, bptr;
2175
2176 align 4;
2177 startsse2u:
2178 movdqu XMM0, [ESI];
2179 movdqu XMM2, [ECX];
2180 movdqu XMM1, [ESI+16];
2181 movdqu XMM3, [ECX+16];
2182 add ESI, 32;
2183 add ECX, 32;
2184 pmullw XMM0, XMM2;
2185 pmullw XMM1, XMM3;
2186 movdqu [ESI -32], XMM0;
2187 movdqu [ESI+16-32], XMM1;
2188 cmp ESI, EDI;
2189 jb startsse2u;
2190
2191 mov aptr, ESI;
2192 mov bptr, ECX;
2193 }
2194 }
2195 else
2196 {
2197 asm
2198 {
2199 mov ESI, aptr;
2200 mov EDI, n;
2201 mov ECX, bptr;
2202
2203 align 4;
2204 startsse2a:
2205 movdqa XMM0, [ESI];
2206 movdqa XMM2, [ECX];
2207 movdqa XMM1, [ESI+16];
2208 movdqa XMM3, [ECX+16];
2209 add ESI, 32;
2210 add ECX, 32;
2211 pmullw XMM0, XMM2;
2212 pmullw XMM1, XMM3;
2213 movdqa [ESI -32], XMM0;
2214 movdqa [ESI+16-32], XMM1;
2215 cmp ESI, EDI;
2216 jb startsse2a;
2217
2218 mov aptr, ESI;
2219 mov bptr, ECX;
2220 }
2221 }
2222 }
2223 else
2224 // MMX version is 1712% faster
2225 if (mmx() && a.length >= 8)
2226 {
2227 auto n = aptr + (a.length & ~7);
2228
2229 asm
2230 {
2231 mov ESI, aptr;
2232 mov EDI, n;
2233 mov ECX, bptr;
2234
2235 align 4;
2236 startmmx:
2237 movq MM0, [ESI];
2238 movq MM2, [ECX];
2239 movq MM1, [ESI+8];
2240 movq MM3, [ECX+8];
2241 add ESI, 16;
2242 add ECX, 16;
2243 pmullw MM0, MM2;
2244 pmullw MM1, MM3;
2245 movq [ESI -16], MM0;
2246 movq [ESI+8-16], MM1;
2247 cmp ESI, EDI;
2248 jb startmmx;
2249
2250 emms;
2251 mov aptr, ESI;
2252 mov bptr, ECX;
2253 }
2254 }
2255 }
2256
2257 while (aptr < aend)
2258 *aptr++ *= *bptr++;
2259
2260 return a;
2261 }
2262
2263 unittest
2264 {
2265 printf("_arraySliceSliceMulass_s unittest\n");
2266
2267 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2268 {
2269 version (log) printf(" cpuid %d\n", cpuid);
2270
2271 for (int j = 0; j < 2; j++)
2272 {
2273 const int dim = 67;
2274 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2275 a = a[j .. dim + j]; // misalign for second iteration
2276 T[] b = new T[dim + j];
2277 b = b[j .. dim + j];
2278 T[] c = new T[dim + j];
2279 c = c[j .. dim + j];
2280
2281 for (int i = 0; i < dim; i++)
2282 { a[i] = cast(T)i;
2283 b[i] = cast(T)(i + 7);
2284 c[i] = cast(T)(i * 2);
2285 }
2286
2287 b[] = a[];
2288 a[] *= c[];
2289
2290 for (int i = 0; i < dim; i++)
2291 {
2292 if (a[i] != cast(T)(b[i] * c[i]))
2293 {
2294 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);
2295 assert(0);
2296 }
2297 }
2298 }
2299 }
2300 }