comparison druntime/src/compiler/dmd/arrayint.d @ 759:d3eb054172f9

Added copy of druntime from DMD 2.020 modified for LDC.
author Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date Tue, 11 Nov 2008 01:52:37 +0100
parents
children
comparison
equal deleted inserted replaced
758:f04dde6e882c 759:d3eb054172f9
1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for byte array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
6 */
7
8 /* Contains MMX versions of certain operations for dchar, int,
9 * and uint ('w', 'i' and 'k' suffixes).
10 */
11
12 module rt.arrayint;
13
14 private import util.cpuid;
15
16 version (Unittest)
17 {
18 /* This is so unit tests will test every CPU variant
19 */
20 int cpuid;
21 const int CPUID_MAX = 4;
22 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
23 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
24 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
25 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
26 }
27 else
28 {
29 alias util.cpuid.mmx mmx;
30 alias util.cpuid.sse sse;
31 alias util.cpuid.sse2 sse2;
32 alias util.cpuid.amd3dnow amd3dnow;
33 }
34
35 //version = log;
36
37 bool disjoint(T)(T[] a, T[] b)
38 {
39 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
40 }
41
42 alias int T;
43
44 extern (C):
45
46 /* ======================================================================== */
47
48 /***********************
49 * Computes:
50 * a[] = b[] + value
51 */
52
53 T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b)
54 {
55 return _arraySliceExpAddSliceAssign_i(a, value, b);
56 }
57
58 T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b)
59 {
60 return _arraySliceExpAddSliceAssign_i(a, value, b);
61 }
62
63 T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b)
64 in
65 {
66 assert(a.length == b.length);
67 assert(disjoint(a, b));
68 }
69 body
70 {
71 //printf("_arraySliceExpAddSliceAssign_i()\n");
72 auto aptr = a.ptr;
73 auto aend = aptr + a.length;
74 auto bptr = b.ptr;
75
76 version (D_InlineAsm_X86)
77 {
78 // SSE2 aligned version is 380% faster
79 if (sse2() && a.length >= 8)
80 {
81 auto n = aptr + (a.length & ~7);
82
83 uint l = value;
84
85 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
86 {
87 asm // unaligned case
88 {
89 mov ESI, aptr;
90 mov EDI, n;
91 mov EAX, bptr;
92 movd XMM2, l;
93 pshufd XMM2, XMM2, 0;
94
95 align 4;
96 startaddsse2u:
97 add ESI, 32;
98 movdqu XMM0, [EAX];
99 movdqu XMM1, [EAX+16];
100 add EAX, 32;
101 paddd XMM0, XMM2;
102 paddd XMM1, XMM2;
103 movdqu [ESI -32], XMM0;
104 movdqu [ESI+16-32], XMM1;
105 cmp ESI, EDI;
106 jb startaddsse2u;
107
108 mov aptr, ESI;
109 mov bptr, EAX;
110 }
111 }
112 else
113 {
114 asm // aligned case
115 {
116 mov ESI, aptr;
117 mov EDI, n;
118 mov EAX, bptr;
119 movd XMM2, l;
120 pshufd XMM2, XMM2, 0;
121
122 align 4;
123 startaddsse2a:
124 add ESI, 32;
125 movdqa XMM0, [EAX];
126 movdqa XMM1, [EAX+16];
127 add EAX, 32;
128 paddd XMM0, XMM2;
129 paddd XMM1, XMM2;
130 movdqa [ESI -32], XMM0;
131 movdqa [ESI+16-32], XMM1;
132 cmp ESI, EDI;
133 jb startaddsse2a;
134
135 mov aptr, ESI;
136 mov bptr, EAX;
137 }
138 }
139 }
140 else
141 // MMX version is 298% faster
142 if (mmx() && a.length >= 4)
143 {
144 auto n = aptr + (a.length & ~3);
145
146 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
147
148 asm
149 {
150 mov ESI, aptr;
151 mov EDI, n;
152 mov EAX, bptr;
153 movq MM2, l;
154
155 align 4;
156 startmmx:
157 add ESI, 16;
158 movq MM0, [EAX];
159 movq MM1, [EAX+8];
160 add EAX, 16;
161 paddd MM0, MM2;
162 paddd MM1, MM2;
163 movq [ESI -16], MM0;
164 movq [ESI+8-16], MM1;
165 cmp ESI, EDI;
166 jb startmmx;
167
168 emms;
169 mov aptr, ESI;
170 mov bptr, EAX;
171 }
172 }
173 else
174 if (a.length >= 2)
175 {
176 auto n = aptr + (a.length & ~1);
177
178 asm
179 {
180 mov ESI, aptr;
181 mov EDI, n;
182 mov EAX, bptr;
183 mov EDX, value;
184
185 align 4;
186 start386:
187 add ESI, 8;
188 mov EBX, [EAX];
189 mov ECX, [EAX+4];
190 add EAX, 8;
191 add EBX, EDX;
192 add ECX, EDX;
193 mov [ESI -8], EBX;
194 mov [ESI+4-8], ECX;
195 cmp ESI, EDI;
196 jb start386;
197
198 mov aptr, ESI;
199 mov bptr, EAX;
200 }
201 }
202 }
203
204 while (aptr < aend)
205 *aptr++ = *bptr++ + value;
206
207 return a;
208 }
209
210 unittest
211 {
212 printf("_arraySliceExpAddSliceAssign_i unittest\n");
213
214 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
215 {
216 version (log) printf(" cpuid %d\n", cpuid);
217
218 for (int j = 0; j < 2; j++)
219 {
220 const int dim = 67;
221 T[] a = new T[dim + j]; // aligned on 16 byte boundary
222 a = a[j .. dim + j]; // misalign for second iteration
223 T[] b = new T[dim + j];
224 b = b[j .. dim + j];
225 T[] c = new T[dim + j];
226 c = c[j .. dim + j];
227
228 for (int i = 0; i < dim; i++)
229 { a[i] = cast(T)i;
230 b[i] = cast(T)(i + 7);
231 c[i] = cast(T)(i * 2);
232 }
233
234 c[] = a[] + 6;
235
236 for (int i = 0; i < dim; i++)
237 {
238 if (c[i] != cast(T)(a[i] + 6))
239 {
240 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
241 assert(0);
242 }
243 }
244 }
245 }
246 }
247
248
249 /* ======================================================================== */
250
251 /***********************
252 * Computes:
253 * a[] = b[] + c[]
254 */
255
256 T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b)
257 {
258 return _arraySliceSliceAddSliceAssign_i(a, c, b);
259 }
260
261 T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b)
262 {
263 return _arraySliceSliceAddSliceAssign_i(a, c, b);
264 }
265
266 T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b)
267 in
268 {
269 assert(a.length == b.length && b.length == c.length);
270 assert(disjoint(a, b));
271 assert(disjoint(a, c));
272 assert(disjoint(b, c));
273 }
274 body
275 {
276 //printf("_arraySliceSliceAddSliceAssign_i()\n");
277 auto aptr = a.ptr;
278 auto aend = aptr + a.length;
279 auto bptr = b.ptr;
280 auto cptr = c.ptr;
281
282 version (D_InlineAsm_X86)
283 {
284 // SSE2 aligned version is 1710% faster
285 if (sse2() && a.length >= 8)
286 {
287 auto n = aptr + (a.length & ~7);
288
289 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
290 {
291 asm // unaligned case
292 {
293 mov ESI, aptr;
294 mov EDI, n;
295 mov EAX, bptr;
296 mov ECX, cptr;
297
298 align 4;
299 startsse2u:
300 add ESI, 32;
301 movdqu XMM0, [EAX];
302 movdqu XMM2, [ECX];
303 movdqu XMM1, [EAX+16];
304 movdqu XMM3, [ECX+16];
305 add EAX, 32;
306 add ECX, 32;
307 paddd XMM0, XMM2;
308 paddd XMM1, XMM3;
309 movdqu [ESI -32], XMM0;
310 movdqu [ESI+16-32], XMM1;
311 cmp ESI, EDI;
312 jb startsse2u;
313
314 mov aptr, ESI;
315 mov bptr, EAX;
316 mov cptr, ECX;
317 }
318 }
319 else
320 {
321 asm // aligned case
322 {
323 mov ESI, aptr;
324 mov EDI, n;
325 mov EAX, bptr;
326 mov ECX, cptr;
327
328 align 4;
329 startsse2a:
330 add ESI, 32;
331 movdqa XMM0, [EAX];
332 movdqa XMM2, [ECX];
333 movdqa XMM1, [EAX+16];
334 movdqa XMM3, [ECX+16];
335 add EAX, 32;
336 add ECX, 32;
337 paddd XMM0, XMM2;
338 paddd XMM1, XMM3;
339 movdqa [ESI -32], XMM0;
340 movdqa [ESI+16-32], XMM1;
341 cmp ESI, EDI;
342 jb startsse2a;
343
344 mov aptr, ESI;
345 mov bptr, EAX;
346 mov cptr, ECX;
347 }
348 }
349 }
350 else
351 // MMX version is 995% faster
352 if (mmx() && a.length >= 4)
353 {
354 auto n = aptr + (a.length & ~3);
355
356 asm
357 {
358 mov ESI, aptr;
359 mov EDI, n;
360 mov EAX, bptr;
361 mov ECX, cptr;
362
363 align 4;
364 startmmx:
365 add ESI, 16;
366 movq MM0, [EAX];
367 movq MM2, [ECX];
368 movq MM1, [EAX+8];
369 movq MM3, [ECX+8];
370 add EAX, 16;
371 add ECX, 16;
372 paddd MM0, MM2;
373 paddd MM1, MM3;
374 movq [ESI -16], MM0;
375 movq [ESI+8-16], MM1;
376 cmp ESI, EDI;
377 jb startmmx;
378
379 emms;
380 mov aptr, ESI;
381 mov bptr, EAX;
382 mov cptr, ECX;
383 }
384 }
385 }
386
387 normal:
388 while (aptr < aend)
389 *aptr++ = *bptr++ + *cptr++;
390
391 return a;
392 }
393
394 unittest
395 {
396 printf("_arraySliceSliceAddSliceAssign_i unittest\n");
397
398 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
399 {
400 version (log) printf(" cpuid %d\n", cpuid);
401
402 for (int j = 0; j < 2; j++)
403 {
404 const int dim = 67;
405 T[] a = new T[dim + j]; // aligned on 16 byte boundary
406 a = a[j .. dim + j]; // misalign for second iteration
407 T[] b = new T[dim + j];
408 b = b[j .. dim + j];
409 T[] c = new T[dim + j];
410 c = c[j .. dim + j];
411
412 for (int i = 0; i < dim; i++)
413 { a[i] = cast(T)i;
414 b[i] = cast(T)(i + 7);
415 c[i] = cast(T)(i * 2);
416 }
417
418 c[] = a[] + b[];
419
420 for (int i = 0; i < dim; i++)
421 {
422 if (c[i] != cast(T)(a[i] + b[i]))
423 {
424 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
425 assert(0);
426 }
427 }
428 }
429 }
430 }
431
432
433 /* ======================================================================== */
434
435 /***********************
436 * Computes:
437 * a[] += value
438 */
439
440 T[] _arrayExpSliceAddass_w(T[] a, T value)
441 {
442 return _arrayExpSliceAddass_i(a, value);
443 }
444
445 T[] _arrayExpSliceAddass_k(T[] a, T value)
446 {
447 return _arrayExpSliceAddass_i(a, value);
448 }
449
450 T[] _arrayExpSliceAddass_i(T[] a, T value)
451 {
452 //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
453 auto aptr = a.ptr;
454 auto aend = aptr + a.length;
455
456 version (D_InlineAsm_X86)
457 {
458 // SSE2 aligned version is 83% faster
459 if (sse2() && a.length >= 8)
460 {
461 auto n = aptr + (a.length & ~7);
462
463 uint l = value;
464
465 if (((cast(uint) aptr) & 15) != 0)
466 {
467 asm // unaligned case
468 {
469 mov ESI, aptr;
470 mov EDI, n;
471 movd XMM2, l;
472 pshufd XMM2, XMM2, 0;
473
474 align 4;
475 startaddsse2u:
476 movdqu XMM0, [ESI];
477 movdqu XMM1, [ESI+16];
478 add ESI, 32;
479 paddd XMM0, XMM2;
480 paddd XMM1, XMM2;
481 movdqu [ESI -32], XMM0;
482 movdqu [ESI+16-32], XMM1;
483 cmp ESI, EDI;
484 jb startaddsse2u;
485
486 mov aptr, ESI;
487 }
488 }
489 else
490 {
491 asm // aligned case
492 {
493 mov ESI, aptr;
494 mov EDI, n;
495 movd XMM2, l;
496 pshufd XMM2, XMM2, 0;
497
498 align 4;
499 startaddsse2a:
500 movdqa XMM0, [ESI];
501 movdqa XMM1, [ESI+16];
502 add ESI, 32;
503 paddd XMM0, XMM2;
504 paddd XMM1, XMM2;
505 movdqa [ESI -32], XMM0;
506 movdqa [ESI+16-32], XMM1;
507 cmp ESI, EDI;
508 jb startaddsse2a;
509
510 mov aptr, ESI;
511 }
512 }
513 }
514 else
515 // MMX version is 81% faster
516 if (mmx() && a.length >= 4)
517 {
518 auto n = aptr + (a.length & ~3);
519
520 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
521
522 asm
523 {
524 mov ESI, aptr;
525 mov EDI, n;
526 movq MM2, l;
527
528 align 4;
529 startmmx:
530 movq MM0, [ESI];
531 movq MM1, [ESI+8];
532 add ESI, 16;
533 paddd MM0, MM2;
534 paddd MM1, MM2;
535 movq [ESI -16], MM0;
536 movq [ESI+8-16], MM1;
537 cmp ESI, EDI;
538 jb startmmx;
539
540 emms;
541 mov aptr, ESI;
542 }
543 }
544 else
545 if (a.length >= 2)
546 {
547 auto n = aptr + (a.length & ~1);
548
549 asm
550 {
551 mov ESI, aptr;
552 mov EDI, n;
553 mov EDX, value;
554
555 align 4;
556 start386:
557 mov EBX, [ESI];
558 mov ECX, [ESI+4];
559 add ESI, 8;
560 add EBX, EDX;
561 add ECX, EDX;
562 mov [ESI -8], EBX;
563 mov [ESI+4-8], ECX;
564 cmp ESI, EDI;
565 jb start386;
566
567 mov aptr, ESI;
568 }
569 }
570 }
571
572 while (aptr < aend)
573 *aptr++ += value;
574
575 return a;
576 }
577
578 unittest
579 {
580 printf("_arrayExpSliceAddass_i unittest\n");
581
582 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
583 {
584 version (log) printf(" cpuid %d\n", cpuid);
585
586 for (int j = 0; j < 2; j++)
587 {
588 const int dim = 67;
589 T[] a = new T[dim + j]; // aligned on 16 byte boundary
590 a = a[j .. dim + j]; // misalign for second iteration
591 T[] b = new T[dim + j];
592 b = b[j .. dim + j];
593 T[] c = new T[dim + j];
594 c = c[j .. dim + j];
595
596 for (int i = 0; i < dim; i++)
597 { a[i] = cast(T)i;
598 b[i] = cast(T)(i + 7);
599 c[i] = cast(T)(i * 2);
600 }
601
602 a[] = c[];
603 a[] += 6;
604
605 for (int i = 0; i < dim; i++)
606 {
607 if (a[i] != cast(T)(c[i] + 6))
608 {
609 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
610 assert(0);
611 }
612 }
613 }
614 }
615 }
616
617
618 /* ======================================================================== */
619
620 /***********************
621 * Computes:
622 * a[] += b[]
623 */
624
625 T[] _arraySliceSliceAddass_w(T[] a, T[] b)
626 {
627 return _arraySliceSliceAddass_i(a, b);
628 }
629
630 T[] _arraySliceSliceAddass_k(T[] a, T[] b)
631 {
632 return _arraySliceSliceAddass_i(a, b);
633 }
634
635 T[] _arraySliceSliceAddass_i(T[] a, T[] b)
636 in
637 {
638 assert (a.length == b.length);
639 assert (disjoint(a, b));
640 }
641 body
642 {
643 //printf("_arraySliceSliceAddass_i()\n");
644 auto aptr = a.ptr;
645 auto aend = aptr + a.length;
646 auto bptr = b.ptr;
647
648 version (D_InlineAsm_X86)
649 {
650 // SSE2 aligned version is 695% faster
651 if (sse2() && a.length >= 8)
652 {
653 auto n = aptr + (a.length & ~7);
654
655 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
656 {
657 asm // unaligned case
658 {
659 mov ESI, aptr;
660 mov EDI, n;
661 mov ECX, bptr;
662
663 align 4;
664 startsse2u:
665 movdqu XMM0, [ESI];
666 movdqu XMM2, [ECX];
667 movdqu XMM1, [ESI+16];
668 movdqu XMM3, [ECX+16];
669 add ESI, 32;
670 add ECX, 32;
671 paddd XMM0, XMM2;
672 paddd XMM1, XMM3;
673 movdqu [ESI -32], XMM0;
674 movdqu [ESI+16-32], XMM1;
675 cmp ESI, EDI;
676 jb startsse2u;
677
678 mov aptr, ESI;
679 mov bptr, ECX;
680 }
681 }
682 else
683 {
684 asm // aligned case
685 {
686 mov ESI, aptr;
687 mov EDI, n;
688 mov ECX, bptr;
689
690 align 4;
691 startsse2a:
692 movdqa XMM0, [ESI];
693 movdqa XMM2, [ECX];
694 movdqa XMM1, [ESI+16];
695 movdqa XMM3, [ECX+16];
696 add ESI, 32;
697 add ECX, 32;
698 paddd XMM0, XMM2;
699 paddd XMM1, XMM3;
700 movdqa [ESI -32], XMM0;
701 movdqa [ESI+16-32], XMM1;
702 cmp ESI, EDI;
703 jb startsse2a;
704
705 mov aptr, ESI;
706 mov bptr, ECX;
707 }
708 }
709 }
710 else
711 // MMX version is 471% faster
712 if (mmx() && a.length >= 4)
713 {
714 auto n = aptr + (a.length & ~3);
715
716 asm
717 {
718 mov ESI, aptr;
719 mov EDI, n;
720 mov ECX, bptr;
721
722 align 4;
723 startmmx:
724 movq MM0, [ESI];
725 movq MM2, [ECX];
726 movq MM1, [ESI+8];
727 movq MM3, [ECX+8];
728 add ESI, 16;
729 add ECX, 16;
730 paddd MM0, MM2;
731 paddd MM1, MM3;
732 movq [ESI -16], MM0;
733 movq [ESI+8-16], MM1;
734 cmp ESI, EDI;
735 jb startmmx;
736
737 emms;
738 mov aptr, ESI;
739 mov bptr, ECX;
740 }
741 }
742 }
743
744 normal:
745 while (aptr < aend)
746 *aptr++ += *bptr++;
747
748 return a;
749 }
750
751 unittest
752 {
753 printf("_arraySliceSliceAddass_i unittest\n");
754
755 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
756 {
757 version (log) printf(" cpuid %d\n", cpuid);
758
759 for (int j = 0; j < 2; j++)
760 {
761 const int dim = 67;
762 T[] a = new T[dim + j]; // aligned on 16 byte boundary
763 a = a[j .. dim + j]; // misalign for second iteration
764 T[] b = new T[dim + j];
765 b = b[j .. dim + j];
766 T[] c = new T[dim + j];
767 c = c[j .. dim + j];
768
769 for (int i = 0; i < dim; i++)
770 { a[i] = cast(T)i;
771 b[i] = cast(T)(i + 7);
772 c[i] = cast(T)(i * 2);
773 }
774
775 b[] = c[];
776 c[] += a[];
777
778 for (int i = 0; i < dim; i++)
779 {
780 if (c[i] != cast(T)(b[i] + a[i]))
781 {
782 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
783 assert(0);
784 }
785 }
786 }
787 }
788 }
789
790
791 /* ======================================================================== */
792
793 /***********************
794 * Computes:
795 * a[] = b[] - value
796 */
797
798 T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b)
799 {
800 return _arraySliceExpMinSliceAssign_i(a, value, b);
801 }
802
803 T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b)
804 {
805 return _arraySliceExpMinSliceAssign_i(a, value, b);
806 }
807
808 T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b)
809 in
810 {
811 assert(a.length == b.length);
812 assert(disjoint(a, b));
813 }
814 body
815 {
816 //printf("_arraySliceExpMinSliceAssign_i()\n");
817 auto aptr = a.ptr;
818 auto aend = aptr + a.length;
819 auto bptr = b.ptr;
820
821 version (D_InlineAsm_X86)
822 {
823 // SSE2 aligned version is 400% faster
824 if (sse2() && a.length >= 8)
825 {
826 auto n = aptr + (a.length & ~7);
827
828 uint l = value;
829
830 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
831 {
832 asm // unaligned case
833 {
834 mov ESI, aptr;
835 mov EDI, n;
836 mov EAX, bptr;
837 movd XMM2, l;
838 pshufd XMM2, XMM2, 0;
839
840 align 4;
841 startaddsse2u:
842 add ESI, 32;
843 movdqu XMM0, [EAX];
844 movdqu XMM1, [EAX+16];
845 add EAX, 32;
846 psubd XMM0, XMM2;
847 psubd XMM1, XMM2;
848 movdqu [ESI -32], XMM0;
849 movdqu [ESI+16-32], XMM1;
850 cmp ESI, EDI;
851 jb startaddsse2u;
852
853 mov aptr, ESI;
854 mov bptr, EAX;
855 }
856 }
857 else
858 {
859 asm // aligned case
860 {
861 mov ESI, aptr;
862 mov EDI, n;
863 mov EAX, bptr;
864 movd XMM2, l;
865 pshufd XMM2, XMM2, 0;
866
867 align 4;
868 startaddsse2a:
869 add ESI, 32;
870 movdqa XMM0, [EAX];
871 movdqa XMM1, [EAX+16];
872 add EAX, 32;
873 psubd XMM0, XMM2;
874 psubd XMM1, XMM2;
875 movdqa [ESI -32], XMM0;
876 movdqa [ESI+16-32], XMM1;
877 cmp ESI, EDI;
878 jb startaddsse2a;
879
880 mov aptr, ESI;
881 mov bptr, EAX;
882 }
883 }
884 }
885 else
886 // MMX version is 315% faster
887 if (mmx() && a.length >= 4)
888 {
889 auto n = aptr + (a.length & ~3);
890
891 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
892
893 asm
894 {
895 mov ESI, aptr;
896 mov EDI, n;
897 mov EAX, bptr;
898 movq MM2, l;
899
900 align 4;
901 startmmx:
902 add ESI, 16;
903 movq MM0, [EAX];
904 movq MM1, [EAX+8];
905 add EAX, 16;
906 psubd MM0, MM2;
907 psubd MM1, MM2;
908 movq [ESI -16], MM0;
909 movq [ESI+8-16], MM1;
910 cmp ESI, EDI;
911 jb startmmx;
912
913 emms;
914 mov aptr, ESI;
915 mov bptr, EAX;
916 }
917 }
918 else
919 if (a.length >= 2)
920 {
921 auto n = aptr + (a.length & ~1);
922
923 asm
924 {
925 mov ESI, aptr;
926 mov EDI, n;
927 mov EAX, bptr;
928 mov EDX, value;
929
930 align 4;
931 start386:
932 add ESI, 8;
933 mov EBX, [EAX];
934 mov ECX, [EAX+4];
935 add EAX, 8;
936 sub EBX, EDX;
937 sub ECX, EDX;
938 mov [ESI -8], EBX;
939 mov [ESI+4-8], ECX;
940 cmp ESI, EDI;
941 jb start386;
942
943 mov aptr, ESI;
944 mov bptr, EAX;
945 }
946 }
947 }
948
949 while (aptr < aend)
950 *aptr++ = *bptr++ - value;
951
952 return a;
953 }
954
955 unittest
956 {
957 printf("_arraySliceExpMinSliceAssign_i unittest\n");
958
959 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
960 {
961 version (log) printf(" cpuid %d\n", cpuid);
962
963 for (int j = 0; j < 2; j++)
964 {
965 const int dim = 67;
966 T[] a = new T[dim + j]; // aligned on 16 byte boundary
967 a = a[j .. dim + j]; // misalign for second iteration
968 T[] b = new T[dim + j];
969 b = b[j .. dim + j];
970 T[] c = new T[dim + j];
971 c = c[j .. dim + j];
972
973 for (int i = 0; i < dim; i++)
974 { a[i] = cast(T)i;
975 b[i] = cast(T)(i + 7);
976 c[i] = cast(T)(i * 2);
977 }
978
979 c[] = a[] - 6;
980
981 for (int i = 0; i < dim; i++)
982 {
983 if (c[i] != cast(T)(a[i] - 6))
984 {
985 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
986 assert(0);
987 }
988 }
989 }
990 }
991 }
992
993
994 /* ======================================================================== */
995
996 /***********************
997 * Computes:
998 * a[] = value - b[]
999 */
1000
1001 T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value)
1002 {
1003 return _arrayExpSliceMinSliceAssign_i(a, b, value);
1004 }
1005
1006 T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value)
1007 {
1008 return _arrayExpSliceMinSliceAssign_i(a, b, value);
1009 }
1010
1011 T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value)
1012 in
1013 {
1014 assert(a.length == b.length);
1015 assert(disjoint(a, b));
1016 }
1017 body
1018 {
1019 //printf("_arrayExpSliceMinSliceAssign_i()\n");
1020 auto aptr = a.ptr;
1021 auto aend = aptr + a.length;
1022 auto bptr = b.ptr;
1023
1024 version (D_InlineAsm_X86)
1025 {
1026 // SSE2 aligned version is 1812% faster
1027 if (sse2() && a.length >= 8)
1028 {
1029 auto n = aptr + (a.length & ~7);
1030
1031 uint l = value;
1032
1033 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1034 {
1035 asm // unaligned case
1036 {
1037 mov ESI, aptr;
1038 mov EDI, n;
1039 mov EAX, bptr;
1040 movd XMM4, l;
1041 pshufd XMM4, XMM4, 0;
1042
1043 align 4;
1044 startaddsse2u:
1045 add ESI, 32;
1046 movdqu XMM2, [EAX];
1047 movdqu XMM3, [EAX+16];
1048 movdqa XMM0, XMM4;
1049 movdqa XMM1, XMM4;
1050 add EAX, 32;
1051 psubd XMM0, XMM2;
1052 psubd XMM1, XMM3;
1053 movdqu [ESI -32], XMM0;
1054 movdqu [ESI+16-32], XMM1;
1055 cmp ESI, EDI;
1056 jb startaddsse2u;
1057
1058 mov aptr, ESI;
1059 mov bptr, EAX;
1060 }
1061 }
1062 else
1063 {
1064 asm // aligned case
1065 {
1066 mov ESI, aptr;
1067 mov EDI, n;
1068 mov EAX, bptr;
1069 movd XMM4, l;
1070 pshufd XMM4, XMM4, 0;
1071
1072 align 4;
1073 startaddsse2a:
1074 add ESI, 32;
1075 movdqa XMM2, [EAX];
1076 movdqa XMM3, [EAX+16];
1077 movdqa XMM0, XMM4;
1078 movdqa XMM1, XMM4;
1079 add EAX, 32;
1080 psubd XMM0, XMM2;
1081 psubd XMM1, XMM3;
1082 movdqa [ESI -32], XMM0;
1083 movdqa [ESI+16-32], XMM1;
1084 cmp ESI, EDI;
1085 jb startaddsse2a;
1086
1087 mov aptr, ESI;
1088 mov bptr, EAX;
1089 }
1090 }
1091 }
1092 else
1093 // MMX version is 1077% faster
1094 if (mmx() && a.length >= 4)
1095 {
1096 auto n = aptr + (a.length & ~3);
1097
1098 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1099
1100 asm
1101 {
1102 mov ESI, aptr;
1103 mov EDI, n;
1104 mov EAX, bptr;
1105 movq MM4, l;
1106
1107 align 4;
1108 startmmx:
1109 add ESI, 16;
1110 movq MM2, [EAX];
1111 movq MM3, [EAX+8];
1112 movq MM0, MM4;
1113 movq MM1, MM4;
1114 add EAX, 16;
1115 psubd MM0, MM2;
1116 psubd MM1, MM3;
1117 movq [ESI -16], MM0;
1118 movq [ESI+8-16], MM1;
1119 cmp ESI, EDI;
1120 jb startmmx;
1121
1122 emms;
1123 mov aptr, ESI;
1124 mov bptr, EAX;
1125 }
1126 }
1127 }
1128
1129 while (aptr < aend)
1130 *aptr++ = value - *bptr++;
1131
1132 return a;
1133 }
1134
1135 unittest
1136 {
1137 printf("_arrayExpSliceMinSliceAssign_i unittest\n");
1138
1139 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1140 {
1141 version (log) printf(" cpuid %d\n", cpuid);
1142
1143 for (int j = 0; j < 2; j++)
1144 {
1145 const int dim = 67;
1146 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1147 a = a[j .. dim + j]; // misalign for second iteration
1148 T[] b = new T[dim + j];
1149 b = b[j .. dim + j];
1150 T[] c = new T[dim + j];
1151 c = c[j .. dim + j];
1152
1153 for (int i = 0; i < dim; i++)
1154 { a[i] = cast(T)i;
1155 b[i] = cast(T)(i + 7);
1156 c[i] = cast(T)(i * 2);
1157 }
1158
1159 c[] = 6 - a[];
1160
1161 for (int i = 0; i < dim; i++)
1162 {
1163 if (c[i] != cast(T)(6 - a[i]))
1164 {
1165 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
1166 assert(0);
1167 }
1168 }
1169 }
1170 }
1171 }
1172
1173
1174 /* ======================================================================== */
1175
1176 /***********************
1177 * Computes:
1178 * a[] = b[] - c[]
1179 */
1180
1181 T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b)
1182 {
1183 return _arraySliceSliceMinSliceAssign_i(a, c, b);
1184 }
1185
1186 T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b)
1187 {
1188 return _arraySliceSliceMinSliceAssign_i(a, c, b);
1189 }
1190
1191 T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b)
1192 in
1193 {
1194 assert(a.length == b.length && b.length == c.length);
1195 assert(disjoint(a, b));
1196 assert(disjoint(a, c));
1197 assert(disjoint(b, c));
1198 }
1199 body
1200 {
1201 auto aptr = a.ptr;
1202 auto aend = aptr + a.length;
1203 auto bptr = b.ptr;
1204 auto cptr = c.ptr;
1205
1206 version (D_InlineAsm_X86)
1207 {
1208 // SSE2 aligned version is 1721% faster
1209 if (sse2() && a.length >= 8)
1210 {
1211 auto n = aptr + (a.length & ~7);
1212
1213 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1214 {
1215 asm // unaligned case
1216 {
1217 mov ESI, aptr;
1218 mov EDI, n;
1219 mov EAX, bptr;
1220 mov ECX, cptr;
1221
1222 align 4;
1223 startsse2u:
1224 add ESI, 32;
1225 movdqu XMM0, [EAX];
1226 movdqu XMM2, [ECX];
1227 movdqu XMM1, [EAX+16];
1228 movdqu XMM3, [ECX+16];
1229 add EAX, 32;
1230 add ECX, 32;
1231 psubd XMM0, XMM2;
1232 psubd XMM1, XMM3;
1233 movdqu [ESI -32], XMM0;
1234 movdqu [ESI+16-32], XMM1;
1235 cmp ESI, EDI;
1236 jb startsse2u;
1237
1238 mov aptr, ESI;
1239 mov bptr, EAX;
1240 mov cptr, ECX;
1241 }
1242 }
1243 else
1244 {
1245 asm // aligned case
1246 {
1247 mov ESI, aptr;
1248 mov EDI, n;
1249 mov EAX, bptr;
1250 mov ECX, cptr;
1251
1252 align 4;
1253 startsse2a:
1254 add ESI, 32;
1255 movdqa XMM0, [EAX];
1256 movdqa XMM2, [ECX];
1257 movdqa XMM1, [EAX+16];
1258 movdqa XMM3, [ECX+16];
1259 add EAX, 32;
1260 add ECX, 32;
1261 psubd XMM0, XMM2;
1262 psubd XMM1, XMM3;
1263 movdqa [ESI -32], XMM0;
1264 movdqa [ESI+16-32], XMM1;
1265 cmp ESI, EDI;
1266 jb startsse2a;
1267
1268 mov aptr, ESI;
1269 mov bptr, EAX;
1270 mov cptr, ECX;
1271 }
1272 }
1273 }
1274 else
1275 // MMX version is 1002% faster
1276 if (mmx() && a.length >= 4)
1277 {
1278 auto n = aptr + (a.length & ~3);
1279
1280 asm
1281 {
1282 mov ESI, aptr;
1283 mov EDI, n;
1284 mov EAX, bptr;
1285 mov ECX, cptr;
1286
1287 align 4;
1288 startmmx:
1289 add ESI, 16;
1290 movq MM0, [EAX];
1291 movq MM2, [ECX];
1292 movq MM1, [EAX+8];
1293 movq MM3, [ECX+8];
1294 add EAX, 16;
1295 add ECX, 16;
1296 psubd MM0, MM2;
1297 psubd MM1, MM3;
1298 movq [ESI -16], MM0;
1299 movq [ESI+8-16], MM1;
1300 cmp ESI, EDI;
1301 jb startmmx;
1302
1303 emms;
1304 mov aptr, ESI;
1305 mov bptr, EAX;
1306 mov cptr, ECX;
1307 }
1308 }
1309 }
1310
1311 while (aptr < aend)
1312 *aptr++ = *bptr++ - *cptr++;
1313
1314 return a;
1315 }
1316
1317 unittest
1318 {
1319 printf("_arraySliceSliceMinSliceAssign_i unittest\n");
1320
1321 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1322 {
1323 version (log) printf(" cpuid %d\n", cpuid);
1324
1325 for (int j = 0; j < 2; j++)
1326 {
1327 const int dim = 67;
1328 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1329 a = a[j .. dim + j]; // misalign for second iteration
1330 T[] b = new T[dim + j];
1331 b = b[j .. dim + j];
1332 T[] c = new T[dim + j];
1333 c = c[j .. dim + j];
1334
1335 for (int i = 0; i < dim; i++)
1336 { a[i] = cast(T)i;
1337 b[i] = cast(T)(i + 7);
1338 c[i] = cast(T)(i * 2);
1339 }
1340
1341 c[] = a[] - b[];
1342
1343 for (int i = 0; i < dim; i++)
1344 {
1345 if (c[i] != cast(T)(a[i] - b[i]))
1346 {
1347 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1348 assert(0);
1349 }
1350 }
1351 }
1352 }
1353 }
1354
1355
1356 /* ======================================================================== */
1357
1358 /***********************
1359 * Computes:
1360 * a[] -= value
1361 */
1362
1363 T[] _arrayExpSliceMinass_w(T[] a, T value)
1364 {
1365 return _arrayExpSliceMinass_i(a, value);
1366 }
1367
1368 T[] _arrayExpSliceMinass_k(T[] a, T value)
1369 {
1370 return _arrayExpSliceMinass_i(a, value);
1371 }
1372
1373 T[] _arrayExpSliceMinass_i(T[] a, T value)
1374 {
1375 //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1376 auto aptr = a.ptr;
1377 auto aend = aptr + a.length;
1378
1379 version (D_InlineAsm_X86)
1380 {
1381 // SSE2 aligned version is 81% faster
1382 if (sse2() && a.length >= 8)
1383 {
1384 auto n = aptr + (a.length & ~7);
1385
1386 uint l = value;
1387
1388 if (((cast(uint) aptr) & 15) != 0)
1389 {
1390 asm // unaligned case
1391 {
1392 mov ESI, aptr;
1393 mov EDI, n;
1394 movd XMM2, l;
1395 pshufd XMM2, XMM2, 0;
1396
1397 align 4;
1398 startaddsse2u:
1399 movdqu XMM0, [ESI];
1400 movdqu XMM1, [ESI+16];
1401 add ESI, 32;
1402 psubd XMM0, XMM2;
1403 psubd XMM1, XMM2;
1404 movdqu [ESI -32], XMM0;
1405 movdqu [ESI+16-32], XMM1;
1406 cmp ESI, EDI;
1407 jb startaddsse2u;
1408
1409 mov aptr, ESI;
1410 }
1411 }
1412 else
1413 {
1414 asm // aligned case
1415 {
1416 mov ESI, aptr;
1417 mov EDI, n;
1418 movd XMM2, l;
1419 pshufd XMM2, XMM2, 0;
1420
1421 align 4;
1422 startaddsse2a:
1423 movdqa XMM0, [ESI];
1424 movdqa XMM1, [ESI+16];
1425 add ESI, 32;
1426 psubd XMM0, XMM2;
1427 psubd XMM1, XMM2;
1428 movdqa [ESI -32], XMM0;
1429 movdqa [ESI+16-32], XMM1;
1430 cmp ESI, EDI;
1431 jb startaddsse2a;
1432
1433 mov aptr, ESI;
1434 }
1435 }
1436 }
1437 else
1438 // MMX version is 81% faster
1439 if (mmx() && a.length >= 4)
1440 {
1441 auto n = aptr + (a.length & ~3);
1442
1443 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1444
1445 asm
1446 {
1447 mov ESI, aptr;
1448 mov EDI, n;
1449 movq MM2, l;
1450
1451 align 4;
1452 startmmx:
1453 movq MM0, [ESI];
1454 movq MM1, [ESI+8];
1455 add ESI, 16;
1456 psubd MM0, MM2;
1457 psubd MM1, MM2;
1458 movq [ESI -16], MM0;
1459 movq [ESI+8-16], MM1;
1460 cmp ESI, EDI;
1461 jb startmmx;
1462
1463 emms;
1464 mov aptr, ESI;
1465 }
1466 }
1467 else
1468 if (a.length >= 2)
1469 {
1470 auto n = aptr + (a.length & ~1);
1471
1472 asm
1473 {
1474 mov ESI, aptr;
1475 mov EDI, n;
1476 mov EDX, value;
1477
1478 align 4;
1479 start386:
1480 mov EBX, [ESI];
1481 mov ECX, [ESI+4];
1482 add ESI, 8;
1483 sub EBX, EDX;
1484 sub ECX, EDX;
1485 mov [ESI -8], EBX;
1486 mov [ESI+4-8], ECX;
1487 cmp ESI, EDI;
1488 jb start386;
1489
1490 mov aptr, ESI;
1491 }
1492 }
1493 }
1494
1495 while (aptr < aend)
1496 *aptr++ -= value;
1497
1498 return a;
1499 }
1500
1501 unittest
1502 {
1503 printf("_arrayExpSliceMinass_i unittest\n");
1504
1505 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1506 {
1507 version (log) printf(" cpuid %d\n", cpuid);
1508
1509 for (int j = 0; j < 2; j++)
1510 {
1511 const int dim = 67;
1512 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1513 a = a[j .. dim + j]; // misalign for second iteration
1514 T[] b = new T[dim + j];
1515 b = b[j .. dim + j];
1516 T[] c = new T[dim + j];
1517 c = c[j .. dim + j];
1518
1519 for (int i = 0; i < dim; i++)
1520 { a[i] = cast(T)i;
1521 b[i] = cast(T)(i + 7);
1522 c[i] = cast(T)(i * 2);
1523 }
1524
1525 a[] = c[];
1526 a[] -= 6;
1527
1528 for (int i = 0; i < dim; i++)
1529 {
1530 if (a[i] != cast(T)(c[i] - 6))
1531 {
1532 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
1533 assert(0);
1534 }
1535 }
1536 }
1537 }
1538 }
1539
1540
1541 /* ======================================================================== */
1542
1543 /***********************
1544 * Computes:
1545 * a[] -= b[]
1546 */
1547
1548 T[] _arraySliceSliceMinass_w(T[] a, T[] b)
1549 {
1550 return _arraySliceSliceMinass_i(a, b);
1551 }
1552
1553 T[] _arraySliceSliceMinass_k(T[] a, T[] b)
1554 {
1555 return _arraySliceSliceMinass_i(a, b);
1556 }
1557
1558 T[] _arraySliceSliceMinass_i(T[] a, T[] b)
1559 in
1560 {
1561 assert (a.length == b.length);
1562 assert (disjoint(a, b));
1563 }
1564 body
1565 {
1566 //printf("_arraySliceSliceMinass_i()\n");
1567 auto aptr = a.ptr;
1568 auto aend = aptr + a.length;
1569 auto bptr = b.ptr;
1570
1571 version (D_InlineAsm_X86)
1572 {
1573 // SSE2 aligned version is 731% faster
1574 if (sse2() && a.length >= 8)
1575 {
1576 auto n = aptr + (a.length & ~7);
1577
1578 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1579 {
1580 asm // unaligned case
1581 {
1582 mov ESI, aptr;
1583 mov EDI, n;
1584 mov ECX, bptr;
1585
1586 align 4;
1587 startsse2u:
1588 movdqu XMM0, [ESI];
1589 movdqu XMM2, [ECX];
1590 movdqu XMM1, [ESI+16];
1591 movdqu XMM3, [ECX+16];
1592 add ESI, 32;
1593 add ECX, 32;
1594 psubd XMM0, XMM2;
1595 psubd XMM1, XMM3;
1596 movdqu [ESI -32], XMM0;
1597 movdqu [ESI+16-32], XMM1;
1598 cmp ESI, EDI;
1599 jb startsse2u;
1600
1601 mov aptr, ESI;
1602 mov bptr, ECX;
1603 }
1604 }
1605 else
1606 {
1607 asm // aligned case
1608 {
1609 mov ESI, aptr;
1610 mov EDI, n;
1611 mov ECX, bptr;
1612
1613 align 4;
1614 startsse2a:
1615 movdqa XMM0, [ESI];
1616 movdqa XMM2, [ECX];
1617 movdqa XMM1, [ESI+16];
1618 movdqa XMM3, [ECX+16];
1619 add ESI, 32;
1620 add ECX, 32;
1621 psubd XMM0, XMM2;
1622 psubd XMM1, XMM3;
1623 movdqa [ESI -32], XMM0;
1624 movdqa [ESI+16-32], XMM1;
1625 cmp ESI, EDI;
1626 jb startsse2a;
1627
1628 mov aptr, ESI;
1629 mov bptr, ECX;
1630 }
1631 }
1632 }
1633 else
1634 // MMX version is 441% faster
1635 if (mmx() && a.length >= 4)
1636 {
1637 auto n = aptr + (a.length & ~3);
1638
1639 asm
1640 {
1641 mov ESI, aptr;
1642 mov EDI, n;
1643 mov ECX, bptr;
1644
1645 align 4;
1646 startmmx:
1647 movq MM0, [ESI];
1648 movq MM2, [ECX];
1649 movq MM1, [ESI+8];
1650 movq MM3, [ECX+8];
1651 add ESI, 16;
1652 add ECX, 16;
1653 psubd MM0, MM2;
1654 psubd MM1, MM3;
1655 movq [ESI -16], MM0;
1656 movq [ESI+8-16], MM1;
1657 cmp ESI, EDI;
1658 jb startmmx;
1659
1660 emms;
1661 mov aptr, ESI;
1662 mov bptr, ECX;
1663 }
1664 }
1665 }
1666
1667 while (aptr < aend)
1668 *aptr++ -= *bptr++;
1669
1670 return a;
1671 }
1672
1673 unittest
1674 {
1675 printf("_arraySliceSliceMinass_i unittest\n");
1676
1677 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1678 {
1679 version (log) printf(" cpuid %d\n", cpuid);
1680
1681 for (int j = 0; j < 2; j++)
1682 {
1683 const int dim = 67;
1684 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1685 a = a[j .. dim + j]; // misalign for second iteration
1686 T[] b = new T[dim + j];
1687 b = b[j .. dim + j];
1688 T[] c = new T[dim + j];
1689 c = c[j .. dim + j];
1690
1691 for (int i = 0; i < dim; i++)
1692 { a[i] = cast(T)i;
1693 b[i] = cast(T)(i + 7);
1694 c[i] = cast(T)(i * 2);
1695 }
1696
1697 b[] = c[];
1698 c[] -= a[];
1699
1700 for (int i = 0; i < dim; i++)
1701 {
1702 if (c[i] != cast(T)(b[i] - a[i]))
1703 {
1704 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
1705 assert(0);
1706 }
1707 }
1708 }
1709 }
1710 }
1711
1712
1713 /* ======================================================================== */
1714
1715 /***********************
1716 * Computes:
1717 * a[] = b[] * value
1718 */
1719
1720 T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b)
1721 {
1722 return _arraySliceExpMulSliceAssign_i(a, value, b);
1723 }
1724
1725 T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b)
1726 {
1727 return _arraySliceExpMulSliceAssign_i(a, value, b);
1728 }
1729
1730 T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b)
1731 in
1732 {
1733 assert(a.length == b.length);
1734 assert(disjoint(a, b));
1735 }
1736 body
1737 {
1738 //printf("_arraySliceExpMulSliceAssign_i()\n");
1739 auto aptr = a.ptr;
1740 auto aend = aptr + a.length;
1741 auto bptr = b.ptr;
1742
1743 version (none) // multiplying a pair is not supported by MMX
1744 {
1745 version (D_InlineAsm_X86)
1746 {
1747 // SSE2 aligned version is 1380% faster
1748 if (sse2() && a.length >= 8)
1749 {
1750 auto n = aptr + (a.length & ~7);
1751
1752 uint l = value;
1753
1754 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1755 {
1756 asm
1757 {
1758 mov ESI, aptr;
1759 mov EDI, n;
1760 mov EAX, bptr;
1761 movd XMM2, l;
1762 pshufd XMM2, XMM2, 0;
1763
1764 align 4;
1765 startsse2u:
1766 add ESI, 32;
1767 movdqu XMM0, [EAX];
1768 movdqu XMM1, [EAX+16];
1769 add EAX, 32;
1770 pmuludq XMM0, XMM2;
1771 pmuludq XMM1, XMM2;
1772 movdqu [ESI -32], XMM0;
1773 movdqu [ESI+16-32], XMM1;
1774 cmp ESI, EDI;
1775 jb startsse2u;
1776
1777 mov aptr, ESI;
1778 mov bptr, EAX;
1779 }
1780 }
1781 else
1782 {
1783 asm
1784 {
1785 mov ESI, aptr;
1786 mov EDI, n;
1787 mov EAX, bptr;
1788 movd XMM2, l;
1789 pshufd XMM2, XMM2, 0;
1790
1791 align 4;
1792 startsse2a:
1793 add ESI, 32;
1794 movdqa XMM0, [EAX];
1795 movdqa XMM1, [EAX+16];
1796 add EAX, 32;
1797 pmuludq XMM0, XMM2;
1798 pmuludq XMM1, XMM2;
1799 movdqa [ESI -32], XMM0;
1800 movdqa [ESI+16-32], XMM1;
1801 cmp ESI, EDI;
1802 jb startsse2a;
1803
1804 mov aptr, ESI;
1805 mov bptr, EAX;
1806 }
1807 }
1808 }
1809 else
1810 {
1811 // MMX version is 1380% faster
1812 if (mmx() && a.length >= 4)
1813 {
1814 auto n = aptr + (a.length & ~3);
1815
1816 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1817
1818 asm
1819 {
1820 mov ESI, aptr;
1821 mov EDI, n;
1822 mov EAX, bptr;
1823 movq MM2, l;
1824
1825 align 4;
1826 startmmx:
1827 add ESI, 16;
1828 movq MM0, [EAX];
1829 movq MM1, [EAX+8];
1830 add EAX, 16;
1831 pmuludq MM0, MM2; // only multiplies low 32 bits
1832 pmuludq MM1, MM2;
1833 movq [ESI -16], MM0;
1834 movq [ESI+8-16], MM1;
1835 cmp ESI, EDI;
1836 jb startmmx;
1837
1838 emms;
1839 mov aptr, ESI;
1840 mov bptr, EAX;
1841 }
1842 }
1843 }
1844 }
1845 }
1846
1847 while (aptr < aend)
1848 *aptr++ = *bptr++ * value;
1849
1850 return a;
1851 }
1852
1853 unittest
1854 {
1855 printf("_arraySliceExpMulSliceAssign_s unittest\n");
1856
1857 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1858 {
1859 version (log) printf(" cpuid %d\n", cpuid);
1860
1861 for (int j = 0; j < 2; j++)
1862 {
1863 const int dim = 67;
1864 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1865 a = a[j .. dim + j]; // misalign for second iteration
1866 T[] b = new T[dim + j];
1867 b = b[j .. dim + j];
1868 T[] c = new T[dim + j];
1869 c = c[j .. dim + j];
1870
1871 for (int i = 0; i < dim; i++)
1872 { a[i] = cast(T)i;
1873 b[i] = cast(T)(i + 7);
1874 c[i] = cast(T)(i * 2);
1875 }
1876
1877 c[] = a[] * 6;
1878
1879 for (int i = 0; i < dim; i++)
1880 {
1881 //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]);
1882 if (c[i] != cast(T)(a[i] * 6))
1883 {
1884 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
1885 assert(0);
1886 }
1887 }
1888 }
1889 }
1890 }
1891
1892
1893 /* ======================================================================== */
1894
1895 /***********************
1896 * Computes:
1897 * a[] = b[] * c[]
1898 */
1899
1900 T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b)
1901 {
1902 return _arraySliceSliceMulSliceAssign_i(a, c, b);
1903 }
1904
1905 T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b)
1906 {
1907 return _arraySliceSliceMulSliceAssign_i(a, c, b);
1908 }
1909
1910 T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b)
1911 in
1912 {
1913 assert(a.length == b.length && b.length == c.length);
1914 assert(disjoint(a, b));
1915 assert(disjoint(a, c));
1916 assert(disjoint(b, c));
1917 }
1918 body
1919 {
1920 //printf("_arraySliceSliceMulSliceAssign_i()\n");
1921 auto aptr = a.ptr;
1922 auto aend = aptr + a.length;
1923 auto bptr = b.ptr;
1924 auto cptr = c.ptr;
1925
1926 version (none)
1927 {
1928 version (D_InlineAsm_X86)
1929 {
1930 // SSE2 aligned version is 1407% faster
1931 if (sse2() && a.length >= 8)
1932 {
1933 auto n = aptr + (a.length & ~7);
1934
1935 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1936 {
1937 asm
1938 {
1939 mov ESI, aptr;
1940 mov EDI, n;
1941 mov EAX, bptr;
1942 mov ECX, cptr;
1943
1944 align 4;
1945 startsse2u:
1946 add ESI, 32;
1947 movdqu XMM0, [EAX];
1948 movdqu XMM2, [ECX];
1949 movdqu XMM1, [EAX+16];
1950 movdqu XMM3, [ECX+16];
1951 add EAX, 32;
1952 add ECX, 32;
1953 pmuludq XMM0, XMM2;
1954 pmuludq XMM1, XMM3;
1955 movdqu [ESI -32], XMM0;
1956 movdqu [ESI+16-32], XMM1;
1957 cmp ESI, EDI;
1958 jb startsse2u;
1959
1960 mov aptr, ESI;
1961 mov bptr, EAX;
1962 mov cptr, ECX;
1963 }
1964 }
1965 else
1966 {
1967 asm
1968 {
1969 mov ESI, aptr;
1970 mov EDI, n;
1971 mov EAX, bptr;
1972 mov ECX, cptr;
1973
1974 align 4;
1975 startsse2a:
1976 add ESI, 32;
1977 movdqa XMM0, [EAX];
1978 movdqa XMM2, [ECX];
1979 movdqa XMM1, [EAX+16];
1980 movdqa XMM3, [ECX+16];
1981 add EAX, 32;
1982 add ECX, 32;
1983 pmuludq XMM0, XMM2;
1984 pmuludq XMM1, XMM3;
1985 movdqa [ESI -32], XMM0;
1986 movdqa [ESI+16-32], XMM1;
1987 cmp ESI, EDI;
1988 jb startsse2a;
1989
1990 mov aptr, ESI;
1991 mov bptr, EAX;
1992 mov cptr, ECX;
1993 }
1994 }
1995 }
1996 else
1997 // MMX version is 1029% faster
1998 if (mmx() && a.length >= 4)
1999 {
2000 auto n = aptr + (a.length & ~3);
2001
2002 asm
2003 {
2004 mov ESI, aptr;
2005 mov EDI, n;
2006 mov EAX, bptr;
2007 mov ECX, cptr;
2008
2009 align 4;
2010 startmmx:
2011 add ESI, 16;
2012 movq MM0, [EAX];
2013 movq MM2, [ECX];
2014 movq MM1, [EAX+8];
2015 movq MM3, [ECX+8];
2016 add EAX, 16;
2017 add ECX, 16;
2018 pmuludq MM0, MM2;
2019 pmuludq MM1, MM3;
2020 movq [ESI -16], MM0;
2021 movq [ESI+8-16], MM1;
2022 cmp ESI, EDI;
2023 jb startmmx;
2024
2025 emms;
2026 mov aptr, ESI;
2027 mov bptr, EAX;
2028 mov cptr, ECX;
2029 }
2030 }
2031 }
2032 }
2033
2034 while (aptr < aend)
2035 *aptr++ = *bptr++ * *cptr++;
2036
2037 return a;
2038 }
2039
2040 unittest
2041 {
2042 printf("_arraySliceSliceMulSliceAssign_i unittest\n");
2043
2044 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2045 {
2046 version (log) printf(" cpuid %d\n", cpuid);
2047
2048 for (int j = 0; j < 2; j++)
2049 {
2050 const int dim = 67;
2051 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2052 a = a[j .. dim + j]; // misalign for second iteration
2053 T[] b = new T[dim + j];
2054 b = b[j .. dim + j];
2055 T[] c = new T[dim + j];
2056 c = c[j .. dim + j];
2057
2058 for (int i = 0; i < dim; i++)
2059 { a[i] = cast(T)i;
2060 b[i] = cast(T)(i + 7);
2061 c[i] = cast(T)(i * 2);
2062 }
2063
2064 c[] = a[] * b[];
2065
2066 for (int i = 0; i < dim; i++)
2067 {
2068 if (c[i] != cast(T)(a[i] * b[i]))
2069 {
2070 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
2071 assert(0);
2072 }
2073 }
2074 }
2075 }
2076 }
2077
2078
2079 /* ======================================================================== */
2080
2081 /***********************
2082 * Computes:
2083 * a[] *= value
2084 */
2085
2086 T[] _arrayExpSliceMulass_w(T[] a, T value)
2087 {
2088 return _arrayExpSliceMulass_i(a, value);
2089 }
2090
2091 T[] _arrayExpSliceMulass_k(T[] a, T value)
2092 {
2093 return _arrayExpSliceMulass_i(a, value);
2094 }
2095
2096 T[] _arrayExpSliceMulass_i(T[] a, T value)
2097 {
2098 //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
2099 auto aptr = a.ptr;
2100 auto aend = aptr + a.length;
2101
2102 version (none)
2103 {
2104 version (D_InlineAsm_X86)
2105 {
2106 // SSE2 aligned version is 400% faster
2107 if (sse2() && a.length >= 8)
2108 {
2109 auto n = aptr + (a.length & ~7);
2110
2111 uint l = value;
2112
2113 if (((cast(uint) aptr) & 15) != 0)
2114 {
2115 asm
2116 {
2117 mov ESI, aptr;
2118 mov EDI, n;
2119 movd XMM2, l;
2120 pshufd XMM2, XMM2, 0;
2121
2122 align 4;
2123 startsse2u:
2124 movdqu XMM0, [ESI];
2125 movdqu XMM1, [ESI+16];
2126 add ESI, 32;
2127 pmuludq XMM0, XMM2;
2128 pmuludq XMM1, XMM2;
2129 movdqu [ESI -32], XMM0;
2130 movdqu [ESI+16-32], XMM1;
2131 cmp ESI, EDI;
2132 jb startsse2u;
2133
2134 mov aptr, ESI;
2135 }
2136 }
2137 else
2138 {
2139 asm
2140 {
2141 mov ESI, aptr;
2142 mov EDI, n;
2143 movd XMM2, l;
2144 pshufd XMM2, XMM2, 0;
2145
2146 align 4;
2147 startsse2a:
2148 movdqa XMM0, [ESI];
2149 movdqa XMM1, [ESI+16];
2150 add ESI, 32;
2151 pmuludq XMM0, XMM2;
2152 pmuludq XMM1, XMM2;
2153 movdqa [ESI -32], XMM0;
2154 movdqa [ESI+16-32], XMM1;
2155 cmp ESI, EDI;
2156 jb startsse2a;
2157
2158 mov aptr, ESI;
2159 }
2160 }
2161 }
2162 else
2163 // MMX version is 402% faster
2164 if (mmx() && a.length >= 4)
2165 {
2166 auto n = aptr + (a.length & ~3);
2167
2168 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
2169
2170 asm
2171 {
2172 mov ESI, aptr;
2173 mov EDI, n;
2174 movq MM2, l;
2175
2176 align 4;
2177 startmmx:
2178 movq MM0, [ESI];
2179 movq MM1, [ESI+8];
2180 add ESI, 16;
2181 pmuludq MM0, MM2;
2182 pmuludq MM1, MM2;
2183 movq [ESI -16], MM0;
2184 movq [ESI+8-16], MM1;
2185 cmp ESI, EDI;
2186 jb startmmx;
2187
2188 emms;
2189 mov aptr, ESI;
2190 }
2191 }
2192 }
2193 }
2194
2195 while (aptr < aend)
2196 *aptr++ *= value;
2197
2198 return a;
2199 }
2200
2201 unittest
2202 {
2203 printf("_arrayExpSliceMulass_i unittest\n");
2204
2205 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2206 {
2207 version (log) printf(" cpuid %d\n", cpuid);
2208
2209 for (int j = 0; j < 2; j++)
2210 {
2211 const int dim = 67;
2212 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2213 a = a[j .. dim + j]; // misalign for second iteration
2214 T[] b = new T[dim + j];
2215 b = b[j .. dim + j];
2216 T[] c = new T[dim + j];
2217 c = c[j .. dim + j];
2218
2219 for (int i = 0; i < dim; i++)
2220 { a[i] = cast(T)i;
2221 b[i] = cast(T)(i + 7);
2222 c[i] = cast(T)(i * 2);
2223 }
2224
2225 b[] = a[];
2226 a[] *= 6;
2227
2228 for (int i = 0; i < dim; i++)
2229 {
2230 if (a[i] != cast(T)(b[i] * 6))
2231 {
2232 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
2233 assert(0);
2234 }
2235 }
2236 }
2237 }
2238 }
2239
2240
2241 /* ======================================================================== */
2242
2243 /***********************
2244 * Computes:
2245 * a[] *= b[]
2246 */
2247
2248 T[] _arraySliceSliceMulass_w(T[] a, T[] b)
2249 {
2250 return _arraySliceSliceMulass_i(a, b);
2251 }
2252
2253 T[] _arraySliceSliceMulass_k(T[] a, T[] b)
2254 {
2255 return _arraySliceSliceMulass_i(a, b);
2256 }
2257
2258 T[] _arraySliceSliceMulass_i(T[] a, T[] b)
2259 in
2260 {
2261 assert (a.length == b.length);
2262 assert (disjoint(a, b));
2263 }
2264 body
2265 {
2266 //printf("_arraySliceSliceMulass_i()\n");
2267 auto aptr = a.ptr;
2268 auto aend = aptr + a.length;
2269 auto bptr = b.ptr;
2270
2271 version (none)
2272 {
2273 version (D_InlineAsm_X86)
2274 {
2275 // SSE2 aligned version is 873% faster
2276 if (sse2() && a.length >= 8)
2277 {
2278 auto n = aptr + (a.length & ~7);
2279
2280 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
2281 {
2282 asm
2283 {
2284 mov ESI, aptr;
2285 mov EDI, n;
2286 mov ECX, bptr;
2287
2288 align 4;
2289 startsse2u:
2290 movdqu XMM0, [ESI];
2291 movdqu XMM2, [ECX];
2292 movdqu XMM1, [ESI+16];
2293 movdqu XMM3, [ECX+16];
2294 add ESI, 32;
2295 add ECX, 32;
2296 pmuludq XMM0, XMM2;
2297 pmuludq XMM1, XMM3;
2298 movdqu [ESI -32], XMM0;
2299 movdqu [ESI+16-32], XMM1;
2300 cmp ESI, EDI;
2301 jb startsse2u;
2302
2303 mov aptr, ESI;
2304 mov bptr, ECX;
2305 }
2306 }
2307 else
2308 {
2309 asm
2310 {
2311 mov ESI, aptr;
2312 mov EDI, n;
2313 mov ECX, bptr;
2314
2315 align 4;
2316 startsse2a:
2317 movdqa XMM0, [ESI];
2318 movdqa XMM2, [ECX];
2319 movdqa XMM1, [ESI+16];
2320 movdqa XMM3, [ECX+16];
2321 add ESI, 32;
2322 add ECX, 32;
2323 pmuludq XMM0, XMM2;
2324 pmuludq XMM1, XMM3;
2325 movdqa [ESI -32], XMM0;
2326 movdqa [ESI+16-32], XMM1;
2327 cmp ESI, EDI;
2328 jb startsse2a;
2329
2330 mov aptr, ESI;
2331 mov bptr, ECX;
2332 }
2333 }
2334 }
2335 /+ BUG: comment out this section until we figure out what is going
2336 wrong with the invalid pshufd instructions.
2337
2338 else
2339 // MMX version is 573% faster
2340 if (mmx() && a.length >= 4)
2341 {
2342 auto n = aptr + (a.length & ~3);
2343
2344 asm
2345 {
2346 mov ESI, aptr;
2347 mov EDI, n;
2348 mov ECX, bptr;
2349
2350 align 4;
2351 startmmx:
2352 movq MM0, [ESI];
2353 movq MM2, [ECX];
2354 movq MM1, [ESI+8];
2355 movq MM3, [ECX+8];
2356 pxor MM4, MM4;
2357 pxor MM5, MM5;
2358 punpckldq MM4, MM0;
2359 punpckldq MM5, MM2;
2360 add ESI, 16;
2361 add ECX, 16;
2362 pmuludq MM4, MM5;
2363 pshufd MM4, MM4, 8; // ?
2364 movq [ESI -16], MM4;
2365 pxor MM4, MM4;
2366 pxor MM5, MM5;
2367 punpckldq MM4, MM1;
2368 punpckldq MM5, MM3;
2369 pmuludq MM4, MM5;
2370 pshufd MM4, MM4, 8; // ?
2371 movq [ESI+8-16], MM4;
2372 cmp ESI, EDI;
2373 jb startmmx;
2374
2375 emms;
2376 mov aptr, ESI;
2377 mov bptr, ECX;
2378 }
2379 }
2380 +/
2381 }
2382 }
2383
2384 while (aptr < aend)
2385 *aptr++ *= *bptr++;
2386
2387 return a;
2388 }
2389
2390 unittest
2391 {
2392 printf("_arraySliceSliceMulass_i unittest\n");
2393
2394 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2395 {
2396 version (log) printf(" cpuid %d\n", cpuid);
2397
2398 for (int j = 0; j < 2; j++)
2399 {
2400 const int dim = 67;
2401 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2402 a = a[j .. dim + j]; // misalign for second iteration
2403 T[] b = new T[dim + j];
2404 b = b[j .. dim + j];
2405 T[] c = new T[dim + j];
2406 c = c[j .. dim + j];
2407
2408 for (int i = 0; i < dim; i++)
2409 { a[i] = cast(T)i;
2410 b[i] = cast(T)(i + 7);
2411 c[i] = cast(T)(i * 2);
2412 }
2413
2414 b[] = a[];
2415 a[] *= c[];
2416
2417 for (int i = 0; i < dim; i++)
2418 {
2419 if (a[i] != cast(T)(b[i] * c[i]))
2420 {
2421 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);
2422 assert(0);
2423 }
2424 }
2425 }
2426 }
2427 }