comparison druntime/src/compiler/ldc/arrayint.d @ 1458:e0b2d67cfe7c

Added druntime (this should be removed once it works).
author Robert Clipsham <robert@octarineparrot.com>
date Tue, 02 Jun 2009 17:43:06 +0100
parents
children
comparison
equal deleted inserted replaced
1456:7b218ec1044f 1458:e0b2d67cfe7c
1 /**
2 * Contains MMX versions of certain operations for dchar, int, and uint ('w',
3 * 'i' and 'k' suffixes).
4 *
5 * Copyright: Copyright Digital Mars 2008 - 2009.
6 * License: <a href="http://www.boost.org/LICENSE_1_0.txt>Boost License 1.0</a>.
7 * Authors: Walter Bright, based on code originally written by Burton Radons
8 *
9 * Copyright Digital Mars 2008 - 2009.
10 * Distributed under the Boost Software License, Version 1.0.
11 * (See accompanying file LICENSE_1_0.txt or copy at
12 * http://www.boost.org/LICENSE_1_0.txt)
13 */
14 module rt.arrayint;
15
16 private import rt.util.cpuid;
17
18 version (unittest)
19 {
20 private import core.stdc.stdio : printf;
21 /* This is so unit tests will test every CPU variant
22 */
23 int cpuid;
24 const int CPUID_MAX = 4;
25 bool mmx() { return cpuid == 1 && rt.util.cpuid.mmx(); }
26 bool sse() { return cpuid == 2 && rt.util.cpuid.sse(); }
27 bool sse2() { return cpuid == 3 && rt.util.cpuid.sse2(); }
28 bool amd3dnow() { return cpuid == 4 && rt.util.cpuid.amd3dnow(); }
29 }
30 else
31 {
32 alias rt.util.cpuid.mmx mmx;
33 alias rt.util.cpuid.sse sse;
34 alias rt.util.cpuid.sse2 sse2;
35 alias rt.util.cpuid.amd3dnow amd3dnow;
36 }
37
38 //version = log;
39
40 bool disjoint(T)(T[] a, T[] b)
41 {
42 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
43 }
44
45 alias int T;
46
47 extern (C):
48
49 /* ======================================================================== */
50
51 /***********************
52 * Computes:
53 * a[] = b[] + value
54 */
55
56 T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b)
57 {
58 return _arraySliceExpAddSliceAssign_i(a, value, b);
59 }
60
61 T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b)
62 {
63 return _arraySliceExpAddSliceAssign_i(a, value, b);
64 }
65
66 T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b)
67 in
68 {
69 assert(a.length == b.length);
70 assert(disjoint(a, b));
71 }
72 body
73 {
74 //printf("_arraySliceExpAddSliceAssign_i()\n");
75 auto aptr = a.ptr;
76 auto aend = aptr + a.length;
77 auto bptr = b.ptr;
78
79 version (D_InlineAsm_X86)
80 {
81 // SSE2 aligned version is 380% faster
82 if (sse2() && a.length >= 8)
83 {
84 auto n = aptr + (a.length & ~7);
85
86 uint l = value;
87
88 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
89 {
90 asm // unaligned case
91 {
92 mov ESI, aptr;
93 mov EDI, n;
94 mov EAX, bptr;
95 movd XMM2, l;
96 pshufd XMM2, XMM2, 0;
97
98 align 4;
99 startaddsse2u:
100 add ESI, 32;
101 movdqu XMM0, [EAX];
102 movdqu XMM1, [EAX+16];
103 add EAX, 32;
104 paddd XMM0, XMM2;
105 paddd XMM1, XMM2;
106 movdqu [ESI -32], XMM0;
107 movdqu [ESI+16-32], XMM1;
108 cmp ESI, EDI;
109 jb startaddsse2u;
110
111 mov aptr, ESI;
112 mov bptr, EAX;
113 }
114 }
115 else
116 {
117 asm // aligned case
118 {
119 mov ESI, aptr;
120 mov EDI, n;
121 mov EAX, bptr;
122 movd XMM2, l;
123 pshufd XMM2, XMM2, 0;
124
125 align 4;
126 startaddsse2a:
127 add ESI, 32;
128 movdqa XMM0, [EAX];
129 movdqa XMM1, [EAX+16];
130 add EAX, 32;
131 paddd XMM0, XMM2;
132 paddd XMM1, XMM2;
133 movdqa [ESI -32], XMM0;
134 movdqa [ESI+16-32], XMM1;
135 cmp ESI, EDI;
136 jb startaddsse2a;
137
138 mov aptr, ESI;
139 mov bptr, EAX;
140 }
141 }
142 }
143 else
144 // MMX version is 298% faster
145 if (mmx() && a.length >= 4)
146 {
147 auto n = aptr + (a.length & ~3);
148
149 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
150
151 asm
152 {
153 mov ESI, aptr;
154 mov EDI, n;
155 mov EAX, bptr;
156 movq MM2, l;
157
158 align 4;
159 startmmx:
160 add ESI, 16;
161 movq MM0, [EAX];
162 movq MM1, [EAX+8];
163 add EAX, 16;
164 paddd MM0, MM2;
165 paddd MM1, MM2;
166 movq [ESI -16], MM0;
167 movq [ESI+8-16], MM1;
168 cmp ESI, EDI;
169 jb startmmx;
170
171 emms;
172 mov aptr, ESI;
173 mov bptr, EAX;
174 }
175 }
176 else
177 if (a.length >= 2)
178 {
179 auto n = aptr + (a.length & ~1);
180
181 asm
182 {
183 mov ESI, aptr;
184 mov EDI, n;
185 mov EAX, bptr;
186 mov EDX, value;
187
188 align 4;
189 start386:
190 add ESI, 8;
191 mov EBX, [EAX];
192 mov ECX, [EAX+4];
193 add EAX, 8;
194 add EBX, EDX;
195 add ECX, EDX;
196 mov [ESI -8], EBX;
197 mov [ESI+4-8], ECX;
198 cmp ESI, EDI;
199 jb start386;
200
201 mov aptr, ESI;
202 mov bptr, EAX;
203 }
204 }
205 }
206
207 while (aptr < aend)
208 *aptr++ = *bptr++ + value;
209
210 return a;
211 }
212
213 unittest
214 {
215 printf("_arraySliceExpAddSliceAssign_i unittest\n");
216
217 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
218 {
219 version (log) printf(" cpuid %d\n", cpuid);
220
221 for (int j = 0; j < 2; j++)
222 {
223 const int dim = 67;
224 T[] a = new T[dim + j]; // aligned on 16 byte boundary
225 a = a[j .. dim + j]; // misalign for second iteration
226 T[] b = new T[dim + j];
227 b = b[j .. dim + j];
228 T[] c = new T[dim + j];
229 c = c[j .. dim + j];
230
231 for (int i = 0; i < dim; i++)
232 { a[i] = cast(T)i;
233 b[i] = cast(T)(i + 7);
234 c[i] = cast(T)(i * 2);
235 }
236
237 c[] = a[] + 6;
238
239 for (int i = 0; i < dim; i++)
240 {
241 if (c[i] != cast(T)(a[i] + 6))
242 {
243 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
244 assert(0);
245 }
246 }
247 }
248 }
249 }
250
251
252 /* ======================================================================== */
253
254 /***********************
255 * Computes:
256 * a[] = b[] + c[]
257 */
258
259 T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b)
260 {
261 return _arraySliceSliceAddSliceAssign_i(a, c, b);
262 }
263
264 T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b)
265 {
266 return _arraySliceSliceAddSliceAssign_i(a, c, b);
267 }
268
269 T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b)
270 in
271 {
272 assert(a.length == b.length && b.length == c.length);
273 assert(disjoint(a, b));
274 assert(disjoint(a, c));
275 assert(disjoint(b, c));
276 }
277 body
278 {
279 //printf("_arraySliceSliceAddSliceAssign_i()\n");
280 auto aptr = a.ptr;
281 auto aend = aptr + a.length;
282 auto bptr = b.ptr;
283 auto cptr = c.ptr;
284
285 version (D_InlineAsm_X86)
286 {
287 // SSE2 aligned version is 1710% faster
288 if (sse2() && a.length >= 8)
289 {
290 auto n = aptr + (a.length & ~7);
291
292 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
293 {
294 asm // unaligned case
295 {
296 mov ESI, aptr;
297 mov EDI, n;
298 mov EAX, bptr;
299 mov ECX, cptr;
300
301 align 4;
302 startsse2u:
303 add ESI, 32;
304 movdqu XMM0, [EAX];
305 movdqu XMM2, [ECX];
306 movdqu XMM1, [EAX+16];
307 movdqu XMM3, [ECX+16];
308 add EAX, 32;
309 add ECX, 32;
310 paddd XMM0, XMM2;
311 paddd XMM1, XMM3;
312 movdqu [ESI -32], XMM0;
313 movdqu [ESI+16-32], XMM1;
314 cmp ESI, EDI;
315 jb startsse2u;
316
317 mov aptr, ESI;
318 mov bptr, EAX;
319 mov cptr, ECX;
320 }
321 }
322 else
323 {
324 asm // aligned case
325 {
326 mov ESI, aptr;
327 mov EDI, n;
328 mov EAX, bptr;
329 mov ECX, cptr;
330
331 align 4;
332 startsse2a:
333 add ESI, 32;
334 movdqa XMM0, [EAX];
335 movdqa XMM2, [ECX];
336 movdqa XMM1, [EAX+16];
337 movdqa XMM3, [ECX+16];
338 add EAX, 32;
339 add ECX, 32;
340 paddd XMM0, XMM2;
341 paddd XMM1, XMM3;
342 movdqa [ESI -32], XMM0;
343 movdqa [ESI+16-32], XMM1;
344 cmp ESI, EDI;
345 jb startsse2a;
346
347 mov aptr, ESI;
348 mov bptr, EAX;
349 mov cptr, ECX;
350 }
351 }
352 }
353 else
354 // MMX version is 995% faster
355 if (mmx() && a.length >= 4)
356 {
357 auto n = aptr + (a.length & ~3);
358
359 asm
360 {
361 mov ESI, aptr;
362 mov EDI, n;
363 mov EAX, bptr;
364 mov ECX, cptr;
365
366 align 4;
367 startmmx:
368 add ESI, 16;
369 movq MM0, [EAX];
370 movq MM2, [ECX];
371 movq MM1, [EAX+8];
372 movq MM3, [ECX+8];
373 add EAX, 16;
374 add ECX, 16;
375 paddd MM0, MM2;
376 paddd MM1, MM3;
377 movq [ESI -16], MM0;
378 movq [ESI+8-16], MM1;
379 cmp ESI, EDI;
380 jb startmmx;
381
382 emms;
383 mov aptr, ESI;
384 mov bptr, EAX;
385 mov cptr, ECX;
386 }
387 }
388 }
389
390 normal:
391 while (aptr < aend)
392 *aptr++ = *bptr++ + *cptr++;
393
394 return a;
395 }
396
397 unittest
398 {
399 printf("_arraySliceSliceAddSliceAssign_i unittest\n");
400
401 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
402 {
403 version (log) printf(" cpuid %d\n", cpuid);
404
405 for (int j = 0; j < 2; j++)
406 {
407 const int dim = 67;
408 T[] a = new T[dim + j]; // aligned on 16 byte boundary
409 a = a[j .. dim + j]; // misalign for second iteration
410 T[] b = new T[dim + j];
411 b = b[j .. dim + j];
412 T[] c = new T[dim + j];
413 c = c[j .. dim + j];
414
415 for (int i = 0; i < dim; i++)
416 { a[i] = cast(T)i;
417 b[i] = cast(T)(i + 7);
418 c[i] = cast(T)(i * 2);
419 }
420
421 c[] = a[] + b[];
422
423 for (int i = 0; i < dim; i++)
424 {
425 if (c[i] != cast(T)(a[i] + b[i]))
426 {
427 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
428 assert(0);
429 }
430 }
431 }
432 }
433 }
434
435
436 /* ======================================================================== */
437
438 /***********************
439 * Computes:
440 * a[] += value
441 */
442
443 T[] _arrayExpSliceAddass_w(T[] a, T value)
444 {
445 return _arrayExpSliceAddass_i(a, value);
446 }
447
448 T[] _arrayExpSliceAddass_k(T[] a, T value)
449 {
450 return _arrayExpSliceAddass_i(a, value);
451 }
452
453 T[] _arrayExpSliceAddass_i(T[] a, T value)
454 {
455 //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
456 auto aptr = a.ptr;
457 auto aend = aptr + a.length;
458
459 version (D_InlineAsm_X86)
460 {
461 // SSE2 aligned version is 83% faster
462 if (sse2() && a.length >= 8)
463 {
464 auto n = aptr + (a.length & ~7);
465
466 uint l = value;
467
468 if (((cast(uint) aptr) & 15) != 0)
469 {
470 asm // unaligned case
471 {
472 mov ESI, aptr;
473 mov EDI, n;
474 movd XMM2, l;
475 pshufd XMM2, XMM2, 0;
476
477 align 4;
478 startaddsse2u:
479 movdqu XMM0, [ESI];
480 movdqu XMM1, [ESI+16];
481 add ESI, 32;
482 paddd XMM0, XMM2;
483 paddd XMM1, XMM2;
484 movdqu [ESI -32], XMM0;
485 movdqu [ESI+16-32], XMM1;
486 cmp ESI, EDI;
487 jb startaddsse2u;
488
489 mov aptr, ESI;
490 }
491 }
492 else
493 {
494 asm // aligned case
495 {
496 mov ESI, aptr;
497 mov EDI, n;
498 movd XMM2, l;
499 pshufd XMM2, XMM2, 0;
500
501 align 4;
502 startaddsse2a:
503 movdqa XMM0, [ESI];
504 movdqa XMM1, [ESI+16];
505 add ESI, 32;
506 paddd XMM0, XMM2;
507 paddd XMM1, XMM2;
508 movdqa [ESI -32], XMM0;
509 movdqa [ESI+16-32], XMM1;
510 cmp ESI, EDI;
511 jb startaddsse2a;
512
513 mov aptr, ESI;
514 }
515 }
516 }
517 else
518 // MMX version is 81% faster
519 if (mmx() && a.length >= 4)
520 {
521 auto n = aptr + (a.length & ~3);
522
523 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
524
525 asm
526 {
527 mov ESI, aptr;
528 mov EDI, n;
529 movq MM2, l;
530
531 align 4;
532 startmmx:
533 movq MM0, [ESI];
534 movq MM1, [ESI+8];
535 add ESI, 16;
536 paddd MM0, MM2;
537 paddd MM1, MM2;
538 movq [ESI -16], MM0;
539 movq [ESI+8-16], MM1;
540 cmp ESI, EDI;
541 jb startmmx;
542
543 emms;
544 mov aptr, ESI;
545 }
546 }
547 else
548 if (a.length >= 2)
549 {
550 auto n = aptr + (a.length & ~1);
551
552 asm
553 {
554 mov ESI, aptr;
555 mov EDI, n;
556 mov EDX, value;
557
558 align 4;
559 start386:
560 mov EBX, [ESI];
561 mov ECX, [ESI+4];
562 add ESI, 8;
563 add EBX, EDX;
564 add ECX, EDX;
565 mov [ESI -8], EBX;
566 mov [ESI+4-8], ECX;
567 cmp ESI, EDI;
568 jb start386;
569
570 mov aptr, ESI;
571 }
572 }
573 }
574
575 while (aptr < aend)
576 *aptr++ += value;
577
578 return a;
579 }
580
581 unittest
582 {
583 printf("_arrayExpSliceAddass_i unittest\n");
584
585 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
586 {
587 version (log) printf(" cpuid %d\n", cpuid);
588
589 for (int j = 0; j < 2; j++)
590 {
591 const int dim = 67;
592 T[] a = new T[dim + j]; // aligned on 16 byte boundary
593 a = a[j .. dim + j]; // misalign for second iteration
594 T[] b = new T[dim + j];
595 b = b[j .. dim + j];
596 T[] c = new T[dim + j];
597 c = c[j .. dim + j];
598
599 for (int i = 0; i < dim; i++)
600 { a[i] = cast(T)i;
601 b[i] = cast(T)(i + 7);
602 c[i] = cast(T)(i * 2);
603 }
604
605 a[] = c[];
606 a[] += 6;
607
608 for (int i = 0; i < dim; i++)
609 {
610 if (a[i] != cast(T)(c[i] + 6))
611 {
612 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
613 assert(0);
614 }
615 }
616 }
617 }
618 }
619
620
621 /* ======================================================================== */
622
623 /***********************
624 * Computes:
625 * a[] += b[]
626 */
627
628 T[] _arraySliceSliceAddass_w(T[] a, T[] b)
629 {
630 return _arraySliceSliceAddass_i(a, b);
631 }
632
633 T[] _arraySliceSliceAddass_k(T[] a, T[] b)
634 {
635 return _arraySliceSliceAddass_i(a, b);
636 }
637
638 T[] _arraySliceSliceAddass_i(T[] a, T[] b)
639 in
640 {
641 assert (a.length == b.length);
642 assert (disjoint(a, b));
643 }
644 body
645 {
646 //printf("_arraySliceSliceAddass_i()\n");
647 auto aptr = a.ptr;
648 auto aend = aptr + a.length;
649 auto bptr = b.ptr;
650
651 version (D_InlineAsm_X86)
652 {
653 // SSE2 aligned version is 695% faster
654 if (sse2() && a.length >= 8)
655 {
656 auto n = aptr + (a.length & ~7);
657
658 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
659 {
660 asm // unaligned case
661 {
662 mov ESI, aptr;
663 mov EDI, n;
664 mov ECX, bptr;
665
666 align 4;
667 startsse2u:
668 movdqu XMM0, [ESI];
669 movdqu XMM2, [ECX];
670 movdqu XMM1, [ESI+16];
671 movdqu XMM3, [ECX+16];
672 add ESI, 32;
673 add ECX, 32;
674 paddd XMM0, XMM2;
675 paddd XMM1, XMM3;
676 movdqu [ESI -32], XMM0;
677 movdqu [ESI+16-32], XMM1;
678 cmp ESI, EDI;
679 jb startsse2u;
680
681 mov aptr, ESI;
682 mov bptr, ECX;
683 }
684 }
685 else
686 {
687 asm // aligned case
688 {
689 mov ESI, aptr;
690 mov EDI, n;
691 mov ECX, bptr;
692
693 align 4;
694 startsse2a:
695 movdqa XMM0, [ESI];
696 movdqa XMM2, [ECX];
697 movdqa XMM1, [ESI+16];
698 movdqa XMM3, [ECX+16];
699 add ESI, 32;
700 add ECX, 32;
701 paddd XMM0, XMM2;
702 paddd XMM1, XMM3;
703 movdqa [ESI -32], XMM0;
704 movdqa [ESI+16-32], XMM1;
705 cmp ESI, EDI;
706 jb startsse2a;
707
708 mov aptr, ESI;
709 mov bptr, ECX;
710 }
711 }
712 }
713 else
714 // MMX version is 471% faster
715 if (mmx() && a.length >= 4)
716 {
717 auto n = aptr + (a.length & ~3);
718
719 asm
720 {
721 mov ESI, aptr;
722 mov EDI, n;
723 mov ECX, bptr;
724
725 align 4;
726 startmmx:
727 movq MM0, [ESI];
728 movq MM2, [ECX];
729 movq MM1, [ESI+8];
730 movq MM3, [ECX+8];
731 add ESI, 16;
732 add ECX, 16;
733 paddd MM0, MM2;
734 paddd MM1, MM3;
735 movq [ESI -16], MM0;
736 movq [ESI+8-16], MM1;
737 cmp ESI, EDI;
738 jb startmmx;
739
740 emms;
741 mov aptr, ESI;
742 mov bptr, ECX;
743 }
744 }
745 }
746
747 normal:
748 while (aptr < aend)
749 *aptr++ += *bptr++;
750
751 return a;
752 }
753
754 unittest
755 {
756 printf("_arraySliceSliceAddass_i unittest\n");
757
758 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
759 {
760 version (log) printf(" cpuid %d\n", cpuid);
761
762 for (int j = 0; j < 2; j++)
763 {
764 const int dim = 67;
765 T[] a = new T[dim + j]; // aligned on 16 byte boundary
766 a = a[j .. dim + j]; // misalign for second iteration
767 T[] b = new T[dim + j];
768 b = b[j .. dim + j];
769 T[] c = new T[dim + j];
770 c = c[j .. dim + j];
771
772 for (int i = 0; i < dim; i++)
773 { a[i] = cast(T)i;
774 b[i] = cast(T)(i + 7);
775 c[i] = cast(T)(i * 2);
776 }
777
778 b[] = c[];
779 c[] += a[];
780
781 for (int i = 0; i < dim; i++)
782 {
783 if (c[i] != cast(T)(b[i] + a[i]))
784 {
785 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
786 assert(0);
787 }
788 }
789 }
790 }
791 }
792
793
794 /* ======================================================================== */
795
796 /***********************
797 * Computes:
798 * a[] = b[] - value
799 */
800
801 T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b)
802 {
803 return _arraySliceExpMinSliceAssign_i(a, value, b);
804 }
805
806 T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b)
807 {
808 return _arraySliceExpMinSliceAssign_i(a, value, b);
809 }
810
811 T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b)
812 in
813 {
814 assert(a.length == b.length);
815 assert(disjoint(a, b));
816 }
817 body
818 {
819 //printf("_arraySliceExpMinSliceAssign_i()\n");
820 auto aptr = a.ptr;
821 auto aend = aptr + a.length;
822 auto bptr = b.ptr;
823
824 version (D_InlineAsm_X86)
825 {
826 // SSE2 aligned version is 400% faster
827 if (sse2() && a.length >= 8)
828 {
829 auto n = aptr + (a.length & ~7);
830
831 uint l = value;
832
833 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
834 {
835 asm // unaligned case
836 {
837 mov ESI, aptr;
838 mov EDI, n;
839 mov EAX, bptr;
840 movd XMM2, l;
841 pshufd XMM2, XMM2, 0;
842
843 align 4;
844 startaddsse2u:
845 add ESI, 32;
846 movdqu XMM0, [EAX];
847 movdqu XMM1, [EAX+16];
848 add EAX, 32;
849 psubd XMM0, XMM2;
850 psubd XMM1, XMM2;
851 movdqu [ESI -32], XMM0;
852 movdqu [ESI+16-32], XMM1;
853 cmp ESI, EDI;
854 jb startaddsse2u;
855
856 mov aptr, ESI;
857 mov bptr, EAX;
858 }
859 }
860 else
861 {
862 asm // aligned case
863 {
864 mov ESI, aptr;
865 mov EDI, n;
866 mov EAX, bptr;
867 movd XMM2, l;
868 pshufd XMM2, XMM2, 0;
869
870 align 4;
871 startaddsse2a:
872 add ESI, 32;
873 movdqa XMM0, [EAX];
874 movdqa XMM1, [EAX+16];
875 add EAX, 32;
876 psubd XMM0, XMM2;
877 psubd XMM1, XMM2;
878 movdqa [ESI -32], XMM0;
879 movdqa [ESI+16-32], XMM1;
880 cmp ESI, EDI;
881 jb startaddsse2a;
882
883 mov aptr, ESI;
884 mov bptr, EAX;
885 }
886 }
887 }
888 else
889 // MMX version is 315% faster
890 if (mmx() && a.length >= 4)
891 {
892 auto n = aptr + (a.length & ~3);
893
894 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
895
896 asm
897 {
898 mov ESI, aptr;
899 mov EDI, n;
900 mov EAX, bptr;
901 movq MM2, l;
902
903 align 4;
904 startmmx:
905 add ESI, 16;
906 movq MM0, [EAX];
907 movq MM1, [EAX+8];
908 add EAX, 16;
909 psubd MM0, MM2;
910 psubd MM1, MM2;
911 movq [ESI -16], MM0;
912 movq [ESI+8-16], MM1;
913 cmp ESI, EDI;
914 jb startmmx;
915
916 emms;
917 mov aptr, ESI;
918 mov bptr, EAX;
919 }
920 }
921 else
922 if (a.length >= 2)
923 {
924 auto n = aptr + (a.length & ~1);
925
926 asm
927 {
928 mov ESI, aptr;
929 mov EDI, n;
930 mov EAX, bptr;
931 mov EDX, value;
932
933 align 4;
934 start386:
935 add ESI, 8;
936 mov EBX, [EAX];
937 mov ECX, [EAX+4];
938 add EAX, 8;
939 sub EBX, EDX;
940 sub ECX, EDX;
941 mov [ESI -8], EBX;
942 mov [ESI+4-8], ECX;
943 cmp ESI, EDI;
944 jb start386;
945
946 mov aptr, ESI;
947 mov bptr, EAX;
948 }
949 }
950 }
951
952 while (aptr < aend)
953 *aptr++ = *bptr++ - value;
954
955 return a;
956 }
957
958 unittest
959 {
960 printf("_arraySliceExpMinSliceAssign_i unittest\n");
961
962 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
963 {
964 version (log) printf(" cpuid %d\n", cpuid);
965
966 for (int j = 0; j < 2; j++)
967 {
968 const int dim = 67;
969 T[] a = new T[dim + j]; // aligned on 16 byte boundary
970 a = a[j .. dim + j]; // misalign for second iteration
971 T[] b = new T[dim + j];
972 b = b[j .. dim + j];
973 T[] c = new T[dim + j];
974 c = c[j .. dim + j];
975
976 for (int i = 0; i < dim; i++)
977 { a[i] = cast(T)i;
978 b[i] = cast(T)(i + 7);
979 c[i] = cast(T)(i * 2);
980 }
981
982 c[] = a[] - 6;
983
984 for (int i = 0; i < dim; i++)
985 {
986 if (c[i] != cast(T)(a[i] - 6))
987 {
988 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
989 assert(0);
990 }
991 }
992 }
993 }
994 }
995
996
997 /* ======================================================================== */
998
999 /***********************
1000 * Computes:
1001 * a[] = value - b[]
1002 */
1003
1004 T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value)
1005 {
1006 return _arrayExpSliceMinSliceAssign_i(a, b, value);
1007 }
1008
1009 T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value)
1010 {
1011 return _arrayExpSliceMinSliceAssign_i(a, b, value);
1012 }
1013
1014 T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value)
1015 in
1016 {
1017 assert(a.length == b.length);
1018 assert(disjoint(a, b));
1019 }
1020 body
1021 {
1022 //printf("_arrayExpSliceMinSliceAssign_i()\n");
1023 auto aptr = a.ptr;
1024 auto aend = aptr + a.length;
1025 auto bptr = b.ptr;
1026
1027 version (D_InlineAsm_X86)
1028 {
1029 // SSE2 aligned version is 1812% faster
1030 if (sse2() && a.length >= 8)
1031 {
1032 auto n = aptr + (a.length & ~7);
1033
1034 uint l = value;
1035
1036 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1037 {
1038 asm // unaligned case
1039 {
1040 mov ESI, aptr;
1041 mov EDI, n;
1042 mov EAX, bptr;
1043 movd XMM4, l;
1044 pshufd XMM4, XMM4, 0;
1045
1046 align 4;
1047 startaddsse2u:
1048 add ESI, 32;
1049 movdqu XMM2, [EAX];
1050 movdqu XMM3, [EAX+16];
1051 movdqa XMM0, XMM4;
1052 movdqa XMM1, XMM4;
1053 add EAX, 32;
1054 psubd XMM0, XMM2;
1055 psubd XMM1, XMM3;
1056 movdqu [ESI -32], XMM0;
1057 movdqu [ESI+16-32], XMM1;
1058 cmp ESI, EDI;
1059 jb startaddsse2u;
1060
1061 mov aptr, ESI;
1062 mov bptr, EAX;
1063 }
1064 }
1065 else
1066 {
1067 asm // aligned case
1068 {
1069 mov ESI, aptr;
1070 mov EDI, n;
1071 mov EAX, bptr;
1072 movd XMM4, l;
1073 pshufd XMM4, XMM4, 0;
1074
1075 align 4;
1076 startaddsse2a:
1077 add ESI, 32;
1078 movdqa XMM2, [EAX];
1079 movdqa XMM3, [EAX+16];
1080 movdqa XMM0, XMM4;
1081 movdqa XMM1, XMM4;
1082 add EAX, 32;
1083 psubd XMM0, XMM2;
1084 psubd XMM1, XMM3;
1085 movdqa [ESI -32], XMM0;
1086 movdqa [ESI+16-32], XMM1;
1087 cmp ESI, EDI;
1088 jb startaddsse2a;
1089
1090 mov aptr, ESI;
1091 mov bptr, EAX;
1092 }
1093 }
1094 }
1095 else
1096 // MMX version is 1077% faster
1097 if (mmx() && a.length >= 4)
1098 {
1099 auto n = aptr + (a.length & ~3);
1100
1101 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1102
1103 asm
1104 {
1105 mov ESI, aptr;
1106 mov EDI, n;
1107 mov EAX, bptr;
1108 movq MM4, l;
1109
1110 align 4;
1111 startmmx:
1112 add ESI, 16;
1113 movq MM2, [EAX];
1114 movq MM3, [EAX+8];
1115 movq MM0, MM4;
1116 movq MM1, MM4;
1117 add EAX, 16;
1118 psubd MM0, MM2;
1119 psubd MM1, MM3;
1120 movq [ESI -16], MM0;
1121 movq [ESI+8-16], MM1;
1122 cmp ESI, EDI;
1123 jb startmmx;
1124
1125 emms;
1126 mov aptr, ESI;
1127 mov bptr, EAX;
1128 }
1129 }
1130 }
1131
1132 while (aptr < aend)
1133 *aptr++ = value - *bptr++;
1134
1135 return a;
1136 }
1137
1138 unittest
1139 {
1140 printf("_arrayExpSliceMinSliceAssign_i unittest\n");
1141
1142 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1143 {
1144 version (log) printf(" cpuid %d\n", cpuid);
1145
1146 for (int j = 0; j < 2; j++)
1147 {
1148 const int dim = 67;
1149 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1150 a = a[j .. dim + j]; // misalign for second iteration
1151 T[] b = new T[dim + j];
1152 b = b[j .. dim + j];
1153 T[] c = new T[dim + j];
1154 c = c[j .. dim + j];
1155
1156 for (int i = 0; i < dim; i++)
1157 { a[i] = cast(T)i;
1158 b[i] = cast(T)(i + 7);
1159 c[i] = cast(T)(i * 2);
1160 }
1161
1162 c[] = 6 - a[];
1163
1164 for (int i = 0; i < dim; i++)
1165 {
1166 if (c[i] != cast(T)(6 - a[i]))
1167 {
1168 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
1169 assert(0);
1170 }
1171 }
1172 }
1173 }
1174 }
1175
1176
1177 /* ======================================================================== */
1178
1179 /***********************
1180 * Computes:
1181 * a[] = b[] - c[]
1182 */
1183
1184 T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b)
1185 {
1186 return _arraySliceSliceMinSliceAssign_i(a, c, b);
1187 }
1188
1189 T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b)
1190 {
1191 return _arraySliceSliceMinSliceAssign_i(a, c, b);
1192 }
1193
1194 T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b)
1195 in
1196 {
1197 assert(a.length == b.length && b.length == c.length);
1198 assert(disjoint(a, b));
1199 assert(disjoint(a, c));
1200 assert(disjoint(b, c));
1201 }
1202 body
1203 {
1204 auto aptr = a.ptr;
1205 auto aend = aptr + a.length;
1206 auto bptr = b.ptr;
1207 auto cptr = c.ptr;
1208
1209 version (D_InlineAsm_X86)
1210 {
1211 // SSE2 aligned version is 1721% faster
1212 if (sse2() && a.length >= 8)
1213 {
1214 auto n = aptr + (a.length & ~7);
1215
1216 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1217 {
1218 asm // unaligned case
1219 {
1220 mov ESI, aptr;
1221 mov EDI, n;
1222 mov EAX, bptr;
1223 mov ECX, cptr;
1224
1225 align 4;
1226 startsse2u:
1227 add ESI, 32;
1228 movdqu XMM0, [EAX];
1229 movdqu XMM2, [ECX];
1230 movdqu XMM1, [EAX+16];
1231 movdqu XMM3, [ECX+16];
1232 add EAX, 32;
1233 add ECX, 32;
1234 psubd XMM0, XMM2;
1235 psubd XMM1, XMM3;
1236 movdqu [ESI -32], XMM0;
1237 movdqu [ESI+16-32], XMM1;
1238 cmp ESI, EDI;
1239 jb startsse2u;
1240
1241 mov aptr, ESI;
1242 mov bptr, EAX;
1243 mov cptr, ECX;
1244 }
1245 }
1246 else
1247 {
1248 asm // aligned case
1249 {
1250 mov ESI, aptr;
1251 mov EDI, n;
1252 mov EAX, bptr;
1253 mov ECX, cptr;
1254
1255 align 4;
1256 startsse2a:
1257 add ESI, 32;
1258 movdqa XMM0, [EAX];
1259 movdqa XMM2, [ECX];
1260 movdqa XMM1, [EAX+16];
1261 movdqa XMM3, [ECX+16];
1262 add EAX, 32;
1263 add ECX, 32;
1264 psubd XMM0, XMM2;
1265 psubd XMM1, XMM3;
1266 movdqa [ESI -32], XMM0;
1267 movdqa [ESI+16-32], XMM1;
1268 cmp ESI, EDI;
1269 jb startsse2a;
1270
1271 mov aptr, ESI;
1272 mov bptr, EAX;
1273 mov cptr, ECX;
1274 }
1275 }
1276 }
1277 else
1278 // MMX version is 1002% faster
1279 if (mmx() && a.length >= 4)
1280 {
1281 auto n = aptr + (a.length & ~3);
1282
1283 asm
1284 {
1285 mov ESI, aptr;
1286 mov EDI, n;
1287 mov EAX, bptr;
1288 mov ECX, cptr;
1289
1290 align 4;
1291 startmmx:
1292 add ESI, 16;
1293 movq MM0, [EAX];
1294 movq MM2, [ECX];
1295 movq MM1, [EAX+8];
1296 movq MM3, [ECX+8];
1297 add EAX, 16;
1298 add ECX, 16;
1299 psubd MM0, MM2;
1300 psubd MM1, MM3;
1301 movq [ESI -16], MM0;
1302 movq [ESI+8-16], MM1;
1303 cmp ESI, EDI;
1304 jb startmmx;
1305
1306 emms;
1307 mov aptr, ESI;
1308 mov bptr, EAX;
1309 mov cptr, ECX;
1310 }
1311 }
1312 }
1313
1314 while (aptr < aend)
1315 *aptr++ = *bptr++ - *cptr++;
1316
1317 return a;
1318 }
1319
1320 unittest
1321 {
1322 printf("_arraySliceSliceMinSliceAssign_i unittest\n");
1323
1324 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1325 {
1326 version (log) printf(" cpuid %d\n", cpuid);
1327
1328 for (int j = 0; j < 2; j++)
1329 {
1330 const int dim = 67;
1331 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1332 a = a[j .. dim + j]; // misalign for second iteration
1333 T[] b = new T[dim + j];
1334 b = b[j .. dim + j];
1335 T[] c = new T[dim + j];
1336 c = c[j .. dim + j];
1337
1338 for (int i = 0; i < dim; i++)
1339 { a[i] = cast(T)i;
1340 b[i] = cast(T)(i + 7);
1341 c[i] = cast(T)(i * 2);
1342 }
1343
1344 c[] = a[] - b[];
1345
1346 for (int i = 0; i < dim; i++)
1347 {
1348 if (c[i] != cast(T)(a[i] - b[i]))
1349 {
1350 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1351 assert(0);
1352 }
1353 }
1354 }
1355 }
1356 }
1357
1358
1359 /* ======================================================================== */
1360
1361 /***********************
1362 * Computes:
1363 * a[] -= value
1364 */
1365
1366 T[] _arrayExpSliceMinass_w(T[] a, T value)
1367 {
1368 return _arrayExpSliceMinass_i(a, value);
1369 }
1370
1371 T[] _arrayExpSliceMinass_k(T[] a, T value)
1372 {
1373 return _arrayExpSliceMinass_i(a, value);
1374 }
1375
1376 T[] _arrayExpSliceMinass_i(T[] a, T value)
1377 {
1378 //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1379 auto aptr = a.ptr;
1380 auto aend = aptr + a.length;
1381
1382 version (D_InlineAsm_X86)
1383 {
1384 // SSE2 aligned version is 81% faster
1385 if (sse2() && a.length >= 8)
1386 {
1387 auto n = aptr + (a.length & ~7);
1388
1389 uint l = value;
1390
1391 if (((cast(uint) aptr) & 15) != 0)
1392 {
1393 asm // unaligned case
1394 {
1395 mov ESI, aptr;
1396 mov EDI, n;
1397 movd XMM2, l;
1398 pshufd XMM2, XMM2, 0;
1399
1400 align 4;
1401 startaddsse2u:
1402 movdqu XMM0, [ESI];
1403 movdqu XMM1, [ESI+16];
1404 add ESI, 32;
1405 psubd XMM0, XMM2;
1406 psubd XMM1, XMM2;
1407 movdqu [ESI -32], XMM0;
1408 movdqu [ESI+16-32], XMM1;
1409 cmp ESI, EDI;
1410 jb startaddsse2u;
1411
1412 mov aptr, ESI;
1413 }
1414 }
1415 else
1416 {
1417 asm // aligned case
1418 {
1419 mov ESI, aptr;
1420 mov EDI, n;
1421 movd XMM2, l;
1422 pshufd XMM2, XMM2, 0;
1423
1424 align 4;
1425 startaddsse2a:
1426 movdqa XMM0, [ESI];
1427 movdqa XMM1, [ESI+16];
1428 add ESI, 32;
1429 psubd XMM0, XMM2;
1430 psubd XMM1, XMM2;
1431 movdqa [ESI -32], XMM0;
1432 movdqa [ESI+16-32], XMM1;
1433 cmp ESI, EDI;
1434 jb startaddsse2a;
1435
1436 mov aptr, ESI;
1437 }
1438 }
1439 }
1440 else
1441 // MMX version is 81% faster
1442 if (mmx() && a.length >= 4)
1443 {
1444 auto n = aptr + (a.length & ~3);
1445
1446 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1447
1448 asm
1449 {
1450 mov ESI, aptr;
1451 mov EDI, n;
1452 movq MM2, l;
1453
1454 align 4;
1455 startmmx:
1456 movq MM0, [ESI];
1457 movq MM1, [ESI+8];
1458 add ESI, 16;
1459 psubd MM0, MM2;
1460 psubd MM1, MM2;
1461 movq [ESI -16], MM0;
1462 movq [ESI+8-16], MM1;
1463 cmp ESI, EDI;
1464 jb startmmx;
1465
1466 emms;
1467 mov aptr, ESI;
1468 }
1469 }
1470 else
1471 if (a.length >= 2)
1472 {
1473 auto n = aptr + (a.length & ~1);
1474
1475 asm
1476 {
1477 mov ESI, aptr;
1478 mov EDI, n;
1479 mov EDX, value;
1480
1481 align 4;
1482 start386:
1483 mov EBX, [ESI];
1484 mov ECX, [ESI+4];
1485 add ESI, 8;
1486 sub EBX, EDX;
1487 sub ECX, EDX;
1488 mov [ESI -8], EBX;
1489 mov [ESI+4-8], ECX;
1490 cmp ESI, EDI;
1491 jb start386;
1492
1493 mov aptr, ESI;
1494 }
1495 }
1496 }
1497
1498 while (aptr < aend)
1499 *aptr++ -= value;
1500
1501 return a;
1502 }
1503
1504 unittest
1505 {
1506 printf("_arrayExpSliceMinass_i unittest\n");
1507
1508 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1509 {
1510 version (log) printf(" cpuid %d\n", cpuid);
1511
1512 for (int j = 0; j < 2; j++)
1513 {
1514 const int dim = 67;
1515 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1516 a = a[j .. dim + j]; // misalign for second iteration
1517 T[] b = new T[dim + j];
1518 b = b[j .. dim + j];
1519 T[] c = new T[dim + j];
1520 c = c[j .. dim + j];
1521
1522 for (int i = 0; i < dim; i++)
1523 { a[i] = cast(T)i;
1524 b[i] = cast(T)(i + 7);
1525 c[i] = cast(T)(i * 2);
1526 }
1527
1528 a[] = c[];
1529 a[] -= 6;
1530
1531 for (int i = 0; i < dim; i++)
1532 {
1533 if (a[i] != cast(T)(c[i] - 6))
1534 {
1535 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
1536 assert(0);
1537 }
1538 }
1539 }
1540 }
1541 }
1542
1543
1544 /* ======================================================================== */
1545
1546 /***********************
1547 * Computes:
1548 * a[] -= b[]
1549 */
1550
1551 T[] _arraySliceSliceMinass_w(T[] a, T[] b)
1552 {
1553 return _arraySliceSliceMinass_i(a, b);
1554 }
1555
1556 T[] _arraySliceSliceMinass_k(T[] a, T[] b)
1557 {
1558 return _arraySliceSliceMinass_i(a, b);
1559 }
1560
1561 T[] _arraySliceSliceMinass_i(T[] a, T[] b)
1562 in
1563 {
1564 assert (a.length == b.length);
1565 assert (disjoint(a, b));
1566 }
1567 body
1568 {
1569 //printf("_arraySliceSliceMinass_i()\n");
1570 auto aptr = a.ptr;
1571 auto aend = aptr + a.length;
1572 auto bptr = b.ptr;
1573
1574 version (D_InlineAsm_X86)
1575 {
1576 // SSE2 aligned version is 731% faster
1577 if (sse2() && a.length >= 8)
1578 {
1579 auto n = aptr + (a.length & ~7);
1580
1581 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1582 {
1583 asm // unaligned case
1584 {
1585 mov ESI, aptr;
1586 mov EDI, n;
1587 mov ECX, bptr;
1588
1589 align 4;
1590 startsse2u:
1591 movdqu XMM0, [ESI];
1592 movdqu XMM2, [ECX];
1593 movdqu XMM1, [ESI+16];
1594 movdqu XMM3, [ECX+16];
1595 add ESI, 32;
1596 add ECX, 32;
1597 psubd XMM0, XMM2;
1598 psubd XMM1, XMM3;
1599 movdqu [ESI -32], XMM0;
1600 movdqu [ESI+16-32], XMM1;
1601 cmp ESI, EDI;
1602 jb startsse2u;
1603
1604 mov aptr, ESI;
1605 mov bptr, ECX;
1606 }
1607 }
1608 else
1609 {
1610 asm // aligned case
1611 {
1612 mov ESI, aptr;
1613 mov EDI, n;
1614 mov ECX, bptr;
1615
1616 align 4;
1617 startsse2a:
1618 movdqa XMM0, [ESI];
1619 movdqa XMM2, [ECX];
1620 movdqa XMM1, [ESI+16];
1621 movdqa XMM3, [ECX+16];
1622 add ESI, 32;
1623 add ECX, 32;
1624 psubd XMM0, XMM2;
1625 psubd XMM1, XMM3;
1626 movdqa [ESI -32], XMM0;
1627 movdqa [ESI+16-32], XMM1;
1628 cmp ESI, EDI;
1629 jb startsse2a;
1630
1631 mov aptr, ESI;
1632 mov bptr, ECX;
1633 }
1634 }
1635 }
1636 else
1637 // MMX version is 441% faster
1638 if (mmx() && a.length >= 4)
1639 {
1640 auto n = aptr + (a.length & ~3);
1641
1642 asm
1643 {
1644 mov ESI, aptr;
1645 mov EDI, n;
1646 mov ECX, bptr;
1647
1648 align 4;
1649 startmmx:
1650 movq MM0, [ESI];
1651 movq MM2, [ECX];
1652 movq MM1, [ESI+8];
1653 movq MM3, [ECX+8];
1654 add ESI, 16;
1655 add ECX, 16;
1656 psubd MM0, MM2;
1657 psubd MM1, MM3;
1658 movq [ESI -16], MM0;
1659 movq [ESI+8-16], MM1;
1660 cmp ESI, EDI;
1661 jb startmmx;
1662
1663 emms;
1664 mov aptr, ESI;
1665 mov bptr, ECX;
1666 }
1667 }
1668 }
1669
1670 while (aptr < aend)
1671 *aptr++ -= *bptr++;
1672
1673 return a;
1674 }
1675
1676 unittest
1677 {
1678 printf("_arraySliceSliceMinass_i unittest\n");
1679
1680 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1681 {
1682 version (log) printf(" cpuid %d\n", cpuid);
1683
1684 for (int j = 0; j < 2; j++)
1685 {
1686 const int dim = 67;
1687 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1688 a = a[j .. dim + j]; // misalign for second iteration
1689 T[] b = new T[dim + j];
1690 b = b[j .. dim + j];
1691 T[] c = new T[dim + j];
1692 c = c[j .. dim + j];
1693
1694 for (int i = 0; i < dim; i++)
1695 { a[i] = cast(T)i;
1696 b[i] = cast(T)(i + 7);
1697 c[i] = cast(T)(i * 2);
1698 }
1699
1700 b[] = c[];
1701 c[] -= a[];
1702
1703 for (int i = 0; i < dim; i++)
1704 {
1705 if (c[i] != cast(T)(b[i] - a[i]))
1706 {
1707 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
1708 assert(0);
1709 }
1710 }
1711 }
1712 }
1713 }
1714
1715
1716 /* ======================================================================== */
1717
1718 /***********************
1719 * Computes:
1720 * a[] = b[] * value
1721 */
1722
1723 T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b)
1724 {
1725 return _arraySliceExpMulSliceAssign_i(a, value, b);
1726 }
1727
1728 T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b)
1729 {
1730 return _arraySliceExpMulSliceAssign_i(a, value, b);
1731 }
1732
1733 T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b)
1734 in
1735 {
1736 assert(a.length == b.length);
1737 assert(disjoint(a, b));
1738 }
1739 body
1740 {
1741 //printf("_arraySliceExpMulSliceAssign_i()\n");
1742 auto aptr = a.ptr;
1743 auto aend = aptr + a.length;
1744 auto bptr = b.ptr;
1745
1746 version (none) // multiplying a pair is not supported by MMX
1747 {
1748 version (D_InlineAsm_X86)
1749 {
1750 // SSE2 aligned version is 1380% faster
1751 if (sse2() && a.length >= 8)
1752 {
1753 auto n = aptr + (a.length & ~7);
1754
1755 uint l = value;
1756
1757 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1758 {
1759 asm
1760 {
1761 mov ESI, aptr;
1762 mov EDI, n;
1763 mov EAX, bptr;
1764 movd XMM2, l;
1765 pshufd XMM2, XMM2, 0;
1766
1767 align 4;
1768 startsse2u:
1769 add ESI, 32;
1770 movdqu XMM0, [EAX];
1771 movdqu XMM1, [EAX+16];
1772 add EAX, 32;
1773 pmuludq XMM0, XMM2;
1774 pmuludq XMM1, XMM2;
1775 movdqu [ESI -32], XMM0;
1776 movdqu [ESI+16-32], XMM1;
1777 cmp ESI, EDI;
1778 jb startsse2u;
1779
1780 mov aptr, ESI;
1781 mov bptr, EAX;
1782 }
1783 }
1784 else
1785 {
1786 asm
1787 {
1788 mov ESI, aptr;
1789 mov EDI, n;
1790 mov EAX, bptr;
1791 movd XMM2, l;
1792 pshufd XMM2, XMM2, 0;
1793
1794 align 4;
1795 startsse2a:
1796 add ESI, 32;
1797 movdqa XMM0, [EAX];
1798 movdqa XMM1, [EAX+16];
1799 add EAX, 32;
1800 pmuludq XMM0, XMM2;
1801 pmuludq XMM1, XMM2;
1802 movdqa [ESI -32], XMM0;
1803 movdqa [ESI+16-32], XMM1;
1804 cmp ESI, EDI;
1805 jb startsse2a;
1806
1807 mov aptr, ESI;
1808 mov bptr, EAX;
1809 }
1810 }
1811 }
1812 else
1813 {
1814 // MMX version is 1380% faster
1815 if (mmx() && a.length >= 4)
1816 {
1817 auto n = aptr + (a.length & ~3);
1818
1819 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1820
1821 asm
1822 {
1823 mov ESI, aptr;
1824 mov EDI, n;
1825 mov EAX, bptr;
1826 movq MM2, l;
1827
1828 align 4;
1829 startmmx:
1830 add ESI, 16;
1831 movq MM0, [EAX];
1832 movq MM1, [EAX+8];
1833 add EAX, 16;
1834 pmuludq MM0, MM2; // only multiplies low 32 bits
1835 pmuludq MM1, MM2;
1836 movq [ESI -16], MM0;
1837 movq [ESI+8-16], MM1;
1838 cmp ESI, EDI;
1839 jb startmmx;
1840
1841 emms;
1842 mov aptr, ESI;
1843 mov bptr, EAX;
1844 }
1845 }
1846 }
1847 }
1848 }
1849
1850 while (aptr < aend)
1851 *aptr++ = *bptr++ * value;
1852
1853 return a;
1854 }
1855
1856 unittest
1857 {
1858 printf("_arraySliceExpMulSliceAssign_s unittest\n");
1859
1860 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1861 {
1862 version (log) printf(" cpuid %d\n", cpuid);
1863
1864 for (int j = 0; j < 2; j++)
1865 {
1866 const int dim = 67;
1867 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1868 a = a[j .. dim + j]; // misalign for second iteration
1869 T[] b = new T[dim + j];
1870 b = b[j .. dim + j];
1871 T[] c = new T[dim + j];
1872 c = c[j .. dim + j];
1873
1874 for (int i = 0; i < dim; i++)
1875 { a[i] = cast(T)i;
1876 b[i] = cast(T)(i + 7);
1877 c[i] = cast(T)(i * 2);
1878 }
1879
1880 c[] = a[] * 6;
1881
1882 for (int i = 0; i < dim; i++)
1883 {
1884 //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]);
1885 if (c[i] != cast(T)(a[i] * 6))
1886 {
1887 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
1888 assert(0);
1889 }
1890 }
1891 }
1892 }
1893 }
1894
1895
1896 /* ======================================================================== */
1897
1898 /***********************
1899 * Computes:
1900 * a[] = b[] * c[]
1901 */
1902
1903 T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b)
1904 {
1905 return _arraySliceSliceMulSliceAssign_i(a, c, b);
1906 }
1907
1908 T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b)
1909 {
1910 return _arraySliceSliceMulSliceAssign_i(a, c, b);
1911 }
1912
1913 T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b)
1914 in
1915 {
1916 assert(a.length == b.length && b.length == c.length);
1917 assert(disjoint(a, b));
1918 assert(disjoint(a, c));
1919 assert(disjoint(b, c));
1920 }
1921 body
1922 {
1923 //printf("_arraySliceSliceMulSliceAssign_i()\n");
1924 auto aptr = a.ptr;
1925 auto aend = aptr + a.length;
1926 auto bptr = b.ptr;
1927 auto cptr = c.ptr;
1928
1929 version (none)
1930 {
1931 version (D_InlineAsm_X86)
1932 {
1933 // SSE2 aligned version is 1407% faster
1934 if (sse2() && a.length >= 8)
1935 {
1936 auto n = aptr + (a.length & ~7);
1937
1938 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1939 {
1940 asm
1941 {
1942 mov ESI, aptr;
1943 mov EDI, n;
1944 mov EAX, bptr;
1945 mov ECX, cptr;
1946
1947 align 4;
1948 startsse2u:
1949 add ESI, 32;
1950 movdqu XMM0, [EAX];
1951 movdqu XMM2, [ECX];
1952 movdqu XMM1, [EAX+16];
1953 movdqu XMM3, [ECX+16];
1954 add EAX, 32;
1955 add ECX, 32;
1956 pmuludq XMM0, XMM2;
1957 pmuludq XMM1, XMM3;
1958 movdqu [ESI -32], XMM0;
1959 movdqu [ESI+16-32], XMM1;
1960 cmp ESI, EDI;
1961 jb startsse2u;
1962
1963 mov aptr, ESI;
1964 mov bptr, EAX;
1965 mov cptr, ECX;
1966 }
1967 }
1968 else
1969 {
1970 asm
1971 {
1972 mov ESI, aptr;
1973 mov EDI, n;
1974 mov EAX, bptr;
1975 mov ECX, cptr;
1976
1977 align 4;
1978 startsse2a:
1979 add ESI, 32;
1980 movdqa XMM0, [EAX];
1981 movdqa XMM2, [ECX];
1982 movdqa XMM1, [EAX+16];
1983 movdqa XMM3, [ECX+16];
1984 add EAX, 32;
1985 add ECX, 32;
1986 pmuludq XMM0, XMM2;
1987 pmuludq XMM1, XMM3;
1988 movdqa [ESI -32], XMM0;
1989 movdqa [ESI+16-32], XMM1;
1990 cmp ESI, EDI;
1991 jb startsse2a;
1992
1993 mov aptr, ESI;
1994 mov bptr, EAX;
1995 mov cptr, ECX;
1996 }
1997 }
1998 }
1999 else
2000 // MMX version is 1029% faster
2001 if (mmx() && a.length >= 4)
2002 {
2003 auto n = aptr + (a.length & ~3);
2004
2005 asm
2006 {
2007 mov ESI, aptr;
2008 mov EDI, n;
2009 mov EAX, bptr;
2010 mov ECX, cptr;
2011
2012 align 4;
2013 startmmx:
2014 add ESI, 16;
2015 movq MM0, [EAX];
2016 movq MM2, [ECX];
2017 movq MM1, [EAX+8];
2018 movq MM3, [ECX+8];
2019 add EAX, 16;
2020 add ECX, 16;
2021 pmuludq MM0, MM2;
2022 pmuludq MM1, MM3;
2023 movq [ESI -16], MM0;
2024 movq [ESI+8-16], MM1;
2025 cmp ESI, EDI;
2026 jb startmmx;
2027
2028 emms;
2029 mov aptr, ESI;
2030 mov bptr, EAX;
2031 mov cptr, ECX;
2032 }
2033 }
2034 }
2035 }
2036
2037 while (aptr < aend)
2038 *aptr++ = *bptr++ * *cptr++;
2039
2040 return a;
2041 }
2042
2043 unittest
2044 {
2045 printf("_arraySliceSliceMulSliceAssign_i unittest\n");
2046
2047 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2048 {
2049 version (log) printf(" cpuid %d\n", cpuid);
2050
2051 for (int j = 0; j < 2; j++)
2052 {
2053 const int dim = 67;
2054 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2055 a = a[j .. dim + j]; // misalign for second iteration
2056 T[] b = new T[dim + j];
2057 b = b[j .. dim + j];
2058 T[] c = new T[dim + j];
2059 c = c[j .. dim + j];
2060
2061 for (int i = 0; i < dim; i++)
2062 { a[i] = cast(T)i;
2063 b[i] = cast(T)(i + 7);
2064 c[i] = cast(T)(i * 2);
2065 }
2066
2067 c[] = a[] * b[];
2068
2069 for (int i = 0; i < dim; i++)
2070 {
2071 if (c[i] != cast(T)(a[i] * b[i]))
2072 {
2073 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
2074 assert(0);
2075 }
2076 }
2077 }
2078 }
2079 }
2080
2081
2082 /* ======================================================================== */
2083
2084 /***********************
2085 * Computes:
2086 * a[] *= value
2087 */
2088
2089 T[] _arrayExpSliceMulass_w(T[] a, T value)
2090 {
2091 return _arrayExpSliceMulass_i(a, value);
2092 }
2093
2094 T[] _arrayExpSliceMulass_k(T[] a, T value)
2095 {
2096 return _arrayExpSliceMulass_i(a, value);
2097 }
2098
2099 T[] _arrayExpSliceMulass_i(T[] a, T value)
2100 {
2101 //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
2102 auto aptr = a.ptr;
2103 auto aend = aptr + a.length;
2104
2105 version (none)
2106 {
2107 version (D_InlineAsm_X86)
2108 {
2109 // SSE2 aligned version is 400% faster
2110 if (sse2() && a.length >= 8)
2111 {
2112 auto n = aptr + (a.length & ~7);
2113
2114 uint l = value;
2115
2116 if (((cast(uint) aptr) & 15) != 0)
2117 {
2118 asm
2119 {
2120 mov ESI, aptr;
2121 mov EDI, n;
2122 movd XMM2, l;
2123 pshufd XMM2, XMM2, 0;
2124
2125 align 4;
2126 startsse2u:
2127 movdqu XMM0, [ESI];
2128 movdqu XMM1, [ESI+16];
2129 add ESI, 32;
2130 pmuludq XMM0, XMM2;
2131 pmuludq XMM1, XMM2;
2132 movdqu [ESI -32], XMM0;
2133 movdqu [ESI+16-32], XMM1;
2134 cmp ESI, EDI;
2135 jb startsse2u;
2136
2137 mov aptr, ESI;
2138 }
2139 }
2140 else
2141 {
2142 asm
2143 {
2144 mov ESI, aptr;
2145 mov EDI, n;
2146 movd XMM2, l;
2147 pshufd XMM2, XMM2, 0;
2148
2149 align 4;
2150 startsse2a:
2151 movdqa XMM0, [ESI];
2152 movdqa XMM1, [ESI+16];
2153 add ESI, 32;
2154 pmuludq XMM0, XMM2;
2155 pmuludq XMM1, XMM2;
2156 movdqa [ESI -32], XMM0;
2157 movdqa [ESI+16-32], XMM1;
2158 cmp ESI, EDI;
2159 jb startsse2a;
2160
2161 mov aptr, ESI;
2162 }
2163 }
2164 }
2165 else
2166 // MMX version is 402% faster
2167 if (mmx() && a.length >= 4)
2168 {
2169 auto n = aptr + (a.length & ~3);
2170
2171 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
2172
2173 asm
2174 {
2175 mov ESI, aptr;
2176 mov EDI, n;
2177 movq MM2, l;
2178
2179 align 4;
2180 startmmx:
2181 movq MM0, [ESI];
2182 movq MM1, [ESI+8];
2183 add ESI, 16;
2184 pmuludq MM0, MM2;
2185 pmuludq MM1, MM2;
2186 movq [ESI -16], MM0;
2187 movq [ESI+8-16], MM1;
2188 cmp ESI, EDI;
2189 jb startmmx;
2190
2191 emms;
2192 mov aptr, ESI;
2193 }
2194 }
2195 }
2196 }
2197
2198 while (aptr < aend)
2199 *aptr++ *= value;
2200
2201 return a;
2202 }
2203
2204 unittest
2205 {
2206 printf("_arrayExpSliceMulass_i unittest\n");
2207
2208 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2209 {
2210 version (log) printf(" cpuid %d\n", cpuid);
2211
2212 for (int j = 0; j < 2; j++)
2213 {
2214 const int dim = 67;
2215 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2216 a = a[j .. dim + j]; // misalign for second iteration
2217 T[] b = new T[dim + j];
2218 b = b[j .. dim + j];
2219 T[] c = new T[dim + j];
2220 c = c[j .. dim + j];
2221
2222 for (int i = 0; i < dim; i++)
2223 { a[i] = cast(T)i;
2224 b[i] = cast(T)(i + 7);
2225 c[i] = cast(T)(i * 2);
2226 }
2227
2228 b[] = a[];
2229 a[] *= 6;
2230
2231 for (int i = 0; i < dim; i++)
2232 {
2233 if (a[i] != cast(T)(b[i] * 6))
2234 {
2235 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
2236 assert(0);
2237 }
2238 }
2239 }
2240 }
2241 }
2242
2243
2244 /* ======================================================================== */
2245
2246 /***********************
2247 * Computes:
2248 * a[] *= b[]
2249 */
2250
2251 T[] _arraySliceSliceMulass_w(T[] a, T[] b)
2252 {
2253 return _arraySliceSliceMulass_i(a, b);
2254 }
2255
2256 T[] _arraySliceSliceMulass_k(T[] a, T[] b)
2257 {
2258 return _arraySliceSliceMulass_i(a, b);
2259 }
2260
2261 T[] _arraySliceSliceMulass_i(T[] a, T[] b)
2262 in
2263 {
2264 assert (a.length == b.length);
2265 assert (disjoint(a, b));
2266 }
2267 body
2268 {
2269 //printf("_arraySliceSliceMulass_i()\n");
2270 auto aptr = a.ptr;
2271 auto aend = aptr + a.length;
2272 auto bptr = b.ptr;
2273
2274 version (none)
2275 {
2276 version (D_InlineAsm_X86)
2277 {
2278 // SSE2 aligned version is 873% faster
2279 if (sse2() && a.length >= 8)
2280 {
2281 auto n = aptr + (a.length & ~7);
2282
2283 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
2284 {
2285 asm
2286 {
2287 mov ESI, aptr;
2288 mov EDI, n;
2289 mov ECX, bptr;
2290
2291 align 4;
2292 startsse2u:
2293 movdqu XMM0, [ESI];
2294 movdqu XMM2, [ECX];
2295 movdqu XMM1, [ESI+16];
2296 movdqu XMM3, [ECX+16];
2297 add ESI, 32;
2298 add ECX, 32;
2299 pmuludq XMM0, XMM2;
2300 pmuludq XMM1, XMM3;
2301 movdqu [ESI -32], XMM0;
2302 movdqu [ESI+16-32], XMM1;
2303 cmp ESI, EDI;
2304 jb startsse2u;
2305
2306 mov aptr, ESI;
2307 mov bptr, ECX;
2308 }
2309 }
2310 else
2311 {
2312 asm
2313 {
2314 mov ESI, aptr;
2315 mov EDI, n;
2316 mov ECX, bptr;
2317
2318 align 4;
2319 startsse2a:
2320 movdqa XMM0, [ESI];
2321 movdqa XMM2, [ECX];
2322 movdqa XMM1, [ESI+16];
2323 movdqa XMM3, [ECX+16];
2324 add ESI, 32;
2325 add ECX, 32;
2326 pmuludq XMM0, XMM2;
2327 pmuludq XMM1, XMM3;
2328 movdqa [ESI -32], XMM0;
2329 movdqa [ESI+16-32], XMM1;
2330 cmp ESI, EDI;
2331 jb startsse2a;
2332
2333 mov aptr, ESI;
2334 mov bptr, ECX;
2335 }
2336 }
2337 }
2338 /+ BUG: comment out this section until we figure out what is going
2339 wrong with the invalid pshufd instructions.
2340
2341 else
2342 // MMX version is 573% faster
2343 if (mmx() && a.length >= 4)
2344 {
2345 auto n = aptr + (a.length & ~3);
2346
2347 asm
2348 {
2349 mov ESI, aptr;
2350 mov EDI, n;
2351 mov ECX, bptr;
2352
2353 align 4;
2354 startmmx:
2355 movq MM0, [ESI];
2356 movq MM2, [ECX];
2357 movq MM1, [ESI+8];
2358 movq MM3, [ECX+8];
2359 pxor MM4, MM4;
2360 pxor MM5, MM5;
2361 punpckldq MM4, MM0;
2362 punpckldq MM5, MM2;
2363 add ESI, 16;
2364 add ECX, 16;
2365 pmuludq MM4, MM5;
2366 pshufd MM4, MM4, 8; // ?
2367 movq [ESI -16], MM4;
2368 pxor MM4, MM4;
2369 pxor MM5, MM5;
2370 punpckldq MM4, MM1;
2371 punpckldq MM5, MM3;
2372 pmuludq MM4, MM5;
2373 pshufd MM4, MM4, 8; // ?
2374 movq [ESI+8-16], MM4;
2375 cmp ESI, EDI;
2376 jb startmmx;
2377
2378 emms;
2379 mov aptr, ESI;
2380 mov bptr, ECX;
2381 }
2382 }
2383 +/
2384 }
2385 }
2386
2387 while (aptr < aend)
2388 *aptr++ *= *bptr++;
2389
2390 return a;
2391 }
2392
2393 unittest
2394 {
2395 printf("_arraySliceSliceMulass_i unittest\n");
2396
2397 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2398 {
2399 version (log) printf(" cpuid %d\n", cpuid);
2400
2401 for (int j = 0; j < 2; j++)
2402 {
2403 const int dim = 67;
2404 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2405 a = a[j .. dim + j]; // misalign for second iteration
2406 T[] b = new T[dim + j];
2407 b = b[j .. dim + j];
2408 T[] c = new T[dim + j];
2409 c = c[j .. dim + j];
2410
2411 for (int i = 0; i < dim; i++)
2412 { a[i] = cast(T)i;
2413 b[i] = cast(T)(i + 7);
2414 c[i] = cast(T)(i * 2);
2415 }
2416
2417 b[] = a[];
2418 a[] *= c[];
2419
2420 for (int i = 0; i < dim; i++)
2421 {
2422 if (a[i] != cast(T)(b[i] * c[i]))
2423 {
2424 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);
2425 assert(0);
2426 }
2427 }
2428 }
2429 }
2430 }