comparison druntime/src/compiler/ldc/arrayshort.d @ 1458:e0b2d67cfe7c

Added druntime (this should be removed once it works).
author Robert Clipsham <robert@octarineparrot.com>
date Tue, 02 Jun 2009 17:43:06 +0100
parents
children
comparison
equal deleted inserted replaced
1456:7b218ec1044f 1458:e0b2d67cfe7c
1 /**
2 * Contains SSE2 and MMX versions of certain operations for wchar, short,
3 * and ushort ('u', 's' and 't' suffixes).
4 *
5 * Copyright: Copyright Digital Mars 2008 - 2009.
6 * License: <a href="http://www.boost.org/LICENSE_1_0.txt>Boost License 1.0</a>.
7 * Authors: Walter Bright, based on code originally written by Burton Radons
8 *
9 * Copyright Digital Mars 2008 - 2009.
10 * Distributed under the Boost Software License, Version 1.0.
11 * (See accompanying file LICENSE_1_0.txt or copy at
12 * http://www.boost.org/LICENSE_1_0.txt)
13 */
14 module rt.arrayshort;
15
16 private import rt.util.cpuid;
17
18 version (unittest)
19 {
20 private import core.stdc.stdio : printf;
21 /* This is so unit tests will test every CPU variant
22 */
23 int cpuid;
24 const int CPUID_MAX = 4;
25 bool mmx() { return cpuid == 1 && rt.util.cpuid.mmx(); }
26 bool sse() { return cpuid == 2 && rt.util.cpuid.sse(); }
27 bool sse2() { return cpuid == 3 && rt.util.cpuid.sse2(); }
28 bool amd3dnow() { return cpuid == 4 && rt.util.cpuid.amd3dnow(); }
29 }
30 else
31 {
32 alias rt.util.cpuid.mmx mmx;
33 alias rt.util.cpuid.sse sse;
34 alias rt.util.cpuid.sse2 sse2;
35 alias rt.util.cpuid.sse2 sse2;
36 }
37
38 //version = log;
39
40 bool disjoint(T)(T[] a, T[] b)
41 {
42 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
43 }
44
45 alias short T;
46
47 extern (C):
48
49 /* ======================================================================== */
50
51 /***********************
52 * Computes:
53 * a[] = b[] + value
54 */
55
56 T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b)
57 {
58 return _arraySliceExpAddSliceAssign_s(a, value, b);
59 }
60
61 T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b)
62 {
63 return _arraySliceExpAddSliceAssign_s(a, value, b);
64 }
65
66 T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b)
67 in
68 {
69 assert(a.length == b.length);
70 assert(disjoint(a, b));
71 }
72 body
73 {
74 //printf("_arraySliceExpAddSliceAssign_s()\n");
75 auto aptr = a.ptr;
76 auto aend = aptr + a.length;
77 auto bptr = b.ptr;
78
79 version (D_InlineAsm_X86)
80 {
81 // SSE2 aligned version is 3343% faster
82 if (sse2() && a.length >= 16)
83 {
84 auto n = aptr + (a.length & ~15);
85
86 uint l = cast(ushort) value;
87 l |= (l << 16);
88
89 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
90 {
91 asm // unaligned case
92 {
93 mov ESI, aptr;
94 mov EDI, n;
95 mov EAX, bptr;
96 movd XMM2, l;
97 pshufd XMM2, XMM2, 0;
98
99 align 4;
100 startaddsse2u:
101 add ESI, 32;
102 movdqu XMM0, [EAX];
103 movdqu XMM1, [EAX+16];
104 add EAX, 32;
105 paddw XMM0, XMM2;
106 paddw XMM1, XMM2;
107 movdqu [ESI -32], XMM0;
108 movdqu [ESI+16-32], XMM1;
109 cmp ESI, EDI;
110 jb startaddsse2u;
111
112 mov aptr, ESI;
113 mov bptr, EAX;
114 }
115 }
116 else
117 {
118 asm // aligned case
119 {
120 mov ESI, aptr;
121 mov EDI, n;
122 mov EAX, bptr;
123 movd XMM2, l;
124 pshufd XMM2, XMM2, 0;
125
126 align 4;
127 startaddsse2a:
128 add ESI, 32;
129 movdqa XMM0, [EAX];
130 movdqa XMM1, [EAX+16];
131 add EAX, 32;
132 paddw XMM0, XMM2;
133 paddw XMM1, XMM2;
134 movdqa [ESI -32], XMM0;
135 movdqa [ESI+16-32], XMM1;
136 cmp ESI, EDI;
137 jb startaddsse2a;
138
139 mov aptr, ESI;
140 mov bptr, EAX;
141 }
142 }
143 }
144 else
145 // MMX version is 3343% faster
146 if (mmx() && a.length >= 8)
147 {
148 auto n = aptr + (a.length & ~7);
149
150 uint l = cast(ushort) value;
151
152 asm
153 {
154 mov ESI, aptr;
155 mov EDI, n;
156 mov EAX, bptr;
157 movd MM2, l;
158 pshufw MM2, MM2, 0;
159
160 align 4;
161 startmmx:
162 add ESI, 16;
163 movq MM0, [EAX];
164 movq MM1, [EAX+8];
165 add EAX, 16;
166 paddw MM0, MM2;
167 paddw MM1, MM2;
168 movq [ESI -16], MM0;
169 movq [ESI+8-16], MM1;
170 cmp ESI, EDI;
171 jb startmmx;
172
173 emms;
174 mov aptr, ESI;
175 mov bptr, EAX;
176 }
177 }
178 }
179
180 while (aptr < aend)
181 *aptr++ = cast(T)(*bptr++ + value);
182
183 return a;
184 }
185
186 unittest
187 {
188 printf("_arraySliceExpAddSliceAssign_s unittest\n");
189
190 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
191 {
192 version (log) printf(" cpuid %d\n", cpuid);
193
194 for (int j = 0; j < 2; j++)
195 {
196 const int dim = 67;
197 T[] a = new T[dim + j]; // aligned on 16 byte boundary
198 a = a[j .. dim + j]; // misalign for second iteration
199 T[] b = new T[dim + j];
200 b = b[j .. dim + j];
201 T[] c = new T[dim + j];
202 c = c[j .. dim + j];
203
204 for (int i = 0; i < dim; i++)
205 { a[i] = cast(T)i;
206 b[i] = cast(T)(i + 7);
207 c[i] = cast(T)(i * 2);
208 }
209
210 c[] = a[] + 6;
211
212 for (int i = 0; i < dim; i++)
213 {
214 if (c[i] != cast(T)(a[i] + 6))
215 {
216 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
217 assert(0);
218 }
219 }
220 }
221 }
222 }
223
224
225 /* ======================================================================== */
226
227 /***********************
228 * Computes:
229 * a[] = b[] + c[]
230 */
231
232 T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b)
233 {
234 return _arraySliceSliceAddSliceAssign_s(a, c, b);
235 }
236
237 T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b)
238 {
239 return _arraySliceSliceAddSliceAssign_s(a, c, b);
240 }
241
242 T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b)
243 in
244 {
245 assert(a.length == b.length && b.length == c.length);
246 assert(disjoint(a, b));
247 assert(disjoint(a, c));
248 assert(disjoint(b, c));
249 }
250 body
251 {
252 //printf("_arraySliceSliceAddSliceAssign_s()\n");
253 auto aptr = a.ptr;
254 auto aend = aptr + a.length;
255 auto bptr = b.ptr;
256 auto cptr = c.ptr;
257
258 version (D_InlineAsm_X86)
259 {
260 // SSE2 aligned version is 3777% faster
261 if (sse2() && a.length >= 16)
262 {
263 auto n = aptr + (a.length & ~15);
264
265 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
266 {
267 asm // unaligned case
268 {
269 mov ESI, aptr;
270 mov EDI, n;
271 mov EAX, bptr;
272 mov ECX, cptr;
273
274 align 4;
275 startsse2u:
276 add ESI, 32;
277 movdqu XMM0, [EAX];
278 movdqu XMM1, [EAX+16];
279 add EAX, 32;
280 movdqu XMM2, [ECX];
281 movdqu XMM3, [ECX+16];
282 add ECX, 32;
283 paddw XMM0, XMM2;
284 paddw XMM1, XMM3;
285 movdqu [ESI -32], XMM0;
286 movdqu [ESI+16-32], XMM1;
287 cmp ESI, EDI;
288 jb startsse2u;
289
290 mov aptr, ESI;
291 mov bptr, EAX;
292 mov cptr, ECX;
293 }
294 }
295 else
296 {
297 asm // aligned case
298 {
299 mov ESI, aptr;
300 mov EDI, n;
301 mov EAX, bptr;
302 mov ECX, cptr;
303
304 align 4;
305 startsse2a:
306 add ESI, 32;
307 movdqa XMM0, [EAX];
308 movdqa XMM1, [EAX+16];
309 add EAX, 32;
310 movdqa XMM2, [ECX];
311 movdqa XMM3, [ECX+16];
312 add ECX, 32;
313 paddw XMM0, XMM2;
314 paddw XMM1, XMM3;
315 movdqa [ESI -32], XMM0;
316 movdqa [ESI+16-32], XMM1;
317 cmp ESI, EDI;
318 jb startsse2a;
319
320 mov aptr, ESI;
321 mov bptr, EAX;
322 mov cptr, ECX;
323 }
324 }
325 }
326 else
327 // MMX version is 2068% faster
328 if (mmx() && a.length >= 8)
329 {
330 auto n = aptr + (a.length & ~7);
331
332 asm
333 {
334 mov ESI, aptr;
335 mov EDI, n;
336 mov EAX, bptr;
337 mov ECX, cptr;
338
339 align 4;
340 startmmx:
341 add ESI, 16;
342 movq MM0, [EAX];
343 movq MM1, [EAX+8];
344 add EAX, 16;
345 movq MM2, [ECX];
346 movq MM3, [ECX+8];
347 add ECX, 16;
348 paddw MM0, MM2;
349 paddw MM1, MM3;
350 movq [ESI -16], MM0;
351 movq [ESI+8-16], MM1;
352 cmp ESI, EDI;
353 jb startmmx;
354
355 emms;
356 mov aptr, ESI;
357 mov bptr, EAX;
358 mov cptr, ECX;
359 }
360 }
361 }
362
363 while (aptr < aend)
364 *aptr++ = cast(T)(*bptr++ + *cptr++);
365
366 return a;
367 }
368
369 unittest
370 {
371 printf("_arraySliceSliceAddSliceAssign_s unittest\n");
372
373 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
374 {
375 version (log) printf(" cpuid %d\n", cpuid);
376
377 for (int j = 0; j < 2; j++)
378 {
379 const int dim = 67;
380 T[] a = new T[dim + j]; // aligned on 16 byte boundary
381 a = a[j .. dim + j]; // misalign for second iteration
382 T[] b = new T[dim + j];
383 b = b[j .. dim + j];
384 T[] c = new T[dim + j];
385 c = c[j .. dim + j];
386
387 for (int i = 0; i < dim; i++)
388 { a[i] = cast(T)i;
389 b[i] = cast(T)(i + 7);
390 c[i] = cast(T)(i * 2);
391 }
392
393 c[] = a[] + b[];
394
395 for (int i = 0; i < dim; i++)
396 {
397 if (c[i] != cast(T)(a[i] + b[i]))
398 {
399 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
400 assert(0);
401 }
402 }
403 }
404 }
405 }
406
407
408 /* ======================================================================== */
409
410 /***********************
411 * Computes:
412 * a[] += value
413 */
414
415 T[] _arrayExpSliceAddass_u(T[] a, T value)
416 {
417 return _arrayExpSliceAddass_s(a, value);
418 }
419
420 T[] _arrayExpSliceAddass_t(T[] a, T value)
421 {
422 return _arrayExpSliceAddass_s(a, value);
423 }
424
425 T[] _arrayExpSliceAddass_s(T[] a, T value)
426 {
427 //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
428 auto aptr = a.ptr;
429 auto aend = aptr + a.length;
430
431 version (D_InlineAsm_X86)
432 {
433 // SSE2 aligned version is 832% faster
434 if (sse2() && a.length >= 16)
435 {
436 auto n = aptr + (a.length & ~15);
437
438 uint l = cast(ushort) value;
439 l |= (l << 16);
440
441 if (((cast(uint) aptr) & 15) != 0)
442 {
443 asm // unaligned case
444 {
445 mov ESI, aptr;
446 mov EDI, n;
447 movd XMM2, l;
448 pshufd XMM2, XMM2, 0;
449
450 align 4;
451 startaddsse2u:
452 movdqu XMM0, [ESI];
453 movdqu XMM1, [ESI+16];
454 add ESI, 32;
455 paddw XMM0, XMM2;
456 paddw XMM1, XMM2;
457 movdqu [ESI -32], XMM0;
458 movdqu [ESI+16-32], XMM1;
459 cmp ESI, EDI;
460 jb startaddsse2u;
461
462 mov aptr, ESI;
463 }
464 }
465 else
466 {
467 asm // aligned case
468 {
469 mov ESI, aptr;
470 mov EDI, n;
471 movd XMM2, l;
472 pshufd XMM2, XMM2, 0;
473
474 align 4;
475 startaddsse2a:
476 movdqa XMM0, [ESI];
477 movdqa XMM1, [ESI+16];
478 add ESI, 32;
479 paddw XMM0, XMM2;
480 paddw XMM1, XMM2;
481 movdqa [ESI -32], XMM0;
482 movdqa [ESI+16-32], XMM1;
483 cmp ESI, EDI;
484 jb startaddsse2a;
485
486 mov aptr, ESI;
487 }
488 }
489 }
490 else
491 // MMX version is 826% faster
492 if (mmx() && a.length >= 8)
493 {
494 auto n = aptr + (a.length & ~7);
495
496 uint l = cast(ushort) value;
497
498 asm
499 {
500 mov ESI, aptr;
501 mov EDI, n;
502 movd MM2, l;
503 pshufw MM2, MM2, 0;
504
505 align 4;
506 startmmx:
507 movq MM0, [ESI];
508 movq MM1, [ESI+8];
509 add ESI, 16;
510 paddw MM0, MM2;
511 paddw MM1, MM2;
512 movq [ESI -16], MM0;
513 movq [ESI+8-16], MM1;
514 cmp ESI, EDI;
515 jb startmmx;
516
517 emms;
518 mov aptr, ESI;
519 }
520 }
521 }
522
523 while (aptr < aend)
524 *aptr++ += value;
525
526 return a;
527 }
528
529 unittest
530 {
531 printf("_arrayExpSliceAddass_s unittest\n");
532
533 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
534 {
535 version (log) printf(" cpuid %d\n", cpuid);
536
537 for (int j = 0; j < 2; j++)
538 {
539 const int dim = 67;
540 T[] a = new T[dim + j]; // aligned on 16 byte boundary
541 a = a[j .. dim + j]; // misalign for second iteration
542 T[] b = new T[dim + j];
543 b = b[j .. dim + j];
544 T[] c = new T[dim + j];
545 c = c[j .. dim + j];
546
547 for (int i = 0; i < dim; i++)
548 { a[i] = cast(T)i;
549 b[i] = cast(T)(i + 7);
550 c[i] = cast(T)(i * 2);
551 }
552
553 a[] = c[];
554 a[] += 6;
555
556 for (int i = 0; i < dim; i++)
557 {
558 if (a[i] != cast(T)(c[i] + 6))
559 {
560 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
561 assert(0);
562 }
563 }
564 }
565 }
566 }
567
568
569 /* ======================================================================== */
570
571 /***********************
572 * Computes:
573 * a[] += b[]
574 */
575
576 T[] _arraySliceSliceAddass_u(T[] a, T[] b)
577 {
578 return _arraySliceSliceAddass_s(a, b);
579 }
580
581 T[] _arraySliceSliceAddass_t(T[] a, T[] b)
582 {
583 return _arraySliceSliceAddass_s(a, b);
584 }
585
586 T[] _arraySliceSliceAddass_s(T[] a, T[] b)
587 in
588 {
589 assert (a.length == b.length);
590 assert (disjoint(a, b));
591 }
592 body
593 {
594 //printf("_arraySliceSliceAddass_s()\n");
595 auto aptr = a.ptr;
596 auto aend = aptr + a.length;
597 auto bptr = b.ptr;
598
599 version (D_InlineAsm_X86)
600 {
601 // SSE2 aligned version is 2085% faster
602 if (sse2() && a.length >= 16)
603 {
604 auto n = aptr + (a.length & ~15);
605
606 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
607 {
608 asm // unaligned case
609 {
610 mov ESI, aptr;
611 mov EDI, n;
612 mov ECX, bptr;
613
614 align 4;
615 startsse2u:
616 movdqu XMM0, [ESI];
617 movdqu XMM1, [ESI+16];
618 add ESI, 32;
619 movdqu XMM2, [ECX];
620 movdqu XMM3, [ECX+16];
621 add ECX, 32;
622 paddw XMM0, XMM2;
623 paddw XMM1, XMM3;
624 movdqu [ESI -32], XMM0;
625 movdqu [ESI+16-32], XMM1;
626 cmp ESI, EDI;
627 jb startsse2u;
628
629 mov aptr, ESI;
630 mov bptr, ECX;
631 }
632 }
633 else
634 {
635 asm // aligned case
636 {
637 mov ESI, aptr;
638 mov EDI, n;
639 mov ECX, bptr;
640
641 align 4;
642 startsse2a:
643 movdqa XMM0, [ESI];
644 movdqa XMM1, [ESI+16];
645 add ESI, 32;
646 movdqa XMM2, [ECX];
647 movdqa XMM3, [ECX+16];
648 add ECX, 32;
649 paddw XMM0, XMM2;
650 paddw XMM1, XMM3;
651 movdqa [ESI -32], XMM0;
652 movdqa [ESI+16-32], XMM1;
653 cmp ESI, EDI;
654 jb startsse2a;
655
656 mov aptr, ESI;
657 mov bptr, ECX;
658 }
659 }
660 }
661 else
662 // MMX version is 1022% faster
663 if (mmx() && a.length >= 8)
664 {
665 auto n = aptr + (a.length & ~7);
666
667 asm
668 {
669 mov ESI, aptr;
670 mov EDI, n;
671 mov ECX, bptr;
672
673 align 4;
674 start:
675 movq MM0, [ESI];
676 movq MM1, [ESI+8];
677 add ESI, 16;
678 movq MM2, [ECX];
679 movq MM3, [ECX+8];
680 add ECX, 16;
681 paddw MM0, MM2;
682 paddw MM1, MM3;
683 movq [ESI -16], MM0;
684 movq [ESI+8-16], MM1;
685 cmp ESI, EDI;
686 jb start;
687
688 emms;
689 mov aptr, ESI;
690 mov bptr, ECX;
691 }
692 }
693 }
694
695 while (aptr < aend)
696 *aptr++ += *bptr++;
697
698 return a;
699 }
700
701 unittest
702 {
703 printf("_arraySliceSliceAddass_s unittest\n");
704
705 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
706 {
707 version (log) printf(" cpuid %d\n", cpuid);
708
709 for (int j = 0; j < 2; j++)
710 {
711 const int dim = 67;
712 T[] a = new T[dim + j]; // aligned on 16 byte boundary
713 a = a[j .. dim + j]; // misalign for second iteration
714 T[] b = new T[dim + j];
715 b = b[j .. dim + j];
716 T[] c = new T[dim + j];
717 c = c[j .. dim + j];
718
719 for (int i = 0; i < dim; i++)
720 { a[i] = cast(T)i;
721 b[i] = cast(T)(i + 7);
722 c[i] = cast(T)(i * 2);
723 }
724
725 b[] = c[];
726 c[] += a[];
727
728 for (int i = 0; i < dim; i++)
729 {
730 if (c[i] != cast(T)(b[i] + a[i]))
731 {
732 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
733 assert(0);
734 }
735 }
736 }
737 }
738 }
739
740
741 /* ======================================================================== */
742
743 /***********************
744 * Computes:
745 * a[] = b[] - value
746 */
747
748 T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b)
749 {
750 return _arraySliceExpMinSliceAssign_s(a, value, b);
751 }
752
753 T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b)
754 {
755 return _arraySliceExpMinSliceAssign_s(a, value, b);
756 }
757
758 T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b)
759 in
760 {
761 assert(a.length == b.length);
762 assert(disjoint(a, b));
763 }
764 body
765 {
766 //printf("_arraySliceExpMinSliceAssign_s()\n");
767 auto aptr = a.ptr;
768 auto aend = aptr + a.length;
769 auto bptr = b.ptr;
770
771 version (D_InlineAsm_X86)
772 {
773 // SSE2 aligned version is 3695% faster
774 if (sse2() && a.length >= 16)
775 {
776 auto n = aptr + (a.length & ~15);
777
778 uint l = cast(ushort) value;
779 l |= (l << 16);
780
781 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
782 {
783 asm // unaligned case
784 {
785 mov ESI, aptr;
786 mov EDI, n;
787 mov EAX, bptr;
788 movd XMM2, l;
789 pshufd XMM2, XMM2, 0;
790
791 align 4;
792 startaddsse2u:
793 add ESI, 32;
794 movdqu XMM0, [EAX];
795 movdqu XMM1, [EAX+16];
796 add EAX, 32;
797 psubw XMM0, XMM2;
798 psubw XMM1, XMM2;
799 movdqu [ESI -32], XMM0;
800 movdqu [ESI+16-32], XMM1;
801 cmp ESI, EDI;
802 jb startaddsse2u;
803
804 mov aptr, ESI;
805 mov bptr, EAX;
806 }
807 }
808 else
809 {
810 asm // aligned case
811 {
812 mov ESI, aptr;
813 mov EDI, n;
814 mov EAX, bptr;
815 movd XMM2, l;
816 pshufd XMM2, XMM2, 0;
817
818 align 4;
819 startaddsse2a:
820 add ESI, 32;
821 movdqa XMM0, [EAX];
822 movdqa XMM1, [EAX+16];
823 add EAX, 32;
824 psubw XMM0, XMM2;
825 psubw XMM1, XMM2;
826 movdqa [ESI -32], XMM0;
827 movdqa [ESI+16-32], XMM1;
828 cmp ESI, EDI;
829 jb startaddsse2a;
830
831 mov aptr, ESI;
832 mov bptr, EAX;
833 }
834 }
835 }
836 else
837 // MMX version is 3049% faster
838 if (mmx() && a.length >= 8)
839 {
840 auto n = aptr + (a.length & ~7);
841
842 uint l = cast(ushort) value;
843
844 asm
845 {
846 mov ESI, aptr;
847 mov EDI, n;
848 mov EAX, bptr;
849 movd MM2, l;
850 pshufw MM2, MM2, 0;
851
852 align 4;
853 startmmx:
854 add ESI, 16;
855 movq MM0, [EAX];
856 movq MM1, [EAX+8];
857 add EAX, 16;
858 psubw MM0, MM2;
859 psubw MM1, MM2;
860 movq [ESI -16], MM0;
861 movq [ESI+8-16], MM1;
862 cmp ESI, EDI;
863 jb startmmx;
864
865 emms;
866 mov aptr, ESI;
867 mov bptr, EAX;
868 }
869 }
870 }
871
872 while (aptr < aend)
873 *aptr++ = cast(T)(*bptr++ - value);
874
875 return a;
876 }
877
878 unittest
879 {
880 printf("_arraySliceExpMinSliceAssign_s unittest\n");
881
882 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
883 {
884 version (log) printf(" cpuid %d\n", cpuid);
885
886 for (int j = 0; j < 2; j++)
887 {
888 const int dim = 67;
889 T[] a = new T[dim + j]; // aligned on 16 byte boundary
890 a = a[j .. dim + j]; // misalign for second iteration
891 T[] b = new T[dim + j];
892 b = b[j .. dim + j];
893 T[] c = new T[dim + j];
894 c = c[j .. dim + j];
895
896 for (int i = 0; i < dim; i++)
897 { a[i] = cast(T)i;
898 b[i] = cast(T)(i + 7);
899 c[i] = cast(T)(i * 2);
900 }
901
902 c[] = a[] - 6;
903
904 for (int i = 0; i < dim; i++)
905 {
906 if (c[i] != cast(T)(a[i] - 6))
907 {
908 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
909 assert(0);
910 }
911 }
912 }
913 }
914 }
915
916
917 /* ======================================================================== */
918
919 /***********************
920 * Computes:
921 * a[] = value - b[]
922 */
923
924 T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value)
925 {
926 return _arrayExpSliceMinSliceAssign_s(a, b, value);
927 }
928
929 T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value)
930 {
931 return _arrayExpSliceMinSliceAssign_s(a, b, value);
932 }
933
934 T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value)
935 in
936 {
937 assert(a.length == b.length);
938 assert(disjoint(a, b));
939 }
940 body
941 {
942 //printf("_arrayExpSliceMinSliceAssign_s()\n");
943 auto aptr = a.ptr;
944 auto aend = aptr + a.length;
945 auto bptr = b.ptr;
946
947 version (D_InlineAsm_X86)
948 {
949 // SSE2 aligned version is 4995% faster
950 if (sse2() && a.length >= 16)
951 {
952 auto n = aptr + (a.length & ~15);
953
954 uint l = cast(ushort) value;
955 l |= (l << 16);
956
957 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
958 {
959 asm // unaligned case
960 {
961 mov ESI, aptr;
962 mov EDI, n;
963 mov EAX, bptr;
964
965 align 4;
966 startaddsse2u:
967 movd XMM2, l;
968 pshufd XMM2, XMM2, 0;
969 movd XMM3, l;
970 pshufd XMM3, XMM3, 0;
971 add ESI, 32;
972 movdqu XMM0, [EAX];
973 movdqu XMM1, [EAX+16];
974 add EAX, 32;
975 psubw XMM2, XMM0;
976 psubw XMM3, XMM1;
977 movdqu [ESI -32], XMM2;
978 movdqu [ESI+16-32], XMM3;
979 cmp ESI, EDI;
980 jb startaddsse2u;
981
982 mov aptr, ESI;
983 mov bptr, EAX;
984 }
985 }
986 else
987 {
988 asm // aligned case
989 {
990 mov ESI, aptr;
991 mov EDI, n;
992 mov EAX, bptr;
993
994 align 4;
995 startaddsse2a:
996 movd XMM2, l;
997 pshufd XMM2, XMM2, 0;
998 movd XMM3, l;
999 pshufd XMM3, XMM3, 0;
1000 add ESI, 32;
1001 movdqa XMM0, [EAX];
1002 movdqa XMM1, [EAX+16];
1003 add EAX, 32;
1004 psubw XMM2, XMM0;
1005 psubw XMM3, XMM1;
1006 movdqa [ESI -32], XMM2;
1007 movdqa [ESI+16-32], XMM3;
1008 cmp ESI, EDI;
1009 jb startaddsse2a;
1010
1011 mov aptr, ESI;
1012 mov bptr, EAX;
1013 }
1014 }
1015 }
1016 else
1017 // MMX version is 4562% faster
1018 if (mmx() && a.length >= 8)
1019 {
1020 auto n = aptr + (a.length & ~7);
1021
1022 uint l = cast(ushort) value;
1023
1024 asm
1025 {
1026 mov ESI, aptr;
1027 mov EDI, n;
1028 mov EAX, bptr;
1029 movd MM4, l;
1030 pshufw MM4, MM4, 0;
1031
1032 align 4;
1033 startmmx:
1034 add ESI, 16;
1035 movq MM2, [EAX];
1036 movq MM3, [EAX+8];
1037 movq MM0, MM4;
1038 movq MM1, MM4;
1039 add EAX, 16;
1040 psubw MM0, MM2;
1041 psubw MM1, MM3;
1042 movq [ESI -16], MM0;
1043 movq [ESI+8-16], MM1;
1044 cmp ESI, EDI;
1045 jb startmmx;
1046
1047 emms;
1048 mov aptr, ESI;
1049 mov bptr, EAX;
1050 }
1051 }
1052 }
1053
1054 while (aptr < aend)
1055 *aptr++ = cast(T)(value - *bptr++);
1056
1057 return a;
1058 }
1059
1060 unittest
1061 {
1062 printf("_arrayExpSliceMinSliceAssign_s unittest\n");
1063
1064 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1065 {
1066 version (log) printf(" cpuid %d\n", cpuid);
1067
1068 for (int j = 0; j < 2; j++)
1069 {
1070 const int dim = 67;
1071 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1072 a = a[j .. dim + j]; // misalign for second iteration
1073 T[] b = new T[dim + j];
1074 b = b[j .. dim + j];
1075 T[] c = new T[dim + j];
1076 c = c[j .. dim + j];
1077
1078 for (int i = 0; i < dim; i++)
1079 { a[i] = cast(T)i;
1080 b[i] = cast(T)(i + 7);
1081 c[i] = cast(T)(i * 2);
1082 }
1083
1084 c[] = 6 - a[];
1085
1086 for (int i = 0; i < dim; i++)
1087 {
1088 if (c[i] != cast(T)(6 - a[i]))
1089 {
1090 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
1091 assert(0);
1092 }
1093 }
1094 }
1095 }
1096 }
1097
1098
1099 /* ======================================================================== */
1100
1101 /***********************
1102 * Computes:
1103 * a[] = b[] - c[]
1104 */
1105
1106 T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b)
1107 {
1108 return _arraySliceSliceMinSliceAssign_s(a, c, b);
1109 }
1110
1111 T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b)
1112 {
1113 return _arraySliceSliceMinSliceAssign_s(a, c, b);
1114 }
1115
1116 T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b)
1117 in
1118 {
1119 assert(a.length == b.length && b.length == c.length);
1120 assert(disjoint(a, b));
1121 assert(disjoint(a, c));
1122 assert(disjoint(b, c));
1123 }
1124 body
1125 {
1126 auto aptr = a.ptr;
1127 auto aend = aptr + a.length;
1128 auto bptr = b.ptr;
1129 auto cptr = c.ptr;
1130
1131 version (D_InlineAsm_X86)
1132 {
1133 // SSE2 aligned version is 4129% faster
1134 if (sse2() && a.length >= 16)
1135 {
1136 auto n = aptr + (a.length & ~15);
1137
1138 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1139 {
1140 asm // unaligned case
1141 {
1142 mov ESI, aptr;
1143 mov EDI, n;
1144 mov EAX, bptr;
1145 mov ECX, cptr;
1146
1147 align 4;
1148 startsse2u:
1149 add ESI, 32;
1150 movdqu XMM0, [EAX];
1151 movdqu XMM1, [EAX+16];
1152 add EAX, 32;
1153 movdqu XMM2, [ECX];
1154 movdqu XMM3, [ECX+16];
1155 add ECX, 32;
1156 psubw XMM0, XMM2;
1157 psubw XMM1, XMM3;
1158 movdqu [ESI -32], XMM0;
1159 movdqu [ESI+16-32], XMM1;
1160 cmp ESI, EDI;
1161 jb startsse2u;
1162
1163 mov aptr, ESI;
1164 mov bptr, EAX;
1165 mov cptr, ECX;
1166 }
1167 }
1168 else
1169 {
1170 asm // aligned case
1171 {
1172 mov ESI, aptr;
1173 mov EDI, n;
1174 mov EAX, bptr;
1175 mov ECX, cptr;
1176
1177 align 4;
1178 startsse2a:
1179 add ESI, 32;
1180 movdqa XMM0, [EAX];
1181 movdqa XMM1, [EAX+16];
1182 add EAX, 32;
1183 movdqa XMM2, [ECX];
1184 movdqa XMM3, [ECX+16];
1185 add ECX, 32;
1186 psubw XMM0, XMM2;
1187 psubw XMM1, XMM3;
1188 movdqa [ESI -32], XMM0;
1189 movdqa [ESI+16-32], XMM1;
1190 cmp ESI, EDI;
1191 jb startsse2a;
1192
1193 mov aptr, ESI;
1194 mov bptr, EAX;
1195 mov cptr, ECX;
1196 }
1197 }
1198 }
1199 else
1200 // MMX version is 2018% faster
1201 if (mmx() && a.length >= 8)
1202 {
1203 auto n = aptr + (a.length & ~7);
1204
1205 asm
1206 {
1207 mov ESI, aptr;
1208 mov EDI, n;
1209 mov EAX, bptr;
1210 mov ECX, cptr;
1211
1212 align 4;
1213 startmmx:
1214 add ESI, 16;
1215 movq MM0, [EAX];
1216 movq MM1, [EAX+8];
1217 add EAX, 16;
1218 movq MM2, [ECX];
1219 movq MM3, [ECX+8];
1220 add ECX, 16;
1221 psubw MM0, MM2;
1222 psubw MM1, MM3;
1223 movq [ESI -16], MM0;
1224 movq [ESI+8-16], MM1;
1225 cmp ESI, EDI;
1226 jb startmmx;
1227
1228 emms;
1229 mov aptr, ESI;
1230 mov bptr, EAX;
1231 mov cptr, ECX;
1232 }
1233 }
1234 }
1235
1236 while (aptr < aend)
1237 *aptr++ = cast(T)(*bptr++ - *cptr++);
1238
1239 return a;
1240 }
1241
1242 unittest
1243 {
1244 printf("_arraySliceSliceMinSliceAssign_s unittest\n");
1245
1246 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1247 {
1248 version (log) printf(" cpuid %d\n", cpuid);
1249
1250 for (int j = 0; j < 2; j++)
1251 {
1252 const int dim = 67;
1253 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1254 a = a[j .. dim + j]; // misalign for second iteration
1255 T[] b = new T[dim + j];
1256 b = b[j .. dim + j];
1257 T[] c = new T[dim + j];
1258 c = c[j .. dim + j];
1259
1260 for (int i = 0; i < dim; i++)
1261 { a[i] = cast(T)i;
1262 b[i] = cast(T)(i + 7);
1263 c[i] = cast(T)(i * 2);
1264 }
1265
1266 c[] = a[] - b[];
1267
1268 for (int i = 0; i < dim; i++)
1269 {
1270 if (c[i] != cast(T)(a[i] - b[i]))
1271 {
1272 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1273 assert(0);
1274 }
1275 }
1276 }
1277 }
1278 }
1279
1280
1281 /* ======================================================================== */
1282
1283 /***********************
1284 * Computes:
1285 * a[] -= value
1286 */
1287
1288 T[] _arrayExpSliceMinass_u(T[] a, T value)
1289 {
1290 return _arrayExpSliceMinass_s(a, value);
1291 }
1292
1293 T[] _arrayExpSliceMinass_t(T[] a, T value)
1294 {
1295 return _arrayExpSliceMinass_s(a, value);
1296 }
1297
1298 T[] _arrayExpSliceMinass_s(T[] a, T value)
1299 {
1300 //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1301 auto aptr = a.ptr;
1302 auto aend = aptr + a.length;
1303
1304 version (D_InlineAsm_X86)
1305 {
1306 // SSE2 aligned version is 835% faster
1307 if (sse2() && a.length >= 16)
1308 {
1309 auto n = aptr + (a.length & ~15);
1310
1311 uint l = cast(ushort) value;
1312 l |= (l << 16);
1313
1314 if (((cast(uint) aptr) & 15) != 0)
1315 {
1316 asm // unaligned case
1317 {
1318 mov ESI, aptr;
1319 mov EDI, n;
1320 movd XMM2, l;
1321 pshufd XMM2, XMM2, 0;
1322
1323 align 4;
1324 startaddsse2u:
1325 movdqu XMM0, [ESI];
1326 movdqu XMM1, [ESI+16];
1327 add ESI, 32;
1328 psubw XMM0, XMM2;
1329 psubw XMM1, XMM2;
1330 movdqu [ESI -32], XMM0;
1331 movdqu [ESI+16-32], XMM1;
1332 cmp ESI, EDI;
1333 jb startaddsse2u;
1334
1335 mov aptr, ESI;
1336 }
1337 }
1338 else
1339 {
1340 asm // aligned case
1341 {
1342 mov ESI, aptr;
1343 mov EDI, n;
1344 movd XMM2, l;
1345 pshufd XMM2, XMM2, 0;
1346
1347 align 4;
1348 startaddsse2a:
1349 movdqa XMM0, [ESI];
1350 movdqa XMM1, [ESI+16];
1351 add ESI, 32;
1352 psubw XMM0, XMM2;
1353 psubw XMM1, XMM2;
1354 movdqa [ESI -32], XMM0;
1355 movdqa [ESI+16-32], XMM1;
1356 cmp ESI, EDI;
1357 jb startaddsse2a;
1358
1359 mov aptr, ESI;
1360 }
1361 }
1362 }
1363 else
1364 // MMX version is 835% faster
1365 if (mmx() && a.length >= 8)
1366 {
1367 auto n = aptr + (a.length & ~7);
1368
1369 uint l = cast(ushort) value;
1370
1371 asm
1372 {
1373 mov ESI, aptr;
1374 mov EDI, n;
1375 movd MM2, l;
1376 pshufw MM2, MM2, 0;
1377
1378 align 4;
1379 startmmx:
1380 movq MM0, [ESI];
1381 movq MM1, [ESI+8];
1382 add ESI, 16;
1383 psubw MM0, MM2;
1384 psubw MM1, MM2;
1385 movq [ESI -16], MM0;
1386 movq [ESI+8-16], MM1;
1387 cmp ESI, EDI;
1388 jb startmmx;
1389
1390 emms;
1391 mov aptr, ESI;
1392 }
1393 }
1394 }
1395
1396 while (aptr < aend)
1397 *aptr++ -= value;
1398
1399 return a;
1400 }
1401
1402 unittest
1403 {
1404 printf("_arrayExpSliceMinass_s unittest\n");
1405
1406 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1407 {
1408 version (log) printf(" cpuid %d\n", cpuid);
1409
1410 for (int j = 0; j < 2; j++)
1411 {
1412 const int dim = 67;
1413 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1414 a = a[j .. dim + j]; // misalign for second iteration
1415 T[] b = new T[dim + j];
1416 b = b[j .. dim + j];
1417 T[] c = new T[dim + j];
1418 c = c[j .. dim + j];
1419
1420 for (int i = 0; i < dim; i++)
1421 { a[i] = cast(T)i;
1422 b[i] = cast(T)(i + 7);
1423 c[i] = cast(T)(i * 2);
1424 }
1425
1426 a[] = c[];
1427 a[] -= 6;
1428
1429 for (int i = 0; i < dim; i++)
1430 {
1431 if (a[i] != cast(T)(c[i] - 6))
1432 {
1433 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
1434 assert(0);
1435 }
1436 }
1437 }
1438 }
1439 }
1440
1441
1442 /* ======================================================================== */
1443
1444 /***********************
1445 * Computes:
1446 * a[] -= b[]
1447 */
1448
1449 T[] _arraySliceSliceMinass_u(T[] a, T[] b)
1450 {
1451 return _arraySliceSliceMinass_s(a, b);
1452 }
1453
1454 T[] _arraySliceSliceMinass_t(T[] a, T[] b)
1455 {
1456 return _arraySliceSliceMinass_s(a, b);
1457 }
1458
1459 T[] _arraySliceSliceMinass_s(T[] a, T[] b)
1460 in
1461 {
1462 assert (a.length == b.length);
1463 assert (disjoint(a, b));
1464 }
1465 body
1466 {
1467 //printf("_arraySliceSliceMinass_s()\n");
1468 auto aptr = a.ptr;
1469 auto aend = aptr + a.length;
1470 auto bptr = b.ptr;
1471
1472 version (D_InlineAsm_X86)
1473 {
1474 // SSE2 aligned version is 2121% faster
1475 if (sse2() && a.length >= 16)
1476 {
1477 auto n = aptr + (a.length & ~15);
1478
1479 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1480 {
1481 asm // unaligned case
1482 {
1483 mov ESI, aptr;
1484 mov EDI, n;
1485 mov ECX, bptr;
1486
1487 align 4;
1488 startsse2u:
1489 movdqu XMM0, [ESI];
1490 movdqu XMM1, [ESI+16];
1491 add ESI, 32;
1492 movdqu XMM2, [ECX];
1493 movdqu XMM3, [ECX+16];
1494 add ECX, 32;
1495 psubw XMM0, XMM2;
1496 psubw XMM1, XMM3;
1497 movdqu [ESI -32], XMM0;
1498 movdqu [ESI+16-32], XMM1;
1499 cmp ESI, EDI;
1500 jb startsse2u;
1501
1502 mov aptr, ESI;
1503 mov bptr, ECX;
1504 }
1505 }
1506 else
1507 {
1508 asm // aligned case
1509 {
1510 mov ESI, aptr;
1511 mov EDI, n;
1512 mov ECX, bptr;
1513
1514 align 4;
1515 startsse2a:
1516 movdqa XMM0, [ESI];
1517 movdqa XMM1, [ESI+16];
1518 add ESI, 32;
1519 movdqa XMM2, [ECX];
1520 movdqa XMM3, [ECX+16];
1521 add ECX, 32;
1522 psubw XMM0, XMM2;
1523 psubw XMM1, XMM3;
1524 movdqa [ESI -32], XMM0;
1525 movdqa [ESI+16-32], XMM1;
1526 cmp ESI, EDI;
1527 jb startsse2a;
1528
1529 mov aptr, ESI;
1530 mov bptr, ECX;
1531 }
1532 }
1533 }
1534 else
1535 // MMX version is 1116% faster
1536 if (mmx() && a.length >= 8)
1537 {
1538 auto n = aptr + (a.length & ~7);
1539
1540 asm
1541 {
1542 mov ESI, aptr;
1543 mov EDI, n;
1544 mov ECX, bptr;
1545
1546 align 4;
1547 start:
1548 movq MM0, [ESI];
1549 movq MM1, [ESI+8];
1550 add ESI, 16;
1551 movq MM2, [ECX];
1552 movq MM3, [ECX+8];
1553 add ECX, 16;
1554 psubw MM0, MM2;
1555 psubw MM1, MM3;
1556 movq [ESI -16], MM0;
1557 movq [ESI+8-16], MM1;
1558 cmp ESI, EDI;
1559 jb start;
1560
1561 emms;
1562 mov aptr, ESI;
1563 mov bptr, ECX;
1564 }
1565 }
1566 }
1567
1568 while (aptr < aend)
1569 *aptr++ -= *bptr++;
1570
1571 return a;
1572 }
1573
1574 unittest
1575 {
1576 printf("_arraySliceSliceMinass_s unittest\n");
1577
1578 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1579 {
1580 version (log) printf(" cpuid %d\n", cpuid);
1581
1582 for (int j = 0; j < 2; j++)
1583 {
1584 const int dim = 67;
1585 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1586 a = a[j .. dim + j]; // misalign for second iteration
1587 T[] b = new T[dim + j];
1588 b = b[j .. dim + j];
1589 T[] c = new T[dim + j];
1590 c = c[j .. dim + j];
1591
1592 for (int i = 0; i < dim; i++)
1593 { a[i] = cast(T)i;
1594 b[i] = cast(T)(i + 7);
1595 c[i] = cast(T)(i * 2);
1596 }
1597
1598 b[] = c[];
1599 c[] -= a[];
1600
1601 for (int i = 0; i < dim; i++)
1602 {
1603 if (c[i] != cast(T)(b[i] - a[i]))
1604 {
1605 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
1606 assert(0);
1607 }
1608 }
1609 }
1610 }
1611 }
1612
1613
1614 /* ======================================================================== */
1615
1616 /***********************
1617 * Computes:
1618 * a[] = b[] * value
1619 */
1620
1621 T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b)
1622 {
1623 return _arraySliceExpMulSliceAssign_s(a, value, b);
1624 }
1625
1626 T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b)
1627 {
1628 return _arraySliceExpMulSliceAssign_s(a, value, b);
1629 }
1630
1631 T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b)
1632 in
1633 {
1634 assert(a.length == b.length);
1635 assert(disjoint(a, b));
1636 }
1637 body
1638 {
1639 //printf("_arraySliceExpMulSliceAssign_s()\n");
1640 auto aptr = a.ptr;
1641 auto aend = aptr + a.length;
1642 auto bptr = b.ptr;
1643
1644 version (D_InlineAsm_X86)
1645 {
1646 // SSE2 aligned version is 3733% faster
1647 if (sse2() && a.length >= 16)
1648 {
1649 auto n = aptr + (a.length & ~15);
1650
1651 uint l = cast(ushort) value;
1652 l |= l << 16;
1653
1654 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1655 {
1656 asm
1657 {
1658 mov ESI, aptr;
1659 mov EDI, n;
1660 mov EAX, bptr;
1661 movd XMM2, l;
1662 pshufd XMM2, XMM2, 0;
1663
1664 align 4;
1665 startsse2u:
1666 add ESI, 32;
1667 movdqu XMM0, [EAX];
1668 movdqu XMM1, [EAX+16];
1669 add EAX, 32;
1670 pmullw XMM0, XMM2;
1671 pmullw XMM1, XMM2;
1672 movdqu [ESI -32], XMM0;
1673 movdqu [ESI+16-32], XMM1;
1674 cmp ESI, EDI;
1675 jb startsse2u;
1676
1677 mov aptr, ESI;
1678 mov bptr, EAX;
1679 }
1680 }
1681 else
1682 {
1683 asm
1684 {
1685 mov ESI, aptr;
1686 mov EDI, n;
1687 mov EAX, bptr;
1688 movd XMM2, l;
1689 pshufd XMM2, XMM2, 0;
1690
1691 align 4;
1692 startsse2a:
1693 add ESI, 32;
1694 movdqa XMM0, [EAX];
1695 movdqa XMM1, [EAX+16];
1696 add EAX, 32;
1697 pmullw XMM0, XMM2;
1698 pmullw XMM1, XMM2;
1699 movdqa [ESI -32], XMM0;
1700 movdqa [ESI+16-32], XMM1;
1701 cmp ESI, EDI;
1702 jb startsse2a;
1703
1704 mov aptr, ESI;
1705 mov bptr, EAX;
1706 }
1707 }
1708 }
1709 else
1710 // MMX version is 3733% faster
1711 if (mmx() && a.length >= 8)
1712 {
1713 auto n = aptr + (a.length & ~7);
1714
1715 uint l = cast(ushort) value;
1716
1717 asm
1718 {
1719 mov ESI, aptr;
1720 mov EDI, n;
1721 mov EAX, bptr;
1722 movd MM2, l;
1723 pshufw MM2, MM2, 0;
1724
1725 align 4;
1726 startmmx:
1727 add ESI, 16;
1728 movq MM0, [EAX];
1729 movq MM1, [EAX+8];
1730 add EAX, 16;
1731 pmullw MM0, MM2;
1732 pmullw MM1, MM2;
1733 movq [ESI -16], MM0;
1734 movq [ESI+8-16], MM1;
1735 cmp ESI, EDI;
1736 jb startmmx;
1737
1738 emms;
1739 mov aptr, ESI;
1740 mov bptr, EAX;
1741 }
1742 }
1743 }
1744
1745 while (aptr < aend)
1746 *aptr++ = cast(T)(*bptr++ * value);
1747
1748 return a;
1749 }
1750
1751 unittest
1752 {
1753 printf("_arraySliceExpMulSliceAssign_s unittest\n");
1754
1755 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1756 {
1757 version (log) printf(" cpuid %d\n", cpuid);
1758
1759 for (int j = 0; j < 2; j++)
1760 {
1761 const int dim = 67;
1762 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1763 a = a[j .. dim + j]; // misalign for second iteration
1764 T[] b = new T[dim + j];
1765 b = b[j .. dim + j];
1766 T[] c = new T[dim + j];
1767 c = c[j .. dim + j];
1768
1769 for (int i = 0; i < dim; i++)
1770 { a[i] = cast(T)i;
1771 b[i] = cast(T)(i + 7);
1772 c[i] = cast(T)(i * 2);
1773 }
1774
1775 c[] = a[] * 6;
1776
1777 for (int i = 0; i < dim; i++)
1778 {
1779 if (c[i] != cast(T)(a[i] * 6))
1780 {
1781 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
1782 assert(0);
1783 }
1784 }
1785 }
1786 }
1787 }
1788
1789
1790 /* ======================================================================== */
1791
1792 /***********************
1793 * Computes:
1794 * a[] = b[] * c[]
1795 */
1796
1797 T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b)
1798 {
1799 return _arraySliceSliceMulSliceAssign_s(a, c, b);
1800 }
1801
1802 T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b)
1803 {
1804 return _arraySliceSliceMulSliceAssign_s(a, c, b);
1805 }
1806
1807 T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b)
1808 in
1809 {
1810 assert(a.length == b.length && b.length == c.length);
1811 assert(disjoint(a, b));
1812 assert(disjoint(a, c));
1813 assert(disjoint(b, c));
1814 }
1815 body
1816 {
1817 //printf("_arraySliceSliceMulSliceAssign_s()\n");
1818 auto aptr = a.ptr;
1819 auto aend = aptr + a.length;
1820 auto bptr = b.ptr;
1821 auto cptr = c.ptr;
1822
1823 version (D_InlineAsm_X86)
1824 {
1825 // SSE2 aligned version is 2515% faster
1826 if (sse2() && a.length >= 16)
1827 {
1828 auto n = aptr + (a.length & ~15);
1829
1830 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1831 {
1832 asm
1833 {
1834 mov ESI, aptr;
1835 mov EDI, n;
1836 mov EAX, bptr;
1837 mov ECX, cptr;
1838
1839 align 4;
1840 startsse2u:
1841 add ESI, 32;
1842 movdqu XMM0, [EAX];
1843 movdqu XMM2, [ECX];
1844 movdqu XMM1, [EAX+16];
1845 movdqu XMM3, [ECX+16];
1846 add EAX, 32;
1847 add ECX, 32;
1848 pmullw XMM0, XMM2;
1849 pmullw XMM1, XMM3;
1850 movdqu [ESI -32], XMM0;
1851 movdqu [ESI+16-32], XMM1;
1852 cmp ESI, EDI;
1853 jb startsse2u;
1854
1855 mov aptr, ESI;
1856 mov bptr, EAX;
1857 mov cptr, ECX;
1858 }
1859 }
1860 else
1861 {
1862 asm
1863 {
1864 mov ESI, aptr;
1865 mov EDI, n;
1866 mov EAX, bptr;
1867 mov ECX, cptr;
1868
1869 align 4;
1870 startsse2a:
1871 add ESI, 32;
1872 movdqa XMM0, [EAX];
1873 movdqa XMM2, [ECX];
1874 movdqa XMM1, [EAX+16];
1875 movdqa XMM3, [ECX+16];
1876 add EAX, 32;
1877 add ECX, 32;
1878 pmullw XMM0, XMM2;
1879 pmullw XMM1, XMM3;
1880 movdqa [ESI -32], XMM0;
1881 movdqa [ESI+16-32], XMM1;
1882 cmp ESI, EDI;
1883 jb startsse2a;
1884
1885 mov aptr, ESI;
1886 mov bptr, EAX;
1887 mov cptr, ECX;
1888 }
1889 }
1890 }
1891 else
1892 // MMX version is 2515% faster
1893 if (mmx() && a.length >= 8)
1894 {
1895 auto n = aptr + (a.length & ~7);
1896
1897 asm
1898 {
1899 mov ESI, aptr;
1900 mov EDI, n;
1901 mov EAX, bptr;
1902 mov ECX, cptr;
1903
1904 align 4;
1905 startmmx:
1906 add ESI, 16;
1907 movq MM0, [EAX];
1908 movq MM2, [ECX];
1909 movq MM1, [EAX+8];
1910 movq MM3, [ECX+8];
1911 add EAX, 16;
1912 add ECX, 16;
1913 pmullw MM0, MM2;
1914 pmullw MM1, MM3;
1915 movq [ESI -16], MM0;
1916 movq [ESI+8-16], MM1;
1917 cmp ESI, EDI;
1918 jb startmmx;
1919
1920 emms;
1921 mov aptr, ESI;
1922 mov bptr, EAX;
1923 mov cptr, ECX;
1924 }
1925 }
1926 }
1927
1928 while (aptr < aend)
1929 *aptr++ = cast(T)(*bptr++ * *cptr++);
1930
1931 return a;
1932 }
1933
1934 unittest
1935 {
1936 printf("_arraySliceSliceMulSliceAssign_s unittest\n");
1937
1938 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1939 {
1940 version (log) printf(" cpuid %d\n", cpuid);
1941
1942 for (int j = 0; j < 2; j++)
1943 {
1944 const int dim = 67;
1945 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1946 a = a[j .. dim + j]; // misalign for second iteration
1947 T[] b = new T[dim + j];
1948 b = b[j .. dim + j];
1949 T[] c = new T[dim + j];
1950 c = c[j .. dim + j];
1951
1952 for (int i = 0; i < dim; i++)
1953 { a[i] = cast(T)i;
1954 b[i] = cast(T)(i + 7);
1955 c[i] = cast(T)(i * 2);
1956 }
1957
1958 c[] = a[] * b[];
1959
1960 for (int i = 0; i < dim; i++)
1961 {
1962 if (c[i] != cast(T)(a[i] * b[i]))
1963 {
1964 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
1965 assert(0);
1966 }
1967 }
1968 }
1969 }
1970 }
1971
1972
1973 /* ======================================================================== */
1974
1975 /***********************
1976 * Computes:
1977 * a[] *= value
1978 */
1979
1980 T[] _arrayExpSliceMulass_u(T[] a, T value)
1981 {
1982 return _arrayExpSliceMulass_s(a, value);
1983 }
1984
1985 T[] _arrayExpSliceMulass_t(T[] a, T value)
1986 {
1987 return _arrayExpSliceMulass_s(a, value);
1988 }
1989
1990 T[] _arrayExpSliceMulass_s(T[] a, T value)
1991 {
1992 //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1993 auto aptr = a.ptr;
1994 auto aend = aptr + a.length;
1995
1996 version (D_InlineAsm_X86)
1997 {
1998 // SSE2 aligned version is 2044% faster
1999 if (sse2() && a.length >= 16)
2000 {
2001 auto n = aptr + (a.length & ~15);
2002
2003 uint l = cast(ushort) value;
2004 l |= l << 16;
2005
2006 if (((cast(uint) aptr) & 15) != 0)
2007 {
2008 asm
2009 {
2010 mov ESI, aptr;
2011 mov EDI, n;
2012 movd XMM2, l;
2013 pshufd XMM2, XMM2, 0;
2014
2015 align 4;
2016 startsse2u:
2017 movdqu XMM0, [ESI];
2018 movdqu XMM1, [ESI+16];
2019 add ESI, 32;
2020 pmullw XMM0, XMM2;
2021 pmullw XMM1, XMM2;
2022 movdqu [ESI -32], XMM0;
2023 movdqu [ESI+16-32], XMM1;
2024 cmp ESI, EDI;
2025 jb startsse2u;
2026
2027 mov aptr, ESI;
2028 }
2029 }
2030 else
2031 {
2032 asm
2033 {
2034 mov ESI, aptr;
2035 mov EDI, n;
2036 movd XMM2, l;
2037 pshufd XMM2, XMM2, 0;
2038
2039 align 4;
2040 startsse2a:
2041 movdqa XMM0, [ESI];
2042 movdqa XMM1, [ESI+16];
2043 add ESI, 32;
2044 pmullw XMM0, XMM2;
2045 pmullw XMM1, XMM2;
2046 movdqa [ESI -32], XMM0;
2047 movdqa [ESI+16-32], XMM1;
2048 cmp ESI, EDI;
2049 jb startsse2a;
2050
2051 mov aptr, ESI;
2052 }
2053 }
2054 }
2055 else
2056 // MMX version is 2056% faster
2057 if (mmx() && a.length >= 8)
2058 {
2059 auto n = aptr + (a.length & ~7);
2060
2061 uint l = cast(ushort) value;
2062
2063 asm
2064 {
2065 mov ESI, aptr;
2066 mov EDI, n;
2067 movd MM2, l;
2068 pshufw MM2, MM2, 0;
2069
2070 align 4;
2071 startmmx:
2072 movq MM0, [ESI];
2073 movq MM1, [ESI+8];
2074 add ESI, 16;
2075 pmullw MM0, MM2;
2076 pmullw MM1, MM2;
2077 movq [ESI -16], MM0;
2078 movq [ESI+8-16], MM1;
2079 cmp ESI, EDI;
2080 jb startmmx;
2081
2082 emms;
2083 mov aptr, ESI;
2084 }
2085 }
2086 }
2087
2088 while (aptr < aend)
2089 *aptr++ *= value;
2090
2091 return a;
2092 }
2093
2094 unittest
2095 {
2096 printf("_arrayExpSliceMulass_s unittest\n");
2097
2098 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2099 {
2100 version (log) printf(" cpuid %d\n", cpuid);
2101
2102 for (int j = 0; j < 2; j++)
2103 {
2104 const int dim = 67;
2105 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2106 a = a[j .. dim + j]; // misalign for second iteration
2107 T[] b = new T[dim + j];
2108 b = b[j .. dim + j];
2109 T[] c = new T[dim + j];
2110 c = c[j .. dim + j];
2111
2112 for (int i = 0; i < dim; i++)
2113 { a[i] = cast(T)i;
2114 b[i] = cast(T)(i + 7);
2115 c[i] = cast(T)(i * 2);
2116 }
2117
2118 b[] = a[];
2119 a[] *= 6;
2120
2121 for (int i = 0; i < dim; i++)
2122 {
2123 if (a[i] != cast(T)(b[i] * 6))
2124 {
2125 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
2126 assert(0);
2127 }
2128 }
2129 }
2130 }
2131 }
2132
2133
2134 /* ======================================================================== */
2135
2136 /***********************
2137 * Computes:
2138 * a[] *= b[]
2139 */
2140
2141 T[] _arraySliceSliceMulass_u(T[] a, T[] b)
2142 {
2143 return _arraySliceSliceMulass_s(a, b);
2144 }
2145
2146 T[] _arraySliceSliceMulass_t(T[] a, T[] b)
2147 {
2148 return _arraySliceSliceMulass_s(a, b);
2149 }
2150
2151 T[] _arraySliceSliceMulass_s(T[] a, T[] b)
2152 in
2153 {
2154 assert (a.length == b.length);
2155 assert (disjoint(a, b));
2156 }
2157 body
2158 {
2159 //printf("_arraySliceSliceMulass_s()\n");
2160 auto aptr = a.ptr;
2161 auto aend = aptr + a.length;
2162 auto bptr = b.ptr;
2163
2164 version (D_InlineAsm_X86)
2165 {
2166 // SSE2 aligned version is 2519% faster
2167 if (sse2() && a.length >= 16)
2168 {
2169 auto n = aptr + (a.length & ~15);
2170
2171 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
2172 {
2173 asm
2174 {
2175 mov ESI, aptr;
2176 mov EDI, n;
2177 mov ECX, bptr;
2178
2179 align 4;
2180 startsse2u:
2181 movdqu XMM0, [ESI];
2182 movdqu XMM2, [ECX];
2183 movdqu XMM1, [ESI+16];
2184 movdqu XMM3, [ECX+16];
2185 add ESI, 32;
2186 add ECX, 32;
2187 pmullw XMM0, XMM2;
2188 pmullw XMM1, XMM3;
2189 movdqu [ESI -32], XMM0;
2190 movdqu [ESI+16-32], XMM1;
2191 cmp ESI, EDI;
2192 jb startsse2u;
2193
2194 mov aptr, ESI;
2195 mov bptr, ECX;
2196 }
2197 }
2198 else
2199 {
2200 asm
2201 {
2202 mov ESI, aptr;
2203 mov EDI, n;
2204 mov ECX, bptr;
2205
2206 align 4;
2207 startsse2a:
2208 movdqa XMM0, [ESI];
2209 movdqa XMM2, [ECX];
2210 movdqa XMM1, [ESI+16];
2211 movdqa XMM3, [ECX+16];
2212 add ESI, 32;
2213 add ECX, 32;
2214 pmullw XMM0, XMM2;
2215 pmullw XMM1, XMM3;
2216 movdqa [ESI -32], XMM0;
2217 movdqa [ESI+16-32], XMM1;
2218 cmp ESI, EDI;
2219 jb startsse2a;
2220
2221 mov aptr, ESI;
2222 mov bptr, ECX;
2223 }
2224 }
2225 }
2226 else
2227 // MMX version is 1712% faster
2228 if (mmx() && a.length >= 8)
2229 {
2230 auto n = aptr + (a.length & ~7);
2231
2232 asm
2233 {
2234 mov ESI, aptr;
2235 mov EDI, n;
2236 mov ECX, bptr;
2237
2238 align 4;
2239 startmmx:
2240 movq MM0, [ESI];
2241 movq MM2, [ECX];
2242 movq MM1, [ESI+8];
2243 movq MM3, [ECX+8];
2244 add ESI, 16;
2245 add ECX, 16;
2246 pmullw MM0, MM2;
2247 pmullw MM1, MM3;
2248 movq [ESI -16], MM0;
2249 movq [ESI+8-16], MM1;
2250 cmp ESI, EDI;
2251 jb startmmx;
2252
2253 emms;
2254 mov aptr, ESI;
2255 mov bptr, ECX;
2256 }
2257 }
2258 }
2259
2260 while (aptr < aend)
2261 *aptr++ *= *bptr++;
2262
2263 return a;
2264 }
2265
2266 unittest
2267 {
2268 printf("_arraySliceSliceMulass_s unittest\n");
2269
2270 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2271 {
2272 version (log) printf(" cpuid %d\n", cpuid);
2273
2274 for (int j = 0; j < 2; j++)
2275 {
2276 const int dim = 67;
2277 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2278 a = a[j .. dim + j]; // misalign for second iteration
2279 T[] b = new T[dim + j];
2280 b = b[j .. dim + j];
2281 T[] c = new T[dim + j];
2282 c = c[j .. dim + j];
2283
2284 for (int i = 0; i < dim; i++)
2285 { a[i] = cast(T)i;
2286 b[i] = cast(T)(i + 7);
2287 c[i] = cast(T)(i * 2);
2288 }
2289
2290 b[] = a[];
2291 a[] *= c[];
2292
2293 for (int i = 0; i < dim; i++)
2294 {
2295 if (a[i] != cast(T)(b[i] * c[i]))
2296 {
2297 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);
2298 assert(0);
2299 }
2300 }
2301 }
2302 }
2303 }