comparison druntime/src/compiler/dmd/arraybyte.d @ 1458:e0b2d67cfe7c

Added druntime (this should be removed once it works).
author Robert Clipsham <robert@octarineparrot.com>
date Tue, 02 Jun 2009 17:43:06 +0100
parents
children
comparison
equal deleted inserted replaced
1456:7b218ec1044f 1458:e0b2d67cfe7c
1 /**
2 * Contains SSE2 and MMX versions of certain operations for char, byte, and
3 * ubyte ('a', 'g' and 'h' suffixes).
4 *
5 * Copyright: Copyright Digital Mars 2008 - 2009.
6 * License: <a href="http://www.boost.org/LICENSE_1_0.txt>Boost License 1.0</a>.
7 * Authors: Walter Bright, based on code originally written by Burton Radons
8 *
9 * Copyright Digital Mars 2008 - 2009.
10 * Distributed under the Boost Software License, Version 1.0.
11 * (See accompanying file LICENSE_1_0.txt or copy at
12 * http://www.boost.org/LICENSE_1_0.txt)
13 */
14 module rt.arraybyte;
15
16 import rt.util.cpuid;
17
18 version (unittest)
19 {
20 private import core.stdc.stdio : printf;
21 /* This is so unit tests will test every CPU variant
22 */
23 int cpuid;
24 const int CPUID_MAX = 4;
25 bool mmx() { return cpuid == 1 && rt.util.cpuid.mmx(); }
26 bool sse() { return cpuid == 2 && rt.util.cpuid.sse(); }
27 bool sse2() { return cpuid == 3 && rt.util.cpuid.sse2(); }
28 bool amd3dnow() { return cpuid == 4 && rt.util.cpuid.amd3dnow(); }
29 }
30 else
31 {
32 alias rt.util.cpuid.mmx mmx;
33 alias rt.util.cpuid.sse sse;
34 alias rt.util.cpuid.sse2 sse2;
35 alias rt.util.cpuid.amd3dnow amd3dnow;
36 }
37
38 //version = log;
39
40 bool disjoint(T)(T[] a, T[] b)
41 {
42 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
43 }
44
45 alias byte T;
46
47 extern (C):
48
49 /* ======================================================================== */
50
51
52 /***********************
53 * Computes:
54 * a[] = b[] + value
55 */
56
57 T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b)
58 {
59 return _arraySliceExpAddSliceAssign_g(a, value, b);
60 }
61
62 T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b)
63 {
64 return _arraySliceExpAddSliceAssign_g(a, value, b);
65 }
66
67 T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b)
68 in
69 {
70 assert(a.length == b.length);
71 assert(disjoint(a, b));
72 }
73 body
74 {
75 //printf("_arraySliceExpAddSliceAssign_g()\n");
76 auto aptr = a.ptr;
77 auto aend = aptr + a.length;
78 auto bptr = b.ptr;
79
80 version (D_InlineAsm_X86)
81 {
82 // SSE2 aligned version is 1088% faster
83 if (sse2() && a.length >= 64)
84 {
85 auto n = aptr + (a.length & ~63);
86
87 uint l = cast(ubyte) value;
88 l |= (l << 8);
89 l |= (l << 16);
90
91 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
92 {
93 asm // unaligned case
94 {
95 mov ESI, aptr;
96 mov EDI, n;
97 mov EAX, bptr;
98 movd XMM4, l;
99 pshufd XMM4, XMM4, 0;
100
101 align 8;
102 startaddsse2u:
103 add ESI, 64;
104 movdqu XMM0, [EAX];
105 movdqu XMM1, [EAX+16];
106 movdqu XMM2, [EAX+32];
107 movdqu XMM3, [EAX+48];
108 add EAX, 64;
109 paddb XMM0, XMM4;
110 paddb XMM1, XMM4;
111 paddb XMM2, XMM4;
112 paddb XMM3, XMM4;
113 movdqu [ESI -64], XMM0;
114 movdqu [ESI+16-64], XMM1;
115 movdqu [ESI+32-64], XMM2;
116 movdqu [ESI+48-64], XMM3;
117 cmp ESI, EDI;
118 jb startaddsse2u;
119
120 mov aptr, ESI;
121 mov bptr, EAX;
122 }
123 }
124 else
125 {
126 asm // aligned case
127 {
128 mov ESI, aptr;
129 mov EDI, n;
130 mov EAX, bptr;
131 movd XMM4, l;
132 pshufd XMM4, XMM4, 0;
133
134 align 8;
135 startaddsse2a:
136 add ESI, 64;
137 movdqa XMM0, [EAX];
138 movdqa XMM1, [EAX+16];
139 movdqa XMM2, [EAX+32];
140 movdqa XMM3, [EAX+48];
141 add EAX, 64;
142 paddb XMM0, XMM4;
143 paddb XMM1, XMM4;
144 paddb XMM2, XMM4;
145 paddb XMM3, XMM4;
146 movdqa [ESI -64], XMM0;
147 movdqa [ESI+16-64], XMM1;
148 movdqa [ESI+32-64], XMM2;
149 movdqa [ESI+48-64], XMM3;
150 cmp ESI, EDI;
151 jb startaddsse2a;
152
153 mov aptr, ESI;
154 mov bptr, EAX;
155 }
156 }
157 }
158 else
159 // MMX version is 1000% faster
160 if (mmx() && a.length >= 32)
161 {
162 auto n = aptr + (a.length & ~31);
163
164 uint l = cast(ubyte) value;
165 l |= (l << 8);
166
167 asm
168 {
169 mov ESI, aptr;
170 mov EDI, n;
171 mov EAX, bptr;
172 movd MM4, l;
173 pshufw MM4, MM4, 0;
174
175 align 4;
176 startaddmmx:
177 add ESI, 32;
178 movq MM0, [EAX];
179 movq MM1, [EAX+8];
180 movq MM2, [EAX+16];
181 movq MM3, [EAX+24];
182 add EAX, 32;
183 paddb MM0, MM4;
184 paddb MM1, MM4;
185 paddb MM2, MM4;
186 paddb MM3, MM4;
187 movq [ESI -32], MM0;
188 movq [ESI+8 -32], MM1;
189 movq [ESI+16-32], MM2;
190 movq [ESI+24-32], MM3;
191 cmp ESI, EDI;
192 jb startaddmmx;
193
194 emms;
195 mov aptr, ESI;
196 mov bptr, EAX;
197 }
198 }
199 /* trying to be fair and treat normal 32-bit cpu the same way as we do
200 * the SIMD units, with unrolled asm. There's not enough registers,
201 * really.
202 */
203 else
204 if (a.length >= 4)
205 {
206
207 auto n = aptr + (a.length & ~3);
208 asm
209 {
210 mov ESI, aptr;
211 mov EDI, n;
212 mov EAX, bptr;
213 mov CL, value;
214
215 align 4;
216 startadd386:
217 add ESI, 4;
218 mov DX, [EAX];
219 mov BX, [EAX+2];
220 add EAX, 4;
221 add BL, CL;
222 add BH, CL;
223 add DL, CL;
224 add DH, CL;
225 mov [ESI -4], DX;
226 mov [ESI+2 -4], BX;
227 cmp ESI, EDI;
228 jb startadd386;
229
230 mov aptr, ESI;
231 mov bptr, EAX;
232 }
233
234 }
235 }
236
237 while (aptr < aend)
238 *aptr++ = cast(T)(*bptr++ + value);
239
240 return a;
241 }
242
243 unittest
244 {
245 printf("_arraySliceExpAddSliceAssign_g unittest\n");
246
247 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
248 {
249 version (log) printf(" cpuid %d\n", cpuid);
250
251 for (int j = 0; j < 2; j++)
252 {
253 const int dim = 67;
254 T[] a = new T[dim + j]; // aligned on 16 byte boundary
255 a = a[j .. dim + j]; // misalign for second iteration
256 T[] b = new T[dim + j];
257 b = b[j .. dim + j];
258 T[] c = new T[dim + j];
259 c = c[j .. dim + j];
260
261 for (int i = 0; i < dim; i++)
262 { a[i] = cast(T)i;
263 b[i] = cast(T)(i + 7);
264 c[i] = cast(T)(i * 2);
265 }
266
267 c[] = a[] + 6;
268
269 for (int i = 0; i < dim; i++)
270 {
271 if (c[i] != cast(T)(a[i] + 6))
272 {
273 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
274 assert(0);
275 }
276 }
277 }
278 }
279 }
280
281
282 /* ======================================================================== */
283
284 /***********************
285 * Computes:
286 * a[] = b[] + c[]
287 */
288
289 T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b)
290 {
291 return _arraySliceSliceAddSliceAssign_g(a, c, b);
292 }
293
294 T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b)
295 {
296 return _arraySliceSliceAddSliceAssign_g(a, c, b);
297 }
298
299 T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b)
300 in
301 {
302 assert(a.length == b.length && b.length == c.length);
303 assert(disjoint(a, b));
304 assert(disjoint(a, c));
305 assert(disjoint(b, c));
306 }
307 body
308 {
309 //printf("_arraySliceSliceAddSliceAssign_g()\n");
310 auto aptr = a.ptr;
311 auto aend = aptr + a.length;
312 auto bptr = b.ptr;
313 auto cptr = c.ptr;
314
315 version (D_InlineAsm_X86)
316 {
317 // SSE2 aligned version is 5739% faster
318 if (sse2() && a.length >= 64)
319 {
320 auto n = aptr + (a.length & ~63);
321
322 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
323 {
324 version (log) printf("\tsse2 unaligned\n");
325 asm // unaligned case
326 {
327 mov ESI, aptr;
328 mov EDI, n;
329 mov EAX, bptr;
330 mov ECX, cptr;
331
332 align 8;
333 startaddlsse2u:
334 add ESI, 64;
335 movdqu XMM0, [EAX];
336 movdqu XMM1, [EAX+16];
337 movdqu XMM2, [EAX+32];
338 movdqu XMM3, [EAX+48];
339 add EAX, 64;
340 movdqu XMM4, [ECX];
341 movdqu XMM5, [ECX+16];
342 movdqu XMM6, [ECX+32];
343 movdqu XMM7, [ECX+48];
344 add ECX, 64;
345 paddb XMM0, XMM4;
346 paddb XMM1, XMM5;
347 paddb XMM2, XMM6;
348 paddb XMM3, XMM7;
349 movdqu [ESI -64], XMM0;
350 movdqu [ESI+16-64], XMM1;
351 movdqu [ESI+32-64], XMM2;
352 movdqu [ESI+48-64], XMM3;
353 cmp ESI, EDI;
354 jb startaddlsse2u;
355
356 mov aptr, ESI;
357 mov bptr, EAX;
358 mov cptr, ECX;
359 }
360 }
361 else
362 {
363 version (log) printf("\tsse2 aligned\n");
364 asm // aligned case
365 {
366 mov ESI, aptr;
367 mov EDI, n;
368 mov EAX, bptr;
369 mov ECX, cptr;
370
371 align 8;
372 startaddlsse2a:
373 add ESI, 64;
374 movdqa XMM0, [EAX];
375 movdqa XMM1, [EAX+16];
376 movdqa XMM2, [EAX+32];
377 movdqa XMM3, [EAX+48];
378 add EAX, 64;
379 movdqa XMM4, [ECX];
380 movdqa XMM5, [ECX+16];
381 movdqa XMM6, [ECX+32];
382 movdqa XMM7, [ECX+48];
383 add ECX, 64;
384 paddb XMM0, XMM4;
385 paddb XMM1, XMM5;
386 paddb XMM2, XMM6;
387 paddb XMM3, XMM7;
388 movdqa [ESI -64], XMM0;
389 movdqa [ESI+16-64], XMM1;
390 movdqa [ESI+32-64], XMM2;
391 movdqa [ESI+48-64], XMM3;
392 cmp ESI, EDI;
393 jb startaddlsse2a;
394
395 mov aptr, ESI;
396 mov bptr, EAX;
397 mov cptr, ECX;
398 }
399 }
400 }
401 else
402 // MMX version is 4428% faster
403 if (mmx() && a.length >= 32)
404 {
405 version (log) printf("\tmmx\n");
406 auto n = aptr + (a.length & ~31);
407
408 asm
409 {
410 mov ESI, aptr;
411 mov EDI, n;
412 mov EAX, bptr;
413 mov ECX, cptr;
414
415 align 4;
416 startaddlmmx:
417 add ESI, 32;
418 movq MM0, [EAX];
419 movq MM1, [EAX+8];
420 movq MM2, [EAX+16];
421 movq MM3, [EAX+24];
422 add EAX, 32;
423 movq MM4, [ECX];
424 movq MM5, [ECX+8];
425 movq MM6, [ECX+16];
426 movq MM7, [ECX+24];
427 add ECX, 32;
428 paddb MM0, MM4;
429 paddb MM1, MM5;
430 paddb MM2, MM6;
431 paddb MM3, MM7;
432 movq [ESI -32], MM0;
433 movq [ESI+8 -32], MM1;
434 movq [ESI+16-32], MM2;
435 movq [ESI+24-32], MM3;
436 cmp ESI, EDI;
437 jb startaddlmmx;
438
439 emms;
440 mov aptr, ESI;
441 mov bptr, EAX;
442 mov cptr, ECX;
443 }
444 }
445 }
446
447 version (log) if (aptr < aend) printf("\tbase\n");
448 while (aptr < aend)
449 *aptr++ = cast(T)(*bptr++ + *cptr++);
450
451 return a;
452 }
453
454 unittest
455 {
456 printf("_arraySliceSliceAddSliceAssign_g unittest\n");
457
458 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
459 {
460 version (log) printf(" cpuid %d\n", cpuid);
461
462 for (int j = 0; j < 2; j++)
463 {
464 const int dim = 67;
465 T[] a = new T[dim + j]; // aligned on 16 byte boundary
466 a = a[j .. dim + j]; // misalign for second iteration
467 T[] b = new T[dim + j];
468 b = b[j .. dim + j];
469 T[] c = new T[dim + j];
470 c = c[j .. dim + j];
471
472 for (int i = 0; i < dim; i++)
473 { a[i] = cast(T)i;
474 b[i] = cast(T)(i + 7);
475 c[i] = cast(T)(i * 2);
476 }
477
478 c[] = a[] + b[];
479
480 for (int i = 0; i < dim; i++)
481 {
482 if (c[i] != cast(T)(a[i] + b[i]))
483 {
484 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
485 assert(0);
486 }
487 }
488 }
489 }
490 }
491
492
493 /* ======================================================================== */
494
495 /***********************
496 * Computes:
497 * a[] += value
498 */
499
500 T[] _arrayExpSliceAddass_a(T[] a, T value)
501 {
502 return _arrayExpSliceAddass_g(a, value);
503 }
504
505 T[] _arrayExpSliceAddass_h(T[] a, T value)
506 {
507 return _arrayExpSliceAddass_g(a, value);
508 }
509
510 T[] _arrayExpSliceAddass_g(T[] a, T value)
511 {
512 //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
513 auto aptr = a.ptr;
514 auto aend = aptr + a.length;
515
516 version (D_InlineAsm_X86)
517 {
518 // SSE2 aligned version is 1578% faster
519 if (sse2() && a.length >= 64)
520 {
521 auto n = aptr + (a.length & ~63);
522
523 uint l = cast(ubyte) value;
524 l |= (l << 8);
525 l |= (l << 16);
526
527 if (((cast(uint) aptr) & 15) != 0)
528 {
529 asm // unaligned case
530 {
531 mov ESI, aptr;
532 mov EDI, n;
533 movd XMM4, l;
534 pshufd XMM4, XMM4, 0;
535
536 align 8;
537 startaddasssse2u:
538 movdqu XMM0, [ESI];
539 movdqu XMM1, [ESI+16];
540 movdqu XMM2, [ESI+32];
541 movdqu XMM3, [ESI+48];
542 add ESI, 64;
543 paddb XMM0, XMM4;
544 paddb XMM1, XMM4;
545 paddb XMM2, XMM4;
546 paddb XMM3, XMM4;
547 movdqu [ESI -64], XMM0;
548 movdqu [ESI+16-64], XMM1;
549 movdqu [ESI+32-64], XMM2;
550 movdqu [ESI+48-64], XMM3;
551 cmp ESI, EDI;
552 jb startaddasssse2u;
553
554 mov aptr, ESI;
555 }
556 }
557 else
558 {
559 asm // aligned case
560 {
561 mov ESI, aptr;
562 mov EDI, n;
563 movd XMM4, l;
564 pshufd XMM4, XMM4, 0;
565
566 align 8;
567 startaddasssse2a:
568 movdqa XMM0, [ESI];
569 movdqa XMM1, [ESI+16];
570 movdqa XMM2, [ESI+32];
571 movdqa XMM3, [ESI+48];
572 add ESI, 64;
573 paddb XMM0, XMM4;
574 paddb XMM1, XMM4;
575 paddb XMM2, XMM4;
576 paddb XMM3, XMM4;
577 movdqa [ESI -64], XMM0;
578 movdqa [ESI+16-64], XMM1;
579 movdqa [ESI+32-64], XMM2;
580 movdqa [ESI+48-64], XMM3;
581 cmp ESI, EDI;
582 jb startaddasssse2a;
583
584 mov aptr, ESI;
585 }
586 }
587 }
588 else
589 // MMX version is 1721% faster
590 if (mmx() && a.length >= 32)
591 {
592
593 auto n = aptr + (a.length & ~31);
594
595 uint l = cast(ubyte) value;
596 l |= (l << 8);
597
598 asm
599 {
600 mov ESI, aptr;
601 mov EDI, n;
602 movd MM4, l;
603 pshufw MM4, MM4, 0;
604
605 align 8;
606 startaddassmmx:
607 movq MM0, [ESI];
608 movq MM1, [ESI+8];
609 movq MM2, [ESI+16];
610 movq MM3, [ESI+24];
611 add ESI, 32;
612 paddb MM0, MM4;
613 paddb MM1, MM4;
614 paddb MM2, MM4;
615 paddb MM3, MM4;
616 movq [ESI -32], MM0;
617 movq [ESI+8 -32], MM1;
618 movq [ESI+16-32], MM2;
619 movq [ESI+24-32], MM3;
620 cmp ESI, EDI;
621 jb startaddassmmx;
622
623 emms;
624 mov aptr, ESI;
625 }
626 }
627 }
628
629 while (aptr < aend)
630 *aptr++ += value;
631
632 return a;
633 }
634
635 unittest
636 {
637 printf("_arrayExpSliceAddass_g unittest\n");
638
639 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
640 {
641 version (log) printf(" cpuid %d\n", cpuid);
642
643 for (int j = 0; j < 2; j++)
644 {
645 const int dim = 67;
646 T[] a = new T[dim + j]; // aligned on 16 byte boundary
647 a = a[j .. dim + j]; // misalign for second iteration
648 T[] b = new T[dim + j];
649 b = b[j .. dim + j];
650 T[] c = new T[dim + j];
651 c = c[j .. dim + j];
652
653 for (int i = 0; i < dim; i++)
654 { a[i] = cast(T)i;
655 b[i] = cast(T)(i + 7);
656 c[i] = cast(T)(i * 2);
657 }
658
659 a[] = c[];
660 c[] += 6;
661
662 for (int i = 0; i < dim; i++)
663 {
664 if (c[i] != cast(T)(a[i] + 6))
665 {
666 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
667 assert(0);
668 }
669 }
670 }
671 }
672 }
673
674
675 /* ======================================================================== */
676
677 /***********************
678 * Computes:
679 * a[] += b[]
680 */
681
682 T[] _arraySliceSliceAddass_a(T[] a, T[] b)
683 {
684 return _arraySliceSliceAddass_g(a, b);
685 }
686
687 T[] _arraySliceSliceAddass_h(T[] a, T[] b)
688 {
689 return _arraySliceSliceAddass_g(a, b);
690 }
691
692 T[] _arraySliceSliceAddass_g(T[] a, T[] b)
693 in
694 {
695 assert (a.length == b.length);
696 assert (disjoint(a, b));
697 }
698 body
699 {
700 //printf("_arraySliceSliceAddass_g()\n");
701 auto aptr = a.ptr;
702 auto aend = aptr + a.length;
703 auto bptr = b.ptr;
704
705 version (D_InlineAsm_X86)
706 {
707 // SSE2 aligned version is 4727% faster
708 if (sse2() && a.length >= 64)
709 {
710 auto n = aptr + (a.length & ~63);
711
712 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
713 {
714 asm // unaligned case
715 {
716 mov ESI, aptr;
717 mov EDI, n;
718 mov ECX, bptr;
719
720 align 8;
721 startaddasslsse2u:
722 movdqu XMM0, [ESI];
723 movdqu XMM1, [ESI+16];
724 movdqu XMM2, [ESI+32];
725 movdqu XMM3, [ESI+48];
726 add ESI, 64;
727 movdqu XMM4, [ECX];
728 movdqu XMM5, [ECX+16];
729 movdqu XMM6, [ECX+32];
730 movdqu XMM7, [ECX+48];
731 add ECX, 64;
732 paddb XMM0, XMM4;
733 paddb XMM1, XMM5;
734 paddb XMM2, XMM6;
735 paddb XMM3, XMM7;
736 movdqu [ESI -64], XMM0;
737 movdqu [ESI+16-64], XMM1;
738 movdqu [ESI+32-64], XMM2;
739 movdqu [ESI+48-64], XMM3;
740 cmp ESI, EDI;
741 jb startaddasslsse2u;
742
743 mov aptr, ESI;
744 mov bptr, ECX;
745 }
746 }
747 else
748 {
749 asm // aligned case
750 {
751 mov ESI, aptr;
752 mov EDI, n;
753 mov ECX, bptr;
754
755 align 8;
756 startaddasslsse2a:
757 movdqa XMM0, [ESI];
758 movdqa XMM1, [ESI+16];
759 movdqa XMM2, [ESI+32];
760 movdqa XMM3, [ESI+48];
761 add ESI, 64;
762 movdqa XMM4, [ECX];
763 movdqa XMM5, [ECX+16];
764 movdqa XMM6, [ECX+32];
765 movdqa XMM7, [ECX+48];
766 add ECX, 64;
767 paddb XMM0, XMM4;
768 paddb XMM1, XMM5;
769 paddb XMM2, XMM6;
770 paddb XMM3, XMM7;
771 movdqa [ESI -64], XMM0;
772 movdqa [ESI+16-64], XMM1;
773 movdqa [ESI+32-64], XMM2;
774 movdqa [ESI+48-64], XMM3;
775 cmp ESI, EDI;
776 jb startaddasslsse2a;
777
778 mov aptr, ESI;
779 mov bptr, ECX;
780 }
781 }
782 }
783 else
784 // MMX version is 3059% faster
785 if (mmx() && a.length >= 32)
786 {
787
788 auto n = aptr + (a.length & ~31);
789
790 asm
791 {
792 mov ESI, aptr;
793 mov EDI, n;
794 mov ECX, bptr;
795
796 align 8;
797 startaddasslmmx:
798 movq MM0, [ESI];
799 movq MM1, [ESI+8];
800 movq MM2, [ESI+16];
801 movq MM3, [ESI+24];
802 add ESI, 32;
803 movq MM4, [ECX];
804 movq MM5, [ECX+8];
805 movq MM6, [ECX+16];
806 movq MM7, [ECX+24];
807 add ECX, 32;
808 paddb MM0, MM4;
809 paddb MM1, MM5;
810 paddb MM2, MM6;
811 paddb MM3, MM7;
812 movq [ESI -32], MM0;
813 movq [ESI+8 -32], MM1;
814 movq [ESI+16-32], MM2;
815 movq [ESI+24-32], MM3;
816 cmp ESI, EDI;
817 jb startaddasslmmx;
818
819 emms;
820 mov aptr, ESI;
821 mov bptr, ECX;
822 }
823 }
824 }
825
826 while (aptr < aend)
827 *aptr++ += *bptr++;
828
829 return a;
830 }
831
832 unittest
833 {
834 printf("_arraySliceSliceAddass_g unittest\n");
835
836 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
837 {
838 version (log) printf(" cpuid %d\n", cpuid);
839
840 for (int j = 0; j < 2; j++)
841 {
842 const int dim = 67;
843 T[] a = new T[dim + j]; // aligned on 16 byte boundary
844 a = a[j .. dim + j]; // misalign for second iteration
845 T[] b = new T[dim + j];
846 b = b[j .. dim + j];
847 T[] c = new T[dim + j];
848 c = c[j .. dim + j];
849
850 for (int i = 0; i < dim; i++)
851 { a[i] = cast(T)i;
852 b[i] = cast(T)(i + 7);
853 c[i] = cast(T)(i * 2);
854 }
855
856 a[] = c[];
857 c[] += b[];
858
859 for (int i = 0; i < dim; i++)
860 {
861 if (c[i] != cast(T)(a[i] + b[i]))
862 {
863 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
864 assert(0);
865 }
866 }
867 }
868 }
869 }
870
871
872 /* ======================================================================== */
873
874
875 /***********************
876 * Computes:
877 * a[] = b[] - value
878 */
879
880 T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b)
881 {
882 return _arraySliceExpMinSliceAssign_g(a, value, b);
883 }
884
885 T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b)
886 {
887 return _arraySliceExpMinSliceAssign_g(a, value, b);
888 }
889
890 T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b)
891 in
892 {
893 assert(a.length == b.length);
894 assert(disjoint(a, b));
895 }
896 body
897 {
898 //printf("_arraySliceExpMinSliceAssign_g()\n");
899 auto aptr = a.ptr;
900 auto aend = aptr + a.length;
901 auto bptr = b.ptr;
902
903 version (D_InlineAsm_X86)
904 {
905 // SSE2 aligned version is 1189% faster
906 if (sse2() && a.length >= 64)
907 {
908 auto n = aptr + (a.length & ~63);
909
910 uint l = cast(ubyte) value;
911 l |= (l << 8);
912 l |= (l << 16);
913
914 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
915 {
916 asm // unaligned case
917 {
918 mov ESI, aptr;
919 mov EDI, n;
920 mov EAX, bptr;
921 movd XMM4, l;
922 pshufd XMM4, XMM4, 0;
923
924 align 8;
925 startsubsse2u:
926 add ESI, 64;
927 movdqu XMM0, [EAX];
928 movdqu XMM1, [EAX+16];
929 movdqu XMM2, [EAX+32];
930 movdqu XMM3, [EAX+48];
931 add EAX, 64;
932 psubb XMM0, XMM4;
933 psubb XMM1, XMM4;
934 psubb XMM2, XMM4;
935 psubb XMM3, XMM4;
936 movdqu [ESI -64], XMM0;
937 movdqu [ESI+16-64], XMM1;
938 movdqu [ESI+32-64], XMM2;
939 movdqu [ESI+48-64], XMM3;
940 cmp ESI, EDI;
941 jb startsubsse2u;
942
943 mov aptr, ESI;
944 mov bptr, EAX;
945 }
946 }
947 else
948 {
949 asm // aligned case
950 {
951 mov ESI, aptr;
952 mov EDI, n;
953 mov EAX, bptr;
954 movd XMM4, l;
955 pshufd XMM4, XMM4, 0;
956
957 align 8;
958 startsubsse2a:
959 add ESI, 64;
960 movdqa XMM0, [EAX];
961 movdqa XMM1, [EAX+16];
962 movdqa XMM2, [EAX+32];
963 movdqa XMM3, [EAX+48];
964 add EAX, 64;
965 psubb XMM0, XMM4;
966 psubb XMM1, XMM4;
967 psubb XMM2, XMM4;
968 psubb XMM3, XMM4;
969 movdqa [ESI -64], XMM0;
970 movdqa [ESI+16-64], XMM1;
971 movdqa [ESI+32-64], XMM2;
972 movdqa [ESI+48-64], XMM3;
973 cmp ESI, EDI;
974 jb startsubsse2a;
975
976 mov aptr, ESI;
977 mov bptr, EAX;
978 }
979 }
980 }
981 else
982 // MMX version is 1079% faster
983 if (mmx() && a.length >= 32)
984 {
985 auto n = aptr + (a.length & ~31);
986
987 uint l = cast(ubyte) value;
988 l |= (l << 8);
989
990 asm
991 {
992 mov ESI, aptr;
993 mov EDI, n;
994 mov EAX, bptr;
995 movd MM4, l;
996 pshufw MM4, MM4, 0;
997
998 align 4;
999 startsubmmx:
1000 add ESI, 32;
1001 movq MM0, [EAX];
1002 movq MM1, [EAX+8];
1003 movq MM2, [EAX+16];
1004 movq MM3, [EAX+24];
1005 add EAX, 32;
1006 psubb MM0, MM4;
1007 psubb MM1, MM4;
1008 psubb MM2, MM4;
1009 psubb MM3, MM4;
1010 movq [ESI -32], MM0;
1011 movq [ESI+8 -32], MM1;
1012 movq [ESI+16-32], MM2;
1013 movq [ESI+24-32], MM3;
1014 cmp ESI, EDI;
1015 jb startsubmmx;
1016
1017 emms;
1018 mov aptr, ESI;
1019 mov bptr, EAX;
1020 }
1021 }
1022 // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really.
1023 else
1024 if (a.length >= 4)
1025 {
1026 auto n = aptr + (a.length & ~3);
1027 asm
1028 {
1029 mov ESI, aptr;
1030 mov EDI, n;
1031 mov EAX, bptr;
1032 mov CL, value;
1033
1034 align 4;
1035 startsub386:
1036 add ESI, 4;
1037 mov DX, [EAX];
1038 mov BX, [EAX+2];
1039 add EAX, 4;
1040 sub BL, CL;
1041 sub BH, CL;
1042 sub DL, CL;
1043 sub DH, CL;
1044 mov [ESI -4], DX;
1045 mov [ESI+2 -4], BX;
1046 cmp ESI, EDI;
1047 jb startsub386;
1048
1049 mov aptr, ESI;
1050 mov bptr, EAX;
1051 }
1052 }
1053 }
1054
1055 while (aptr < aend)
1056 *aptr++ = cast(T)(*bptr++ - value);
1057
1058 return a;
1059 }
1060
1061 unittest
1062 {
1063 printf("_arraySliceExpMinSliceAssign_g unittest\n");
1064
1065 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1066 {
1067 version (log) printf(" cpuid %d\n", cpuid);
1068
1069 for (int j = 0; j < 2; j++)
1070 {
1071 const int dim = 67;
1072 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1073 a = a[j .. dim + j]; // misalign for second iteration
1074 T[] b = new T[dim + j];
1075 b = b[j .. dim + j];
1076 T[] c = new T[dim + j];
1077 c = c[j .. dim + j];
1078
1079 for (int i = 0; i < dim; i++)
1080 { a[i] = cast(T)i;
1081 b[i] = cast(T)(i + 7);
1082 c[i] = cast(T)(i * 2);
1083 }
1084
1085 a[] = c[];
1086 c[] = b[] - 6;
1087
1088 for (int i = 0; i < dim; i++)
1089 {
1090 if (c[i] != cast(T)(b[i] - 6))
1091 {
1092 printf("[%d]: %d != %d - 6\n", i, c[i], b[i]);
1093 assert(0);
1094 }
1095 }
1096 }
1097 }
1098 }
1099
1100
1101 /* ======================================================================== */
1102
1103 /***********************
1104 * Computes:
1105 * a[] = value - b[]
1106 */
1107
1108 T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value)
1109 {
1110 return _arrayExpSliceMinSliceAssign_g(a, b, value);
1111 }
1112
1113 T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value)
1114 {
1115 return _arrayExpSliceMinSliceAssign_g(a, b, value);
1116 }
1117
1118 T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value)
1119 in
1120 {
1121 assert(a.length == b.length);
1122 assert(disjoint(a, b));
1123 }
1124 body
1125 {
1126 //printf("_arrayExpSliceMinSliceAssign_g()\n");
1127 auto aptr = a.ptr;
1128 auto aend = aptr + a.length;
1129 auto bptr = b.ptr;
1130
1131 version (D_InlineAsm_X86)
1132 {
1133 // SSE2 aligned version is 8748% faster
1134 if (sse2() && a.length >= 64)
1135 {
1136 auto n = aptr + (a.length & ~63);
1137
1138 uint l = cast(ubyte) value;
1139 l |= (l << 8);
1140 l |= (l << 16);
1141
1142 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1143 {
1144 asm // unaligned case
1145 {
1146 mov ESI, aptr;
1147 mov EDI, n;
1148 mov EAX, bptr;
1149 movd XMM4, l;
1150 pshufd XMM4, XMM4, 0;
1151
1152 align 8;
1153 startsubrsse2u:
1154 add ESI, 64;
1155 movdqa XMM5, XMM4;
1156 movdqa XMM6, XMM4;
1157 movdqu XMM0, [EAX];
1158 movdqu XMM1, [EAX+16];
1159 psubb XMM5, XMM0;
1160 psubb XMM6, XMM1;
1161 movdqu [ESI -64], XMM5;
1162 movdqu [ESI+16-64], XMM6;
1163 movdqa XMM5, XMM4;
1164 movdqa XMM6, XMM4;
1165 movdqu XMM2, [EAX+32];
1166 movdqu XMM3, [EAX+48];
1167 add EAX, 64;
1168 psubb XMM5, XMM2;
1169 psubb XMM6, XMM3;
1170 movdqu [ESI+32-64], XMM5;
1171 movdqu [ESI+48-64], XMM6;
1172 cmp ESI, EDI;
1173 jb startsubrsse2u;
1174
1175 mov aptr, ESI;
1176 mov bptr, EAX;
1177 }
1178 }
1179 else
1180 {
1181 asm // aligned case
1182 {
1183 mov ESI, aptr;
1184 mov EDI, n;
1185 mov EAX, bptr;
1186 movd XMM4, l;
1187 pshufd XMM4, XMM4, 0;
1188
1189 align 8;
1190 startsubrsse2a:
1191 add ESI, 64;
1192 movdqa XMM5, XMM4;
1193 movdqa XMM6, XMM4;
1194 movdqa XMM0, [EAX];
1195 movdqa XMM1, [EAX+16];
1196 psubb XMM5, XMM0;
1197 psubb XMM6, XMM1;
1198 movdqa [ESI -64], XMM5;
1199 movdqa [ESI+16-64], XMM6;
1200 movdqa XMM5, XMM4;
1201 movdqa XMM6, XMM4;
1202 movdqa XMM2, [EAX+32];
1203 movdqa XMM3, [EAX+48];
1204 add EAX, 64;
1205 psubb XMM5, XMM2;
1206 psubb XMM6, XMM3;
1207 movdqa [ESI+32-64], XMM5;
1208 movdqa [ESI+48-64], XMM6;
1209 cmp ESI, EDI;
1210 jb startsubrsse2a;
1211
1212 mov aptr, ESI;
1213 mov bptr, EAX;
1214 }
1215 }
1216 }
1217 else
1218 // MMX version is 7397% faster
1219 if (mmx() && a.length >= 32)
1220 {
1221 auto n = aptr + (a.length & ~31);
1222
1223 uint l = cast(ubyte) value;
1224 l |= (l << 8);
1225
1226 asm
1227 {
1228 mov ESI, aptr;
1229 mov EDI, n;
1230 mov EAX, bptr;
1231 movd MM4, l;
1232 pshufw MM4, MM4, 0;
1233
1234 align 4;
1235 startsubrmmx:
1236 add ESI, 32;
1237 movq MM5, MM4;
1238 movq MM6, MM4;
1239 movq MM0, [EAX];
1240 movq MM1, [EAX+8];
1241 psubb MM5, MM0;
1242 psubb MM6, MM1;
1243 movq [ESI -32], MM5;
1244 movq [ESI+8 -32], MM6;
1245 movq MM5, MM4;
1246 movq MM6, MM4;
1247 movq MM2, [EAX+16];
1248 movq MM3, [EAX+24];
1249 add EAX, 32;
1250 psubb MM5, MM2;
1251 psubb MM6, MM3;
1252 movq [ESI+16-32], MM5;
1253 movq [ESI+24-32], MM6;
1254 cmp ESI, EDI;
1255 jb startsubrmmx;
1256
1257 emms;
1258 mov aptr, ESI;
1259 mov bptr, EAX;
1260 }
1261 }
1262
1263 }
1264
1265 while (aptr < aend)
1266 *aptr++ = cast(T)(value - *bptr++);
1267
1268 return a;
1269 }
1270
1271 unittest
1272 {
1273 printf("_arrayExpSliceMinSliceAssign_g unittest\n");
1274
1275 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1276 {
1277 version (log) printf(" cpuid %d\n", cpuid);
1278
1279 for (int j = 0; j < 2; j++)
1280 {
1281 const int dim = 67;
1282 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1283 a = a[j .. dim + j]; // misalign for second iteration
1284 T[] b = new T[dim + j];
1285 b = b[j .. dim + j];
1286 T[] c = new T[dim + j];
1287 c = c[j .. dim + j];
1288
1289 for (int i = 0; i < dim; i++)
1290 { a[i] = cast(T)i;
1291 b[i] = cast(T)(i + 7);
1292 c[i] = cast(T)(i * 2);
1293 }
1294
1295 a[] = c[];
1296 c[] = 6 - b[];
1297
1298 for (int i = 0; i < dim; i++)
1299 {
1300 if (c[i] != cast(T)(6 - b[i]))
1301 {
1302 printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]);
1303 assert(0);
1304 }
1305 }
1306 }
1307 }
1308 }
1309
1310
1311 /* ======================================================================== */
1312
1313 /***********************
1314 * Computes:
1315 * a[] = b[] - c[]
1316 */
1317
1318 T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b)
1319 {
1320 return _arraySliceSliceMinSliceAssign_g(a, c, b);
1321 }
1322
1323 T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b)
1324 {
1325 return _arraySliceSliceMinSliceAssign_g(a, c, b);
1326 }
1327
1328 T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b)
1329 in
1330 {
1331 assert(a.length == b.length && b.length == c.length);
1332 assert(disjoint(a, b));
1333 assert(disjoint(a, c));
1334 assert(disjoint(b, c));
1335 }
1336 body
1337 {
1338 auto aptr = a.ptr;
1339 auto aend = aptr + a.length;
1340 auto bptr = b.ptr;
1341 auto cptr = c.ptr;
1342
1343 version (D_InlineAsm_X86)
1344 {
1345 // SSE2 aligned version is 5756% faster
1346 if (sse2() && a.length >= 64)
1347 {
1348 auto n = aptr + (a.length & ~63);
1349
1350 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1351 {
1352 asm // unaligned case
1353 {
1354 mov ESI, aptr;
1355 mov EDI, n;
1356 mov EAX, bptr;
1357 mov ECX, cptr;
1358
1359 align 8;
1360 startsublsse2u:
1361 add ESI, 64;
1362 movdqu XMM0, [EAX];
1363 movdqu XMM1, [EAX+16];
1364 movdqu XMM2, [EAX+32];
1365 movdqu XMM3, [EAX+48];
1366 add EAX, 64;
1367 movdqu XMM4, [ECX];
1368 movdqu XMM5, [ECX+16];
1369 movdqu XMM6, [ECX+32];
1370 movdqu XMM7, [ECX+48];
1371 add ECX, 64;
1372 psubb XMM0, XMM4;
1373 psubb XMM1, XMM5;
1374 psubb XMM2, XMM6;
1375 psubb XMM3, XMM7;
1376 movdqu [ESI -64], XMM0;
1377 movdqu [ESI+16-64], XMM1;
1378 movdqu [ESI+32-64], XMM2;
1379 movdqu [ESI+48-64], XMM3;
1380 cmp ESI, EDI;
1381 jb startsublsse2u;
1382
1383 mov aptr, ESI;
1384 mov bptr, EAX;
1385 mov cptr, ECX;
1386 }
1387 }
1388 else
1389 {
1390 asm // aligned case
1391 {
1392 mov ESI, aptr;
1393 mov EDI, n;
1394 mov EAX, bptr;
1395 mov ECX, cptr;
1396
1397 align 8;
1398 startsublsse2a:
1399 add ESI, 64;
1400 movdqa XMM0, [EAX];
1401 movdqa XMM1, [EAX+16];
1402 movdqa XMM2, [EAX+32];
1403 movdqa XMM3, [EAX+48];
1404 add EAX, 64;
1405 movdqa XMM4, [ECX];
1406 movdqa XMM5, [ECX+16];
1407 movdqa XMM6, [ECX+32];
1408 movdqa XMM7, [ECX+48];
1409 add ECX, 64;
1410 psubb XMM0, XMM4;
1411 psubb XMM1, XMM5;
1412 psubb XMM2, XMM6;
1413 psubb XMM3, XMM7;
1414 movdqa [ESI -64], XMM0;
1415 movdqa [ESI+16-64], XMM1;
1416 movdqa [ESI+32-64], XMM2;
1417 movdqa [ESI+48-64], XMM3;
1418 cmp ESI, EDI;
1419 jb startsublsse2a;
1420
1421 mov aptr, ESI;
1422 mov bptr, EAX;
1423 mov cptr, ECX;
1424 }
1425 }
1426 }
1427 else
1428 // MMX version is 4428% faster
1429 if (mmx() && a.length >= 32)
1430 {
1431 auto n = aptr + (a.length & ~31);
1432
1433 asm
1434 {
1435 mov ESI, aptr;
1436 mov EDI, n;
1437 mov EAX, bptr;
1438 mov ECX, cptr;
1439
1440 align 8;
1441 startsublmmx:
1442 add ESI, 32;
1443 movq MM0, [EAX];
1444 movq MM1, [EAX+8];
1445 movq MM2, [EAX+16];
1446 movq MM3, [EAX+24];
1447 add EAX, 32;
1448 movq MM4, [ECX];
1449 movq MM5, [ECX+8];
1450 movq MM6, [ECX+16];
1451 movq MM7, [ECX+24];
1452 add ECX, 32;
1453 psubb MM0, MM4;
1454 psubb MM1, MM5;
1455 psubb MM2, MM6;
1456 psubb MM3, MM7;
1457 movq [ESI -32], MM0;
1458 movq [ESI+8 -32], MM1;
1459 movq [ESI+16-32], MM2;
1460 movq [ESI+24-32], MM3;
1461 cmp ESI, EDI;
1462 jb startsublmmx;
1463
1464 emms;
1465 mov aptr, ESI;
1466 mov bptr, EAX;
1467 mov cptr, ECX;
1468 }
1469 }
1470 }
1471
1472 while (aptr < aend)
1473 *aptr++ = cast(T)(*bptr++ - *cptr++);
1474
1475 return a;
1476 }
1477
1478 unittest
1479 {
1480 printf("_arraySliceSliceMinSliceAssign_g unittest\n");
1481
1482 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1483 {
1484 version (log) printf(" cpuid %d\n", cpuid);
1485
1486 for (int j = 0; j < 2; j++)
1487 {
1488 const int dim = 67;
1489 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1490 a = a[j .. dim + j]; // misalign for second iteration
1491 T[] b = new T[dim + j];
1492 b = b[j .. dim + j];
1493 T[] c = new T[dim + j];
1494 c = c[j .. dim + j];
1495
1496 for (int i = 0; i < dim; i++)
1497 { a[i] = cast(T)i;
1498 b[i] = cast(T)(i + 7);
1499 c[i] = cast(T)(i * 2);
1500 }
1501
1502 c[] = a[] - b[];
1503
1504 for (int i = 0; i < dim; i++)
1505 {
1506 if (c[i] != cast(T)(a[i] - b[i]))
1507 {
1508 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1509 assert(0);
1510 }
1511 }
1512 }
1513 }
1514 }
1515
1516
1517 /* ======================================================================== */
1518
1519 /***********************
1520 * Computes:
1521 * a[] -= value
1522 */
1523
1524 T[] _arrayExpSliceMinass_a(T[] a, T value)
1525 {
1526 return _arrayExpSliceMinass_g(a, value);
1527 }
1528
1529 T[] _arrayExpSliceMinass_h(T[] a, T value)
1530 {
1531 return _arrayExpSliceMinass_g(a, value);
1532 }
1533
1534 T[] _arrayExpSliceMinass_g(T[] a, T value)
1535 {
1536 //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1537 auto aptr = a.ptr;
1538 auto aend = aptr + a.length;
1539
1540 version (D_InlineAsm_X86)
1541 {
1542 // SSE2 aligned version is 1577% faster
1543 if (sse2() && a.length >= 64)
1544 {
1545 auto n = aptr + (a.length & ~63);
1546
1547 uint l = cast(ubyte) value;
1548 l |= (l << 8);
1549 l |= (l << 16);
1550
1551 if (((cast(uint) aptr) & 15) != 0)
1552 {
1553 asm // unaligned case
1554 {
1555 mov ESI, aptr;
1556 mov EDI, n;
1557 movd XMM4, l;
1558 pshufd XMM4, XMM4, 0;
1559
1560 align 8;
1561 startsubasssse2u:
1562 movdqu XMM0, [ESI];
1563 movdqu XMM1, [ESI+16];
1564 movdqu XMM2, [ESI+32];
1565 movdqu XMM3, [ESI+48];
1566 add ESI, 64;
1567 psubb XMM0, XMM4;
1568 psubb XMM1, XMM4;
1569 psubb XMM2, XMM4;
1570 psubb XMM3, XMM4;
1571 movdqu [ESI -64], XMM0;
1572 movdqu [ESI+16-64], XMM1;
1573 movdqu [ESI+32-64], XMM2;
1574 movdqu [ESI+48-64], XMM3;
1575 cmp ESI, EDI;
1576 jb startsubasssse2u;
1577
1578 mov aptr, ESI;
1579 }
1580 }
1581 else
1582 {
1583 asm // aligned case
1584 {
1585 mov ESI, aptr;
1586 mov EDI, n;
1587 movd XMM4, l;
1588 pshufd XMM4, XMM4, 0;
1589
1590 align 8;
1591 startsubasssse2a:
1592 movdqa XMM0, [ESI];
1593 movdqa XMM1, [ESI+16];
1594 movdqa XMM2, [ESI+32];
1595 movdqa XMM3, [ESI+48];
1596 add ESI, 64;
1597 psubb XMM0, XMM4;
1598 psubb XMM1, XMM4;
1599 psubb XMM2, XMM4;
1600 psubb XMM3, XMM4;
1601 movdqa [ESI -64], XMM0;
1602 movdqa [ESI+16-64], XMM1;
1603 movdqa [ESI+32-64], XMM2;
1604 movdqa [ESI+48-64], XMM3;
1605 cmp ESI, EDI;
1606 jb startsubasssse2a;
1607
1608 mov aptr, ESI;
1609 }
1610 }
1611 }
1612 else
1613 // MMX version is 1577% faster
1614 if (mmx() && a.length >= 32)
1615 {
1616
1617 auto n = aptr + (a.length & ~31);
1618
1619 uint l = cast(ubyte) value;
1620 l |= (l << 8);
1621
1622 asm
1623 {
1624 mov ESI, aptr;
1625 mov EDI, n;
1626 movd MM4, l;
1627 pshufw MM4, MM4, 0;
1628
1629 align 8;
1630 startsubassmmx:
1631 movq MM0, [ESI];
1632 movq MM1, [ESI+8];
1633 movq MM2, [ESI+16];
1634 movq MM3, [ESI+24];
1635 add ESI, 32;
1636 psubb MM0, MM4;
1637 psubb MM1, MM4;
1638 psubb MM2, MM4;
1639 psubb MM3, MM4;
1640 movq [ESI -32], MM0;
1641 movq [ESI+8 -32], MM1;
1642 movq [ESI+16-32], MM2;
1643 movq [ESI+24-32], MM3;
1644 cmp ESI, EDI;
1645 jb startsubassmmx;
1646
1647 emms;
1648 mov aptr, ESI;
1649 }
1650 }
1651 }
1652
1653 while (aptr < aend)
1654 *aptr++ -= value;
1655
1656 return a;
1657 }
1658
1659 unittest
1660 {
1661 printf("_arrayExpSliceMinass_g unittest\n");
1662
1663 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1664 {
1665 version (log) printf(" cpuid %d\n", cpuid);
1666
1667 for (int j = 0; j < 2; j++)
1668 {
1669 const int dim = 67;
1670 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1671 a = a[j .. dim + j]; // misalign for second iteration
1672 T[] b = new T[dim + j];
1673 b = b[j .. dim + j];
1674 T[] c = new T[dim + j];
1675 c = c[j .. dim + j];
1676
1677 for (int i = 0; i < dim; i++)
1678 { a[i] = cast(T)i;
1679 b[i] = cast(T)(i + 7);
1680 c[i] = cast(T)(i * 2);
1681 }
1682
1683 a[] = c[];
1684 c[] -= 6;
1685
1686 for (int i = 0; i < dim; i++)
1687 {
1688 if (c[i] != cast(T)(a[i] - 6))
1689 {
1690 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
1691 assert(0);
1692 }
1693 }
1694 }
1695 }
1696 }
1697
1698
1699 /* ======================================================================== */
1700
1701 /***********************
1702 * Computes:
1703 * a[] -= b[]
1704 */
1705
1706 T[] _arraySliceSliceMinass_a(T[] a, T[] b)
1707 {
1708 return _arraySliceSliceMinass_g(a, b);
1709 }
1710
1711 T[] _arraySliceSliceMinass_h(T[] a, T[] b)
1712 {
1713 return _arraySliceSliceMinass_g(a, b);
1714 }
1715
1716 T[] _arraySliceSliceMinass_g(T[] a, T[] b)
1717 in
1718 {
1719 assert (a.length == b.length);
1720 assert (disjoint(a, b));
1721 }
1722 body
1723 {
1724 //printf("_arraySliceSliceMinass_g()\n");
1725 auto aptr = a.ptr;
1726 auto aend = aptr + a.length;
1727 auto bptr = b.ptr;
1728
1729 version (D_InlineAsm_X86)
1730 {
1731 // SSE2 aligned version is 4800% faster
1732 if (sse2() && a.length >= 64)
1733 {
1734 auto n = aptr + (a.length & ~63);
1735
1736 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1737 {
1738 asm // unaligned case
1739 {
1740 mov ESI, aptr;
1741 mov EDI, n;
1742 mov ECX, bptr;
1743
1744 align 8;
1745 startsubasslsse2u:
1746 movdqu XMM0, [ESI];
1747 movdqu XMM1, [ESI+16];
1748 movdqu XMM2, [ESI+32];
1749 movdqu XMM3, [ESI+48];
1750 add ESI, 64;
1751 movdqu XMM4, [ECX];
1752 movdqu XMM5, [ECX+16];
1753 movdqu XMM6, [ECX+32];
1754 movdqu XMM7, [ECX+48];
1755 add ECX, 64;
1756 psubb XMM0, XMM4;
1757 psubb XMM1, XMM5;
1758 psubb XMM2, XMM6;
1759 psubb XMM3, XMM7;
1760 movdqu [ESI -64], XMM0;
1761 movdqu [ESI+16-64], XMM1;
1762 movdqu [ESI+32-64], XMM2;
1763 movdqu [ESI+48-64], XMM3;
1764 cmp ESI, EDI;
1765 jb startsubasslsse2u;
1766
1767 mov aptr, ESI;
1768 mov bptr, ECX;
1769 }
1770 }
1771 else
1772 {
1773 asm // aligned case
1774 {
1775 mov ESI, aptr;
1776 mov EDI, n;
1777 mov ECX, bptr;
1778
1779 align 8;
1780 startsubasslsse2a:
1781 movdqa XMM0, [ESI];
1782 movdqa XMM1, [ESI+16];
1783 movdqa XMM2, [ESI+32];
1784 movdqa XMM3, [ESI+48];
1785 add ESI, 64;
1786 movdqa XMM4, [ECX];
1787 movdqa XMM5, [ECX+16];
1788 movdqa XMM6, [ECX+32];
1789 movdqa XMM7, [ECX+48];
1790 add ECX, 64;
1791 psubb XMM0, XMM4;
1792 psubb XMM1, XMM5;
1793 psubb XMM2, XMM6;
1794 psubb XMM3, XMM7;
1795 movdqa [ESI -64], XMM0;
1796 movdqa [ESI+16-64], XMM1;
1797 movdqa [ESI+32-64], XMM2;
1798 movdqa [ESI+48-64], XMM3;
1799 cmp ESI, EDI;
1800 jb startsubasslsse2a;
1801
1802 mov aptr, ESI;
1803 mov bptr, ECX;
1804 }
1805 }
1806 }
1807 else
1808 // MMX version is 3107% faster
1809 if (mmx() && a.length >= 32)
1810 {
1811
1812 auto n = aptr + (a.length & ~31);
1813
1814 asm
1815 {
1816 mov ESI, aptr;
1817 mov EDI, n;
1818 mov ECX, bptr;
1819
1820 align 8;
1821 startsubasslmmx:
1822 movq MM0, [ESI];
1823 movq MM1, [ESI+8];
1824 movq MM2, [ESI+16];
1825 movq MM3, [ESI+24];
1826 add ESI, 32;
1827 movq MM4, [ECX];
1828 movq MM5, [ECX+8];
1829 movq MM6, [ECX+16];
1830 movq MM7, [ECX+24];
1831 add ECX, 32;
1832 psubb MM0, MM4;
1833 psubb MM1, MM5;
1834 psubb MM2, MM6;
1835 psubb MM3, MM7;
1836 movq [ESI -32], MM0;
1837 movq [ESI+8 -32], MM1;
1838 movq [ESI+16-32], MM2;
1839 movq [ESI+24-32], MM3;
1840 cmp ESI, EDI;
1841 jb startsubasslmmx;
1842
1843 emms;
1844 mov aptr, ESI;
1845 mov bptr, ECX;
1846 }
1847 }
1848 }
1849
1850 while (aptr < aend)
1851 *aptr++ -= *bptr++;
1852
1853 return a;
1854 }
1855
1856 unittest
1857 {
1858 printf("_arraySliceSliceMinass_g unittest\n");
1859
1860 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1861 {
1862 version (log) printf(" cpuid %d\n", cpuid);
1863
1864 for (int j = 0; j < 2; j++)
1865 {
1866 const int dim = 67;
1867 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1868 a = a[j .. dim + j]; // misalign for second iteration
1869 T[] b = new T[dim + j];
1870 b = b[j .. dim + j];
1871 T[] c = new T[dim + j];
1872 c = c[j .. dim + j];
1873
1874 for (int i = 0; i < dim; i++)
1875 { a[i] = cast(T)i;
1876 b[i] = cast(T)(i + 7);
1877 c[i] = cast(T)(i * 2);
1878 }
1879
1880 a[] = c[];
1881 c[] -= b[];
1882
1883 for (int i = 0; i < dim; i++)
1884 {
1885 if (c[i] != cast(T)(a[i] - b[i]))
1886 {
1887 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1888 assert(0);
1889 }
1890 }
1891 }
1892 }
1893 }