comparison druntime/src/compiler/dmd/arraybyte.d @ 759:d3eb054172f9

Added copy of druntime from DMD 2.020 modified for LDC.
author Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date Tue, 11 Nov 2008 01:52:37 +0100
parents
children
comparison
equal deleted inserted replaced
758:f04dde6e882c 759:d3eb054172f9
1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for byte array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
6 */
7
8 /* Contains SSE2 and MMX versions of certain operations for char, byte,
9 * and ubyte ('a', 'g' and 'h' suffixes).
10 */
11
12 module rt.arraybyte;
13
14 import util.cpuid;
15
16 version (Unittest)
17 {
18 /* This is so unit tests will test every CPU variant
19 */
20 int cpuid;
21 const int CPUID_MAX = 4;
22 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
23 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
24 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
25 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
26 }
27 else
28 {
29 alias util.cpuid.mmx mmx;
30 alias util.cpuid.sse sse;
31 alias util.cpuid.sse2 sse2;
32 alias util.cpuid.amd3dnow amd3dnow;
33 }
34
35 //version = log;
36
37 bool disjoint(T)(T[] a, T[] b)
38 {
39 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
40 }
41
42 alias byte T;
43
44 extern (C):
45
46 /* ======================================================================== */
47
48
49 /***********************
50 * Computes:
51 * a[] = b[] + value
52 */
53
54 T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b)
55 {
56 return _arraySliceExpAddSliceAssign_g(a, value, b);
57 }
58
59 T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b)
60 {
61 return _arraySliceExpAddSliceAssign_g(a, value, b);
62 }
63
64 T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b)
65 in
66 {
67 assert(a.length == b.length);
68 assert(disjoint(a, b));
69 }
70 body
71 {
72 //printf("_arraySliceExpAddSliceAssign_g()\n");
73 auto aptr = a.ptr;
74 auto aend = aptr + a.length;
75 auto bptr = b.ptr;
76
77 version (D_InlineAsm_X86)
78 {
79 // SSE2 aligned version is 1088% faster
80 if (sse2() && a.length >= 64)
81 {
82 auto n = aptr + (a.length & ~63);
83
84 uint l = cast(ubyte) value;
85 l |= (l << 8);
86 l |= (l << 16);
87
88 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
89 {
90 asm // unaligned case
91 {
92 mov ESI, aptr;
93 mov EDI, n;
94 mov EAX, bptr;
95 movd XMM4, l;
96 pshufd XMM4, XMM4, 0;
97
98 align 8;
99 startaddsse2u:
100 add ESI, 64;
101 movdqu XMM0, [EAX];
102 movdqu XMM1, [EAX+16];
103 movdqu XMM2, [EAX+32];
104 movdqu XMM3, [EAX+48];
105 add EAX, 64;
106 paddb XMM0, XMM4;
107 paddb XMM1, XMM4;
108 paddb XMM2, XMM4;
109 paddb XMM3, XMM4;
110 movdqu [ESI -64], XMM0;
111 movdqu [ESI+16-64], XMM1;
112 movdqu [ESI+32-64], XMM2;
113 movdqu [ESI+48-64], XMM3;
114 cmp ESI, EDI;
115 jb startaddsse2u;
116
117 mov aptr, ESI;
118 mov bptr, EAX;
119 }
120 }
121 else
122 {
123 asm // aligned case
124 {
125 mov ESI, aptr;
126 mov EDI, n;
127 mov EAX, bptr;
128 movd XMM4, l;
129 pshufd XMM4, XMM4, 0;
130
131 align 8;
132 startaddsse2a:
133 add ESI, 64;
134 movdqa XMM0, [EAX];
135 movdqa XMM1, [EAX+16];
136 movdqa XMM2, [EAX+32];
137 movdqa XMM3, [EAX+48];
138 add EAX, 64;
139 paddb XMM0, XMM4;
140 paddb XMM1, XMM4;
141 paddb XMM2, XMM4;
142 paddb XMM3, XMM4;
143 movdqa [ESI -64], XMM0;
144 movdqa [ESI+16-64], XMM1;
145 movdqa [ESI+32-64], XMM2;
146 movdqa [ESI+48-64], XMM3;
147 cmp ESI, EDI;
148 jb startaddsse2a;
149
150 mov aptr, ESI;
151 mov bptr, EAX;
152 }
153 }
154 }
155 else
156 // MMX version is 1000% faster
157 if (mmx() && a.length >= 32)
158 {
159 auto n = aptr + (a.length & ~31);
160
161 uint l = cast(ubyte) value;
162 l |= (l << 8);
163
164 asm
165 {
166 mov ESI, aptr;
167 mov EDI, n;
168 mov EAX, bptr;
169 movd MM4, l;
170 pshufw MM4, MM4, 0;
171
172 align 4;
173 startaddmmx:
174 add ESI, 32;
175 movq MM0, [EAX];
176 movq MM1, [EAX+8];
177 movq MM2, [EAX+16];
178 movq MM3, [EAX+24];
179 add EAX, 32;
180 paddb MM0, MM4;
181 paddb MM1, MM4;
182 paddb MM2, MM4;
183 paddb MM3, MM4;
184 movq [ESI -32], MM0;
185 movq [ESI+8 -32], MM1;
186 movq [ESI+16-32], MM2;
187 movq [ESI+24-32], MM3;
188 cmp ESI, EDI;
189 jb startaddmmx;
190
191 emms;
192 mov aptr, ESI;
193 mov bptr, EAX;
194 }
195 }
196 /* trying to be fair and treat normal 32-bit cpu the same way as we do
197 * the SIMD units, with unrolled asm. There's not enough registers,
198 * really.
199 */
200 else
201 if (a.length >= 4)
202 {
203
204 auto n = aptr + (a.length & ~3);
205 asm
206 {
207 mov ESI, aptr;
208 mov EDI, n;
209 mov EAX, bptr;
210 mov CL, value;
211
212 align 4;
213 startadd386:
214 add ESI, 4;
215 mov DX, [EAX];
216 mov BX, [EAX+2];
217 add EAX, 4;
218 add BL, CL;
219 add BH, CL;
220 add DL, CL;
221 add DH, CL;
222 mov [ESI -4], DX;
223 mov [ESI+2 -4], BX;
224 cmp ESI, EDI;
225 jb startadd386;
226
227 mov aptr, ESI;
228 mov bptr, EAX;
229 }
230
231 }
232 }
233
234 while (aptr < aend)
235 *aptr++ = cast(T)(*bptr++ + value);
236
237 return a;
238 }
239
240 unittest
241 {
242 printf("_arraySliceExpAddSliceAssign_g unittest\n");
243
244 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
245 {
246 version (log) printf(" cpuid %d\n", cpuid);
247
248 for (int j = 0; j < 2; j++)
249 {
250 const int dim = 67;
251 T[] a = new T[dim + j]; // aligned on 16 byte boundary
252 a = a[j .. dim + j]; // misalign for second iteration
253 T[] b = new T[dim + j];
254 b = b[j .. dim + j];
255 T[] c = new T[dim + j];
256 c = c[j .. dim + j];
257
258 for (int i = 0; i < dim; i++)
259 { a[i] = cast(T)i;
260 b[i] = cast(T)(i + 7);
261 c[i] = cast(T)(i * 2);
262 }
263
264 c[] = a[] + 6;
265
266 for (int i = 0; i < dim; i++)
267 {
268 if (c[i] != cast(T)(a[i] + 6))
269 {
270 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
271 assert(0);
272 }
273 }
274 }
275 }
276 }
277
278
279 /* ======================================================================== */
280
281 /***********************
282 * Computes:
283 * a[] = b[] + c[]
284 */
285
286 T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b)
287 {
288 return _arraySliceSliceAddSliceAssign_g(a, c, b);
289 }
290
291 T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b)
292 {
293 return _arraySliceSliceAddSliceAssign_g(a, c, b);
294 }
295
296 T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b)
297 in
298 {
299 assert(a.length == b.length && b.length == c.length);
300 assert(disjoint(a, b));
301 assert(disjoint(a, c));
302 assert(disjoint(b, c));
303 }
304 body
305 {
306 //printf("_arraySliceSliceAddSliceAssign_g()\n");
307 auto aptr = a.ptr;
308 auto aend = aptr + a.length;
309 auto bptr = b.ptr;
310 auto cptr = c.ptr;
311
312 version (D_InlineAsm_X86)
313 {
314 // SSE2 aligned version is 5739% faster
315 if (sse2() && a.length >= 64)
316 {
317 auto n = aptr + (a.length & ~63);
318
319 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
320 {
321 version (log) printf("\tsse2 unaligned\n");
322 asm // unaligned case
323 {
324 mov ESI, aptr;
325 mov EDI, n;
326 mov EAX, bptr;
327 mov ECX, cptr;
328
329 align 8;
330 startaddlsse2u:
331 add ESI, 64;
332 movdqu XMM0, [EAX];
333 movdqu XMM1, [EAX+16];
334 movdqu XMM2, [EAX+32];
335 movdqu XMM3, [EAX+48];
336 add EAX, 64;
337 movdqu XMM4, [ECX];
338 movdqu XMM5, [ECX+16];
339 movdqu XMM6, [ECX+32];
340 movdqu XMM7, [ECX+48];
341 add ECX, 64;
342 paddb XMM0, XMM4;
343 paddb XMM1, XMM5;
344 paddb XMM2, XMM6;
345 paddb XMM3, XMM7;
346 movdqu [ESI -64], XMM0;
347 movdqu [ESI+16-64], XMM1;
348 movdqu [ESI+32-64], XMM2;
349 movdqu [ESI+48-64], XMM3;
350 cmp ESI, EDI;
351 jb startaddlsse2u;
352
353 mov aptr, ESI;
354 mov bptr, EAX;
355 mov cptr, ECX;
356 }
357 }
358 else
359 {
360 version (log) printf("\tsse2 aligned\n");
361 asm // aligned case
362 {
363 mov ESI, aptr;
364 mov EDI, n;
365 mov EAX, bptr;
366 mov ECX, cptr;
367
368 align 8;
369 startaddlsse2a:
370 add ESI, 64;
371 movdqa XMM0, [EAX];
372 movdqa XMM1, [EAX+16];
373 movdqa XMM2, [EAX+32];
374 movdqa XMM3, [EAX+48];
375 add EAX, 64;
376 movdqa XMM4, [ECX];
377 movdqa XMM5, [ECX+16];
378 movdqa XMM6, [ECX+32];
379 movdqa XMM7, [ECX+48];
380 add ECX, 64;
381 paddb XMM0, XMM4;
382 paddb XMM1, XMM5;
383 paddb XMM2, XMM6;
384 paddb XMM3, XMM7;
385 movdqa [ESI -64], XMM0;
386 movdqa [ESI+16-64], XMM1;
387 movdqa [ESI+32-64], XMM2;
388 movdqa [ESI+48-64], XMM3;
389 cmp ESI, EDI;
390 jb startaddlsse2a;
391
392 mov aptr, ESI;
393 mov bptr, EAX;
394 mov cptr, ECX;
395 }
396 }
397 }
398 else
399 // MMX version is 4428% faster
400 if (mmx() && a.length >= 32)
401 {
402 version (log) printf("\tmmx\n");
403 auto n = aptr + (a.length & ~31);
404
405 asm
406 {
407 mov ESI, aptr;
408 mov EDI, n;
409 mov EAX, bptr;
410 mov ECX, cptr;
411
412 align 4;
413 startaddlmmx:
414 add ESI, 32;
415 movq MM0, [EAX];
416 movq MM1, [EAX+8];
417 movq MM2, [EAX+16];
418 movq MM3, [EAX+24];
419 add EAX, 32;
420 movq MM4, [ECX];
421 movq MM5, [ECX+8];
422 movq MM6, [ECX+16];
423 movq MM7, [ECX+24];
424 add ECX, 32;
425 paddb MM0, MM4;
426 paddb MM1, MM5;
427 paddb MM2, MM6;
428 paddb MM3, MM7;
429 movq [ESI -32], MM0;
430 movq [ESI+8 -32], MM1;
431 movq [ESI+16-32], MM2;
432 movq [ESI+24-32], MM3;
433 cmp ESI, EDI;
434 jb startaddlmmx;
435
436 emms;
437 mov aptr, ESI;
438 mov bptr, EAX;
439 mov cptr, ECX;
440 }
441 }
442 }
443
444 version (log) if (aptr < aend) printf("\tbase\n");
445 while (aptr < aend)
446 *aptr++ = cast(T)(*bptr++ + *cptr++);
447
448 return a;
449 }
450
451 unittest
452 {
453 printf("_arraySliceSliceAddSliceAssign_g unittest\n");
454
455 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
456 {
457 version (log) printf(" cpuid %d\n", cpuid);
458
459 for (int j = 0; j < 2; j++)
460 {
461 const int dim = 67;
462 T[] a = new T[dim + j]; // aligned on 16 byte boundary
463 a = a[j .. dim + j]; // misalign for second iteration
464 T[] b = new T[dim + j];
465 b = b[j .. dim + j];
466 T[] c = new T[dim + j];
467 c = c[j .. dim + j];
468
469 for (int i = 0; i < dim; i++)
470 { a[i] = cast(T)i;
471 b[i] = cast(T)(i + 7);
472 c[i] = cast(T)(i * 2);
473 }
474
475 c[] = a[] + b[];
476
477 for (int i = 0; i < dim; i++)
478 {
479 if (c[i] != cast(T)(a[i] + b[i]))
480 {
481 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
482 assert(0);
483 }
484 }
485 }
486 }
487 }
488
489
490 /* ======================================================================== */
491
492 /***********************
493 * Computes:
494 * a[] += value
495 */
496
497 T[] _arrayExpSliceAddass_a(T[] a, T value)
498 {
499 return _arrayExpSliceAddass_g(a, value);
500 }
501
502 T[] _arrayExpSliceAddass_h(T[] a, T value)
503 {
504 return _arrayExpSliceAddass_g(a, value);
505 }
506
507 T[] _arrayExpSliceAddass_g(T[] a, T value)
508 {
509 //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
510 auto aptr = a.ptr;
511 auto aend = aptr + a.length;
512
513 version (D_InlineAsm_X86)
514 {
515 // SSE2 aligned version is 1578% faster
516 if (sse2() && a.length >= 64)
517 {
518 auto n = aptr + (a.length & ~63);
519
520 uint l = cast(ubyte) value;
521 l |= (l << 8);
522 l |= (l << 16);
523
524 if (((cast(uint) aptr) & 15) != 0)
525 {
526 asm // unaligned case
527 {
528 mov ESI, aptr;
529 mov EDI, n;
530 movd XMM4, l;
531 pshufd XMM4, XMM4, 0;
532
533 align 8;
534 startaddasssse2u:
535 movdqu XMM0, [ESI];
536 movdqu XMM1, [ESI+16];
537 movdqu XMM2, [ESI+32];
538 movdqu XMM3, [ESI+48];
539 add ESI, 64;
540 paddb XMM0, XMM4;
541 paddb XMM1, XMM4;
542 paddb XMM2, XMM4;
543 paddb XMM3, XMM4;
544 movdqu [ESI -64], XMM0;
545 movdqu [ESI+16-64], XMM1;
546 movdqu [ESI+32-64], XMM2;
547 movdqu [ESI+48-64], XMM3;
548 cmp ESI, EDI;
549 jb startaddasssse2u;
550
551 mov aptr, ESI;
552 }
553 }
554 else
555 {
556 asm // aligned case
557 {
558 mov ESI, aptr;
559 mov EDI, n;
560 movd XMM4, l;
561 pshufd XMM4, XMM4, 0;
562
563 align 8;
564 startaddasssse2a:
565 movdqa XMM0, [ESI];
566 movdqa XMM1, [ESI+16];
567 movdqa XMM2, [ESI+32];
568 movdqa XMM3, [ESI+48];
569 add ESI, 64;
570 paddb XMM0, XMM4;
571 paddb XMM1, XMM4;
572 paddb XMM2, XMM4;
573 paddb XMM3, XMM4;
574 movdqa [ESI -64], XMM0;
575 movdqa [ESI+16-64], XMM1;
576 movdqa [ESI+32-64], XMM2;
577 movdqa [ESI+48-64], XMM3;
578 cmp ESI, EDI;
579 jb startaddasssse2a;
580
581 mov aptr, ESI;
582 }
583 }
584 }
585 else
586 // MMX version is 1721% faster
587 if (mmx() && a.length >= 32)
588 {
589
590 auto n = aptr + (a.length & ~31);
591
592 uint l = cast(ubyte) value;
593 l |= (l << 8);
594
595 asm
596 {
597 mov ESI, aptr;
598 mov EDI, n;
599 movd MM4, l;
600 pshufw MM4, MM4, 0;
601
602 align 8;
603 startaddassmmx:
604 movq MM0, [ESI];
605 movq MM1, [ESI+8];
606 movq MM2, [ESI+16];
607 movq MM3, [ESI+24];
608 add ESI, 32;
609 paddb MM0, MM4;
610 paddb MM1, MM4;
611 paddb MM2, MM4;
612 paddb MM3, MM4;
613 movq [ESI -32], MM0;
614 movq [ESI+8 -32], MM1;
615 movq [ESI+16-32], MM2;
616 movq [ESI+24-32], MM3;
617 cmp ESI, EDI;
618 jb startaddassmmx;
619
620 emms;
621 mov aptr, ESI;
622 }
623 }
624 }
625
626 while (aptr < aend)
627 *aptr++ += value;
628
629 return a;
630 }
631
632 unittest
633 {
634 printf("_arrayExpSliceAddass_g unittest\n");
635
636 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
637 {
638 version (log) printf(" cpuid %d\n", cpuid);
639
640 for (int j = 0; j < 2; j++)
641 {
642 const int dim = 67;
643 T[] a = new T[dim + j]; // aligned on 16 byte boundary
644 a = a[j .. dim + j]; // misalign for second iteration
645 T[] b = new T[dim + j];
646 b = b[j .. dim + j];
647 T[] c = new T[dim + j];
648 c = c[j .. dim + j];
649
650 for (int i = 0; i < dim; i++)
651 { a[i] = cast(T)i;
652 b[i] = cast(T)(i + 7);
653 c[i] = cast(T)(i * 2);
654 }
655
656 a[] = c[];
657 c[] += 6;
658
659 for (int i = 0; i < dim; i++)
660 {
661 if (c[i] != cast(T)(a[i] + 6))
662 {
663 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
664 assert(0);
665 }
666 }
667 }
668 }
669 }
670
671
672 /* ======================================================================== */
673
674 /***********************
675 * Computes:
676 * a[] += b[]
677 */
678
679 T[] _arraySliceSliceAddass_a(T[] a, T[] b)
680 {
681 return _arraySliceSliceAddass_g(a, b);
682 }
683
684 T[] _arraySliceSliceAddass_h(T[] a, T[] b)
685 {
686 return _arraySliceSliceAddass_g(a, b);
687 }
688
689 T[] _arraySliceSliceAddass_g(T[] a, T[] b)
690 in
691 {
692 assert (a.length == b.length);
693 assert (disjoint(a, b));
694 }
695 body
696 {
697 //printf("_arraySliceSliceAddass_g()\n");
698 auto aptr = a.ptr;
699 auto aend = aptr + a.length;
700 auto bptr = b.ptr;
701
702 version (D_InlineAsm_X86)
703 {
704 // SSE2 aligned version is 4727% faster
705 if (sse2() && a.length >= 64)
706 {
707 auto n = aptr + (a.length & ~63);
708
709 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
710 {
711 asm // unaligned case
712 {
713 mov ESI, aptr;
714 mov EDI, n;
715 mov ECX, bptr;
716
717 align 8;
718 startaddasslsse2u:
719 movdqu XMM0, [ESI];
720 movdqu XMM1, [ESI+16];
721 movdqu XMM2, [ESI+32];
722 movdqu XMM3, [ESI+48];
723 add ESI, 64;
724 movdqu XMM4, [ECX];
725 movdqu XMM5, [ECX+16];
726 movdqu XMM6, [ECX+32];
727 movdqu XMM7, [ECX+48];
728 add ECX, 64;
729 paddb XMM0, XMM4;
730 paddb XMM1, XMM5;
731 paddb XMM2, XMM6;
732 paddb XMM3, XMM7;
733 movdqu [ESI -64], XMM0;
734 movdqu [ESI+16-64], XMM1;
735 movdqu [ESI+32-64], XMM2;
736 movdqu [ESI+48-64], XMM3;
737 cmp ESI, EDI;
738 jb startaddasslsse2u;
739
740 mov aptr, ESI;
741 mov bptr, ECX;
742 }
743 }
744 else
745 {
746 asm // aligned case
747 {
748 mov ESI, aptr;
749 mov EDI, n;
750 mov ECX, bptr;
751
752 align 8;
753 startaddasslsse2a:
754 movdqa XMM0, [ESI];
755 movdqa XMM1, [ESI+16];
756 movdqa XMM2, [ESI+32];
757 movdqa XMM3, [ESI+48];
758 add ESI, 64;
759 movdqa XMM4, [ECX];
760 movdqa XMM5, [ECX+16];
761 movdqa XMM6, [ECX+32];
762 movdqa XMM7, [ECX+48];
763 add ECX, 64;
764 paddb XMM0, XMM4;
765 paddb XMM1, XMM5;
766 paddb XMM2, XMM6;
767 paddb XMM3, XMM7;
768 movdqa [ESI -64], XMM0;
769 movdqa [ESI+16-64], XMM1;
770 movdqa [ESI+32-64], XMM2;
771 movdqa [ESI+48-64], XMM3;
772 cmp ESI, EDI;
773 jb startaddasslsse2a;
774
775 mov aptr, ESI;
776 mov bptr, ECX;
777 }
778 }
779 }
780 else
781 // MMX version is 3059% faster
782 if (mmx() && a.length >= 32)
783 {
784
785 auto n = aptr + (a.length & ~31);
786
787 asm
788 {
789 mov ESI, aptr;
790 mov EDI, n;
791 mov ECX, bptr;
792
793 align 8;
794 startaddasslmmx:
795 movq MM0, [ESI];
796 movq MM1, [ESI+8];
797 movq MM2, [ESI+16];
798 movq MM3, [ESI+24];
799 add ESI, 32;
800 movq MM4, [ECX];
801 movq MM5, [ECX+8];
802 movq MM6, [ECX+16];
803 movq MM7, [ECX+24];
804 add ECX, 32;
805 paddb MM0, MM4;
806 paddb MM1, MM5;
807 paddb MM2, MM6;
808 paddb MM3, MM7;
809 movq [ESI -32], MM0;
810 movq [ESI+8 -32], MM1;
811 movq [ESI+16-32], MM2;
812 movq [ESI+24-32], MM3;
813 cmp ESI, EDI;
814 jb startaddasslmmx;
815
816 emms;
817 mov aptr, ESI;
818 mov bptr, ECX;
819 }
820 }
821 }
822
823 while (aptr < aend)
824 *aptr++ += *bptr++;
825
826 return a;
827 }
828
829 unittest
830 {
831 printf("_arraySliceSliceAddass_g unittest\n");
832
833 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
834 {
835 version (log) printf(" cpuid %d\n", cpuid);
836
837 for (int j = 0; j < 2; j++)
838 {
839 const int dim = 67;
840 T[] a = new T[dim + j]; // aligned on 16 byte boundary
841 a = a[j .. dim + j]; // misalign for second iteration
842 T[] b = new T[dim + j];
843 b = b[j .. dim + j];
844 T[] c = new T[dim + j];
845 c = c[j .. dim + j];
846
847 for (int i = 0; i < dim; i++)
848 { a[i] = cast(T)i;
849 b[i] = cast(T)(i + 7);
850 c[i] = cast(T)(i * 2);
851 }
852
853 a[] = c[];
854 c[] += b[];
855
856 for (int i = 0; i < dim; i++)
857 {
858 if (c[i] != cast(T)(a[i] + b[i]))
859 {
860 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
861 assert(0);
862 }
863 }
864 }
865 }
866 }
867
868
869 /* ======================================================================== */
870
871
872 /***********************
873 * Computes:
874 * a[] = b[] - value
875 */
876
877 T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b)
878 {
879 return _arraySliceExpMinSliceAssign_g(a, value, b);
880 }
881
882 T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b)
883 {
884 return _arraySliceExpMinSliceAssign_g(a, value, b);
885 }
886
887 T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b)
888 in
889 {
890 assert(a.length == b.length);
891 assert(disjoint(a, b));
892 }
893 body
894 {
895 //printf("_arraySliceExpMinSliceAssign_g()\n");
896 auto aptr = a.ptr;
897 auto aend = aptr + a.length;
898 auto bptr = b.ptr;
899
900 version (D_InlineAsm_X86)
901 {
902 // SSE2 aligned version is 1189% faster
903 if (sse2() && a.length >= 64)
904 {
905 auto n = aptr + (a.length & ~63);
906
907 uint l = cast(ubyte) value;
908 l |= (l << 8);
909 l |= (l << 16);
910
911 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
912 {
913 asm // unaligned case
914 {
915 mov ESI, aptr;
916 mov EDI, n;
917 mov EAX, bptr;
918 movd XMM4, l;
919 pshufd XMM4, XMM4, 0;
920
921 align 8;
922 startsubsse2u:
923 add ESI, 64;
924 movdqu XMM0, [EAX];
925 movdqu XMM1, [EAX+16];
926 movdqu XMM2, [EAX+32];
927 movdqu XMM3, [EAX+48];
928 add EAX, 64;
929 psubb XMM0, XMM4;
930 psubb XMM1, XMM4;
931 psubb XMM2, XMM4;
932 psubb XMM3, XMM4;
933 movdqu [ESI -64], XMM0;
934 movdqu [ESI+16-64], XMM1;
935 movdqu [ESI+32-64], XMM2;
936 movdqu [ESI+48-64], XMM3;
937 cmp ESI, EDI;
938 jb startsubsse2u;
939
940 mov aptr, ESI;
941 mov bptr, EAX;
942 }
943 }
944 else
945 {
946 asm // aligned case
947 {
948 mov ESI, aptr;
949 mov EDI, n;
950 mov EAX, bptr;
951 movd XMM4, l;
952 pshufd XMM4, XMM4, 0;
953
954 align 8;
955 startsubsse2a:
956 add ESI, 64;
957 movdqa XMM0, [EAX];
958 movdqa XMM1, [EAX+16];
959 movdqa XMM2, [EAX+32];
960 movdqa XMM3, [EAX+48];
961 add EAX, 64;
962 psubb XMM0, XMM4;
963 psubb XMM1, XMM4;
964 psubb XMM2, XMM4;
965 psubb XMM3, XMM4;
966 movdqa [ESI -64], XMM0;
967 movdqa [ESI+16-64], XMM1;
968 movdqa [ESI+32-64], XMM2;
969 movdqa [ESI+48-64], XMM3;
970 cmp ESI, EDI;
971 jb startsubsse2a;
972
973 mov aptr, ESI;
974 mov bptr, EAX;
975 }
976 }
977 }
978 else
979 // MMX version is 1079% faster
980 if (mmx() && a.length >= 32)
981 {
982 auto n = aptr + (a.length & ~31);
983
984 uint l = cast(ubyte) value;
985 l |= (l << 8);
986
987 asm
988 {
989 mov ESI, aptr;
990 mov EDI, n;
991 mov EAX, bptr;
992 movd MM4, l;
993 pshufw MM4, MM4, 0;
994
995 align 4;
996 startsubmmx:
997 add ESI, 32;
998 movq MM0, [EAX];
999 movq MM1, [EAX+8];
1000 movq MM2, [EAX+16];
1001 movq MM3, [EAX+24];
1002 add EAX, 32;
1003 psubb MM0, MM4;
1004 psubb MM1, MM4;
1005 psubb MM2, MM4;
1006 psubb MM3, MM4;
1007 movq [ESI -32], MM0;
1008 movq [ESI+8 -32], MM1;
1009 movq [ESI+16-32], MM2;
1010 movq [ESI+24-32], MM3;
1011 cmp ESI, EDI;
1012 jb startsubmmx;
1013
1014 emms;
1015 mov aptr, ESI;
1016 mov bptr, EAX;
1017 }
1018 }
1019 // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really.
1020 else
1021 if (a.length >= 4)
1022 {
1023 auto n = aptr + (a.length & ~3);
1024 asm
1025 {
1026 mov ESI, aptr;
1027 mov EDI, n;
1028 mov EAX, bptr;
1029 mov CL, value;
1030
1031 align 4;
1032 startsub386:
1033 add ESI, 4;
1034 mov DX, [EAX];
1035 mov BX, [EAX+2];
1036 add EAX, 4;
1037 sub BL, CL;
1038 sub BH, CL;
1039 sub DL, CL;
1040 sub DH, CL;
1041 mov [ESI -4], DX;
1042 mov [ESI+2 -4], BX;
1043 cmp ESI, EDI;
1044 jb startsub386;
1045
1046 mov aptr, ESI;
1047 mov bptr, EAX;
1048 }
1049 }
1050 }
1051
1052 while (aptr < aend)
1053 *aptr++ = cast(T)(*bptr++ - value);
1054
1055 return a;
1056 }
1057
1058 unittest
1059 {
1060 printf("_arraySliceExpMinSliceAssign_g unittest\n");
1061
1062 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1063 {
1064 version (log) printf(" cpuid %d\n", cpuid);
1065
1066 for (int j = 0; j < 2; j++)
1067 {
1068 const int dim = 67;
1069 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1070 a = a[j .. dim + j]; // misalign for second iteration
1071 T[] b = new T[dim + j];
1072 b = b[j .. dim + j];
1073 T[] c = new T[dim + j];
1074 c = c[j .. dim + j];
1075
1076 for (int i = 0; i < dim; i++)
1077 { a[i] = cast(T)i;
1078 b[i] = cast(T)(i + 7);
1079 c[i] = cast(T)(i * 2);
1080 }
1081
1082 a[] = c[];
1083 c[] = b[] - 6;
1084
1085 for (int i = 0; i < dim; i++)
1086 {
1087 if (c[i] != cast(T)(b[i] - 6))
1088 {
1089 printf("[%d]: %d != %d - 6\n", i, c[i], b[i]);
1090 assert(0);
1091 }
1092 }
1093 }
1094 }
1095 }
1096
1097
1098 /* ======================================================================== */
1099
1100 /***********************
1101 * Computes:
1102 * a[] = value - b[]
1103 */
1104
1105 T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value)
1106 {
1107 return _arrayExpSliceMinSliceAssign_g(a, b, value);
1108 }
1109
1110 T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value)
1111 {
1112 return _arrayExpSliceMinSliceAssign_g(a, b, value);
1113 }
1114
1115 T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value)
1116 in
1117 {
1118 assert(a.length == b.length);
1119 assert(disjoint(a, b));
1120 }
1121 body
1122 {
1123 //printf("_arrayExpSliceMinSliceAssign_g()\n");
1124 auto aptr = a.ptr;
1125 auto aend = aptr + a.length;
1126 auto bptr = b.ptr;
1127
1128 version (D_InlineAsm_X86)
1129 {
1130 // SSE2 aligned version is 8748% faster
1131 if (sse2() && a.length >= 64)
1132 {
1133 auto n = aptr + (a.length & ~63);
1134
1135 uint l = cast(ubyte) value;
1136 l |= (l << 8);
1137 l |= (l << 16);
1138
1139 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1140 {
1141 asm // unaligned case
1142 {
1143 mov ESI, aptr;
1144 mov EDI, n;
1145 mov EAX, bptr;
1146 movd XMM4, l;
1147 pshufd XMM4, XMM4, 0;
1148
1149 align 8;
1150 startsubrsse2u:
1151 add ESI, 64;
1152 movdqa XMM5, XMM4;
1153 movdqa XMM6, XMM4;
1154 movdqu XMM0, [EAX];
1155 movdqu XMM1, [EAX+16];
1156 psubb XMM5, XMM0;
1157 psubb XMM6, XMM1;
1158 movdqu [ESI -64], XMM5;
1159 movdqu [ESI+16-64], XMM6;
1160 movdqa XMM5, XMM4;
1161 movdqa XMM6, XMM4;
1162 movdqu XMM2, [EAX+32];
1163 movdqu XMM3, [EAX+48];
1164 add EAX, 64;
1165 psubb XMM5, XMM2;
1166 psubb XMM6, XMM3;
1167 movdqu [ESI+32-64], XMM5;
1168 movdqu [ESI+48-64], XMM6;
1169 cmp ESI, EDI;
1170 jb startsubrsse2u;
1171
1172 mov aptr, ESI;
1173 mov bptr, EAX;
1174 }
1175 }
1176 else
1177 {
1178 asm // aligned case
1179 {
1180 mov ESI, aptr;
1181 mov EDI, n;
1182 mov EAX, bptr;
1183 movd XMM4, l;
1184 pshufd XMM4, XMM4, 0;
1185
1186 align 8;
1187 startsubrsse2a:
1188 add ESI, 64;
1189 movdqa XMM5, XMM4;
1190 movdqa XMM6, XMM4;
1191 movdqa XMM0, [EAX];
1192 movdqa XMM1, [EAX+16];
1193 psubb XMM5, XMM0;
1194 psubb XMM6, XMM1;
1195 movdqa [ESI -64], XMM5;
1196 movdqa [ESI+16-64], XMM6;
1197 movdqa XMM5, XMM4;
1198 movdqa XMM6, XMM4;
1199 movdqa XMM2, [EAX+32];
1200 movdqa XMM3, [EAX+48];
1201 add EAX, 64;
1202 psubb XMM5, XMM2;
1203 psubb XMM6, XMM3;
1204 movdqa [ESI+32-64], XMM5;
1205 movdqa [ESI+48-64], XMM6;
1206 cmp ESI, EDI;
1207 jb startsubrsse2a;
1208
1209 mov aptr, ESI;
1210 mov bptr, EAX;
1211 }
1212 }
1213 }
1214 else
1215 // MMX version is 7397% faster
1216 if (mmx() && a.length >= 32)
1217 {
1218 auto n = aptr + (a.length & ~31);
1219
1220 uint l = cast(ubyte) value;
1221 l |= (l << 8);
1222
1223 asm
1224 {
1225 mov ESI, aptr;
1226 mov EDI, n;
1227 mov EAX, bptr;
1228 movd MM4, l;
1229 pshufw MM4, MM4, 0;
1230
1231 align 4;
1232 startsubrmmx:
1233 add ESI, 32;
1234 movq MM5, MM4;
1235 movq MM6, MM4;
1236 movq MM0, [EAX];
1237 movq MM1, [EAX+8];
1238 psubb MM5, MM0;
1239 psubb MM6, MM1;
1240 movq [ESI -32], MM5;
1241 movq [ESI+8 -32], MM6;
1242 movq MM5, MM4;
1243 movq MM6, MM4;
1244 movq MM2, [EAX+16];
1245 movq MM3, [EAX+24];
1246 add EAX, 32;
1247 psubb MM5, MM2;
1248 psubb MM6, MM3;
1249 movq [ESI+16-32], MM5;
1250 movq [ESI+24-32], MM6;
1251 cmp ESI, EDI;
1252 jb startsubrmmx;
1253
1254 emms;
1255 mov aptr, ESI;
1256 mov bptr, EAX;
1257 }
1258 }
1259
1260 }
1261
1262 while (aptr < aend)
1263 *aptr++ = cast(T)(value - *bptr++);
1264
1265 return a;
1266 }
1267
1268 unittest
1269 {
1270 printf("_arrayExpSliceMinSliceAssign_g unittest\n");
1271
1272 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1273 {
1274 version (log) printf(" cpuid %d\n", cpuid);
1275
1276 for (int j = 0; j < 2; j++)
1277 {
1278 const int dim = 67;
1279 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1280 a = a[j .. dim + j]; // misalign for second iteration
1281 T[] b = new T[dim + j];
1282 b = b[j .. dim + j];
1283 T[] c = new T[dim + j];
1284 c = c[j .. dim + j];
1285
1286 for (int i = 0; i < dim; i++)
1287 { a[i] = cast(T)i;
1288 b[i] = cast(T)(i + 7);
1289 c[i] = cast(T)(i * 2);
1290 }
1291
1292 a[] = c[];
1293 c[] = 6 - b[];
1294
1295 for (int i = 0; i < dim; i++)
1296 {
1297 if (c[i] != cast(T)(6 - b[i]))
1298 {
1299 printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]);
1300 assert(0);
1301 }
1302 }
1303 }
1304 }
1305 }
1306
1307
1308 /* ======================================================================== */
1309
1310 /***********************
1311 * Computes:
1312 * a[] = b[] - c[]
1313 */
1314
1315 T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b)
1316 {
1317 return _arraySliceSliceMinSliceAssign_g(a, c, b);
1318 }
1319
1320 T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b)
1321 {
1322 return _arraySliceSliceMinSliceAssign_g(a, c, b);
1323 }
1324
1325 T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b)
1326 in
1327 {
1328 assert(a.length == b.length && b.length == c.length);
1329 assert(disjoint(a, b));
1330 assert(disjoint(a, c));
1331 assert(disjoint(b, c));
1332 }
1333 body
1334 {
1335 auto aptr = a.ptr;
1336 auto aend = aptr + a.length;
1337 auto bptr = b.ptr;
1338 auto cptr = c.ptr;
1339
1340 version (D_InlineAsm_X86)
1341 {
1342 // SSE2 aligned version is 5756% faster
1343 if (sse2() && a.length >= 64)
1344 {
1345 auto n = aptr + (a.length & ~63);
1346
1347 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1348 {
1349 asm // unaligned case
1350 {
1351 mov ESI, aptr;
1352 mov EDI, n;
1353 mov EAX, bptr;
1354 mov ECX, cptr;
1355
1356 align 8;
1357 startsublsse2u:
1358 add ESI, 64;
1359 movdqu XMM0, [EAX];
1360 movdqu XMM1, [EAX+16];
1361 movdqu XMM2, [EAX+32];
1362 movdqu XMM3, [EAX+48];
1363 add EAX, 64;
1364 movdqu XMM4, [ECX];
1365 movdqu XMM5, [ECX+16];
1366 movdqu XMM6, [ECX+32];
1367 movdqu XMM7, [ECX+48];
1368 add ECX, 64;
1369 psubb XMM0, XMM4;
1370 psubb XMM1, XMM5;
1371 psubb XMM2, XMM6;
1372 psubb XMM3, XMM7;
1373 movdqu [ESI -64], XMM0;
1374 movdqu [ESI+16-64], XMM1;
1375 movdqu [ESI+32-64], XMM2;
1376 movdqu [ESI+48-64], XMM3;
1377 cmp ESI, EDI;
1378 jb startsublsse2u;
1379
1380 mov aptr, ESI;
1381 mov bptr, EAX;
1382 mov cptr, ECX;
1383 }
1384 }
1385 else
1386 {
1387 asm // aligned case
1388 {
1389 mov ESI, aptr;
1390 mov EDI, n;
1391 mov EAX, bptr;
1392 mov ECX, cptr;
1393
1394 align 8;
1395 startsublsse2a:
1396 add ESI, 64;
1397 movdqa XMM0, [EAX];
1398 movdqa XMM1, [EAX+16];
1399 movdqa XMM2, [EAX+32];
1400 movdqa XMM3, [EAX+48];
1401 add EAX, 64;
1402 movdqa XMM4, [ECX];
1403 movdqa XMM5, [ECX+16];
1404 movdqa XMM6, [ECX+32];
1405 movdqa XMM7, [ECX+48];
1406 add ECX, 64;
1407 psubb XMM0, XMM4;
1408 psubb XMM1, XMM5;
1409 psubb XMM2, XMM6;
1410 psubb XMM3, XMM7;
1411 movdqa [ESI -64], XMM0;
1412 movdqa [ESI+16-64], XMM1;
1413 movdqa [ESI+32-64], XMM2;
1414 movdqa [ESI+48-64], XMM3;
1415 cmp ESI, EDI;
1416 jb startsublsse2a;
1417
1418 mov aptr, ESI;
1419 mov bptr, EAX;
1420 mov cptr, ECX;
1421 }
1422 }
1423 }
1424 else
1425 // MMX version is 4428% faster
1426 if (mmx() && a.length >= 32)
1427 {
1428 auto n = aptr + (a.length & ~31);
1429
1430 asm
1431 {
1432 mov ESI, aptr;
1433 mov EDI, n;
1434 mov EAX, bptr;
1435 mov ECX, cptr;
1436
1437 align 8;
1438 startsublmmx:
1439 add ESI, 32;
1440 movq MM0, [EAX];
1441 movq MM1, [EAX+8];
1442 movq MM2, [EAX+16];
1443 movq MM3, [EAX+24];
1444 add EAX, 32;
1445 movq MM4, [ECX];
1446 movq MM5, [ECX+8];
1447 movq MM6, [ECX+16];
1448 movq MM7, [ECX+24];
1449 add ECX, 32;
1450 psubb MM0, MM4;
1451 psubb MM1, MM5;
1452 psubb MM2, MM6;
1453 psubb MM3, MM7;
1454 movq [ESI -32], MM0;
1455 movq [ESI+8 -32], MM1;
1456 movq [ESI+16-32], MM2;
1457 movq [ESI+24-32], MM3;
1458 cmp ESI, EDI;
1459 jb startsublmmx;
1460
1461 emms;
1462 mov aptr, ESI;
1463 mov bptr, EAX;
1464 mov cptr, ECX;
1465 }
1466 }
1467 }
1468
1469 while (aptr < aend)
1470 *aptr++ = cast(T)(*bptr++ - *cptr++);
1471
1472 return a;
1473 }
1474
1475 unittest
1476 {
1477 printf("_arraySliceSliceMinSliceAssign_g unittest\n");
1478
1479 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1480 {
1481 version (log) printf(" cpuid %d\n", cpuid);
1482
1483 for (int j = 0; j < 2; j++)
1484 {
1485 const int dim = 67;
1486 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1487 a = a[j .. dim + j]; // misalign for second iteration
1488 T[] b = new T[dim + j];
1489 b = b[j .. dim + j];
1490 T[] c = new T[dim + j];
1491 c = c[j .. dim + j];
1492
1493 for (int i = 0; i < dim; i++)
1494 { a[i] = cast(T)i;
1495 b[i] = cast(T)(i + 7);
1496 c[i] = cast(T)(i * 2);
1497 }
1498
1499 c[] = a[] - b[];
1500
1501 for (int i = 0; i < dim; i++)
1502 {
1503 if (c[i] != cast(T)(a[i] - b[i]))
1504 {
1505 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1506 assert(0);
1507 }
1508 }
1509 }
1510 }
1511 }
1512
1513
1514 /* ======================================================================== */
1515
1516 /***********************
1517 * Computes:
1518 * a[] -= value
1519 */
1520
1521 T[] _arrayExpSliceMinass_a(T[] a, T value)
1522 {
1523 return _arrayExpSliceMinass_g(a, value);
1524 }
1525
1526 T[] _arrayExpSliceMinass_h(T[] a, T value)
1527 {
1528 return _arrayExpSliceMinass_g(a, value);
1529 }
1530
1531 T[] _arrayExpSliceMinass_g(T[] a, T value)
1532 {
1533 //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1534 auto aptr = a.ptr;
1535 auto aend = aptr + a.length;
1536
1537 version (D_InlineAsm_X86)
1538 {
1539 // SSE2 aligned version is 1577% faster
1540 if (sse2() && a.length >= 64)
1541 {
1542 auto n = aptr + (a.length & ~63);
1543
1544 uint l = cast(ubyte) value;
1545 l |= (l << 8);
1546 l |= (l << 16);
1547
1548 if (((cast(uint) aptr) & 15) != 0)
1549 {
1550 asm // unaligned case
1551 {
1552 mov ESI, aptr;
1553 mov EDI, n;
1554 movd XMM4, l;
1555 pshufd XMM4, XMM4, 0;
1556
1557 align 8;
1558 startsubasssse2u:
1559 movdqu XMM0, [ESI];
1560 movdqu XMM1, [ESI+16];
1561 movdqu XMM2, [ESI+32];
1562 movdqu XMM3, [ESI+48];
1563 add ESI, 64;
1564 psubb XMM0, XMM4;
1565 psubb XMM1, XMM4;
1566 psubb XMM2, XMM4;
1567 psubb XMM3, XMM4;
1568 movdqu [ESI -64], XMM0;
1569 movdqu [ESI+16-64], XMM1;
1570 movdqu [ESI+32-64], XMM2;
1571 movdqu [ESI+48-64], XMM3;
1572 cmp ESI, EDI;
1573 jb startsubasssse2u;
1574
1575 mov aptr, ESI;
1576 }
1577 }
1578 else
1579 {
1580 asm // aligned case
1581 {
1582 mov ESI, aptr;
1583 mov EDI, n;
1584 movd XMM4, l;
1585 pshufd XMM4, XMM4, 0;
1586
1587 align 8;
1588 startsubasssse2a:
1589 movdqa XMM0, [ESI];
1590 movdqa XMM1, [ESI+16];
1591 movdqa XMM2, [ESI+32];
1592 movdqa XMM3, [ESI+48];
1593 add ESI, 64;
1594 psubb XMM0, XMM4;
1595 psubb XMM1, XMM4;
1596 psubb XMM2, XMM4;
1597 psubb XMM3, XMM4;
1598 movdqa [ESI -64], XMM0;
1599 movdqa [ESI+16-64], XMM1;
1600 movdqa [ESI+32-64], XMM2;
1601 movdqa [ESI+48-64], XMM3;
1602 cmp ESI, EDI;
1603 jb startsubasssse2a;
1604
1605 mov aptr, ESI;
1606 }
1607 }
1608 }
1609 else
1610 // MMX version is 1577% faster
1611 if (mmx() && a.length >= 32)
1612 {
1613
1614 auto n = aptr + (a.length & ~31);
1615
1616 uint l = cast(ubyte) value;
1617 l |= (l << 8);
1618
1619 asm
1620 {
1621 mov ESI, aptr;
1622 mov EDI, n;
1623 movd MM4, l;
1624 pshufw MM4, MM4, 0;
1625
1626 align 8;
1627 startsubassmmx:
1628 movq MM0, [ESI];
1629 movq MM1, [ESI+8];
1630 movq MM2, [ESI+16];
1631 movq MM3, [ESI+24];
1632 add ESI, 32;
1633 psubb MM0, MM4;
1634 psubb MM1, MM4;
1635 psubb MM2, MM4;
1636 psubb MM3, MM4;
1637 movq [ESI -32], MM0;
1638 movq [ESI+8 -32], MM1;
1639 movq [ESI+16-32], MM2;
1640 movq [ESI+24-32], MM3;
1641 cmp ESI, EDI;
1642 jb startsubassmmx;
1643
1644 emms;
1645 mov aptr, ESI;
1646 }
1647 }
1648 }
1649
1650 while (aptr < aend)
1651 *aptr++ -= value;
1652
1653 return a;
1654 }
1655
1656 unittest
1657 {
1658 printf("_arrayExpSliceMinass_g unittest\n");
1659
1660 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1661 {
1662 version (log) printf(" cpuid %d\n", cpuid);
1663
1664 for (int j = 0; j < 2; j++)
1665 {
1666 const int dim = 67;
1667 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1668 a = a[j .. dim + j]; // misalign for second iteration
1669 T[] b = new T[dim + j];
1670 b = b[j .. dim + j];
1671 T[] c = new T[dim + j];
1672 c = c[j .. dim + j];
1673
1674 for (int i = 0; i < dim; i++)
1675 { a[i] = cast(T)i;
1676 b[i] = cast(T)(i + 7);
1677 c[i] = cast(T)(i * 2);
1678 }
1679
1680 a[] = c[];
1681 c[] -= 6;
1682
1683 for (int i = 0; i < dim; i++)
1684 {
1685 if (c[i] != cast(T)(a[i] - 6))
1686 {
1687 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
1688 assert(0);
1689 }
1690 }
1691 }
1692 }
1693 }
1694
1695
1696 /* ======================================================================== */
1697
1698 /***********************
1699 * Computes:
1700 * a[] -= b[]
1701 */
1702
1703 T[] _arraySliceSliceMinass_a(T[] a, T[] b)
1704 {
1705 return _arraySliceSliceMinass_g(a, b);
1706 }
1707
1708 T[] _arraySliceSliceMinass_h(T[] a, T[] b)
1709 {
1710 return _arraySliceSliceMinass_g(a, b);
1711 }
1712
1713 T[] _arraySliceSliceMinass_g(T[] a, T[] b)
1714 in
1715 {
1716 assert (a.length == b.length);
1717 assert (disjoint(a, b));
1718 }
1719 body
1720 {
1721 //printf("_arraySliceSliceMinass_g()\n");
1722 auto aptr = a.ptr;
1723 auto aend = aptr + a.length;
1724 auto bptr = b.ptr;
1725
1726 version (D_InlineAsm_X86)
1727 {
1728 // SSE2 aligned version is 4800% faster
1729 if (sse2() && a.length >= 64)
1730 {
1731 auto n = aptr + (a.length & ~63);
1732
1733 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1734 {
1735 asm // unaligned case
1736 {
1737 mov ESI, aptr;
1738 mov EDI, n;
1739 mov ECX, bptr;
1740
1741 align 8;
1742 startsubasslsse2u:
1743 movdqu XMM0, [ESI];
1744 movdqu XMM1, [ESI+16];
1745 movdqu XMM2, [ESI+32];
1746 movdqu XMM3, [ESI+48];
1747 add ESI, 64;
1748 movdqu XMM4, [ECX];
1749 movdqu XMM5, [ECX+16];
1750 movdqu XMM6, [ECX+32];
1751 movdqu XMM7, [ECX+48];
1752 add ECX, 64;
1753 psubb XMM0, XMM4;
1754 psubb XMM1, XMM5;
1755 psubb XMM2, XMM6;
1756 psubb XMM3, XMM7;
1757 movdqu [ESI -64], XMM0;
1758 movdqu [ESI+16-64], XMM1;
1759 movdqu [ESI+32-64], XMM2;
1760 movdqu [ESI+48-64], XMM3;
1761 cmp ESI, EDI;
1762 jb startsubasslsse2u;
1763
1764 mov aptr, ESI;
1765 mov bptr, ECX;
1766 }
1767 }
1768 else
1769 {
1770 asm // aligned case
1771 {
1772 mov ESI, aptr;
1773 mov EDI, n;
1774 mov ECX, bptr;
1775
1776 align 8;
1777 startsubasslsse2a:
1778 movdqa XMM0, [ESI];
1779 movdqa XMM1, [ESI+16];
1780 movdqa XMM2, [ESI+32];
1781 movdqa XMM3, [ESI+48];
1782 add ESI, 64;
1783 movdqa XMM4, [ECX];
1784 movdqa XMM5, [ECX+16];
1785 movdqa XMM6, [ECX+32];
1786 movdqa XMM7, [ECX+48];
1787 add ECX, 64;
1788 psubb XMM0, XMM4;
1789 psubb XMM1, XMM5;
1790 psubb XMM2, XMM6;
1791 psubb XMM3, XMM7;
1792 movdqa [ESI -64], XMM0;
1793 movdqa [ESI+16-64], XMM1;
1794 movdqa [ESI+32-64], XMM2;
1795 movdqa [ESI+48-64], XMM3;
1796 cmp ESI, EDI;
1797 jb startsubasslsse2a;
1798
1799 mov aptr, ESI;
1800 mov bptr, ECX;
1801 }
1802 }
1803 }
1804 else
1805 // MMX version is 3107% faster
1806 if (mmx() && a.length >= 32)
1807 {
1808
1809 auto n = aptr + (a.length & ~31);
1810
1811 asm
1812 {
1813 mov ESI, aptr;
1814 mov EDI, n;
1815 mov ECX, bptr;
1816
1817 align 8;
1818 startsubasslmmx:
1819 movq MM0, [ESI];
1820 movq MM1, [ESI+8];
1821 movq MM2, [ESI+16];
1822 movq MM3, [ESI+24];
1823 add ESI, 32;
1824 movq MM4, [ECX];
1825 movq MM5, [ECX+8];
1826 movq MM6, [ECX+16];
1827 movq MM7, [ECX+24];
1828 add ECX, 32;
1829 psubb MM0, MM4;
1830 psubb MM1, MM5;
1831 psubb MM2, MM6;
1832 psubb MM3, MM7;
1833 movq [ESI -32], MM0;
1834 movq [ESI+8 -32], MM1;
1835 movq [ESI+16-32], MM2;
1836 movq [ESI+24-32], MM3;
1837 cmp ESI, EDI;
1838 jb startsubasslmmx;
1839
1840 emms;
1841 mov aptr, ESI;
1842 mov bptr, ECX;
1843 }
1844 }
1845 }
1846
1847 while (aptr < aend)
1848 *aptr++ -= *bptr++;
1849
1850 return a;
1851 }
1852
1853 unittest
1854 {
1855 printf("_arraySliceSliceMinass_g unittest\n");
1856
1857 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1858 {
1859 version (log) printf(" cpuid %d\n", cpuid);
1860
1861 for (int j = 0; j < 2; j++)
1862 {
1863 const int dim = 67;
1864 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1865 a = a[j .. dim + j]; // misalign for second iteration
1866 T[] b = new T[dim + j];
1867 b = b[j .. dim + j];
1868 T[] c = new T[dim + j];
1869 c = c[j .. dim + j];
1870
1871 for (int i = 0; i < dim; i++)
1872 { a[i] = cast(T)i;
1873 b[i] = cast(T)(i + 7);
1874 c[i] = cast(T)(i * 2);
1875 }
1876
1877 a[] = c[];
1878 c[] -= b[];
1879
1880 for (int i = 0; i < dim; i++)
1881 {
1882 if (c[i] != cast(T)(a[i] - b[i]))
1883 {
1884 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1885 assert(0);
1886 }
1887 }
1888 }
1889 }
1890 }