Mercurial > projects > ldc
comparison druntime/src/compiler/dmd/arrayint.d @ 759:d3eb054172f9
Added copy of druntime from DMD 2.020 modified for LDC.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:52:37 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
758:f04dde6e882c | 759:d3eb054172f9 |
---|---|
1 /*************************** | |
2 * D programming language http://www.digitalmars.com/d/ | |
3 * Runtime support for byte array operations. | |
4 * Based on code originally written by Burton Radons. | |
5 * Placed in public domain. | |
6 */ | |
7 | |
8 /* Contains MMX versions of certain operations for dchar, int, | |
9 * and uint ('w', 'i' and 'k' suffixes). | |
10 */ | |
11 | |
12 module rt.arrayint; | |
13 | |
14 private import util.cpuid; | |
15 | |
16 version (Unittest) | |
17 { | |
18 /* This is so unit tests will test every CPU variant | |
19 */ | |
20 int cpuid; | |
21 const int CPUID_MAX = 4; | |
22 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); } | |
23 bool sse() { return cpuid == 2 && util.cpuid.sse(); } | |
24 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); } | |
25 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); } | |
26 } | |
27 else | |
28 { | |
29 alias util.cpuid.mmx mmx; | |
30 alias util.cpuid.sse sse; | |
31 alias util.cpuid.sse2 sse2; | |
32 alias util.cpuid.amd3dnow amd3dnow; | |
33 } | |
34 | |
35 //version = log; | |
36 | |
37 bool disjoint(T)(T[] a, T[] b) | |
38 { | |
39 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); | |
40 } | |
41 | |
42 alias int T; | |
43 | |
44 extern (C): | |
45 | |
46 /* ======================================================================== */ | |
47 | |
48 /*********************** | |
49 * Computes: | |
50 * a[] = b[] + value | |
51 */ | |
52 | |
53 T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b) | |
54 { | |
55 return _arraySliceExpAddSliceAssign_i(a, value, b); | |
56 } | |
57 | |
58 T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b) | |
59 { | |
60 return _arraySliceExpAddSliceAssign_i(a, value, b); | |
61 } | |
62 | |
63 T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b) | |
64 in | |
65 { | |
66 assert(a.length == b.length); | |
67 assert(disjoint(a, b)); | |
68 } | |
69 body | |
70 { | |
71 //printf("_arraySliceExpAddSliceAssign_i()\n"); | |
72 auto aptr = a.ptr; | |
73 auto aend = aptr + a.length; | |
74 auto bptr = b.ptr; | |
75 | |
76 version (D_InlineAsm_X86) | |
77 { | |
78 // SSE2 aligned version is 380% faster | |
79 if (sse2() && a.length >= 8) | |
80 { | |
81 auto n = aptr + (a.length & ~7); | |
82 | |
83 uint l = value; | |
84 | |
85 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
86 { | |
87 asm // unaligned case | |
88 { | |
89 mov ESI, aptr; | |
90 mov EDI, n; | |
91 mov EAX, bptr; | |
92 movd XMM2, l; | |
93 pshufd XMM2, XMM2, 0; | |
94 | |
95 align 4; | |
96 startaddsse2u: | |
97 add ESI, 32; | |
98 movdqu XMM0, [EAX]; | |
99 movdqu XMM1, [EAX+16]; | |
100 add EAX, 32; | |
101 paddd XMM0, XMM2; | |
102 paddd XMM1, XMM2; | |
103 movdqu [ESI -32], XMM0; | |
104 movdqu [ESI+16-32], XMM1; | |
105 cmp ESI, EDI; | |
106 jb startaddsse2u; | |
107 | |
108 mov aptr, ESI; | |
109 mov bptr, EAX; | |
110 } | |
111 } | |
112 else | |
113 { | |
114 asm // aligned case | |
115 { | |
116 mov ESI, aptr; | |
117 mov EDI, n; | |
118 mov EAX, bptr; | |
119 movd XMM2, l; | |
120 pshufd XMM2, XMM2, 0; | |
121 | |
122 align 4; | |
123 startaddsse2a: | |
124 add ESI, 32; | |
125 movdqa XMM0, [EAX]; | |
126 movdqa XMM1, [EAX+16]; | |
127 add EAX, 32; | |
128 paddd XMM0, XMM2; | |
129 paddd XMM1, XMM2; | |
130 movdqa [ESI -32], XMM0; | |
131 movdqa [ESI+16-32], XMM1; | |
132 cmp ESI, EDI; | |
133 jb startaddsse2a; | |
134 | |
135 mov aptr, ESI; | |
136 mov bptr, EAX; | |
137 } | |
138 } | |
139 } | |
140 else | |
141 // MMX version is 298% faster | |
142 if (mmx() && a.length >= 4) | |
143 { | |
144 auto n = aptr + (a.length & ~3); | |
145 | |
146 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); | |
147 | |
148 asm | |
149 { | |
150 mov ESI, aptr; | |
151 mov EDI, n; | |
152 mov EAX, bptr; | |
153 movq MM2, l; | |
154 | |
155 align 4; | |
156 startmmx: | |
157 add ESI, 16; | |
158 movq MM0, [EAX]; | |
159 movq MM1, [EAX+8]; | |
160 add EAX, 16; | |
161 paddd MM0, MM2; | |
162 paddd MM1, MM2; | |
163 movq [ESI -16], MM0; | |
164 movq [ESI+8-16], MM1; | |
165 cmp ESI, EDI; | |
166 jb startmmx; | |
167 | |
168 emms; | |
169 mov aptr, ESI; | |
170 mov bptr, EAX; | |
171 } | |
172 } | |
173 else | |
174 if (a.length >= 2) | |
175 { | |
176 auto n = aptr + (a.length & ~1); | |
177 | |
178 asm | |
179 { | |
180 mov ESI, aptr; | |
181 mov EDI, n; | |
182 mov EAX, bptr; | |
183 mov EDX, value; | |
184 | |
185 align 4; | |
186 start386: | |
187 add ESI, 8; | |
188 mov EBX, [EAX]; | |
189 mov ECX, [EAX+4]; | |
190 add EAX, 8; | |
191 add EBX, EDX; | |
192 add ECX, EDX; | |
193 mov [ESI -8], EBX; | |
194 mov [ESI+4-8], ECX; | |
195 cmp ESI, EDI; | |
196 jb start386; | |
197 | |
198 mov aptr, ESI; | |
199 mov bptr, EAX; | |
200 } | |
201 } | |
202 } | |
203 | |
204 while (aptr < aend) | |
205 *aptr++ = *bptr++ + value; | |
206 | |
207 return a; | |
208 } | |
209 | |
210 unittest | |
211 { | |
212 printf("_arraySliceExpAddSliceAssign_i unittest\n"); | |
213 | |
214 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
215 { | |
216 version (log) printf(" cpuid %d\n", cpuid); | |
217 | |
218 for (int j = 0; j < 2; j++) | |
219 { | |
220 const int dim = 67; | |
221 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
222 a = a[j .. dim + j]; // misalign for second iteration | |
223 T[] b = new T[dim + j]; | |
224 b = b[j .. dim + j]; | |
225 T[] c = new T[dim + j]; | |
226 c = c[j .. dim + j]; | |
227 | |
228 for (int i = 0; i < dim; i++) | |
229 { a[i] = cast(T)i; | |
230 b[i] = cast(T)(i + 7); | |
231 c[i] = cast(T)(i * 2); | |
232 } | |
233 | |
234 c[] = a[] + 6; | |
235 | |
236 for (int i = 0; i < dim; i++) | |
237 { | |
238 if (c[i] != cast(T)(a[i] + 6)) | |
239 { | |
240 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); | |
241 assert(0); | |
242 } | |
243 } | |
244 } | |
245 } | |
246 } | |
247 | |
248 | |
249 /* ======================================================================== */ | |
250 | |
251 /*********************** | |
252 * Computes: | |
253 * a[] = b[] + c[] | |
254 */ | |
255 | |
256 T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b) | |
257 { | |
258 return _arraySliceSliceAddSliceAssign_i(a, c, b); | |
259 } | |
260 | |
261 T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b) | |
262 { | |
263 return _arraySliceSliceAddSliceAssign_i(a, c, b); | |
264 } | |
265 | |
266 T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b) | |
267 in | |
268 { | |
269 assert(a.length == b.length && b.length == c.length); | |
270 assert(disjoint(a, b)); | |
271 assert(disjoint(a, c)); | |
272 assert(disjoint(b, c)); | |
273 } | |
274 body | |
275 { | |
276 //printf("_arraySliceSliceAddSliceAssign_i()\n"); | |
277 auto aptr = a.ptr; | |
278 auto aend = aptr + a.length; | |
279 auto bptr = b.ptr; | |
280 auto cptr = c.ptr; | |
281 | |
282 version (D_InlineAsm_X86) | |
283 { | |
284 // SSE2 aligned version is 1710% faster | |
285 if (sse2() && a.length >= 8) | |
286 { | |
287 auto n = aptr + (a.length & ~7); | |
288 | |
289 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
290 { | |
291 asm // unaligned case | |
292 { | |
293 mov ESI, aptr; | |
294 mov EDI, n; | |
295 mov EAX, bptr; | |
296 mov ECX, cptr; | |
297 | |
298 align 4; | |
299 startsse2u: | |
300 add ESI, 32; | |
301 movdqu XMM0, [EAX]; | |
302 movdqu XMM2, [ECX]; | |
303 movdqu XMM1, [EAX+16]; | |
304 movdqu XMM3, [ECX+16]; | |
305 add EAX, 32; | |
306 add ECX, 32; | |
307 paddd XMM0, XMM2; | |
308 paddd XMM1, XMM3; | |
309 movdqu [ESI -32], XMM0; | |
310 movdqu [ESI+16-32], XMM1; | |
311 cmp ESI, EDI; | |
312 jb startsse2u; | |
313 | |
314 mov aptr, ESI; | |
315 mov bptr, EAX; | |
316 mov cptr, ECX; | |
317 } | |
318 } | |
319 else | |
320 { | |
321 asm // aligned case | |
322 { | |
323 mov ESI, aptr; | |
324 mov EDI, n; | |
325 mov EAX, bptr; | |
326 mov ECX, cptr; | |
327 | |
328 align 4; | |
329 startsse2a: | |
330 add ESI, 32; | |
331 movdqa XMM0, [EAX]; | |
332 movdqa XMM2, [ECX]; | |
333 movdqa XMM1, [EAX+16]; | |
334 movdqa XMM3, [ECX+16]; | |
335 add EAX, 32; | |
336 add ECX, 32; | |
337 paddd XMM0, XMM2; | |
338 paddd XMM1, XMM3; | |
339 movdqa [ESI -32], XMM0; | |
340 movdqa [ESI+16-32], XMM1; | |
341 cmp ESI, EDI; | |
342 jb startsse2a; | |
343 | |
344 mov aptr, ESI; | |
345 mov bptr, EAX; | |
346 mov cptr, ECX; | |
347 } | |
348 } | |
349 } | |
350 else | |
351 // MMX version is 995% faster | |
352 if (mmx() && a.length >= 4) | |
353 { | |
354 auto n = aptr + (a.length & ~3); | |
355 | |
356 asm | |
357 { | |
358 mov ESI, aptr; | |
359 mov EDI, n; | |
360 mov EAX, bptr; | |
361 mov ECX, cptr; | |
362 | |
363 align 4; | |
364 startmmx: | |
365 add ESI, 16; | |
366 movq MM0, [EAX]; | |
367 movq MM2, [ECX]; | |
368 movq MM1, [EAX+8]; | |
369 movq MM3, [ECX+8]; | |
370 add EAX, 16; | |
371 add ECX, 16; | |
372 paddd MM0, MM2; | |
373 paddd MM1, MM3; | |
374 movq [ESI -16], MM0; | |
375 movq [ESI+8-16], MM1; | |
376 cmp ESI, EDI; | |
377 jb startmmx; | |
378 | |
379 emms; | |
380 mov aptr, ESI; | |
381 mov bptr, EAX; | |
382 mov cptr, ECX; | |
383 } | |
384 } | |
385 } | |
386 | |
387 normal: | |
388 while (aptr < aend) | |
389 *aptr++ = *bptr++ + *cptr++; | |
390 | |
391 return a; | |
392 } | |
393 | |
394 unittest | |
395 { | |
396 printf("_arraySliceSliceAddSliceAssign_i unittest\n"); | |
397 | |
398 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
399 { | |
400 version (log) printf(" cpuid %d\n", cpuid); | |
401 | |
402 for (int j = 0; j < 2; j++) | |
403 { | |
404 const int dim = 67; | |
405 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
406 a = a[j .. dim + j]; // misalign for second iteration | |
407 T[] b = new T[dim + j]; | |
408 b = b[j .. dim + j]; | |
409 T[] c = new T[dim + j]; | |
410 c = c[j .. dim + j]; | |
411 | |
412 for (int i = 0; i < dim; i++) | |
413 { a[i] = cast(T)i; | |
414 b[i] = cast(T)(i + 7); | |
415 c[i] = cast(T)(i * 2); | |
416 } | |
417 | |
418 c[] = a[] + b[]; | |
419 | |
420 for (int i = 0; i < dim; i++) | |
421 { | |
422 if (c[i] != cast(T)(a[i] + b[i])) | |
423 { | |
424 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); | |
425 assert(0); | |
426 } | |
427 } | |
428 } | |
429 } | |
430 } | |
431 | |
432 | |
433 /* ======================================================================== */ | |
434 | |
435 /*********************** | |
436 * Computes: | |
437 * a[] += value | |
438 */ | |
439 | |
440 T[] _arrayExpSliceAddass_w(T[] a, T value) | |
441 { | |
442 return _arrayExpSliceAddass_i(a, value); | |
443 } | |
444 | |
445 T[] _arrayExpSliceAddass_k(T[] a, T value) | |
446 { | |
447 return _arrayExpSliceAddass_i(a, value); | |
448 } | |
449 | |
450 T[] _arrayExpSliceAddass_i(T[] a, T value) | |
451 { | |
452 //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
453 auto aptr = a.ptr; | |
454 auto aend = aptr + a.length; | |
455 | |
456 version (D_InlineAsm_X86) | |
457 { | |
458 // SSE2 aligned version is 83% faster | |
459 if (sse2() && a.length >= 8) | |
460 { | |
461 auto n = aptr + (a.length & ~7); | |
462 | |
463 uint l = value; | |
464 | |
465 if (((cast(uint) aptr) & 15) != 0) | |
466 { | |
467 asm // unaligned case | |
468 { | |
469 mov ESI, aptr; | |
470 mov EDI, n; | |
471 movd XMM2, l; | |
472 pshufd XMM2, XMM2, 0; | |
473 | |
474 align 4; | |
475 startaddsse2u: | |
476 movdqu XMM0, [ESI]; | |
477 movdqu XMM1, [ESI+16]; | |
478 add ESI, 32; | |
479 paddd XMM0, XMM2; | |
480 paddd XMM1, XMM2; | |
481 movdqu [ESI -32], XMM0; | |
482 movdqu [ESI+16-32], XMM1; | |
483 cmp ESI, EDI; | |
484 jb startaddsse2u; | |
485 | |
486 mov aptr, ESI; | |
487 } | |
488 } | |
489 else | |
490 { | |
491 asm // aligned case | |
492 { | |
493 mov ESI, aptr; | |
494 mov EDI, n; | |
495 movd XMM2, l; | |
496 pshufd XMM2, XMM2, 0; | |
497 | |
498 align 4; | |
499 startaddsse2a: | |
500 movdqa XMM0, [ESI]; | |
501 movdqa XMM1, [ESI+16]; | |
502 add ESI, 32; | |
503 paddd XMM0, XMM2; | |
504 paddd XMM1, XMM2; | |
505 movdqa [ESI -32], XMM0; | |
506 movdqa [ESI+16-32], XMM1; | |
507 cmp ESI, EDI; | |
508 jb startaddsse2a; | |
509 | |
510 mov aptr, ESI; | |
511 } | |
512 } | |
513 } | |
514 else | |
515 // MMX version is 81% faster | |
516 if (mmx() && a.length >= 4) | |
517 { | |
518 auto n = aptr + (a.length & ~3); | |
519 | |
520 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); | |
521 | |
522 asm | |
523 { | |
524 mov ESI, aptr; | |
525 mov EDI, n; | |
526 movq MM2, l; | |
527 | |
528 align 4; | |
529 startmmx: | |
530 movq MM0, [ESI]; | |
531 movq MM1, [ESI+8]; | |
532 add ESI, 16; | |
533 paddd MM0, MM2; | |
534 paddd MM1, MM2; | |
535 movq [ESI -16], MM0; | |
536 movq [ESI+8-16], MM1; | |
537 cmp ESI, EDI; | |
538 jb startmmx; | |
539 | |
540 emms; | |
541 mov aptr, ESI; | |
542 } | |
543 } | |
544 else | |
545 if (a.length >= 2) | |
546 { | |
547 auto n = aptr + (a.length & ~1); | |
548 | |
549 asm | |
550 { | |
551 mov ESI, aptr; | |
552 mov EDI, n; | |
553 mov EDX, value; | |
554 | |
555 align 4; | |
556 start386: | |
557 mov EBX, [ESI]; | |
558 mov ECX, [ESI+4]; | |
559 add ESI, 8; | |
560 add EBX, EDX; | |
561 add ECX, EDX; | |
562 mov [ESI -8], EBX; | |
563 mov [ESI+4-8], ECX; | |
564 cmp ESI, EDI; | |
565 jb start386; | |
566 | |
567 mov aptr, ESI; | |
568 } | |
569 } | |
570 } | |
571 | |
572 while (aptr < aend) | |
573 *aptr++ += value; | |
574 | |
575 return a; | |
576 } | |
577 | |
578 unittest | |
579 { | |
580 printf("_arrayExpSliceAddass_i unittest\n"); | |
581 | |
582 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
583 { | |
584 version (log) printf(" cpuid %d\n", cpuid); | |
585 | |
586 for (int j = 0; j < 2; j++) | |
587 { | |
588 const int dim = 67; | |
589 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
590 a = a[j .. dim + j]; // misalign for second iteration | |
591 T[] b = new T[dim + j]; | |
592 b = b[j .. dim + j]; | |
593 T[] c = new T[dim + j]; | |
594 c = c[j .. dim + j]; | |
595 | |
596 for (int i = 0; i < dim; i++) | |
597 { a[i] = cast(T)i; | |
598 b[i] = cast(T)(i + 7); | |
599 c[i] = cast(T)(i * 2); | |
600 } | |
601 | |
602 a[] = c[]; | |
603 a[] += 6; | |
604 | |
605 for (int i = 0; i < dim; i++) | |
606 { | |
607 if (a[i] != cast(T)(c[i] + 6)) | |
608 { | |
609 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); | |
610 assert(0); | |
611 } | |
612 } | |
613 } | |
614 } | |
615 } | |
616 | |
617 | |
618 /* ======================================================================== */ | |
619 | |
620 /*********************** | |
621 * Computes: | |
622 * a[] += b[] | |
623 */ | |
624 | |
625 T[] _arraySliceSliceAddass_w(T[] a, T[] b) | |
626 { | |
627 return _arraySliceSliceAddass_i(a, b); | |
628 } | |
629 | |
630 T[] _arraySliceSliceAddass_k(T[] a, T[] b) | |
631 { | |
632 return _arraySliceSliceAddass_i(a, b); | |
633 } | |
634 | |
635 T[] _arraySliceSliceAddass_i(T[] a, T[] b) | |
636 in | |
637 { | |
638 assert (a.length == b.length); | |
639 assert (disjoint(a, b)); | |
640 } | |
641 body | |
642 { | |
643 //printf("_arraySliceSliceAddass_i()\n"); | |
644 auto aptr = a.ptr; | |
645 auto aend = aptr + a.length; | |
646 auto bptr = b.ptr; | |
647 | |
648 version (D_InlineAsm_X86) | |
649 { | |
650 // SSE2 aligned version is 695% faster | |
651 if (sse2() && a.length >= 8) | |
652 { | |
653 auto n = aptr + (a.length & ~7); | |
654 | |
655 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
656 { | |
657 asm // unaligned case | |
658 { | |
659 mov ESI, aptr; | |
660 mov EDI, n; | |
661 mov ECX, bptr; | |
662 | |
663 align 4; | |
664 startsse2u: | |
665 movdqu XMM0, [ESI]; | |
666 movdqu XMM2, [ECX]; | |
667 movdqu XMM1, [ESI+16]; | |
668 movdqu XMM3, [ECX+16]; | |
669 add ESI, 32; | |
670 add ECX, 32; | |
671 paddd XMM0, XMM2; | |
672 paddd XMM1, XMM3; | |
673 movdqu [ESI -32], XMM0; | |
674 movdqu [ESI+16-32], XMM1; | |
675 cmp ESI, EDI; | |
676 jb startsse2u; | |
677 | |
678 mov aptr, ESI; | |
679 mov bptr, ECX; | |
680 } | |
681 } | |
682 else | |
683 { | |
684 asm // aligned case | |
685 { | |
686 mov ESI, aptr; | |
687 mov EDI, n; | |
688 mov ECX, bptr; | |
689 | |
690 align 4; | |
691 startsse2a: | |
692 movdqa XMM0, [ESI]; | |
693 movdqa XMM2, [ECX]; | |
694 movdqa XMM1, [ESI+16]; | |
695 movdqa XMM3, [ECX+16]; | |
696 add ESI, 32; | |
697 add ECX, 32; | |
698 paddd XMM0, XMM2; | |
699 paddd XMM1, XMM3; | |
700 movdqa [ESI -32], XMM0; | |
701 movdqa [ESI+16-32], XMM1; | |
702 cmp ESI, EDI; | |
703 jb startsse2a; | |
704 | |
705 mov aptr, ESI; | |
706 mov bptr, ECX; | |
707 } | |
708 } | |
709 } | |
710 else | |
711 // MMX version is 471% faster | |
712 if (mmx() && a.length >= 4) | |
713 { | |
714 auto n = aptr + (a.length & ~3); | |
715 | |
716 asm | |
717 { | |
718 mov ESI, aptr; | |
719 mov EDI, n; | |
720 mov ECX, bptr; | |
721 | |
722 align 4; | |
723 startmmx: | |
724 movq MM0, [ESI]; | |
725 movq MM2, [ECX]; | |
726 movq MM1, [ESI+8]; | |
727 movq MM3, [ECX+8]; | |
728 add ESI, 16; | |
729 add ECX, 16; | |
730 paddd MM0, MM2; | |
731 paddd MM1, MM3; | |
732 movq [ESI -16], MM0; | |
733 movq [ESI+8-16], MM1; | |
734 cmp ESI, EDI; | |
735 jb startmmx; | |
736 | |
737 emms; | |
738 mov aptr, ESI; | |
739 mov bptr, ECX; | |
740 } | |
741 } | |
742 } | |
743 | |
744 normal: | |
745 while (aptr < aend) | |
746 *aptr++ += *bptr++; | |
747 | |
748 return a; | |
749 } | |
750 | |
751 unittest | |
752 { | |
753 printf("_arraySliceSliceAddass_i unittest\n"); | |
754 | |
755 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
756 { | |
757 version (log) printf(" cpuid %d\n", cpuid); | |
758 | |
759 for (int j = 0; j < 2; j++) | |
760 { | |
761 const int dim = 67; | |
762 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
763 a = a[j .. dim + j]; // misalign for second iteration | |
764 T[] b = new T[dim + j]; | |
765 b = b[j .. dim + j]; | |
766 T[] c = new T[dim + j]; | |
767 c = c[j .. dim + j]; | |
768 | |
769 for (int i = 0; i < dim; i++) | |
770 { a[i] = cast(T)i; | |
771 b[i] = cast(T)(i + 7); | |
772 c[i] = cast(T)(i * 2); | |
773 } | |
774 | |
775 b[] = c[]; | |
776 c[] += a[]; | |
777 | |
778 for (int i = 0; i < dim; i++) | |
779 { | |
780 if (c[i] != cast(T)(b[i] + a[i])) | |
781 { | |
782 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); | |
783 assert(0); | |
784 } | |
785 } | |
786 } | |
787 } | |
788 } | |
789 | |
790 | |
791 /* ======================================================================== */ | |
792 | |
793 /*********************** | |
794 * Computes: | |
795 * a[] = b[] - value | |
796 */ | |
797 | |
798 T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b) | |
799 { | |
800 return _arraySliceExpMinSliceAssign_i(a, value, b); | |
801 } | |
802 | |
803 T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b) | |
804 { | |
805 return _arraySliceExpMinSliceAssign_i(a, value, b); | |
806 } | |
807 | |
808 T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b) | |
809 in | |
810 { | |
811 assert(a.length == b.length); | |
812 assert(disjoint(a, b)); | |
813 } | |
814 body | |
815 { | |
816 //printf("_arraySliceExpMinSliceAssign_i()\n"); | |
817 auto aptr = a.ptr; | |
818 auto aend = aptr + a.length; | |
819 auto bptr = b.ptr; | |
820 | |
821 version (D_InlineAsm_X86) | |
822 { | |
823 // SSE2 aligned version is 400% faster | |
824 if (sse2() && a.length >= 8) | |
825 { | |
826 auto n = aptr + (a.length & ~7); | |
827 | |
828 uint l = value; | |
829 | |
830 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
831 { | |
832 asm // unaligned case | |
833 { | |
834 mov ESI, aptr; | |
835 mov EDI, n; | |
836 mov EAX, bptr; | |
837 movd XMM2, l; | |
838 pshufd XMM2, XMM2, 0; | |
839 | |
840 align 4; | |
841 startaddsse2u: | |
842 add ESI, 32; | |
843 movdqu XMM0, [EAX]; | |
844 movdqu XMM1, [EAX+16]; | |
845 add EAX, 32; | |
846 psubd XMM0, XMM2; | |
847 psubd XMM1, XMM2; | |
848 movdqu [ESI -32], XMM0; | |
849 movdqu [ESI+16-32], XMM1; | |
850 cmp ESI, EDI; | |
851 jb startaddsse2u; | |
852 | |
853 mov aptr, ESI; | |
854 mov bptr, EAX; | |
855 } | |
856 } | |
857 else | |
858 { | |
859 asm // aligned case | |
860 { | |
861 mov ESI, aptr; | |
862 mov EDI, n; | |
863 mov EAX, bptr; | |
864 movd XMM2, l; | |
865 pshufd XMM2, XMM2, 0; | |
866 | |
867 align 4; | |
868 startaddsse2a: | |
869 add ESI, 32; | |
870 movdqa XMM0, [EAX]; | |
871 movdqa XMM1, [EAX+16]; | |
872 add EAX, 32; | |
873 psubd XMM0, XMM2; | |
874 psubd XMM1, XMM2; | |
875 movdqa [ESI -32], XMM0; | |
876 movdqa [ESI+16-32], XMM1; | |
877 cmp ESI, EDI; | |
878 jb startaddsse2a; | |
879 | |
880 mov aptr, ESI; | |
881 mov bptr, EAX; | |
882 } | |
883 } | |
884 } | |
885 else | |
886 // MMX version is 315% faster | |
887 if (mmx() && a.length >= 4) | |
888 { | |
889 auto n = aptr + (a.length & ~3); | |
890 | |
891 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); | |
892 | |
893 asm | |
894 { | |
895 mov ESI, aptr; | |
896 mov EDI, n; | |
897 mov EAX, bptr; | |
898 movq MM2, l; | |
899 | |
900 align 4; | |
901 startmmx: | |
902 add ESI, 16; | |
903 movq MM0, [EAX]; | |
904 movq MM1, [EAX+8]; | |
905 add EAX, 16; | |
906 psubd MM0, MM2; | |
907 psubd MM1, MM2; | |
908 movq [ESI -16], MM0; | |
909 movq [ESI+8-16], MM1; | |
910 cmp ESI, EDI; | |
911 jb startmmx; | |
912 | |
913 emms; | |
914 mov aptr, ESI; | |
915 mov bptr, EAX; | |
916 } | |
917 } | |
918 else | |
919 if (a.length >= 2) | |
920 { | |
921 auto n = aptr + (a.length & ~1); | |
922 | |
923 asm | |
924 { | |
925 mov ESI, aptr; | |
926 mov EDI, n; | |
927 mov EAX, bptr; | |
928 mov EDX, value; | |
929 | |
930 align 4; | |
931 start386: | |
932 add ESI, 8; | |
933 mov EBX, [EAX]; | |
934 mov ECX, [EAX+4]; | |
935 add EAX, 8; | |
936 sub EBX, EDX; | |
937 sub ECX, EDX; | |
938 mov [ESI -8], EBX; | |
939 mov [ESI+4-8], ECX; | |
940 cmp ESI, EDI; | |
941 jb start386; | |
942 | |
943 mov aptr, ESI; | |
944 mov bptr, EAX; | |
945 } | |
946 } | |
947 } | |
948 | |
949 while (aptr < aend) | |
950 *aptr++ = *bptr++ - value; | |
951 | |
952 return a; | |
953 } | |
954 | |
955 unittest | |
956 { | |
957 printf("_arraySliceExpMinSliceAssign_i unittest\n"); | |
958 | |
959 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
960 { | |
961 version (log) printf(" cpuid %d\n", cpuid); | |
962 | |
963 for (int j = 0; j < 2; j++) | |
964 { | |
965 const int dim = 67; | |
966 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
967 a = a[j .. dim + j]; // misalign for second iteration | |
968 T[] b = new T[dim + j]; | |
969 b = b[j .. dim + j]; | |
970 T[] c = new T[dim + j]; | |
971 c = c[j .. dim + j]; | |
972 | |
973 for (int i = 0; i < dim; i++) | |
974 { a[i] = cast(T)i; | |
975 b[i] = cast(T)(i + 7); | |
976 c[i] = cast(T)(i * 2); | |
977 } | |
978 | |
979 c[] = a[] - 6; | |
980 | |
981 for (int i = 0; i < dim; i++) | |
982 { | |
983 if (c[i] != cast(T)(a[i] - 6)) | |
984 { | |
985 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); | |
986 assert(0); | |
987 } | |
988 } | |
989 } | |
990 } | |
991 } | |
992 | |
993 | |
994 /* ======================================================================== */ | |
995 | |
996 /*********************** | |
997 * Computes: | |
998 * a[] = value - b[] | |
999 */ | |
1000 | |
1001 T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value) | |
1002 { | |
1003 return _arrayExpSliceMinSliceAssign_i(a, b, value); | |
1004 } | |
1005 | |
1006 T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value) | |
1007 { | |
1008 return _arrayExpSliceMinSliceAssign_i(a, b, value); | |
1009 } | |
1010 | |
1011 T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value) | |
1012 in | |
1013 { | |
1014 assert(a.length == b.length); | |
1015 assert(disjoint(a, b)); | |
1016 } | |
1017 body | |
1018 { | |
1019 //printf("_arrayExpSliceMinSliceAssign_i()\n"); | |
1020 auto aptr = a.ptr; | |
1021 auto aend = aptr + a.length; | |
1022 auto bptr = b.ptr; | |
1023 | |
1024 version (D_InlineAsm_X86) | |
1025 { | |
1026 // SSE2 aligned version is 1812% faster | |
1027 if (sse2() && a.length >= 8) | |
1028 { | |
1029 auto n = aptr + (a.length & ~7); | |
1030 | |
1031 uint l = value; | |
1032 | |
1033 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1034 { | |
1035 asm // unaligned case | |
1036 { | |
1037 mov ESI, aptr; | |
1038 mov EDI, n; | |
1039 mov EAX, bptr; | |
1040 movd XMM4, l; | |
1041 pshufd XMM4, XMM4, 0; | |
1042 | |
1043 align 4; | |
1044 startaddsse2u: | |
1045 add ESI, 32; | |
1046 movdqu XMM2, [EAX]; | |
1047 movdqu XMM3, [EAX+16]; | |
1048 movdqa XMM0, XMM4; | |
1049 movdqa XMM1, XMM4; | |
1050 add EAX, 32; | |
1051 psubd XMM0, XMM2; | |
1052 psubd XMM1, XMM3; | |
1053 movdqu [ESI -32], XMM0; | |
1054 movdqu [ESI+16-32], XMM1; | |
1055 cmp ESI, EDI; | |
1056 jb startaddsse2u; | |
1057 | |
1058 mov aptr, ESI; | |
1059 mov bptr, EAX; | |
1060 } | |
1061 } | |
1062 else | |
1063 { | |
1064 asm // aligned case | |
1065 { | |
1066 mov ESI, aptr; | |
1067 mov EDI, n; | |
1068 mov EAX, bptr; | |
1069 movd XMM4, l; | |
1070 pshufd XMM4, XMM4, 0; | |
1071 | |
1072 align 4; | |
1073 startaddsse2a: | |
1074 add ESI, 32; | |
1075 movdqa XMM2, [EAX]; | |
1076 movdqa XMM3, [EAX+16]; | |
1077 movdqa XMM0, XMM4; | |
1078 movdqa XMM1, XMM4; | |
1079 add EAX, 32; | |
1080 psubd XMM0, XMM2; | |
1081 psubd XMM1, XMM3; | |
1082 movdqa [ESI -32], XMM0; | |
1083 movdqa [ESI+16-32], XMM1; | |
1084 cmp ESI, EDI; | |
1085 jb startaddsse2a; | |
1086 | |
1087 mov aptr, ESI; | |
1088 mov bptr, EAX; | |
1089 } | |
1090 } | |
1091 } | |
1092 else | |
1093 // MMX version is 1077% faster | |
1094 if (mmx() && a.length >= 4) | |
1095 { | |
1096 auto n = aptr + (a.length & ~3); | |
1097 | |
1098 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); | |
1099 | |
1100 asm | |
1101 { | |
1102 mov ESI, aptr; | |
1103 mov EDI, n; | |
1104 mov EAX, bptr; | |
1105 movq MM4, l; | |
1106 | |
1107 align 4; | |
1108 startmmx: | |
1109 add ESI, 16; | |
1110 movq MM2, [EAX]; | |
1111 movq MM3, [EAX+8]; | |
1112 movq MM0, MM4; | |
1113 movq MM1, MM4; | |
1114 add EAX, 16; | |
1115 psubd MM0, MM2; | |
1116 psubd MM1, MM3; | |
1117 movq [ESI -16], MM0; | |
1118 movq [ESI+8-16], MM1; | |
1119 cmp ESI, EDI; | |
1120 jb startmmx; | |
1121 | |
1122 emms; | |
1123 mov aptr, ESI; | |
1124 mov bptr, EAX; | |
1125 } | |
1126 } | |
1127 } | |
1128 | |
1129 while (aptr < aend) | |
1130 *aptr++ = value - *bptr++; | |
1131 | |
1132 return a; | |
1133 } | |
1134 | |
1135 unittest | |
1136 { | |
1137 printf("_arrayExpSliceMinSliceAssign_i unittest\n"); | |
1138 | |
1139 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1140 { | |
1141 version (log) printf(" cpuid %d\n", cpuid); | |
1142 | |
1143 for (int j = 0; j < 2; j++) | |
1144 { | |
1145 const int dim = 67; | |
1146 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1147 a = a[j .. dim + j]; // misalign for second iteration | |
1148 T[] b = new T[dim + j]; | |
1149 b = b[j .. dim + j]; | |
1150 T[] c = new T[dim + j]; | |
1151 c = c[j .. dim + j]; | |
1152 | |
1153 for (int i = 0; i < dim; i++) | |
1154 { a[i] = cast(T)i; | |
1155 b[i] = cast(T)(i + 7); | |
1156 c[i] = cast(T)(i * 2); | |
1157 } | |
1158 | |
1159 c[] = 6 - a[]; | |
1160 | |
1161 for (int i = 0; i < dim; i++) | |
1162 { | |
1163 if (c[i] != cast(T)(6 - a[i])) | |
1164 { | |
1165 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); | |
1166 assert(0); | |
1167 } | |
1168 } | |
1169 } | |
1170 } | |
1171 } | |
1172 | |
1173 | |
1174 /* ======================================================================== */ | |
1175 | |
1176 /*********************** | |
1177 * Computes: | |
1178 * a[] = b[] - c[] | |
1179 */ | |
1180 | |
1181 T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b) | |
1182 { | |
1183 return _arraySliceSliceMinSliceAssign_i(a, c, b); | |
1184 } | |
1185 | |
1186 T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b) | |
1187 { | |
1188 return _arraySliceSliceMinSliceAssign_i(a, c, b); | |
1189 } | |
1190 | |
1191 T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b) | |
1192 in | |
1193 { | |
1194 assert(a.length == b.length && b.length == c.length); | |
1195 assert(disjoint(a, b)); | |
1196 assert(disjoint(a, c)); | |
1197 assert(disjoint(b, c)); | |
1198 } | |
1199 body | |
1200 { | |
1201 auto aptr = a.ptr; | |
1202 auto aend = aptr + a.length; | |
1203 auto bptr = b.ptr; | |
1204 auto cptr = c.ptr; | |
1205 | |
1206 version (D_InlineAsm_X86) | |
1207 { | |
1208 // SSE2 aligned version is 1721% faster | |
1209 if (sse2() && a.length >= 8) | |
1210 { | |
1211 auto n = aptr + (a.length & ~7); | |
1212 | |
1213 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
1214 { | |
1215 asm // unaligned case | |
1216 { | |
1217 mov ESI, aptr; | |
1218 mov EDI, n; | |
1219 mov EAX, bptr; | |
1220 mov ECX, cptr; | |
1221 | |
1222 align 4; | |
1223 startsse2u: | |
1224 add ESI, 32; | |
1225 movdqu XMM0, [EAX]; | |
1226 movdqu XMM2, [ECX]; | |
1227 movdqu XMM1, [EAX+16]; | |
1228 movdqu XMM3, [ECX+16]; | |
1229 add EAX, 32; | |
1230 add ECX, 32; | |
1231 psubd XMM0, XMM2; | |
1232 psubd XMM1, XMM3; | |
1233 movdqu [ESI -32], XMM0; | |
1234 movdqu [ESI+16-32], XMM1; | |
1235 cmp ESI, EDI; | |
1236 jb startsse2u; | |
1237 | |
1238 mov aptr, ESI; | |
1239 mov bptr, EAX; | |
1240 mov cptr, ECX; | |
1241 } | |
1242 } | |
1243 else | |
1244 { | |
1245 asm // aligned case | |
1246 { | |
1247 mov ESI, aptr; | |
1248 mov EDI, n; | |
1249 mov EAX, bptr; | |
1250 mov ECX, cptr; | |
1251 | |
1252 align 4; | |
1253 startsse2a: | |
1254 add ESI, 32; | |
1255 movdqa XMM0, [EAX]; | |
1256 movdqa XMM2, [ECX]; | |
1257 movdqa XMM1, [EAX+16]; | |
1258 movdqa XMM3, [ECX+16]; | |
1259 add EAX, 32; | |
1260 add ECX, 32; | |
1261 psubd XMM0, XMM2; | |
1262 psubd XMM1, XMM3; | |
1263 movdqa [ESI -32], XMM0; | |
1264 movdqa [ESI+16-32], XMM1; | |
1265 cmp ESI, EDI; | |
1266 jb startsse2a; | |
1267 | |
1268 mov aptr, ESI; | |
1269 mov bptr, EAX; | |
1270 mov cptr, ECX; | |
1271 } | |
1272 } | |
1273 } | |
1274 else | |
1275 // MMX version is 1002% faster | |
1276 if (mmx() && a.length >= 4) | |
1277 { | |
1278 auto n = aptr + (a.length & ~3); | |
1279 | |
1280 asm | |
1281 { | |
1282 mov ESI, aptr; | |
1283 mov EDI, n; | |
1284 mov EAX, bptr; | |
1285 mov ECX, cptr; | |
1286 | |
1287 align 4; | |
1288 startmmx: | |
1289 add ESI, 16; | |
1290 movq MM0, [EAX]; | |
1291 movq MM2, [ECX]; | |
1292 movq MM1, [EAX+8]; | |
1293 movq MM3, [ECX+8]; | |
1294 add EAX, 16; | |
1295 add ECX, 16; | |
1296 psubd MM0, MM2; | |
1297 psubd MM1, MM3; | |
1298 movq [ESI -16], MM0; | |
1299 movq [ESI+8-16], MM1; | |
1300 cmp ESI, EDI; | |
1301 jb startmmx; | |
1302 | |
1303 emms; | |
1304 mov aptr, ESI; | |
1305 mov bptr, EAX; | |
1306 mov cptr, ECX; | |
1307 } | |
1308 } | |
1309 } | |
1310 | |
1311 while (aptr < aend) | |
1312 *aptr++ = *bptr++ - *cptr++; | |
1313 | |
1314 return a; | |
1315 } | |
1316 | |
1317 unittest | |
1318 { | |
1319 printf("_arraySliceSliceMinSliceAssign_i unittest\n"); | |
1320 | |
1321 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1322 { | |
1323 version (log) printf(" cpuid %d\n", cpuid); | |
1324 | |
1325 for (int j = 0; j < 2; j++) | |
1326 { | |
1327 const int dim = 67; | |
1328 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1329 a = a[j .. dim + j]; // misalign for second iteration | |
1330 T[] b = new T[dim + j]; | |
1331 b = b[j .. dim + j]; | |
1332 T[] c = new T[dim + j]; | |
1333 c = c[j .. dim + j]; | |
1334 | |
1335 for (int i = 0; i < dim; i++) | |
1336 { a[i] = cast(T)i; | |
1337 b[i] = cast(T)(i + 7); | |
1338 c[i] = cast(T)(i * 2); | |
1339 } | |
1340 | |
1341 c[] = a[] - b[]; | |
1342 | |
1343 for (int i = 0; i < dim; i++) | |
1344 { | |
1345 if (c[i] != cast(T)(a[i] - b[i])) | |
1346 { | |
1347 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); | |
1348 assert(0); | |
1349 } | |
1350 } | |
1351 } | |
1352 } | |
1353 } | |
1354 | |
1355 | |
1356 /* ======================================================================== */ | |
1357 | |
1358 /*********************** | |
1359 * Computes: | |
1360 * a[] -= value | |
1361 */ | |
1362 | |
1363 T[] _arrayExpSliceMinass_w(T[] a, T value) | |
1364 { | |
1365 return _arrayExpSliceMinass_i(a, value); | |
1366 } | |
1367 | |
1368 T[] _arrayExpSliceMinass_k(T[] a, T value) | |
1369 { | |
1370 return _arrayExpSliceMinass_i(a, value); | |
1371 } | |
1372 | |
1373 T[] _arrayExpSliceMinass_i(T[] a, T value) | |
1374 { | |
1375 //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1376 auto aptr = a.ptr; | |
1377 auto aend = aptr + a.length; | |
1378 | |
1379 version (D_InlineAsm_X86) | |
1380 { | |
1381 // SSE2 aligned version is 81% faster | |
1382 if (sse2() && a.length >= 8) | |
1383 { | |
1384 auto n = aptr + (a.length & ~7); | |
1385 | |
1386 uint l = value; | |
1387 | |
1388 if (((cast(uint) aptr) & 15) != 0) | |
1389 { | |
1390 asm // unaligned case | |
1391 { | |
1392 mov ESI, aptr; | |
1393 mov EDI, n; | |
1394 movd XMM2, l; | |
1395 pshufd XMM2, XMM2, 0; | |
1396 | |
1397 align 4; | |
1398 startaddsse2u: | |
1399 movdqu XMM0, [ESI]; | |
1400 movdqu XMM1, [ESI+16]; | |
1401 add ESI, 32; | |
1402 psubd XMM0, XMM2; | |
1403 psubd XMM1, XMM2; | |
1404 movdqu [ESI -32], XMM0; | |
1405 movdqu [ESI+16-32], XMM1; | |
1406 cmp ESI, EDI; | |
1407 jb startaddsse2u; | |
1408 | |
1409 mov aptr, ESI; | |
1410 } | |
1411 } | |
1412 else | |
1413 { | |
1414 asm // aligned case | |
1415 { | |
1416 mov ESI, aptr; | |
1417 mov EDI, n; | |
1418 movd XMM2, l; | |
1419 pshufd XMM2, XMM2, 0; | |
1420 | |
1421 align 4; | |
1422 startaddsse2a: | |
1423 movdqa XMM0, [ESI]; | |
1424 movdqa XMM1, [ESI+16]; | |
1425 add ESI, 32; | |
1426 psubd XMM0, XMM2; | |
1427 psubd XMM1, XMM2; | |
1428 movdqa [ESI -32], XMM0; | |
1429 movdqa [ESI+16-32], XMM1; | |
1430 cmp ESI, EDI; | |
1431 jb startaddsse2a; | |
1432 | |
1433 mov aptr, ESI; | |
1434 } | |
1435 } | |
1436 } | |
1437 else | |
1438 // MMX version is 81% faster | |
1439 if (mmx() && a.length >= 4) | |
1440 { | |
1441 auto n = aptr + (a.length & ~3); | |
1442 | |
1443 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); | |
1444 | |
1445 asm | |
1446 { | |
1447 mov ESI, aptr; | |
1448 mov EDI, n; | |
1449 movq MM2, l; | |
1450 | |
1451 align 4; | |
1452 startmmx: | |
1453 movq MM0, [ESI]; | |
1454 movq MM1, [ESI+8]; | |
1455 add ESI, 16; | |
1456 psubd MM0, MM2; | |
1457 psubd MM1, MM2; | |
1458 movq [ESI -16], MM0; | |
1459 movq [ESI+8-16], MM1; | |
1460 cmp ESI, EDI; | |
1461 jb startmmx; | |
1462 | |
1463 emms; | |
1464 mov aptr, ESI; | |
1465 } | |
1466 } | |
1467 else | |
1468 if (a.length >= 2) | |
1469 { | |
1470 auto n = aptr + (a.length & ~1); | |
1471 | |
1472 asm | |
1473 { | |
1474 mov ESI, aptr; | |
1475 mov EDI, n; | |
1476 mov EDX, value; | |
1477 | |
1478 align 4; | |
1479 start386: | |
1480 mov EBX, [ESI]; | |
1481 mov ECX, [ESI+4]; | |
1482 add ESI, 8; | |
1483 sub EBX, EDX; | |
1484 sub ECX, EDX; | |
1485 mov [ESI -8], EBX; | |
1486 mov [ESI+4-8], ECX; | |
1487 cmp ESI, EDI; | |
1488 jb start386; | |
1489 | |
1490 mov aptr, ESI; | |
1491 } | |
1492 } | |
1493 } | |
1494 | |
1495 while (aptr < aend) | |
1496 *aptr++ -= value; | |
1497 | |
1498 return a; | |
1499 } | |
1500 | |
1501 unittest | |
1502 { | |
1503 printf("_arrayExpSliceMinass_i unittest\n"); | |
1504 | |
1505 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1506 { | |
1507 version (log) printf(" cpuid %d\n", cpuid); | |
1508 | |
1509 for (int j = 0; j < 2; j++) | |
1510 { | |
1511 const int dim = 67; | |
1512 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1513 a = a[j .. dim + j]; // misalign for second iteration | |
1514 T[] b = new T[dim + j]; | |
1515 b = b[j .. dim + j]; | |
1516 T[] c = new T[dim + j]; | |
1517 c = c[j .. dim + j]; | |
1518 | |
1519 for (int i = 0; i < dim; i++) | |
1520 { a[i] = cast(T)i; | |
1521 b[i] = cast(T)(i + 7); | |
1522 c[i] = cast(T)(i * 2); | |
1523 } | |
1524 | |
1525 a[] = c[]; | |
1526 a[] -= 6; | |
1527 | |
1528 for (int i = 0; i < dim; i++) | |
1529 { | |
1530 if (a[i] != cast(T)(c[i] - 6)) | |
1531 { | |
1532 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); | |
1533 assert(0); | |
1534 } | |
1535 } | |
1536 } | |
1537 } | |
1538 } | |
1539 | |
1540 | |
1541 /* ======================================================================== */ | |
1542 | |
1543 /*********************** | |
1544 * Computes: | |
1545 * a[] -= b[] | |
1546 */ | |
1547 | |
1548 T[] _arraySliceSliceMinass_w(T[] a, T[] b) | |
1549 { | |
1550 return _arraySliceSliceMinass_i(a, b); | |
1551 } | |
1552 | |
1553 T[] _arraySliceSliceMinass_k(T[] a, T[] b) | |
1554 { | |
1555 return _arraySliceSliceMinass_i(a, b); | |
1556 } | |
1557 | |
1558 T[] _arraySliceSliceMinass_i(T[] a, T[] b) | |
1559 in | |
1560 { | |
1561 assert (a.length == b.length); | |
1562 assert (disjoint(a, b)); | |
1563 } | |
1564 body | |
1565 { | |
1566 //printf("_arraySliceSliceMinass_i()\n"); | |
1567 auto aptr = a.ptr; | |
1568 auto aend = aptr + a.length; | |
1569 auto bptr = b.ptr; | |
1570 | |
1571 version (D_InlineAsm_X86) | |
1572 { | |
1573 // SSE2 aligned version is 731% faster | |
1574 if (sse2() && a.length >= 8) | |
1575 { | |
1576 auto n = aptr + (a.length & ~7); | |
1577 | |
1578 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1579 { | |
1580 asm // unaligned case | |
1581 { | |
1582 mov ESI, aptr; | |
1583 mov EDI, n; | |
1584 mov ECX, bptr; | |
1585 | |
1586 align 4; | |
1587 startsse2u: | |
1588 movdqu XMM0, [ESI]; | |
1589 movdqu XMM2, [ECX]; | |
1590 movdqu XMM1, [ESI+16]; | |
1591 movdqu XMM3, [ECX+16]; | |
1592 add ESI, 32; | |
1593 add ECX, 32; | |
1594 psubd XMM0, XMM2; | |
1595 psubd XMM1, XMM3; | |
1596 movdqu [ESI -32], XMM0; | |
1597 movdqu [ESI+16-32], XMM1; | |
1598 cmp ESI, EDI; | |
1599 jb startsse2u; | |
1600 | |
1601 mov aptr, ESI; | |
1602 mov bptr, ECX; | |
1603 } | |
1604 } | |
1605 else | |
1606 { | |
1607 asm // aligned case | |
1608 { | |
1609 mov ESI, aptr; | |
1610 mov EDI, n; | |
1611 mov ECX, bptr; | |
1612 | |
1613 align 4; | |
1614 startsse2a: | |
1615 movdqa XMM0, [ESI]; | |
1616 movdqa XMM2, [ECX]; | |
1617 movdqa XMM1, [ESI+16]; | |
1618 movdqa XMM3, [ECX+16]; | |
1619 add ESI, 32; | |
1620 add ECX, 32; | |
1621 psubd XMM0, XMM2; | |
1622 psubd XMM1, XMM3; | |
1623 movdqa [ESI -32], XMM0; | |
1624 movdqa [ESI+16-32], XMM1; | |
1625 cmp ESI, EDI; | |
1626 jb startsse2a; | |
1627 | |
1628 mov aptr, ESI; | |
1629 mov bptr, ECX; | |
1630 } | |
1631 } | |
1632 } | |
1633 else | |
1634 // MMX version is 441% faster | |
1635 if (mmx() && a.length >= 4) | |
1636 { | |
1637 auto n = aptr + (a.length & ~3); | |
1638 | |
1639 asm | |
1640 { | |
1641 mov ESI, aptr; | |
1642 mov EDI, n; | |
1643 mov ECX, bptr; | |
1644 | |
1645 align 4; | |
1646 startmmx: | |
1647 movq MM0, [ESI]; | |
1648 movq MM2, [ECX]; | |
1649 movq MM1, [ESI+8]; | |
1650 movq MM3, [ECX+8]; | |
1651 add ESI, 16; | |
1652 add ECX, 16; | |
1653 psubd MM0, MM2; | |
1654 psubd MM1, MM3; | |
1655 movq [ESI -16], MM0; | |
1656 movq [ESI+8-16], MM1; | |
1657 cmp ESI, EDI; | |
1658 jb startmmx; | |
1659 | |
1660 emms; | |
1661 mov aptr, ESI; | |
1662 mov bptr, ECX; | |
1663 } | |
1664 } | |
1665 } | |
1666 | |
1667 while (aptr < aend) | |
1668 *aptr++ -= *bptr++; | |
1669 | |
1670 return a; | |
1671 } | |
1672 | |
1673 unittest | |
1674 { | |
1675 printf("_arraySliceSliceMinass_i unittest\n"); | |
1676 | |
1677 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1678 { | |
1679 version (log) printf(" cpuid %d\n", cpuid); | |
1680 | |
1681 for (int j = 0; j < 2; j++) | |
1682 { | |
1683 const int dim = 67; | |
1684 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1685 a = a[j .. dim + j]; // misalign for second iteration | |
1686 T[] b = new T[dim + j]; | |
1687 b = b[j .. dim + j]; | |
1688 T[] c = new T[dim + j]; | |
1689 c = c[j .. dim + j]; | |
1690 | |
1691 for (int i = 0; i < dim; i++) | |
1692 { a[i] = cast(T)i; | |
1693 b[i] = cast(T)(i + 7); | |
1694 c[i] = cast(T)(i * 2); | |
1695 } | |
1696 | |
1697 b[] = c[]; | |
1698 c[] -= a[]; | |
1699 | |
1700 for (int i = 0; i < dim; i++) | |
1701 { | |
1702 if (c[i] != cast(T)(b[i] - a[i])) | |
1703 { | |
1704 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); | |
1705 assert(0); | |
1706 } | |
1707 } | |
1708 } | |
1709 } | |
1710 } | |
1711 | |
1712 | |
1713 /* ======================================================================== */ | |
1714 | |
1715 /*********************** | |
1716 * Computes: | |
1717 * a[] = b[] * value | |
1718 */ | |
1719 | |
1720 T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b) | |
1721 { | |
1722 return _arraySliceExpMulSliceAssign_i(a, value, b); | |
1723 } | |
1724 | |
1725 T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b) | |
1726 { | |
1727 return _arraySliceExpMulSliceAssign_i(a, value, b); | |
1728 } | |
1729 | |
1730 T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b) | |
1731 in | |
1732 { | |
1733 assert(a.length == b.length); | |
1734 assert(disjoint(a, b)); | |
1735 } | |
1736 body | |
1737 { | |
1738 //printf("_arraySliceExpMulSliceAssign_i()\n"); | |
1739 auto aptr = a.ptr; | |
1740 auto aend = aptr + a.length; | |
1741 auto bptr = b.ptr; | |
1742 | |
1743 version (none) // multiplying a pair is not supported by MMX | |
1744 { | |
1745 version (D_InlineAsm_X86) | |
1746 { | |
1747 // SSE2 aligned version is 1380% faster | |
1748 if (sse2() && a.length >= 8) | |
1749 { | |
1750 auto n = aptr + (a.length & ~7); | |
1751 | |
1752 uint l = value; | |
1753 | |
1754 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1755 { | |
1756 asm | |
1757 { | |
1758 mov ESI, aptr; | |
1759 mov EDI, n; | |
1760 mov EAX, bptr; | |
1761 movd XMM2, l; | |
1762 pshufd XMM2, XMM2, 0; | |
1763 | |
1764 align 4; | |
1765 startsse2u: | |
1766 add ESI, 32; | |
1767 movdqu XMM0, [EAX]; | |
1768 movdqu XMM1, [EAX+16]; | |
1769 add EAX, 32; | |
1770 pmuludq XMM0, XMM2; | |
1771 pmuludq XMM1, XMM2; | |
1772 movdqu [ESI -32], XMM0; | |
1773 movdqu [ESI+16-32], XMM1; | |
1774 cmp ESI, EDI; | |
1775 jb startsse2u; | |
1776 | |
1777 mov aptr, ESI; | |
1778 mov bptr, EAX; | |
1779 } | |
1780 } | |
1781 else | |
1782 { | |
1783 asm | |
1784 { | |
1785 mov ESI, aptr; | |
1786 mov EDI, n; | |
1787 mov EAX, bptr; | |
1788 movd XMM2, l; | |
1789 pshufd XMM2, XMM2, 0; | |
1790 | |
1791 align 4; | |
1792 startsse2a: | |
1793 add ESI, 32; | |
1794 movdqa XMM0, [EAX]; | |
1795 movdqa XMM1, [EAX+16]; | |
1796 add EAX, 32; | |
1797 pmuludq XMM0, XMM2; | |
1798 pmuludq XMM1, XMM2; | |
1799 movdqa [ESI -32], XMM0; | |
1800 movdqa [ESI+16-32], XMM1; | |
1801 cmp ESI, EDI; | |
1802 jb startsse2a; | |
1803 | |
1804 mov aptr, ESI; | |
1805 mov bptr, EAX; | |
1806 } | |
1807 } | |
1808 } | |
1809 else | |
1810 { | |
1811 // MMX version is 1380% faster | |
1812 if (mmx() && a.length >= 4) | |
1813 { | |
1814 auto n = aptr + (a.length & ~3); | |
1815 | |
1816 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); | |
1817 | |
1818 asm | |
1819 { | |
1820 mov ESI, aptr; | |
1821 mov EDI, n; | |
1822 mov EAX, bptr; | |
1823 movq MM2, l; | |
1824 | |
1825 align 4; | |
1826 startmmx: | |
1827 add ESI, 16; | |
1828 movq MM0, [EAX]; | |
1829 movq MM1, [EAX+8]; | |
1830 add EAX, 16; | |
1831 pmuludq MM0, MM2; // only multiplies low 32 bits | |
1832 pmuludq MM1, MM2; | |
1833 movq [ESI -16], MM0; | |
1834 movq [ESI+8-16], MM1; | |
1835 cmp ESI, EDI; | |
1836 jb startmmx; | |
1837 | |
1838 emms; | |
1839 mov aptr, ESI; | |
1840 mov bptr, EAX; | |
1841 } | |
1842 } | |
1843 } | |
1844 } | |
1845 } | |
1846 | |
1847 while (aptr < aend) | |
1848 *aptr++ = *bptr++ * value; | |
1849 | |
1850 return a; | |
1851 } | |
1852 | |
1853 unittest | |
1854 { | |
1855 printf("_arraySliceExpMulSliceAssign_s unittest\n"); | |
1856 | |
1857 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1858 { | |
1859 version (log) printf(" cpuid %d\n", cpuid); | |
1860 | |
1861 for (int j = 0; j < 2; j++) | |
1862 { | |
1863 const int dim = 67; | |
1864 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1865 a = a[j .. dim + j]; // misalign for second iteration | |
1866 T[] b = new T[dim + j]; | |
1867 b = b[j .. dim + j]; | |
1868 T[] c = new T[dim + j]; | |
1869 c = c[j .. dim + j]; | |
1870 | |
1871 for (int i = 0; i < dim; i++) | |
1872 { a[i] = cast(T)i; | |
1873 b[i] = cast(T)(i + 7); | |
1874 c[i] = cast(T)(i * 2); | |
1875 } | |
1876 | |
1877 c[] = a[] * 6; | |
1878 | |
1879 for (int i = 0; i < dim; i++) | |
1880 { | |
1881 //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]); | |
1882 if (c[i] != cast(T)(a[i] * 6)) | |
1883 { | |
1884 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); | |
1885 assert(0); | |
1886 } | |
1887 } | |
1888 } | |
1889 } | |
1890 } | |
1891 | |
1892 | |
1893 /* ======================================================================== */ | |
1894 | |
1895 /*********************** | |
1896 * Computes: | |
1897 * a[] = b[] * c[] | |
1898 */ | |
1899 | |
1900 T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b) | |
1901 { | |
1902 return _arraySliceSliceMulSliceAssign_i(a, c, b); | |
1903 } | |
1904 | |
1905 T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b) | |
1906 { | |
1907 return _arraySliceSliceMulSliceAssign_i(a, c, b); | |
1908 } | |
1909 | |
1910 T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b) | |
1911 in | |
1912 { | |
1913 assert(a.length == b.length && b.length == c.length); | |
1914 assert(disjoint(a, b)); | |
1915 assert(disjoint(a, c)); | |
1916 assert(disjoint(b, c)); | |
1917 } | |
1918 body | |
1919 { | |
1920 //printf("_arraySliceSliceMulSliceAssign_i()\n"); | |
1921 auto aptr = a.ptr; | |
1922 auto aend = aptr + a.length; | |
1923 auto bptr = b.ptr; | |
1924 auto cptr = c.ptr; | |
1925 | |
1926 version (none) | |
1927 { | |
1928 version (D_InlineAsm_X86) | |
1929 { | |
1930 // SSE2 aligned version is 1407% faster | |
1931 if (sse2() && a.length >= 8) | |
1932 { | |
1933 auto n = aptr + (a.length & ~7); | |
1934 | |
1935 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
1936 { | |
1937 asm | |
1938 { | |
1939 mov ESI, aptr; | |
1940 mov EDI, n; | |
1941 mov EAX, bptr; | |
1942 mov ECX, cptr; | |
1943 | |
1944 align 4; | |
1945 startsse2u: | |
1946 add ESI, 32; | |
1947 movdqu XMM0, [EAX]; | |
1948 movdqu XMM2, [ECX]; | |
1949 movdqu XMM1, [EAX+16]; | |
1950 movdqu XMM3, [ECX+16]; | |
1951 add EAX, 32; | |
1952 add ECX, 32; | |
1953 pmuludq XMM0, XMM2; | |
1954 pmuludq XMM1, XMM3; | |
1955 movdqu [ESI -32], XMM0; | |
1956 movdqu [ESI+16-32], XMM1; | |
1957 cmp ESI, EDI; | |
1958 jb startsse2u; | |
1959 | |
1960 mov aptr, ESI; | |
1961 mov bptr, EAX; | |
1962 mov cptr, ECX; | |
1963 } | |
1964 } | |
1965 else | |
1966 { | |
1967 asm | |
1968 { | |
1969 mov ESI, aptr; | |
1970 mov EDI, n; | |
1971 mov EAX, bptr; | |
1972 mov ECX, cptr; | |
1973 | |
1974 align 4; | |
1975 startsse2a: | |
1976 add ESI, 32; | |
1977 movdqa XMM0, [EAX]; | |
1978 movdqa XMM2, [ECX]; | |
1979 movdqa XMM1, [EAX+16]; | |
1980 movdqa XMM3, [ECX+16]; | |
1981 add EAX, 32; | |
1982 add ECX, 32; | |
1983 pmuludq XMM0, XMM2; | |
1984 pmuludq XMM1, XMM3; | |
1985 movdqa [ESI -32], XMM0; | |
1986 movdqa [ESI+16-32], XMM1; | |
1987 cmp ESI, EDI; | |
1988 jb startsse2a; | |
1989 | |
1990 mov aptr, ESI; | |
1991 mov bptr, EAX; | |
1992 mov cptr, ECX; | |
1993 } | |
1994 } | |
1995 } | |
1996 else | |
1997 // MMX version is 1029% faster | |
1998 if (mmx() && a.length >= 4) | |
1999 { | |
2000 auto n = aptr + (a.length & ~3); | |
2001 | |
2002 asm | |
2003 { | |
2004 mov ESI, aptr; | |
2005 mov EDI, n; | |
2006 mov EAX, bptr; | |
2007 mov ECX, cptr; | |
2008 | |
2009 align 4; | |
2010 startmmx: | |
2011 add ESI, 16; | |
2012 movq MM0, [EAX]; | |
2013 movq MM2, [ECX]; | |
2014 movq MM1, [EAX+8]; | |
2015 movq MM3, [ECX+8]; | |
2016 add EAX, 16; | |
2017 add ECX, 16; | |
2018 pmuludq MM0, MM2; | |
2019 pmuludq MM1, MM3; | |
2020 movq [ESI -16], MM0; | |
2021 movq [ESI+8-16], MM1; | |
2022 cmp ESI, EDI; | |
2023 jb startmmx; | |
2024 | |
2025 emms; | |
2026 mov aptr, ESI; | |
2027 mov bptr, EAX; | |
2028 mov cptr, ECX; | |
2029 } | |
2030 } | |
2031 } | |
2032 } | |
2033 | |
2034 while (aptr < aend) | |
2035 *aptr++ = *bptr++ * *cptr++; | |
2036 | |
2037 return a; | |
2038 } | |
2039 | |
2040 unittest | |
2041 { | |
2042 printf("_arraySliceSliceMulSliceAssign_i unittest\n"); | |
2043 | |
2044 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
2045 { | |
2046 version (log) printf(" cpuid %d\n", cpuid); | |
2047 | |
2048 for (int j = 0; j < 2; j++) | |
2049 { | |
2050 const int dim = 67; | |
2051 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
2052 a = a[j .. dim + j]; // misalign for second iteration | |
2053 T[] b = new T[dim + j]; | |
2054 b = b[j .. dim + j]; | |
2055 T[] c = new T[dim + j]; | |
2056 c = c[j .. dim + j]; | |
2057 | |
2058 for (int i = 0; i < dim; i++) | |
2059 { a[i] = cast(T)i; | |
2060 b[i] = cast(T)(i + 7); | |
2061 c[i] = cast(T)(i * 2); | |
2062 } | |
2063 | |
2064 c[] = a[] * b[]; | |
2065 | |
2066 for (int i = 0; i < dim; i++) | |
2067 { | |
2068 if (c[i] != cast(T)(a[i] * b[i])) | |
2069 { | |
2070 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); | |
2071 assert(0); | |
2072 } | |
2073 } | |
2074 } | |
2075 } | |
2076 } | |
2077 | |
2078 | |
2079 /* ======================================================================== */ | |
2080 | |
2081 /*********************** | |
2082 * Computes: | |
2083 * a[] *= value | |
2084 */ | |
2085 | |
2086 T[] _arrayExpSliceMulass_w(T[] a, T value) | |
2087 { | |
2088 return _arrayExpSliceMulass_i(a, value); | |
2089 } | |
2090 | |
2091 T[] _arrayExpSliceMulass_k(T[] a, T value) | |
2092 { | |
2093 return _arrayExpSliceMulass_i(a, value); | |
2094 } | |
2095 | |
2096 T[] _arrayExpSliceMulass_i(T[] a, T value) | |
2097 { | |
2098 //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
2099 auto aptr = a.ptr; | |
2100 auto aend = aptr + a.length; | |
2101 | |
2102 version (none) | |
2103 { | |
2104 version (D_InlineAsm_X86) | |
2105 { | |
2106 // SSE2 aligned version is 400% faster | |
2107 if (sse2() && a.length >= 8) | |
2108 { | |
2109 auto n = aptr + (a.length & ~7); | |
2110 | |
2111 uint l = value; | |
2112 | |
2113 if (((cast(uint) aptr) & 15) != 0) | |
2114 { | |
2115 asm | |
2116 { | |
2117 mov ESI, aptr; | |
2118 mov EDI, n; | |
2119 movd XMM2, l; | |
2120 pshufd XMM2, XMM2, 0; | |
2121 | |
2122 align 4; | |
2123 startsse2u: | |
2124 movdqu XMM0, [ESI]; | |
2125 movdqu XMM1, [ESI+16]; | |
2126 add ESI, 32; | |
2127 pmuludq XMM0, XMM2; | |
2128 pmuludq XMM1, XMM2; | |
2129 movdqu [ESI -32], XMM0; | |
2130 movdqu [ESI+16-32], XMM1; | |
2131 cmp ESI, EDI; | |
2132 jb startsse2u; | |
2133 | |
2134 mov aptr, ESI; | |
2135 } | |
2136 } | |
2137 else | |
2138 { | |
2139 asm | |
2140 { | |
2141 mov ESI, aptr; | |
2142 mov EDI, n; | |
2143 movd XMM2, l; | |
2144 pshufd XMM2, XMM2, 0; | |
2145 | |
2146 align 4; | |
2147 startsse2a: | |
2148 movdqa XMM0, [ESI]; | |
2149 movdqa XMM1, [ESI+16]; | |
2150 add ESI, 32; | |
2151 pmuludq XMM0, XMM2; | |
2152 pmuludq XMM1, XMM2; | |
2153 movdqa [ESI -32], XMM0; | |
2154 movdqa [ESI+16-32], XMM1; | |
2155 cmp ESI, EDI; | |
2156 jb startsse2a; | |
2157 | |
2158 mov aptr, ESI; | |
2159 } | |
2160 } | |
2161 } | |
2162 else | |
2163 // MMX version is 402% faster | |
2164 if (mmx() && a.length >= 4) | |
2165 { | |
2166 auto n = aptr + (a.length & ~3); | |
2167 | |
2168 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); | |
2169 | |
2170 asm | |
2171 { | |
2172 mov ESI, aptr; | |
2173 mov EDI, n; | |
2174 movq MM2, l; | |
2175 | |
2176 align 4; | |
2177 startmmx: | |
2178 movq MM0, [ESI]; | |
2179 movq MM1, [ESI+8]; | |
2180 add ESI, 16; | |
2181 pmuludq MM0, MM2; | |
2182 pmuludq MM1, MM2; | |
2183 movq [ESI -16], MM0; | |
2184 movq [ESI+8-16], MM1; | |
2185 cmp ESI, EDI; | |
2186 jb startmmx; | |
2187 | |
2188 emms; | |
2189 mov aptr, ESI; | |
2190 } | |
2191 } | |
2192 } | |
2193 } | |
2194 | |
2195 while (aptr < aend) | |
2196 *aptr++ *= value; | |
2197 | |
2198 return a; | |
2199 } | |
2200 | |
2201 unittest | |
2202 { | |
2203 printf("_arrayExpSliceMulass_i unittest\n"); | |
2204 | |
2205 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
2206 { | |
2207 version (log) printf(" cpuid %d\n", cpuid); | |
2208 | |
2209 for (int j = 0; j < 2; j++) | |
2210 { | |
2211 const int dim = 67; | |
2212 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
2213 a = a[j .. dim + j]; // misalign for second iteration | |
2214 T[] b = new T[dim + j]; | |
2215 b = b[j .. dim + j]; | |
2216 T[] c = new T[dim + j]; | |
2217 c = c[j .. dim + j]; | |
2218 | |
2219 for (int i = 0; i < dim; i++) | |
2220 { a[i] = cast(T)i; | |
2221 b[i] = cast(T)(i + 7); | |
2222 c[i] = cast(T)(i * 2); | |
2223 } | |
2224 | |
2225 b[] = a[]; | |
2226 a[] *= 6; | |
2227 | |
2228 for (int i = 0; i < dim; i++) | |
2229 { | |
2230 if (a[i] != cast(T)(b[i] * 6)) | |
2231 { | |
2232 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); | |
2233 assert(0); | |
2234 } | |
2235 } | |
2236 } | |
2237 } | |
2238 } | |
2239 | |
2240 | |
2241 /* ======================================================================== */ | |
2242 | |
2243 /*********************** | |
2244 * Computes: | |
2245 * a[] *= b[] | |
2246 */ | |
2247 | |
2248 T[] _arraySliceSliceMulass_w(T[] a, T[] b) | |
2249 { | |
2250 return _arraySliceSliceMulass_i(a, b); | |
2251 } | |
2252 | |
2253 T[] _arraySliceSliceMulass_k(T[] a, T[] b) | |
2254 { | |
2255 return _arraySliceSliceMulass_i(a, b); | |
2256 } | |
2257 | |
2258 T[] _arraySliceSliceMulass_i(T[] a, T[] b) | |
2259 in | |
2260 { | |
2261 assert (a.length == b.length); | |
2262 assert (disjoint(a, b)); | |
2263 } | |
2264 body | |
2265 { | |
2266 //printf("_arraySliceSliceMulass_i()\n"); | |
2267 auto aptr = a.ptr; | |
2268 auto aend = aptr + a.length; | |
2269 auto bptr = b.ptr; | |
2270 | |
2271 version (none) | |
2272 { | |
2273 version (D_InlineAsm_X86) | |
2274 { | |
2275 // SSE2 aligned version is 873% faster | |
2276 if (sse2() && a.length >= 8) | |
2277 { | |
2278 auto n = aptr + (a.length & ~7); | |
2279 | |
2280 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
2281 { | |
2282 asm | |
2283 { | |
2284 mov ESI, aptr; | |
2285 mov EDI, n; | |
2286 mov ECX, bptr; | |
2287 | |
2288 align 4; | |
2289 startsse2u: | |
2290 movdqu XMM0, [ESI]; | |
2291 movdqu XMM2, [ECX]; | |
2292 movdqu XMM1, [ESI+16]; | |
2293 movdqu XMM3, [ECX+16]; | |
2294 add ESI, 32; | |
2295 add ECX, 32; | |
2296 pmuludq XMM0, XMM2; | |
2297 pmuludq XMM1, XMM3; | |
2298 movdqu [ESI -32], XMM0; | |
2299 movdqu [ESI+16-32], XMM1; | |
2300 cmp ESI, EDI; | |
2301 jb startsse2u; | |
2302 | |
2303 mov aptr, ESI; | |
2304 mov bptr, ECX; | |
2305 } | |
2306 } | |
2307 else | |
2308 { | |
2309 asm | |
2310 { | |
2311 mov ESI, aptr; | |
2312 mov EDI, n; | |
2313 mov ECX, bptr; | |
2314 | |
2315 align 4; | |
2316 startsse2a: | |
2317 movdqa XMM0, [ESI]; | |
2318 movdqa XMM2, [ECX]; | |
2319 movdqa XMM1, [ESI+16]; | |
2320 movdqa XMM3, [ECX+16]; | |
2321 add ESI, 32; | |
2322 add ECX, 32; | |
2323 pmuludq XMM0, XMM2; | |
2324 pmuludq XMM1, XMM3; | |
2325 movdqa [ESI -32], XMM0; | |
2326 movdqa [ESI+16-32], XMM1; | |
2327 cmp ESI, EDI; | |
2328 jb startsse2a; | |
2329 | |
2330 mov aptr, ESI; | |
2331 mov bptr, ECX; | |
2332 } | |
2333 } | |
2334 } | |
2335 /+ BUG: comment out this section until we figure out what is going | |
2336 wrong with the invalid pshufd instructions. | |
2337 | |
2338 else | |
2339 // MMX version is 573% faster | |
2340 if (mmx() && a.length >= 4) | |
2341 { | |
2342 auto n = aptr + (a.length & ~3); | |
2343 | |
2344 asm | |
2345 { | |
2346 mov ESI, aptr; | |
2347 mov EDI, n; | |
2348 mov ECX, bptr; | |
2349 | |
2350 align 4; | |
2351 startmmx: | |
2352 movq MM0, [ESI]; | |
2353 movq MM2, [ECX]; | |
2354 movq MM1, [ESI+8]; | |
2355 movq MM3, [ECX+8]; | |
2356 pxor MM4, MM4; | |
2357 pxor MM5, MM5; | |
2358 punpckldq MM4, MM0; | |
2359 punpckldq MM5, MM2; | |
2360 add ESI, 16; | |
2361 add ECX, 16; | |
2362 pmuludq MM4, MM5; | |
2363 pshufd MM4, MM4, 8; // ? | |
2364 movq [ESI -16], MM4; | |
2365 pxor MM4, MM4; | |
2366 pxor MM5, MM5; | |
2367 punpckldq MM4, MM1; | |
2368 punpckldq MM5, MM3; | |
2369 pmuludq MM4, MM5; | |
2370 pshufd MM4, MM4, 8; // ? | |
2371 movq [ESI+8-16], MM4; | |
2372 cmp ESI, EDI; | |
2373 jb startmmx; | |
2374 | |
2375 emms; | |
2376 mov aptr, ESI; | |
2377 mov bptr, ECX; | |
2378 } | |
2379 } | |
2380 +/ | |
2381 } | |
2382 } | |
2383 | |
2384 while (aptr < aend) | |
2385 *aptr++ *= *bptr++; | |
2386 | |
2387 return a; | |
2388 } | |
2389 | |
2390 unittest | |
2391 { | |
2392 printf("_arraySliceSliceMulass_i unittest\n"); | |
2393 | |
2394 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
2395 { | |
2396 version (log) printf(" cpuid %d\n", cpuid); | |
2397 | |
2398 for (int j = 0; j < 2; j++) | |
2399 { | |
2400 const int dim = 67; | |
2401 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
2402 a = a[j .. dim + j]; // misalign for second iteration | |
2403 T[] b = new T[dim + j]; | |
2404 b = b[j .. dim + j]; | |
2405 T[] c = new T[dim + j]; | |
2406 c = c[j .. dim + j]; | |
2407 | |
2408 for (int i = 0; i < dim; i++) | |
2409 { a[i] = cast(T)i; | |
2410 b[i] = cast(T)(i + 7); | |
2411 c[i] = cast(T)(i * 2); | |
2412 } | |
2413 | |
2414 b[] = a[]; | |
2415 a[] *= c[]; | |
2416 | |
2417 for (int i = 0; i < dim; i++) | |
2418 { | |
2419 if (a[i] != cast(T)(b[i] * c[i])) | |
2420 { | |
2421 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); | |
2422 assert(0); | |
2423 } | |
2424 } | |
2425 } | |
2426 } | |
2427 } |