Mercurial > projects > ldc
comparison druntime/src/compiler/dmd/arrayshort.d @ 759:d3eb054172f9
Added copy of druntime from DMD 2.020 modified for LDC.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:52:37 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
758:f04dde6e882c | 759:d3eb054172f9 |
---|---|
1 /*************************** | |
2 * D programming language http://www.digitalmars.com/d/ | |
3 * Runtime support for byte array operations. | |
4 * Based on code originally written by Burton Radons. | |
5 * Placed in public domain. | |
6 */ | |
7 | |
8 /* Contains SSE2 and MMX versions of certain operations for wchar, short, | |
9 * and ushort ('u', 's' and 't' suffixes). | |
10 */ | |
11 | |
12 module rt.arrayshort; | |
13 | |
14 private import util.cpuid; | |
15 | |
16 version (Unittest) | |
17 { | |
18 /* This is so unit tests will test every CPU variant | |
19 */ | |
20 int cpuid; | |
21 const int CPUID_MAX = 4; | |
22 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); } | |
23 bool sse() { return cpuid == 2 && util.cpuid.sse(); } | |
24 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); } | |
25 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); } | |
26 } | |
27 else | |
28 { | |
29 alias util.cpuid.mmx mmx; | |
30 alias util.cpuid.sse sse; | |
31 alias util.cpuid.sse2 sse2; | |
32 alias util.cpuid.sse2 sse2; | |
33 } | |
34 | |
35 //version = log; | |
36 | |
37 bool disjoint(T)(T[] a, T[] b) | |
38 { | |
39 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); | |
40 } | |
41 | |
42 alias short T; | |
43 | |
44 extern (C): | |
45 | |
46 /* ======================================================================== */ | |
47 | |
48 /*********************** | |
49 * Computes: | |
50 * a[] = b[] + value | |
51 */ | |
52 | |
53 T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b) | |
54 { | |
55 return _arraySliceExpAddSliceAssign_s(a, value, b); | |
56 } | |
57 | |
58 T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b) | |
59 { | |
60 return _arraySliceExpAddSliceAssign_s(a, value, b); | |
61 } | |
62 | |
63 T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b) | |
64 in | |
65 { | |
66 assert(a.length == b.length); | |
67 assert(disjoint(a, b)); | |
68 } | |
69 body | |
70 { | |
71 //printf("_arraySliceExpAddSliceAssign_s()\n"); | |
72 auto aptr = a.ptr; | |
73 auto aend = aptr + a.length; | |
74 auto bptr = b.ptr; | |
75 | |
76 version (D_InlineAsm_X86) | |
77 { | |
78 // SSE2 aligned version is 3343% faster | |
79 if (sse2() && a.length >= 16) | |
80 { | |
81 auto n = aptr + (a.length & ~15); | |
82 | |
83 uint l = cast(ushort) value; | |
84 l |= (l << 16); | |
85 | |
86 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
87 { | |
88 asm // unaligned case | |
89 { | |
90 mov ESI, aptr; | |
91 mov EDI, n; | |
92 mov EAX, bptr; | |
93 movd XMM2, l; | |
94 pshufd XMM2, XMM2, 0; | |
95 | |
96 align 4; | |
97 startaddsse2u: | |
98 add ESI, 32; | |
99 movdqu XMM0, [EAX]; | |
100 movdqu XMM1, [EAX+16]; | |
101 add EAX, 32; | |
102 paddw XMM0, XMM2; | |
103 paddw XMM1, XMM2; | |
104 movdqu [ESI -32], XMM0; | |
105 movdqu [ESI+16-32], XMM1; | |
106 cmp ESI, EDI; | |
107 jb startaddsse2u; | |
108 | |
109 mov aptr, ESI; | |
110 mov bptr, EAX; | |
111 } | |
112 } | |
113 else | |
114 { | |
115 asm // aligned case | |
116 { | |
117 mov ESI, aptr; | |
118 mov EDI, n; | |
119 mov EAX, bptr; | |
120 movd XMM2, l; | |
121 pshufd XMM2, XMM2, 0; | |
122 | |
123 align 4; | |
124 startaddsse2a: | |
125 add ESI, 32; | |
126 movdqa XMM0, [EAX]; | |
127 movdqa XMM1, [EAX+16]; | |
128 add EAX, 32; | |
129 paddw XMM0, XMM2; | |
130 paddw XMM1, XMM2; | |
131 movdqa [ESI -32], XMM0; | |
132 movdqa [ESI+16-32], XMM1; | |
133 cmp ESI, EDI; | |
134 jb startaddsse2a; | |
135 | |
136 mov aptr, ESI; | |
137 mov bptr, EAX; | |
138 } | |
139 } | |
140 } | |
141 else | |
142 // MMX version is 3343% faster | |
143 if (mmx() && a.length >= 8) | |
144 { | |
145 auto n = aptr + (a.length & ~7); | |
146 | |
147 uint l = cast(ushort) value; | |
148 | |
149 asm | |
150 { | |
151 mov ESI, aptr; | |
152 mov EDI, n; | |
153 mov EAX, bptr; | |
154 movd MM2, l; | |
155 pshufw MM2, MM2, 0; | |
156 | |
157 align 4; | |
158 startmmx: | |
159 add ESI, 16; | |
160 movq MM0, [EAX]; | |
161 movq MM1, [EAX+8]; | |
162 add EAX, 16; | |
163 paddw MM0, MM2; | |
164 paddw MM1, MM2; | |
165 movq [ESI -16], MM0; | |
166 movq [ESI+8-16], MM1; | |
167 cmp ESI, EDI; | |
168 jb startmmx; | |
169 | |
170 emms; | |
171 mov aptr, ESI; | |
172 mov bptr, EAX; | |
173 } | |
174 } | |
175 } | |
176 | |
177 while (aptr < aend) | |
178 *aptr++ = cast(T)(*bptr++ + value); | |
179 | |
180 return a; | |
181 } | |
182 | |
183 unittest | |
184 { | |
185 printf("_arraySliceExpAddSliceAssign_s unittest\n"); | |
186 | |
187 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
188 { | |
189 version (log) printf(" cpuid %d\n", cpuid); | |
190 | |
191 for (int j = 0; j < 2; j++) | |
192 { | |
193 const int dim = 67; | |
194 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
195 a = a[j .. dim + j]; // misalign for second iteration | |
196 T[] b = new T[dim + j]; | |
197 b = b[j .. dim + j]; | |
198 T[] c = new T[dim + j]; | |
199 c = c[j .. dim + j]; | |
200 | |
201 for (int i = 0; i < dim; i++) | |
202 { a[i] = cast(T)i; | |
203 b[i] = cast(T)(i + 7); | |
204 c[i] = cast(T)(i * 2); | |
205 } | |
206 | |
207 c[] = a[] + 6; | |
208 | |
209 for (int i = 0; i < dim; i++) | |
210 { | |
211 if (c[i] != cast(T)(a[i] + 6)) | |
212 { | |
213 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); | |
214 assert(0); | |
215 } | |
216 } | |
217 } | |
218 } | |
219 } | |
220 | |
221 | |
222 /* ======================================================================== */ | |
223 | |
224 /*********************** | |
225 * Computes: | |
226 * a[] = b[] + c[] | |
227 */ | |
228 | |
229 T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b) | |
230 { | |
231 return _arraySliceSliceAddSliceAssign_s(a, c, b); | |
232 } | |
233 | |
234 T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b) | |
235 { | |
236 return _arraySliceSliceAddSliceAssign_s(a, c, b); | |
237 } | |
238 | |
239 T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b) | |
240 in | |
241 { | |
242 assert(a.length == b.length && b.length == c.length); | |
243 assert(disjoint(a, b)); | |
244 assert(disjoint(a, c)); | |
245 assert(disjoint(b, c)); | |
246 } | |
247 body | |
248 { | |
249 //printf("_arraySliceSliceAddSliceAssign_s()\n"); | |
250 auto aptr = a.ptr; | |
251 auto aend = aptr + a.length; | |
252 auto bptr = b.ptr; | |
253 auto cptr = c.ptr; | |
254 | |
255 version (D_InlineAsm_X86) | |
256 { | |
257 // SSE2 aligned version is 3777% faster | |
258 if (sse2() && a.length >= 16) | |
259 { | |
260 auto n = aptr + (a.length & ~15); | |
261 | |
262 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
263 { | |
264 asm // unaligned case | |
265 { | |
266 mov ESI, aptr; | |
267 mov EDI, n; | |
268 mov EAX, bptr; | |
269 mov ECX, cptr; | |
270 | |
271 align 4; | |
272 startsse2u: | |
273 add ESI, 32; | |
274 movdqu XMM0, [EAX]; | |
275 movdqu XMM1, [EAX+16]; | |
276 add EAX, 32; | |
277 movdqu XMM2, [ECX]; | |
278 movdqu XMM3, [ECX+16]; | |
279 add ECX, 32; | |
280 paddw XMM0, XMM2; | |
281 paddw XMM1, XMM3; | |
282 movdqu [ESI -32], XMM0; | |
283 movdqu [ESI+16-32], XMM1; | |
284 cmp ESI, EDI; | |
285 jb startsse2u; | |
286 | |
287 mov aptr, ESI; | |
288 mov bptr, EAX; | |
289 mov cptr, ECX; | |
290 } | |
291 } | |
292 else | |
293 { | |
294 asm // aligned case | |
295 { | |
296 mov ESI, aptr; | |
297 mov EDI, n; | |
298 mov EAX, bptr; | |
299 mov ECX, cptr; | |
300 | |
301 align 4; | |
302 startsse2a: | |
303 add ESI, 32; | |
304 movdqa XMM0, [EAX]; | |
305 movdqa XMM1, [EAX+16]; | |
306 add EAX, 32; | |
307 movdqa XMM2, [ECX]; | |
308 movdqa XMM3, [ECX+16]; | |
309 add ECX, 32; | |
310 paddw XMM0, XMM2; | |
311 paddw XMM1, XMM3; | |
312 movdqa [ESI -32], XMM0; | |
313 movdqa [ESI+16-32], XMM1; | |
314 cmp ESI, EDI; | |
315 jb startsse2a; | |
316 | |
317 mov aptr, ESI; | |
318 mov bptr, EAX; | |
319 mov cptr, ECX; | |
320 } | |
321 } | |
322 } | |
323 else | |
324 // MMX version is 2068% faster | |
325 if (mmx() && a.length >= 8) | |
326 { | |
327 auto n = aptr + (a.length & ~7); | |
328 | |
329 asm | |
330 { | |
331 mov ESI, aptr; | |
332 mov EDI, n; | |
333 mov EAX, bptr; | |
334 mov ECX, cptr; | |
335 | |
336 align 4; | |
337 startmmx: | |
338 add ESI, 16; | |
339 movq MM0, [EAX]; | |
340 movq MM1, [EAX+8]; | |
341 add EAX, 16; | |
342 movq MM2, [ECX]; | |
343 movq MM3, [ECX+8]; | |
344 add ECX, 16; | |
345 paddw MM0, MM2; | |
346 paddw MM1, MM3; | |
347 movq [ESI -16], MM0; | |
348 movq [ESI+8-16], MM1; | |
349 cmp ESI, EDI; | |
350 jb startmmx; | |
351 | |
352 emms; | |
353 mov aptr, ESI; | |
354 mov bptr, EAX; | |
355 mov cptr, ECX; | |
356 } | |
357 } | |
358 } | |
359 | |
360 while (aptr < aend) | |
361 *aptr++ = cast(T)(*bptr++ + *cptr++); | |
362 | |
363 return a; | |
364 } | |
365 | |
366 unittest | |
367 { | |
368 printf("_arraySliceSliceAddSliceAssign_s unittest\n"); | |
369 | |
370 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
371 { | |
372 version (log) printf(" cpuid %d\n", cpuid); | |
373 | |
374 for (int j = 0; j < 2; j++) | |
375 { | |
376 const int dim = 67; | |
377 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
378 a = a[j .. dim + j]; // misalign for second iteration | |
379 T[] b = new T[dim + j]; | |
380 b = b[j .. dim + j]; | |
381 T[] c = new T[dim + j]; | |
382 c = c[j .. dim + j]; | |
383 | |
384 for (int i = 0; i < dim; i++) | |
385 { a[i] = cast(T)i; | |
386 b[i] = cast(T)(i + 7); | |
387 c[i] = cast(T)(i * 2); | |
388 } | |
389 | |
390 c[] = a[] + b[]; | |
391 | |
392 for (int i = 0; i < dim; i++) | |
393 { | |
394 if (c[i] != cast(T)(a[i] + b[i])) | |
395 { | |
396 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); | |
397 assert(0); | |
398 } | |
399 } | |
400 } | |
401 } | |
402 } | |
403 | |
404 | |
405 /* ======================================================================== */ | |
406 | |
407 /*********************** | |
408 * Computes: | |
409 * a[] += value | |
410 */ | |
411 | |
412 T[] _arrayExpSliceAddass_u(T[] a, T value) | |
413 { | |
414 return _arrayExpSliceAddass_s(a, value); | |
415 } | |
416 | |
417 T[] _arrayExpSliceAddass_t(T[] a, T value) | |
418 { | |
419 return _arrayExpSliceAddass_s(a, value); | |
420 } | |
421 | |
422 T[] _arrayExpSliceAddass_s(T[] a, T value) | |
423 { | |
424 //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
425 auto aptr = a.ptr; | |
426 auto aend = aptr + a.length; | |
427 | |
428 version (D_InlineAsm_X86) | |
429 { | |
430 // SSE2 aligned version is 832% faster | |
431 if (sse2() && a.length >= 16) | |
432 { | |
433 auto n = aptr + (a.length & ~15); | |
434 | |
435 uint l = cast(ushort) value; | |
436 l |= (l << 16); | |
437 | |
438 if (((cast(uint) aptr) & 15) != 0) | |
439 { | |
440 asm // unaligned case | |
441 { | |
442 mov ESI, aptr; | |
443 mov EDI, n; | |
444 movd XMM2, l; | |
445 pshufd XMM2, XMM2, 0; | |
446 | |
447 align 4; | |
448 startaddsse2u: | |
449 movdqu XMM0, [ESI]; | |
450 movdqu XMM1, [ESI+16]; | |
451 add ESI, 32; | |
452 paddw XMM0, XMM2; | |
453 paddw XMM1, XMM2; | |
454 movdqu [ESI -32], XMM0; | |
455 movdqu [ESI+16-32], XMM1; | |
456 cmp ESI, EDI; | |
457 jb startaddsse2u; | |
458 | |
459 mov aptr, ESI; | |
460 } | |
461 } | |
462 else | |
463 { | |
464 asm // aligned case | |
465 { | |
466 mov ESI, aptr; | |
467 mov EDI, n; | |
468 movd XMM2, l; | |
469 pshufd XMM2, XMM2, 0; | |
470 | |
471 align 4; | |
472 startaddsse2a: | |
473 movdqa XMM0, [ESI]; | |
474 movdqa XMM1, [ESI+16]; | |
475 add ESI, 32; | |
476 paddw XMM0, XMM2; | |
477 paddw XMM1, XMM2; | |
478 movdqa [ESI -32], XMM0; | |
479 movdqa [ESI+16-32], XMM1; | |
480 cmp ESI, EDI; | |
481 jb startaddsse2a; | |
482 | |
483 mov aptr, ESI; | |
484 } | |
485 } | |
486 } | |
487 else | |
488 // MMX version is 826% faster | |
489 if (mmx() && a.length >= 8) | |
490 { | |
491 auto n = aptr + (a.length & ~7); | |
492 | |
493 uint l = cast(ushort) value; | |
494 | |
495 asm | |
496 { | |
497 mov ESI, aptr; | |
498 mov EDI, n; | |
499 movd MM2, l; | |
500 pshufw MM2, MM2, 0; | |
501 | |
502 align 4; | |
503 startmmx: | |
504 movq MM0, [ESI]; | |
505 movq MM1, [ESI+8]; | |
506 add ESI, 16; | |
507 paddw MM0, MM2; | |
508 paddw MM1, MM2; | |
509 movq [ESI -16], MM0; | |
510 movq [ESI+8-16], MM1; | |
511 cmp ESI, EDI; | |
512 jb startmmx; | |
513 | |
514 emms; | |
515 mov aptr, ESI; | |
516 } | |
517 } | |
518 } | |
519 | |
520 while (aptr < aend) | |
521 *aptr++ += value; | |
522 | |
523 return a; | |
524 } | |
525 | |
526 unittest | |
527 { | |
528 printf("_arrayExpSliceAddass_s unittest\n"); | |
529 | |
530 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
531 { | |
532 version (log) printf(" cpuid %d\n", cpuid); | |
533 | |
534 for (int j = 0; j < 2; j++) | |
535 { | |
536 const int dim = 67; | |
537 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
538 a = a[j .. dim + j]; // misalign for second iteration | |
539 T[] b = new T[dim + j]; | |
540 b = b[j .. dim + j]; | |
541 T[] c = new T[dim + j]; | |
542 c = c[j .. dim + j]; | |
543 | |
544 for (int i = 0; i < dim; i++) | |
545 { a[i] = cast(T)i; | |
546 b[i] = cast(T)(i + 7); | |
547 c[i] = cast(T)(i * 2); | |
548 } | |
549 | |
550 a[] = c[]; | |
551 a[] += 6; | |
552 | |
553 for (int i = 0; i < dim; i++) | |
554 { | |
555 if (a[i] != cast(T)(c[i] + 6)) | |
556 { | |
557 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); | |
558 assert(0); | |
559 } | |
560 } | |
561 } | |
562 } | |
563 } | |
564 | |
565 | |
566 /* ======================================================================== */ | |
567 | |
568 /*********************** | |
569 * Computes: | |
570 * a[] += b[] | |
571 */ | |
572 | |
573 T[] _arraySliceSliceAddass_u(T[] a, T[] b) | |
574 { | |
575 return _arraySliceSliceAddass_s(a, b); | |
576 } | |
577 | |
578 T[] _arraySliceSliceAddass_t(T[] a, T[] b) | |
579 { | |
580 return _arraySliceSliceAddass_s(a, b); | |
581 } | |
582 | |
583 T[] _arraySliceSliceAddass_s(T[] a, T[] b) | |
584 in | |
585 { | |
586 assert (a.length == b.length); | |
587 assert (disjoint(a, b)); | |
588 } | |
589 body | |
590 { | |
591 //printf("_arraySliceSliceAddass_s()\n"); | |
592 auto aptr = a.ptr; | |
593 auto aend = aptr + a.length; | |
594 auto bptr = b.ptr; | |
595 | |
596 version (D_InlineAsm_X86) | |
597 { | |
598 // SSE2 aligned version is 2085% faster | |
599 if (sse2() && a.length >= 16) | |
600 { | |
601 auto n = aptr + (a.length & ~15); | |
602 | |
603 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
604 { | |
605 asm // unaligned case | |
606 { | |
607 mov ESI, aptr; | |
608 mov EDI, n; | |
609 mov ECX, bptr; | |
610 | |
611 align 4; | |
612 startsse2u: | |
613 movdqu XMM0, [ESI]; | |
614 movdqu XMM1, [ESI+16]; | |
615 add ESI, 32; | |
616 movdqu XMM2, [ECX]; | |
617 movdqu XMM3, [ECX+16]; | |
618 add ECX, 32; | |
619 paddw XMM0, XMM2; | |
620 paddw XMM1, XMM3; | |
621 movdqu [ESI -32], XMM0; | |
622 movdqu [ESI+16-32], XMM1; | |
623 cmp ESI, EDI; | |
624 jb startsse2u; | |
625 | |
626 mov aptr, ESI; | |
627 mov bptr, ECX; | |
628 } | |
629 } | |
630 else | |
631 { | |
632 asm // aligned case | |
633 { | |
634 mov ESI, aptr; | |
635 mov EDI, n; | |
636 mov ECX, bptr; | |
637 | |
638 align 4; | |
639 startsse2a: | |
640 movdqa XMM0, [ESI]; | |
641 movdqa XMM1, [ESI+16]; | |
642 add ESI, 32; | |
643 movdqa XMM2, [ECX]; | |
644 movdqa XMM3, [ECX+16]; | |
645 add ECX, 32; | |
646 paddw XMM0, XMM2; | |
647 paddw XMM1, XMM3; | |
648 movdqa [ESI -32], XMM0; | |
649 movdqa [ESI+16-32], XMM1; | |
650 cmp ESI, EDI; | |
651 jb startsse2a; | |
652 | |
653 mov aptr, ESI; | |
654 mov bptr, ECX; | |
655 } | |
656 } | |
657 } | |
658 else | |
659 // MMX version is 1022% faster | |
660 if (mmx() && a.length >= 8) | |
661 { | |
662 auto n = aptr + (a.length & ~7); | |
663 | |
664 asm | |
665 { | |
666 mov ESI, aptr; | |
667 mov EDI, n; | |
668 mov ECX, bptr; | |
669 | |
670 align 4; | |
671 start: | |
672 movq MM0, [ESI]; | |
673 movq MM1, [ESI+8]; | |
674 add ESI, 16; | |
675 movq MM2, [ECX]; | |
676 movq MM3, [ECX+8]; | |
677 add ECX, 16; | |
678 paddw MM0, MM2; | |
679 paddw MM1, MM3; | |
680 movq [ESI -16], MM0; | |
681 movq [ESI+8-16], MM1; | |
682 cmp ESI, EDI; | |
683 jb start; | |
684 | |
685 emms; | |
686 mov aptr, ESI; | |
687 mov bptr, ECX; | |
688 } | |
689 } | |
690 } | |
691 | |
692 while (aptr < aend) | |
693 *aptr++ += *bptr++; | |
694 | |
695 return a; | |
696 } | |
697 | |
698 unittest | |
699 { | |
700 printf("_arraySliceSliceAddass_s unittest\n"); | |
701 | |
702 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
703 { | |
704 version (log) printf(" cpuid %d\n", cpuid); | |
705 | |
706 for (int j = 0; j < 2; j++) | |
707 { | |
708 const int dim = 67; | |
709 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
710 a = a[j .. dim + j]; // misalign for second iteration | |
711 T[] b = new T[dim + j]; | |
712 b = b[j .. dim + j]; | |
713 T[] c = new T[dim + j]; | |
714 c = c[j .. dim + j]; | |
715 | |
716 for (int i = 0; i < dim; i++) | |
717 { a[i] = cast(T)i; | |
718 b[i] = cast(T)(i + 7); | |
719 c[i] = cast(T)(i * 2); | |
720 } | |
721 | |
722 b[] = c[]; | |
723 c[] += a[]; | |
724 | |
725 for (int i = 0; i < dim; i++) | |
726 { | |
727 if (c[i] != cast(T)(b[i] + a[i])) | |
728 { | |
729 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); | |
730 assert(0); | |
731 } | |
732 } | |
733 } | |
734 } | |
735 } | |
736 | |
737 | |
738 /* ======================================================================== */ | |
739 | |
740 /*********************** | |
741 * Computes: | |
742 * a[] = b[] - value | |
743 */ | |
744 | |
745 T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b) | |
746 { | |
747 return _arraySliceExpMinSliceAssign_s(a, value, b); | |
748 } | |
749 | |
750 T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b) | |
751 { | |
752 return _arraySliceExpMinSliceAssign_s(a, value, b); | |
753 } | |
754 | |
755 T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b) | |
756 in | |
757 { | |
758 assert(a.length == b.length); | |
759 assert(disjoint(a, b)); | |
760 } | |
761 body | |
762 { | |
763 //printf("_arraySliceExpMinSliceAssign_s()\n"); | |
764 auto aptr = a.ptr; | |
765 auto aend = aptr + a.length; | |
766 auto bptr = b.ptr; | |
767 | |
768 version (D_InlineAsm_X86) | |
769 { | |
770 // SSE2 aligned version is 3695% faster | |
771 if (sse2() && a.length >= 16) | |
772 { | |
773 auto n = aptr + (a.length & ~15); | |
774 | |
775 uint l = cast(ushort) value; | |
776 l |= (l << 16); | |
777 | |
778 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
779 { | |
780 asm // unaligned case | |
781 { | |
782 mov ESI, aptr; | |
783 mov EDI, n; | |
784 mov EAX, bptr; | |
785 movd XMM2, l; | |
786 pshufd XMM2, XMM2, 0; | |
787 | |
788 align 4; | |
789 startaddsse2u: | |
790 add ESI, 32; | |
791 movdqu XMM0, [EAX]; | |
792 movdqu XMM1, [EAX+16]; | |
793 add EAX, 32; | |
794 psubw XMM0, XMM2; | |
795 psubw XMM1, XMM2; | |
796 movdqu [ESI -32], XMM0; | |
797 movdqu [ESI+16-32], XMM1; | |
798 cmp ESI, EDI; | |
799 jb startaddsse2u; | |
800 | |
801 mov aptr, ESI; | |
802 mov bptr, EAX; | |
803 } | |
804 } | |
805 else | |
806 { | |
807 asm // aligned case | |
808 { | |
809 mov ESI, aptr; | |
810 mov EDI, n; | |
811 mov EAX, bptr; | |
812 movd XMM2, l; | |
813 pshufd XMM2, XMM2, 0; | |
814 | |
815 align 4; | |
816 startaddsse2a: | |
817 add ESI, 32; | |
818 movdqa XMM0, [EAX]; | |
819 movdqa XMM1, [EAX+16]; | |
820 add EAX, 32; | |
821 psubw XMM0, XMM2; | |
822 psubw XMM1, XMM2; | |
823 movdqa [ESI -32], XMM0; | |
824 movdqa [ESI+16-32], XMM1; | |
825 cmp ESI, EDI; | |
826 jb startaddsse2a; | |
827 | |
828 mov aptr, ESI; | |
829 mov bptr, EAX; | |
830 } | |
831 } | |
832 } | |
833 else | |
834 // MMX version is 3049% faster | |
835 if (mmx() && a.length >= 8) | |
836 { | |
837 auto n = aptr + (a.length & ~7); | |
838 | |
839 uint l = cast(ushort) value; | |
840 | |
841 asm | |
842 { | |
843 mov ESI, aptr; | |
844 mov EDI, n; | |
845 mov EAX, bptr; | |
846 movd MM2, l; | |
847 pshufw MM2, MM2, 0; | |
848 | |
849 align 4; | |
850 startmmx: | |
851 add ESI, 16; | |
852 movq MM0, [EAX]; | |
853 movq MM1, [EAX+8]; | |
854 add EAX, 16; | |
855 psubw MM0, MM2; | |
856 psubw MM1, MM2; | |
857 movq [ESI -16], MM0; | |
858 movq [ESI+8-16], MM1; | |
859 cmp ESI, EDI; | |
860 jb startmmx; | |
861 | |
862 emms; | |
863 mov aptr, ESI; | |
864 mov bptr, EAX; | |
865 } | |
866 } | |
867 } | |
868 | |
869 while (aptr < aend) | |
870 *aptr++ = cast(T)(*bptr++ - value); | |
871 | |
872 return a; | |
873 } | |
874 | |
875 unittest | |
876 { | |
877 printf("_arraySliceExpMinSliceAssign_s unittest\n"); | |
878 | |
879 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
880 { | |
881 version (log) printf(" cpuid %d\n", cpuid); | |
882 | |
883 for (int j = 0; j < 2; j++) | |
884 { | |
885 const int dim = 67; | |
886 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
887 a = a[j .. dim + j]; // misalign for second iteration | |
888 T[] b = new T[dim + j]; | |
889 b = b[j .. dim + j]; | |
890 T[] c = new T[dim + j]; | |
891 c = c[j .. dim + j]; | |
892 | |
893 for (int i = 0; i < dim; i++) | |
894 { a[i] = cast(T)i; | |
895 b[i] = cast(T)(i + 7); | |
896 c[i] = cast(T)(i * 2); | |
897 } | |
898 | |
899 c[] = a[] - 6; | |
900 | |
901 for (int i = 0; i < dim; i++) | |
902 { | |
903 if (c[i] != cast(T)(a[i] - 6)) | |
904 { | |
905 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); | |
906 assert(0); | |
907 } | |
908 } | |
909 } | |
910 } | |
911 } | |
912 | |
913 | |
914 /* ======================================================================== */ | |
915 | |
916 /*********************** | |
917 * Computes: | |
918 * a[] = value - b[] | |
919 */ | |
920 | |
921 T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value) | |
922 { | |
923 return _arrayExpSliceMinSliceAssign_s(a, b, value); | |
924 } | |
925 | |
926 T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value) | |
927 { | |
928 return _arrayExpSliceMinSliceAssign_s(a, b, value); | |
929 } | |
930 | |
931 T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value) | |
932 in | |
933 { | |
934 assert(a.length == b.length); | |
935 assert(disjoint(a, b)); | |
936 } | |
937 body | |
938 { | |
939 //printf("_arrayExpSliceMinSliceAssign_s()\n"); | |
940 auto aptr = a.ptr; | |
941 auto aend = aptr + a.length; | |
942 auto bptr = b.ptr; | |
943 | |
944 version (D_InlineAsm_X86) | |
945 { | |
946 // SSE2 aligned version is 4995% faster | |
947 if (sse2() && a.length >= 16) | |
948 { | |
949 auto n = aptr + (a.length & ~15); | |
950 | |
951 uint l = cast(ushort) value; | |
952 l |= (l << 16); | |
953 | |
954 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
955 { | |
956 asm // unaligned case | |
957 { | |
958 mov ESI, aptr; | |
959 mov EDI, n; | |
960 mov EAX, bptr; | |
961 | |
962 align 4; | |
963 startaddsse2u: | |
964 movd XMM2, l; | |
965 pshufd XMM2, XMM2, 0; | |
966 movd XMM3, l; | |
967 pshufd XMM3, XMM3, 0; | |
968 add ESI, 32; | |
969 movdqu XMM0, [EAX]; | |
970 movdqu XMM1, [EAX+16]; | |
971 add EAX, 32; | |
972 psubw XMM2, XMM0; | |
973 psubw XMM3, XMM1; | |
974 movdqu [ESI -32], XMM2; | |
975 movdqu [ESI+16-32], XMM3; | |
976 cmp ESI, EDI; | |
977 jb startaddsse2u; | |
978 | |
979 mov aptr, ESI; | |
980 mov bptr, EAX; | |
981 } | |
982 } | |
983 else | |
984 { | |
985 asm // aligned case | |
986 { | |
987 mov ESI, aptr; | |
988 mov EDI, n; | |
989 mov EAX, bptr; | |
990 | |
991 align 4; | |
992 startaddsse2a: | |
993 movd XMM2, l; | |
994 pshufd XMM2, XMM2, 0; | |
995 movd XMM3, l; | |
996 pshufd XMM3, XMM3, 0; | |
997 add ESI, 32; | |
998 movdqa XMM0, [EAX]; | |
999 movdqa XMM1, [EAX+16]; | |
1000 add EAX, 32; | |
1001 psubw XMM2, XMM0; | |
1002 psubw XMM3, XMM1; | |
1003 movdqa [ESI -32], XMM2; | |
1004 movdqa [ESI+16-32], XMM3; | |
1005 cmp ESI, EDI; | |
1006 jb startaddsse2a; | |
1007 | |
1008 mov aptr, ESI; | |
1009 mov bptr, EAX; | |
1010 } | |
1011 } | |
1012 } | |
1013 else | |
1014 // MMX version is 4562% faster | |
1015 if (mmx() && a.length >= 8) | |
1016 { | |
1017 auto n = aptr + (a.length & ~7); | |
1018 | |
1019 uint l = cast(ushort) value; | |
1020 | |
1021 asm | |
1022 { | |
1023 mov ESI, aptr; | |
1024 mov EDI, n; | |
1025 mov EAX, bptr; | |
1026 movd MM4, l; | |
1027 pshufw MM4, MM4, 0; | |
1028 | |
1029 align 4; | |
1030 startmmx: | |
1031 add ESI, 16; | |
1032 movq MM2, [EAX]; | |
1033 movq MM3, [EAX+8]; | |
1034 movq MM0, MM4; | |
1035 movq MM1, MM4; | |
1036 add EAX, 16; | |
1037 psubw MM0, MM2; | |
1038 psubw MM1, MM3; | |
1039 movq [ESI -16], MM0; | |
1040 movq [ESI+8-16], MM1; | |
1041 cmp ESI, EDI; | |
1042 jb startmmx; | |
1043 | |
1044 emms; | |
1045 mov aptr, ESI; | |
1046 mov bptr, EAX; | |
1047 } | |
1048 } | |
1049 } | |
1050 | |
1051 while (aptr < aend) | |
1052 *aptr++ = cast(T)(value - *bptr++); | |
1053 | |
1054 return a; | |
1055 } | |
1056 | |
1057 unittest | |
1058 { | |
1059 printf("_arrayExpSliceMinSliceAssign_s unittest\n"); | |
1060 | |
1061 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1062 { | |
1063 version (log) printf(" cpuid %d\n", cpuid); | |
1064 | |
1065 for (int j = 0; j < 2; j++) | |
1066 { | |
1067 const int dim = 67; | |
1068 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1069 a = a[j .. dim + j]; // misalign for second iteration | |
1070 T[] b = new T[dim + j]; | |
1071 b = b[j .. dim + j]; | |
1072 T[] c = new T[dim + j]; | |
1073 c = c[j .. dim + j]; | |
1074 | |
1075 for (int i = 0; i < dim; i++) | |
1076 { a[i] = cast(T)i; | |
1077 b[i] = cast(T)(i + 7); | |
1078 c[i] = cast(T)(i * 2); | |
1079 } | |
1080 | |
1081 c[] = 6 - a[]; | |
1082 | |
1083 for (int i = 0; i < dim; i++) | |
1084 { | |
1085 if (c[i] != cast(T)(6 - a[i])) | |
1086 { | |
1087 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); | |
1088 assert(0); | |
1089 } | |
1090 } | |
1091 } | |
1092 } | |
1093 } | |
1094 | |
1095 | |
1096 /* ======================================================================== */ | |
1097 | |
1098 /*********************** | |
1099 * Computes: | |
1100 * a[] = b[] - c[] | |
1101 */ | |
1102 | |
1103 T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b) | |
1104 { | |
1105 return _arraySliceSliceMinSliceAssign_s(a, c, b); | |
1106 } | |
1107 | |
1108 T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b) | |
1109 { | |
1110 return _arraySliceSliceMinSliceAssign_s(a, c, b); | |
1111 } | |
1112 | |
1113 T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b) | |
1114 in | |
1115 { | |
1116 assert(a.length == b.length && b.length == c.length); | |
1117 assert(disjoint(a, b)); | |
1118 assert(disjoint(a, c)); | |
1119 assert(disjoint(b, c)); | |
1120 } | |
1121 body | |
1122 { | |
1123 auto aptr = a.ptr; | |
1124 auto aend = aptr + a.length; | |
1125 auto bptr = b.ptr; | |
1126 auto cptr = c.ptr; | |
1127 | |
1128 version (D_InlineAsm_X86) | |
1129 { | |
1130 // SSE2 aligned version is 4129% faster | |
1131 if (sse2() && a.length >= 16) | |
1132 { | |
1133 auto n = aptr + (a.length & ~15); | |
1134 | |
1135 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
1136 { | |
1137 asm // unaligned case | |
1138 { | |
1139 mov ESI, aptr; | |
1140 mov EDI, n; | |
1141 mov EAX, bptr; | |
1142 mov ECX, cptr; | |
1143 | |
1144 align 4; | |
1145 startsse2u: | |
1146 add ESI, 32; | |
1147 movdqu XMM0, [EAX]; | |
1148 movdqu XMM1, [EAX+16]; | |
1149 add EAX, 32; | |
1150 movdqu XMM2, [ECX]; | |
1151 movdqu XMM3, [ECX+16]; | |
1152 add ECX, 32; | |
1153 psubw XMM0, XMM2; | |
1154 psubw XMM1, XMM3; | |
1155 movdqu [ESI -32], XMM0; | |
1156 movdqu [ESI+16-32], XMM1; | |
1157 cmp ESI, EDI; | |
1158 jb startsse2u; | |
1159 | |
1160 mov aptr, ESI; | |
1161 mov bptr, EAX; | |
1162 mov cptr, ECX; | |
1163 } | |
1164 } | |
1165 else | |
1166 { | |
1167 asm // aligned case | |
1168 { | |
1169 mov ESI, aptr; | |
1170 mov EDI, n; | |
1171 mov EAX, bptr; | |
1172 mov ECX, cptr; | |
1173 | |
1174 align 4; | |
1175 startsse2a: | |
1176 add ESI, 32; | |
1177 movdqa XMM0, [EAX]; | |
1178 movdqa XMM1, [EAX+16]; | |
1179 add EAX, 32; | |
1180 movdqa XMM2, [ECX]; | |
1181 movdqa XMM3, [ECX+16]; | |
1182 add ECX, 32; | |
1183 psubw XMM0, XMM2; | |
1184 psubw XMM1, XMM3; | |
1185 movdqa [ESI -32], XMM0; | |
1186 movdqa [ESI+16-32], XMM1; | |
1187 cmp ESI, EDI; | |
1188 jb startsse2a; | |
1189 | |
1190 mov aptr, ESI; | |
1191 mov bptr, EAX; | |
1192 mov cptr, ECX; | |
1193 } | |
1194 } | |
1195 } | |
1196 else | |
1197 // MMX version is 2018% faster | |
1198 if (mmx() && a.length >= 8) | |
1199 { | |
1200 auto n = aptr + (a.length & ~7); | |
1201 | |
1202 asm | |
1203 { | |
1204 mov ESI, aptr; | |
1205 mov EDI, n; | |
1206 mov EAX, bptr; | |
1207 mov ECX, cptr; | |
1208 | |
1209 align 4; | |
1210 startmmx: | |
1211 add ESI, 16; | |
1212 movq MM0, [EAX]; | |
1213 movq MM1, [EAX+8]; | |
1214 add EAX, 16; | |
1215 movq MM2, [ECX]; | |
1216 movq MM3, [ECX+8]; | |
1217 add ECX, 16; | |
1218 psubw MM0, MM2; | |
1219 psubw MM1, MM3; | |
1220 movq [ESI -16], MM0; | |
1221 movq [ESI+8-16], MM1; | |
1222 cmp ESI, EDI; | |
1223 jb startmmx; | |
1224 | |
1225 emms; | |
1226 mov aptr, ESI; | |
1227 mov bptr, EAX; | |
1228 mov cptr, ECX; | |
1229 } | |
1230 } | |
1231 } | |
1232 | |
1233 while (aptr < aend) | |
1234 *aptr++ = cast(T)(*bptr++ - *cptr++); | |
1235 | |
1236 return a; | |
1237 } | |
1238 | |
1239 unittest | |
1240 { | |
1241 printf("_arraySliceSliceMinSliceAssign_s unittest\n"); | |
1242 | |
1243 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1244 { | |
1245 version (log) printf(" cpuid %d\n", cpuid); | |
1246 | |
1247 for (int j = 0; j < 2; j++) | |
1248 { | |
1249 const int dim = 67; | |
1250 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1251 a = a[j .. dim + j]; // misalign for second iteration | |
1252 T[] b = new T[dim + j]; | |
1253 b = b[j .. dim + j]; | |
1254 T[] c = new T[dim + j]; | |
1255 c = c[j .. dim + j]; | |
1256 | |
1257 for (int i = 0; i < dim; i++) | |
1258 { a[i] = cast(T)i; | |
1259 b[i] = cast(T)(i + 7); | |
1260 c[i] = cast(T)(i * 2); | |
1261 } | |
1262 | |
1263 c[] = a[] - b[]; | |
1264 | |
1265 for (int i = 0; i < dim; i++) | |
1266 { | |
1267 if (c[i] != cast(T)(a[i] - b[i])) | |
1268 { | |
1269 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); | |
1270 assert(0); | |
1271 } | |
1272 } | |
1273 } | |
1274 } | |
1275 } | |
1276 | |
1277 | |
1278 /* ======================================================================== */ | |
1279 | |
1280 /*********************** | |
1281 * Computes: | |
1282 * a[] -= value | |
1283 */ | |
1284 | |
1285 T[] _arrayExpSliceMinass_u(T[] a, T value) | |
1286 { | |
1287 return _arrayExpSliceMinass_s(a, value); | |
1288 } | |
1289 | |
1290 T[] _arrayExpSliceMinass_t(T[] a, T value) | |
1291 { | |
1292 return _arrayExpSliceMinass_s(a, value); | |
1293 } | |
1294 | |
1295 T[] _arrayExpSliceMinass_s(T[] a, T value) | |
1296 { | |
1297 //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1298 auto aptr = a.ptr; | |
1299 auto aend = aptr + a.length; | |
1300 | |
1301 version (D_InlineAsm_X86) | |
1302 { | |
1303 // SSE2 aligned version is 835% faster | |
1304 if (sse2() && a.length >= 16) | |
1305 { | |
1306 auto n = aptr + (a.length & ~15); | |
1307 | |
1308 uint l = cast(ushort) value; | |
1309 l |= (l << 16); | |
1310 | |
1311 if (((cast(uint) aptr) & 15) != 0) | |
1312 { | |
1313 asm // unaligned case | |
1314 { | |
1315 mov ESI, aptr; | |
1316 mov EDI, n; | |
1317 movd XMM2, l; | |
1318 pshufd XMM2, XMM2, 0; | |
1319 | |
1320 align 4; | |
1321 startaddsse2u: | |
1322 movdqu XMM0, [ESI]; | |
1323 movdqu XMM1, [ESI+16]; | |
1324 add ESI, 32; | |
1325 psubw XMM0, XMM2; | |
1326 psubw XMM1, XMM2; | |
1327 movdqu [ESI -32], XMM0; | |
1328 movdqu [ESI+16-32], XMM1; | |
1329 cmp ESI, EDI; | |
1330 jb startaddsse2u; | |
1331 | |
1332 mov aptr, ESI; | |
1333 } | |
1334 } | |
1335 else | |
1336 { | |
1337 asm // aligned case | |
1338 { | |
1339 mov ESI, aptr; | |
1340 mov EDI, n; | |
1341 movd XMM2, l; | |
1342 pshufd XMM2, XMM2, 0; | |
1343 | |
1344 align 4; | |
1345 startaddsse2a: | |
1346 movdqa XMM0, [ESI]; | |
1347 movdqa XMM1, [ESI+16]; | |
1348 add ESI, 32; | |
1349 psubw XMM0, XMM2; | |
1350 psubw XMM1, XMM2; | |
1351 movdqa [ESI -32], XMM0; | |
1352 movdqa [ESI+16-32], XMM1; | |
1353 cmp ESI, EDI; | |
1354 jb startaddsse2a; | |
1355 | |
1356 mov aptr, ESI; | |
1357 } | |
1358 } | |
1359 } | |
1360 else | |
1361 // MMX version is 835% faster | |
1362 if (mmx() && a.length >= 8) | |
1363 { | |
1364 auto n = aptr + (a.length & ~7); | |
1365 | |
1366 uint l = cast(ushort) value; | |
1367 | |
1368 asm | |
1369 { | |
1370 mov ESI, aptr; | |
1371 mov EDI, n; | |
1372 movd MM2, l; | |
1373 pshufw MM2, MM2, 0; | |
1374 | |
1375 align 4; | |
1376 startmmx: | |
1377 movq MM0, [ESI]; | |
1378 movq MM1, [ESI+8]; | |
1379 add ESI, 16; | |
1380 psubw MM0, MM2; | |
1381 psubw MM1, MM2; | |
1382 movq [ESI -16], MM0; | |
1383 movq [ESI+8-16], MM1; | |
1384 cmp ESI, EDI; | |
1385 jb startmmx; | |
1386 | |
1387 emms; | |
1388 mov aptr, ESI; | |
1389 } | |
1390 } | |
1391 } | |
1392 | |
1393 while (aptr < aend) | |
1394 *aptr++ -= value; | |
1395 | |
1396 return a; | |
1397 } | |
1398 | |
1399 unittest | |
1400 { | |
1401 printf("_arrayExpSliceMinass_s unittest\n"); | |
1402 | |
1403 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1404 { | |
1405 version (log) printf(" cpuid %d\n", cpuid); | |
1406 | |
1407 for (int j = 0; j < 2; j++) | |
1408 { | |
1409 const int dim = 67; | |
1410 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1411 a = a[j .. dim + j]; // misalign for second iteration | |
1412 T[] b = new T[dim + j]; | |
1413 b = b[j .. dim + j]; | |
1414 T[] c = new T[dim + j]; | |
1415 c = c[j .. dim + j]; | |
1416 | |
1417 for (int i = 0; i < dim; i++) | |
1418 { a[i] = cast(T)i; | |
1419 b[i] = cast(T)(i + 7); | |
1420 c[i] = cast(T)(i * 2); | |
1421 } | |
1422 | |
1423 a[] = c[]; | |
1424 a[] -= 6; | |
1425 | |
1426 for (int i = 0; i < dim; i++) | |
1427 { | |
1428 if (a[i] != cast(T)(c[i] - 6)) | |
1429 { | |
1430 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); | |
1431 assert(0); | |
1432 } | |
1433 } | |
1434 } | |
1435 } | |
1436 } | |
1437 | |
1438 | |
1439 /* ======================================================================== */ | |
1440 | |
1441 /*********************** | |
1442 * Computes: | |
1443 * a[] -= b[] | |
1444 */ | |
1445 | |
1446 T[] _arraySliceSliceMinass_u(T[] a, T[] b) | |
1447 { | |
1448 return _arraySliceSliceMinass_s(a, b); | |
1449 } | |
1450 | |
1451 T[] _arraySliceSliceMinass_t(T[] a, T[] b) | |
1452 { | |
1453 return _arraySliceSliceMinass_s(a, b); | |
1454 } | |
1455 | |
1456 T[] _arraySliceSliceMinass_s(T[] a, T[] b) | |
1457 in | |
1458 { | |
1459 assert (a.length == b.length); | |
1460 assert (disjoint(a, b)); | |
1461 } | |
1462 body | |
1463 { | |
1464 //printf("_arraySliceSliceMinass_s()\n"); | |
1465 auto aptr = a.ptr; | |
1466 auto aend = aptr + a.length; | |
1467 auto bptr = b.ptr; | |
1468 | |
1469 version (D_InlineAsm_X86) | |
1470 { | |
1471 // SSE2 aligned version is 2121% faster | |
1472 if (sse2() && a.length >= 16) | |
1473 { | |
1474 auto n = aptr + (a.length & ~15); | |
1475 | |
1476 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1477 { | |
1478 asm // unaligned case | |
1479 { | |
1480 mov ESI, aptr; | |
1481 mov EDI, n; | |
1482 mov ECX, bptr; | |
1483 | |
1484 align 4; | |
1485 startsse2u: | |
1486 movdqu XMM0, [ESI]; | |
1487 movdqu XMM1, [ESI+16]; | |
1488 add ESI, 32; | |
1489 movdqu XMM2, [ECX]; | |
1490 movdqu XMM3, [ECX+16]; | |
1491 add ECX, 32; | |
1492 psubw XMM0, XMM2; | |
1493 psubw XMM1, XMM3; | |
1494 movdqu [ESI -32], XMM0; | |
1495 movdqu [ESI+16-32], XMM1; | |
1496 cmp ESI, EDI; | |
1497 jb startsse2u; | |
1498 | |
1499 mov aptr, ESI; | |
1500 mov bptr, ECX; | |
1501 } | |
1502 } | |
1503 else | |
1504 { | |
1505 asm // aligned case | |
1506 { | |
1507 mov ESI, aptr; | |
1508 mov EDI, n; | |
1509 mov ECX, bptr; | |
1510 | |
1511 align 4; | |
1512 startsse2a: | |
1513 movdqa XMM0, [ESI]; | |
1514 movdqa XMM1, [ESI+16]; | |
1515 add ESI, 32; | |
1516 movdqa XMM2, [ECX]; | |
1517 movdqa XMM3, [ECX+16]; | |
1518 add ECX, 32; | |
1519 psubw XMM0, XMM2; | |
1520 psubw XMM1, XMM3; | |
1521 movdqa [ESI -32], XMM0; | |
1522 movdqa [ESI+16-32], XMM1; | |
1523 cmp ESI, EDI; | |
1524 jb startsse2a; | |
1525 | |
1526 mov aptr, ESI; | |
1527 mov bptr, ECX; | |
1528 } | |
1529 } | |
1530 } | |
1531 else | |
1532 // MMX version is 1116% faster | |
1533 if (mmx() && a.length >= 8) | |
1534 { | |
1535 auto n = aptr + (a.length & ~7); | |
1536 | |
1537 asm | |
1538 { | |
1539 mov ESI, aptr; | |
1540 mov EDI, n; | |
1541 mov ECX, bptr; | |
1542 | |
1543 align 4; | |
1544 start: | |
1545 movq MM0, [ESI]; | |
1546 movq MM1, [ESI+8]; | |
1547 add ESI, 16; | |
1548 movq MM2, [ECX]; | |
1549 movq MM3, [ECX+8]; | |
1550 add ECX, 16; | |
1551 psubw MM0, MM2; | |
1552 psubw MM1, MM3; | |
1553 movq [ESI -16], MM0; | |
1554 movq [ESI+8-16], MM1; | |
1555 cmp ESI, EDI; | |
1556 jb start; | |
1557 | |
1558 emms; | |
1559 mov aptr, ESI; | |
1560 mov bptr, ECX; | |
1561 } | |
1562 } | |
1563 } | |
1564 | |
1565 while (aptr < aend) | |
1566 *aptr++ -= *bptr++; | |
1567 | |
1568 return a; | |
1569 } | |
1570 | |
1571 unittest | |
1572 { | |
1573 printf("_arraySliceSliceMinass_s unittest\n"); | |
1574 | |
1575 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1576 { | |
1577 version (log) printf(" cpuid %d\n", cpuid); | |
1578 | |
1579 for (int j = 0; j < 2; j++) | |
1580 { | |
1581 const int dim = 67; | |
1582 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1583 a = a[j .. dim + j]; // misalign for second iteration | |
1584 T[] b = new T[dim + j]; | |
1585 b = b[j .. dim + j]; | |
1586 T[] c = new T[dim + j]; | |
1587 c = c[j .. dim + j]; | |
1588 | |
1589 for (int i = 0; i < dim; i++) | |
1590 { a[i] = cast(T)i; | |
1591 b[i] = cast(T)(i + 7); | |
1592 c[i] = cast(T)(i * 2); | |
1593 } | |
1594 | |
1595 b[] = c[]; | |
1596 c[] -= a[]; | |
1597 | |
1598 for (int i = 0; i < dim; i++) | |
1599 { | |
1600 if (c[i] != cast(T)(b[i] - a[i])) | |
1601 { | |
1602 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); | |
1603 assert(0); | |
1604 } | |
1605 } | |
1606 } | |
1607 } | |
1608 } | |
1609 | |
1610 | |
1611 /* ======================================================================== */ | |
1612 | |
1613 /*********************** | |
1614 * Computes: | |
1615 * a[] = b[] * value | |
1616 */ | |
1617 | |
1618 T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b) | |
1619 { | |
1620 return _arraySliceExpMulSliceAssign_s(a, value, b); | |
1621 } | |
1622 | |
1623 T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b) | |
1624 { | |
1625 return _arraySliceExpMulSliceAssign_s(a, value, b); | |
1626 } | |
1627 | |
1628 T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b) | |
1629 in | |
1630 { | |
1631 assert(a.length == b.length); | |
1632 assert(disjoint(a, b)); | |
1633 } | |
1634 body | |
1635 { | |
1636 //printf("_arraySliceExpMulSliceAssign_s()\n"); | |
1637 auto aptr = a.ptr; | |
1638 auto aend = aptr + a.length; | |
1639 auto bptr = b.ptr; | |
1640 | |
1641 version (D_InlineAsm_X86) | |
1642 { | |
1643 // SSE2 aligned version is 3733% faster | |
1644 if (sse2() && a.length >= 16) | |
1645 { | |
1646 auto n = aptr + (a.length & ~15); | |
1647 | |
1648 uint l = cast(ushort) value; | |
1649 l |= l << 16; | |
1650 | |
1651 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1652 { | |
1653 asm | |
1654 { | |
1655 mov ESI, aptr; | |
1656 mov EDI, n; | |
1657 mov EAX, bptr; | |
1658 movd XMM2, l; | |
1659 pshufd XMM2, XMM2, 0; | |
1660 | |
1661 align 4; | |
1662 startsse2u: | |
1663 add ESI, 32; | |
1664 movdqu XMM0, [EAX]; | |
1665 movdqu XMM1, [EAX+16]; | |
1666 add EAX, 32; | |
1667 pmullw XMM0, XMM2; | |
1668 pmullw XMM1, XMM2; | |
1669 movdqu [ESI -32], XMM0; | |
1670 movdqu [ESI+16-32], XMM1; | |
1671 cmp ESI, EDI; | |
1672 jb startsse2u; | |
1673 | |
1674 mov aptr, ESI; | |
1675 mov bptr, EAX; | |
1676 } | |
1677 } | |
1678 else | |
1679 { | |
1680 asm | |
1681 { | |
1682 mov ESI, aptr; | |
1683 mov EDI, n; | |
1684 mov EAX, bptr; | |
1685 movd XMM2, l; | |
1686 pshufd XMM2, XMM2, 0; | |
1687 | |
1688 align 4; | |
1689 startsse2a: | |
1690 add ESI, 32; | |
1691 movdqa XMM0, [EAX]; | |
1692 movdqa XMM1, [EAX+16]; | |
1693 add EAX, 32; | |
1694 pmullw XMM0, XMM2; | |
1695 pmullw XMM1, XMM2; | |
1696 movdqa [ESI -32], XMM0; | |
1697 movdqa [ESI+16-32], XMM1; | |
1698 cmp ESI, EDI; | |
1699 jb startsse2a; | |
1700 | |
1701 mov aptr, ESI; | |
1702 mov bptr, EAX; | |
1703 } | |
1704 } | |
1705 } | |
1706 else | |
1707 // MMX version is 3733% faster | |
1708 if (mmx() && a.length >= 8) | |
1709 { | |
1710 auto n = aptr + (a.length & ~7); | |
1711 | |
1712 uint l = cast(ushort) value; | |
1713 | |
1714 asm | |
1715 { | |
1716 mov ESI, aptr; | |
1717 mov EDI, n; | |
1718 mov EAX, bptr; | |
1719 movd MM2, l; | |
1720 pshufw MM2, MM2, 0; | |
1721 | |
1722 align 4; | |
1723 startmmx: | |
1724 add ESI, 16; | |
1725 movq MM0, [EAX]; | |
1726 movq MM1, [EAX+8]; | |
1727 add EAX, 16; | |
1728 pmullw MM0, MM2; | |
1729 pmullw MM1, MM2; | |
1730 movq [ESI -16], MM0; | |
1731 movq [ESI+8-16], MM1; | |
1732 cmp ESI, EDI; | |
1733 jb startmmx; | |
1734 | |
1735 emms; | |
1736 mov aptr, ESI; | |
1737 mov bptr, EAX; | |
1738 } | |
1739 } | |
1740 } | |
1741 | |
1742 while (aptr < aend) | |
1743 *aptr++ = cast(T)(*bptr++ * value); | |
1744 | |
1745 return a; | |
1746 } | |
1747 | |
1748 unittest | |
1749 { | |
1750 printf("_arraySliceExpMulSliceAssign_s unittest\n"); | |
1751 | |
1752 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1753 { | |
1754 version (log) printf(" cpuid %d\n", cpuid); | |
1755 | |
1756 for (int j = 0; j < 2; j++) | |
1757 { | |
1758 const int dim = 67; | |
1759 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1760 a = a[j .. dim + j]; // misalign for second iteration | |
1761 T[] b = new T[dim + j]; | |
1762 b = b[j .. dim + j]; | |
1763 T[] c = new T[dim + j]; | |
1764 c = c[j .. dim + j]; | |
1765 | |
1766 for (int i = 0; i < dim; i++) | |
1767 { a[i] = cast(T)i; | |
1768 b[i] = cast(T)(i + 7); | |
1769 c[i] = cast(T)(i * 2); | |
1770 } | |
1771 | |
1772 c[] = a[] * 6; | |
1773 | |
1774 for (int i = 0; i < dim; i++) | |
1775 { | |
1776 if (c[i] != cast(T)(a[i] * 6)) | |
1777 { | |
1778 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); | |
1779 assert(0); | |
1780 } | |
1781 } | |
1782 } | |
1783 } | |
1784 } | |
1785 | |
1786 | |
1787 /* ======================================================================== */ | |
1788 | |
1789 /*********************** | |
1790 * Computes: | |
1791 * a[] = b[] * c[] | |
1792 */ | |
1793 | |
1794 T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b) | |
1795 { | |
1796 return _arraySliceSliceMulSliceAssign_s(a, c, b); | |
1797 } | |
1798 | |
1799 T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b) | |
1800 { | |
1801 return _arraySliceSliceMulSliceAssign_s(a, c, b); | |
1802 } | |
1803 | |
1804 T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b) | |
1805 in | |
1806 { | |
1807 assert(a.length == b.length && b.length == c.length); | |
1808 assert(disjoint(a, b)); | |
1809 assert(disjoint(a, c)); | |
1810 assert(disjoint(b, c)); | |
1811 } | |
1812 body | |
1813 { | |
1814 //printf("_arraySliceSliceMulSliceAssign_s()\n"); | |
1815 auto aptr = a.ptr; | |
1816 auto aend = aptr + a.length; | |
1817 auto bptr = b.ptr; | |
1818 auto cptr = c.ptr; | |
1819 | |
1820 version (D_InlineAsm_X86) | |
1821 { | |
1822 // SSE2 aligned version is 2515% faster | |
1823 if (sse2() && a.length >= 16) | |
1824 { | |
1825 auto n = aptr + (a.length & ~15); | |
1826 | |
1827 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
1828 { | |
1829 asm | |
1830 { | |
1831 mov ESI, aptr; | |
1832 mov EDI, n; | |
1833 mov EAX, bptr; | |
1834 mov ECX, cptr; | |
1835 | |
1836 align 4; | |
1837 startsse2u: | |
1838 add ESI, 32; | |
1839 movdqu XMM0, [EAX]; | |
1840 movdqu XMM2, [ECX]; | |
1841 movdqu XMM1, [EAX+16]; | |
1842 movdqu XMM3, [ECX+16]; | |
1843 add EAX, 32; | |
1844 add ECX, 32; | |
1845 pmullw XMM0, XMM2; | |
1846 pmullw XMM1, XMM3; | |
1847 movdqu [ESI -32], XMM0; | |
1848 movdqu [ESI+16-32], XMM1; | |
1849 cmp ESI, EDI; | |
1850 jb startsse2u; | |
1851 | |
1852 mov aptr, ESI; | |
1853 mov bptr, EAX; | |
1854 mov cptr, ECX; | |
1855 } | |
1856 } | |
1857 else | |
1858 { | |
1859 asm | |
1860 { | |
1861 mov ESI, aptr; | |
1862 mov EDI, n; | |
1863 mov EAX, bptr; | |
1864 mov ECX, cptr; | |
1865 | |
1866 align 4; | |
1867 startsse2a: | |
1868 add ESI, 32; | |
1869 movdqa XMM0, [EAX]; | |
1870 movdqa XMM2, [ECX]; | |
1871 movdqa XMM1, [EAX+16]; | |
1872 movdqa XMM3, [ECX+16]; | |
1873 add EAX, 32; | |
1874 add ECX, 32; | |
1875 pmullw XMM0, XMM2; | |
1876 pmullw XMM1, XMM3; | |
1877 movdqa [ESI -32], XMM0; | |
1878 movdqa [ESI+16-32], XMM1; | |
1879 cmp ESI, EDI; | |
1880 jb startsse2a; | |
1881 | |
1882 mov aptr, ESI; | |
1883 mov bptr, EAX; | |
1884 mov cptr, ECX; | |
1885 } | |
1886 } | |
1887 } | |
1888 else | |
1889 // MMX version is 2515% faster | |
1890 if (mmx() && a.length >= 8) | |
1891 { | |
1892 auto n = aptr + (a.length & ~7); | |
1893 | |
1894 asm | |
1895 { | |
1896 mov ESI, aptr; | |
1897 mov EDI, n; | |
1898 mov EAX, bptr; | |
1899 mov ECX, cptr; | |
1900 | |
1901 align 4; | |
1902 startmmx: | |
1903 add ESI, 16; | |
1904 movq MM0, [EAX]; | |
1905 movq MM2, [ECX]; | |
1906 movq MM1, [EAX+8]; | |
1907 movq MM3, [ECX+8]; | |
1908 add EAX, 16; | |
1909 add ECX, 16; | |
1910 pmullw MM0, MM2; | |
1911 pmullw MM1, MM3; | |
1912 movq [ESI -16], MM0; | |
1913 movq [ESI+8-16], MM1; | |
1914 cmp ESI, EDI; | |
1915 jb startmmx; | |
1916 | |
1917 emms; | |
1918 mov aptr, ESI; | |
1919 mov bptr, EAX; | |
1920 mov cptr, ECX; | |
1921 } | |
1922 } | |
1923 } | |
1924 | |
1925 while (aptr < aend) | |
1926 *aptr++ = cast(T)(*bptr++ * *cptr++); | |
1927 | |
1928 return a; | |
1929 } | |
1930 | |
1931 unittest | |
1932 { | |
1933 printf("_arraySliceSliceMulSliceAssign_s unittest\n"); | |
1934 | |
1935 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1936 { | |
1937 version (log) printf(" cpuid %d\n", cpuid); | |
1938 | |
1939 for (int j = 0; j < 2; j++) | |
1940 { | |
1941 const int dim = 67; | |
1942 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1943 a = a[j .. dim + j]; // misalign for second iteration | |
1944 T[] b = new T[dim + j]; | |
1945 b = b[j .. dim + j]; | |
1946 T[] c = new T[dim + j]; | |
1947 c = c[j .. dim + j]; | |
1948 | |
1949 for (int i = 0; i < dim; i++) | |
1950 { a[i] = cast(T)i; | |
1951 b[i] = cast(T)(i + 7); | |
1952 c[i] = cast(T)(i * 2); | |
1953 } | |
1954 | |
1955 c[] = a[] * b[]; | |
1956 | |
1957 for (int i = 0; i < dim; i++) | |
1958 { | |
1959 if (c[i] != cast(T)(a[i] * b[i])) | |
1960 { | |
1961 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); | |
1962 assert(0); | |
1963 } | |
1964 } | |
1965 } | |
1966 } | |
1967 } | |
1968 | |
1969 | |
1970 /* ======================================================================== */ | |
1971 | |
1972 /*********************** | |
1973 * Computes: | |
1974 * a[] *= value | |
1975 */ | |
1976 | |
1977 T[] _arrayExpSliceMulass_u(T[] a, T value) | |
1978 { | |
1979 return _arrayExpSliceMulass_s(a, value); | |
1980 } | |
1981 | |
1982 T[] _arrayExpSliceMulass_t(T[] a, T value) | |
1983 { | |
1984 return _arrayExpSliceMulass_s(a, value); | |
1985 } | |
1986 | |
1987 T[] _arrayExpSliceMulass_s(T[] a, T value) | |
1988 { | |
1989 //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1990 auto aptr = a.ptr; | |
1991 auto aend = aptr + a.length; | |
1992 | |
1993 version (D_InlineAsm_X86) | |
1994 { | |
1995 // SSE2 aligned version is 2044% faster | |
1996 if (sse2() && a.length >= 16) | |
1997 { | |
1998 auto n = aptr + (a.length & ~15); | |
1999 | |
2000 uint l = cast(ushort) value; | |
2001 l |= l << 16; | |
2002 | |
2003 if (((cast(uint) aptr) & 15) != 0) | |
2004 { | |
2005 asm | |
2006 { | |
2007 mov ESI, aptr; | |
2008 mov EDI, n; | |
2009 movd XMM2, l; | |
2010 pshufd XMM2, XMM2, 0; | |
2011 | |
2012 align 4; | |
2013 startsse2u: | |
2014 movdqu XMM0, [ESI]; | |
2015 movdqu XMM1, [ESI+16]; | |
2016 add ESI, 32; | |
2017 pmullw XMM0, XMM2; | |
2018 pmullw XMM1, XMM2; | |
2019 movdqu [ESI -32], XMM0; | |
2020 movdqu [ESI+16-32], XMM1; | |
2021 cmp ESI, EDI; | |
2022 jb startsse2u; | |
2023 | |
2024 mov aptr, ESI; | |
2025 } | |
2026 } | |
2027 else | |
2028 { | |
2029 asm | |
2030 { | |
2031 mov ESI, aptr; | |
2032 mov EDI, n; | |
2033 movd XMM2, l; | |
2034 pshufd XMM2, XMM2, 0; | |
2035 | |
2036 align 4; | |
2037 startsse2a: | |
2038 movdqa XMM0, [ESI]; | |
2039 movdqa XMM1, [ESI+16]; | |
2040 add ESI, 32; | |
2041 pmullw XMM0, XMM2; | |
2042 pmullw XMM1, XMM2; | |
2043 movdqa [ESI -32], XMM0; | |
2044 movdqa [ESI+16-32], XMM1; | |
2045 cmp ESI, EDI; | |
2046 jb startsse2a; | |
2047 | |
2048 mov aptr, ESI; | |
2049 } | |
2050 } | |
2051 } | |
2052 else | |
2053 // MMX version is 2056% faster | |
2054 if (mmx() && a.length >= 8) | |
2055 { | |
2056 auto n = aptr + (a.length & ~7); | |
2057 | |
2058 uint l = cast(ushort) value; | |
2059 | |
2060 asm | |
2061 { | |
2062 mov ESI, aptr; | |
2063 mov EDI, n; | |
2064 movd MM2, l; | |
2065 pshufw MM2, MM2, 0; | |
2066 | |
2067 align 4; | |
2068 startmmx: | |
2069 movq MM0, [ESI]; | |
2070 movq MM1, [ESI+8]; | |
2071 add ESI, 16; | |
2072 pmullw MM0, MM2; | |
2073 pmullw MM1, MM2; | |
2074 movq [ESI -16], MM0; | |
2075 movq [ESI+8-16], MM1; | |
2076 cmp ESI, EDI; | |
2077 jb startmmx; | |
2078 | |
2079 emms; | |
2080 mov aptr, ESI; | |
2081 } | |
2082 } | |
2083 } | |
2084 | |
2085 while (aptr < aend) | |
2086 *aptr++ *= value; | |
2087 | |
2088 return a; | |
2089 } | |
2090 | |
2091 unittest | |
2092 { | |
2093 printf("_arrayExpSliceMulass_s unittest\n"); | |
2094 | |
2095 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
2096 { | |
2097 version (log) printf(" cpuid %d\n", cpuid); | |
2098 | |
2099 for (int j = 0; j < 2; j++) | |
2100 { | |
2101 const int dim = 67; | |
2102 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
2103 a = a[j .. dim + j]; // misalign for second iteration | |
2104 T[] b = new T[dim + j]; | |
2105 b = b[j .. dim + j]; | |
2106 T[] c = new T[dim + j]; | |
2107 c = c[j .. dim + j]; | |
2108 | |
2109 for (int i = 0; i < dim; i++) | |
2110 { a[i] = cast(T)i; | |
2111 b[i] = cast(T)(i + 7); | |
2112 c[i] = cast(T)(i * 2); | |
2113 } | |
2114 | |
2115 b[] = a[]; | |
2116 a[] *= 6; | |
2117 | |
2118 for (int i = 0; i < dim; i++) | |
2119 { | |
2120 if (a[i] != cast(T)(b[i] * 6)) | |
2121 { | |
2122 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); | |
2123 assert(0); | |
2124 } | |
2125 } | |
2126 } | |
2127 } | |
2128 } | |
2129 | |
2130 | |
2131 /* ======================================================================== */ | |
2132 | |
2133 /*********************** | |
2134 * Computes: | |
2135 * a[] *= b[] | |
2136 */ | |
2137 | |
2138 T[] _arraySliceSliceMulass_u(T[] a, T[] b) | |
2139 { | |
2140 return _arraySliceSliceMulass_s(a, b); | |
2141 } | |
2142 | |
2143 T[] _arraySliceSliceMulass_t(T[] a, T[] b) | |
2144 { | |
2145 return _arraySliceSliceMulass_s(a, b); | |
2146 } | |
2147 | |
2148 T[] _arraySliceSliceMulass_s(T[] a, T[] b) | |
2149 in | |
2150 { | |
2151 assert (a.length == b.length); | |
2152 assert (disjoint(a, b)); | |
2153 } | |
2154 body | |
2155 { | |
2156 //printf("_arraySliceSliceMulass_s()\n"); | |
2157 auto aptr = a.ptr; | |
2158 auto aend = aptr + a.length; | |
2159 auto bptr = b.ptr; | |
2160 | |
2161 version (D_InlineAsm_X86) | |
2162 { | |
2163 // SSE2 aligned version is 2519% faster | |
2164 if (sse2() && a.length >= 16) | |
2165 { | |
2166 auto n = aptr + (a.length & ~15); | |
2167 | |
2168 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
2169 { | |
2170 asm | |
2171 { | |
2172 mov ESI, aptr; | |
2173 mov EDI, n; | |
2174 mov ECX, bptr; | |
2175 | |
2176 align 4; | |
2177 startsse2u: | |
2178 movdqu XMM0, [ESI]; | |
2179 movdqu XMM2, [ECX]; | |
2180 movdqu XMM1, [ESI+16]; | |
2181 movdqu XMM3, [ECX+16]; | |
2182 add ESI, 32; | |
2183 add ECX, 32; | |
2184 pmullw XMM0, XMM2; | |
2185 pmullw XMM1, XMM3; | |
2186 movdqu [ESI -32], XMM0; | |
2187 movdqu [ESI+16-32], XMM1; | |
2188 cmp ESI, EDI; | |
2189 jb startsse2u; | |
2190 | |
2191 mov aptr, ESI; | |
2192 mov bptr, ECX; | |
2193 } | |
2194 } | |
2195 else | |
2196 { | |
2197 asm | |
2198 { | |
2199 mov ESI, aptr; | |
2200 mov EDI, n; | |
2201 mov ECX, bptr; | |
2202 | |
2203 align 4; | |
2204 startsse2a: | |
2205 movdqa XMM0, [ESI]; | |
2206 movdqa XMM2, [ECX]; | |
2207 movdqa XMM1, [ESI+16]; | |
2208 movdqa XMM3, [ECX+16]; | |
2209 add ESI, 32; | |
2210 add ECX, 32; | |
2211 pmullw XMM0, XMM2; | |
2212 pmullw XMM1, XMM3; | |
2213 movdqa [ESI -32], XMM0; | |
2214 movdqa [ESI+16-32], XMM1; | |
2215 cmp ESI, EDI; | |
2216 jb startsse2a; | |
2217 | |
2218 mov aptr, ESI; | |
2219 mov bptr, ECX; | |
2220 } | |
2221 } | |
2222 } | |
2223 else | |
2224 // MMX version is 1712% faster | |
2225 if (mmx() && a.length >= 8) | |
2226 { | |
2227 auto n = aptr + (a.length & ~7); | |
2228 | |
2229 asm | |
2230 { | |
2231 mov ESI, aptr; | |
2232 mov EDI, n; | |
2233 mov ECX, bptr; | |
2234 | |
2235 align 4; | |
2236 startmmx: | |
2237 movq MM0, [ESI]; | |
2238 movq MM2, [ECX]; | |
2239 movq MM1, [ESI+8]; | |
2240 movq MM3, [ECX+8]; | |
2241 add ESI, 16; | |
2242 add ECX, 16; | |
2243 pmullw MM0, MM2; | |
2244 pmullw MM1, MM3; | |
2245 movq [ESI -16], MM0; | |
2246 movq [ESI+8-16], MM1; | |
2247 cmp ESI, EDI; | |
2248 jb startmmx; | |
2249 | |
2250 emms; | |
2251 mov aptr, ESI; | |
2252 mov bptr, ECX; | |
2253 } | |
2254 } | |
2255 } | |
2256 | |
2257 while (aptr < aend) | |
2258 *aptr++ *= *bptr++; | |
2259 | |
2260 return a; | |
2261 } | |
2262 | |
2263 unittest | |
2264 { | |
2265 printf("_arraySliceSliceMulass_s unittest\n"); | |
2266 | |
2267 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
2268 { | |
2269 version (log) printf(" cpuid %d\n", cpuid); | |
2270 | |
2271 for (int j = 0; j < 2; j++) | |
2272 { | |
2273 const int dim = 67; | |
2274 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
2275 a = a[j .. dim + j]; // misalign for second iteration | |
2276 T[] b = new T[dim + j]; | |
2277 b = b[j .. dim + j]; | |
2278 T[] c = new T[dim + j]; | |
2279 c = c[j .. dim + j]; | |
2280 | |
2281 for (int i = 0; i < dim; i++) | |
2282 { a[i] = cast(T)i; | |
2283 b[i] = cast(T)(i + 7); | |
2284 c[i] = cast(T)(i * 2); | |
2285 } | |
2286 | |
2287 b[] = a[]; | |
2288 a[] *= c[]; | |
2289 | |
2290 for (int i = 0; i < dim; i++) | |
2291 { | |
2292 if (a[i] != cast(T)(b[i] * c[i])) | |
2293 { | |
2294 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); | |
2295 assert(0); | |
2296 } | |
2297 } | |
2298 } | |
2299 } | |
2300 } |