Mercurial > projects > ldc
comparison druntime/src/compiler/dmd/arraybyte.d @ 759:d3eb054172f9
Added copy of druntime from DMD 2.020 modified for LDC.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:52:37 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
758:f04dde6e882c | 759:d3eb054172f9 |
---|---|
1 /*************************** | |
2 * D programming language http://www.digitalmars.com/d/ | |
3 * Runtime support for byte array operations. | |
4 * Based on code originally written by Burton Radons. | |
5 * Placed in public domain. | |
6 */ | |
7 | |
8 /* Contains SSE2 and MMX versions of certain operations for char, byte, | |
9 * and ubyte ('a', 'g' and 'h' suffixes). | |
10 */ | |
11 | |
12 module rt.arraybyte; | |
13 | |
14 import util.cpuid; | |
15 | |
16 version (Unittest) | |
17 { | |
18 /* This is so unit tests will test every CPU variant | |
19 */ | |
20 int cpuid; | |
21 const int CPUID_MAX = 4; | |
22 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); } | |
23 bool sse() { return cpuid == 2 && util.cpuid.sse(); } | |
24 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); } | |
25 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); } | |
26 } | |
27 else | |
28 { | |
29 alias util.cpuid.mmx mmx; | |
30 alias util.cpuid.sse sse; | |
31 alias util.cpuid.sse2 sse2; | |
32 alias util.cpuid.amd3dnow amd3dnow; | |
33 } | |
34 | |
35 //version = log; | |
36 | |
37 bool disjoint(T)(T[] a, T[] b) | |
38 { | |
39 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); | |
40 } | |
41 | |
42 alias byte T; | |
43 | |
44 extern (C): | |
45 | |
46 /* ======================================================================== */ | |
47 | |
48 | |
49 /*********************** | |
50 * Computes: | |
51 * a[] = b[] + value | |
52 */ | |
53 | |
54 T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b) | |
55 { | |
56 return _arraySliceExpAddSliceAssign_g(a, value, b); | |
57 } | |
58 | |
59 T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b) | |
60 { | |
61 return _arraySliceExpAddSliceAssign_g(a, value, b); | |
62 } | |
63 | |
64 T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b) | |
65 in | |
66 { | |
67 assert(a.length == b.length); | |
68 assert(disjoint(a, b)); | |
69 } | |
70 body | |
71 { | |
72 //printf("_arraySliceExpAddSliceAssign_g()\n"); | |
73 auto aptr = a.ptr; | |
74 auto aend = aptr + a.length; | |
75 auto bptr = b.ptr; | |
76 | |
77 version (D_InlineAsm_X86) | |
78 { | |
79 // SSE2 aligned version is 1088% faster | |
80 if (sse2() && a.length >= 64) | |
81 { | |
82 auto n = aptr + (a.length & ~63); | |
83 | |
84 uint l = cast(ubyte) value; | |
85 l |= (l << 8); | |
86 l |= (l << 16); | |
87 | |
88 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
89 { | |
90 asm // unaligned case | |
91 { | |
92 mov ESI, aptr; | |
93 mov EDI, n; | |
94 mov EAX, bptr; | |
95 movd XMM4, l; | |
96 pshufd XMM4, XMM4, 0; | |
97 | |
98 align 8; | |
99 startaddsse2u: | |
100 add ESI, 64; | |
101 movdqu XMM0, [EAX]; | |
102 movdqu XMM1, [EAX+16]; | |
103 movdqu XMM2, [EAX+32]; | |
104 movdqu XMM3, [EAX+48]; | |
105 add EAX, 64; | |
106 paddb XMM0, XMM4; | |
107 paddb XMM1, XMM4; | |
108 paddb XMM2, XMM4; | |
109 paddb XMM3, XMM4; | |
110 movdqu [ESI -64], XMM0; | |
111 movdqu [ESI+16-64], XMM1; | |
112 movdqu [ESI+32-64], XMM2; | |
113 movdqu [ESI+48-64], XMM3; | |
114 cmp ESI, EDI; | |
115 jb startaddsse2u; | |
116 | |
117 mov aptr, ESI; | |
118 mov bptr, EAX; | |
119 } | |
120 } | |
121 else | |
122 { | |
123 asm // aligned case | |
124 { | |
125 mov ESI, aptr; | |
126 mov EDI, n; | |
127 mov EAX, bptr; | |
128 movd XMM4, l; | |
129 pshufd XMM4, XMM4, 0; | |
130 | |
131 align 8; | |
132 startaddsse2a: | |
133 add ESI, 64; | |
134 movdqa XMM0, [EAX]; | |
135 movdqa XMM1, [EAX+16]; | |
136 movdqa XMM2, [EAX+32]; | |
137 movdqa XMM3, [EAX+48]; | |
138 add EAX, 64; | |
139 paddb XMM0, XMM4; | |
140 paddb XMM1, XMM4; | |
141 paddb XMM2, XMM4; | |
142 paddb XMM3, XMM4; | |
143 movdqa [ESI -64], XMM0; | |
144 movdqa [ESI+16-64], XMM1; | |
145 movdqa [ESI+32-64], XMM2; | |
146 movdqa [ESI+48-64], XMM3; | |
147 cmp ESI, EDI; | |
148 jb startaddsse2a; | |
149 | |
150 mov aptr, ESI; | |
151 mov bptr, EAX; | |
152 } | |
153 } | |
154 } | |
155 else | |
156 // MMX version is 1000% faster | |
157 if (mmx() && a.length >= 32) | |
158 { | |
159 auto n = aptr + (a.length & ~31); | |
160 | |
161 uint l = cast(ubyte) value; | |
162 l |= (l << 8); | |
163 | |
164 asm | |
165 { | |
166 mov ESI, aptr; | |
167 mov EDI, n; | |
168 mov EAX, bptr; | |
169 movd MM4, l; | |
170 pshufw MM4, MM4, 0; | |
171 | |
172 align 4; | |
173 startaddmmx: | |
174 add ESI, 32; | |
175 movq MM0, [EAX]; | |
176 movq MM1, [EAX+8]; | |
177 movq MM2, [EAX+16]; | |
178 movq MM3, [EAX+24]; | |
179 add EAX, 32; | |
180 paddb MM0, MM4; | |
181 paddb MM1, MM4; | |
182 paddb MM2, MM4; | |
183 paddb MM3, MM4; | |
184 movq [ESI -32], MM0; | |
185 movq [ESI+8 -32], MM1; | |
186 movq [ESI+16-32], MM2; | |
187 movq [ESI+24-32], MM3; | |
188 cmp ESI, EDI; | |
189 jb startaddmmx; | |
190 | |
191 emms; | |
192 mov aptr, ESI; | |
193 mov bptr, EAX; | |
194 } | |
195 } | |
196 /* trying to be fair and treat normal 32-bit cpu the same way as we do | |
197 * the SIMD units, with unrolled asm. There's not enough registers, | |
198 * really. | |
199 */ | |
200 else | |
201 if (a.length >= 4) | |
202 { | |
203 | |
204 auto n = aptr + (a.length & ~3); | |
205 asm | |
206 { | |
207 mov ESI, aptr; | |
208 mov EDI, n; | |
209 mov EAX, bptr; | |
210 mov CL, value; | |
211 | |
212 align 4; | |
213 startadd386: | |
214 add ESI, 4; | |
215 mov DX, [EAX]; | |
216 mov BX, [EAX+2]; | |
217 add EAX, 4; | |
218 add BL, CL; | |
219 add BH, CL; | |
220 add DL, CL; | |
221 add DH, CL; | |
222 mov [ESI -4], DX; | |
223 mov [ESI+2 -4], BX; | |
224 cmp ESI, EDI; | |
225 jb startadd386; | |
226 | |
227 mov aptr, ESI; | |
228 mov bptr, EAX; | |
229 } | |
230 | |
231 } | |
232 } | |
233 | |
234 while (aptr < aend) | |
235 *aptr++ = cast(T)(*bptr++ + value); | |
236 | |
237 return a; | |
238 } | |
239 | |
240 unittest | |
241 { | |
242 printf("_arraySliceExpAddSliceAssign_g unittest\n"); | |
243 | |
244 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
245 { | |
246 version (log) printf(" cpuid %d\n", cpuid); | |
247 | |
248 for (int j = 0; j < 2; j++) | |
249 { | |
250 const int dim = 67; | |
251 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
252 a = a[j .. dim + j]; // misalign for second iteration | |
253 T[] b = new T[dim + j]; | |
254 b = b[j .. dim + j]; | |
255 T[] c = new T[dim + j]; | |
256 c = c[j .. dim + j]; | |
257 | |
258 for (int i = 0; i < dim; i++) | |
259 { a[i] = cast(T)i; | |
260 b[i] = cast(T)(i + 7); | |
261 c[i] = cast(T)(i * 2); | |
262 } | |
263 | |
264 c[] = a[] + 6; | |
265 | |
266 for (int i = 0; i < dim; i++) | |
267 { | |
268 if (c[i] != cast(T)(a[i] + 6)) | |
269 { | |
270 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); | |
271 assert(0); | |
272 } | |
273 } | |
274 } | |
275 } | |
276 } | |
277 | |
278 | |
279 /* ======================================================================== */ | |
280 | |
281 /*********************** | |
282 * Computes: | |
283 * a[] = b[] + c[] | |
284 */ | |
285 | |
286 T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b) | |
287 { | |
288 return _arraySliceSliceAddSliceAssign_g(a, c, b); | |
289 } | |
290 | |
291 T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b) | |
292 { | |
293 return _arraySliceSliceAddSliceAssign_g(a, c, b); | |
294 } | |
295 | |
296 T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b) | |
297 in | |
298 { | |
299 assert(a.length == b.length && b.length == c.length); | |
300 assert(disjoint(a, b)); | |
301 assert(disjoint(a, c)); | |
302 assert(disjoint(b, c)); | |
303 } | |
304 body | |
305 { | |
306 //printf("_arraySliceSliceAddSliceAssign_g()\n"); | |
307 auto aptr = a.ptr; | |
308 auto aend = aptr + a.length; | |
309 auto bptr = b.ptr; | |
310 auto cptr = c.ptr; | |
311 | |
312 version (D_InlineAsm_X86) | |
313 { | |
314 // SSE2 aligned version is 5739% faster | |
315 if (sse2() && a.length >= 64) | |
316 { | |
317 auto n = aptr + (a.length & ~63); | |
318 | |
319 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
320 { | |
321 version (log) printf("\tsse2 unaligned\n"); | |
322 asm // unaligned case | |
323 { | |
324 mov ESI, aptr; | |
325 mov EDI, n; | |
326 mov EAX, bptr; | |
327 mov ECX, cptr; | |
328 | |
329 align 8; | |
330 startaddlsse2u: | |
331 add ESI, 64; | |
332 movdqu XMM0, [EAX]; | |
333 movdqu XMM1, [EAX+16]; | |
334 movdqu XMM2, [EAX+32]; | |
335 movdqu XMM3, [EAX+48]; | |
336 add EAX, 64; | |
337 movdqu XMM4, [ECX]; | |
338 movdqu XMM5, [ECX+16]; | |
339 movdqu XMM6, [ECX+32]; | |
340 movdqu XMM7, [ECX+48]; | |
341 add ECX, 64; | |
342 paddb XMM0, XMM4; | |
343 paddb XMM1, XMM5; | |
344 paddb XMM2, XMM6; | |
345 paddb XMM3, XMM7; | |
346 movdqu [ESI -64], XMM0; | |
347 movdqu [ESI+16-64], XMM1; | |
348 movdqu [ESI+32-64], XMM2; | |
349 movdqu [ESI+48-64], XMM3; | |
350 cmp ESI, EDI; | |
351 jb startaddlsse2u; | |
352 | |
353 mov aptr, ESI; | |
354 mov bptr, EAX; | |
355 mov cptr, ECX; | |
356 } | |
357 } | |
358 else | |
359 { | |
360 version (log) printf("\tsse2 aligned\n"); | |
361 asm // aligned case | |
362 { | |
363 mov ESI, aptr; | |
364 mov EDI, n; | |
365 mov EAX, bptr; | |
366 mov ECX, cptr; | |
367 | |
368 align 8; | |
369 startaddlsse2a: | |
370 add ESI, 64; | |
371 movdqa XMM0, [EAX]; | |
372 movdqa XMM1, [EAX+16]; | |
373 movdqa XMM2, [EAX+32]; | |
374 movdqa XMM3, [EAX+48]; | |
375 add EAX, 64; | |
376 movdqa XMM4, [ECX]; | |
377 movdqa XMM5, [ECX+16]; | |
378 movdqa XMM6, [ECX+32]; | |
379 movdqa XMM7, [ECX+48]; | |
380 add ECX, 64; | |
381 paddb XMM0, XMM4; | |
382 paddb XMM1, XMM5; | |
383 paddb XMM2, XMM6; | |
384 paddb XMM3, XMM7; | |
385 movdqa [ESI -64], XMM0; | |
386 movdqa [ESI+16-64], XMM1; | |
387 movdqa [ESI+32-64], XMM2; | |
388 movdqa [ESI+48-64], XMM3; | |
389 cmp ESI, EDI; | |
390 jb startaddlsse2a; | |
391 | |
392 mov aptr, ESI; | |
393 mov bptr, EAX; | |
394 mov cptr, ECX; | |
395 } | |
396 } | |
397 } | |
398 else | |
399 // MMX version is 4428% faster | |
400 if (mmx() && a.length >= 32) | |
401 { | |
402 version (log) printf("\tmmx\n"); | |
403 auto n = aptr + (a.length & ~31); | |
404 | |
405 asm | |
406 { | |
407 mov ESI, aptr; | |
408 mov EDI, n; | |
409 mov EAX, bptr; | |
410 mov ECX, cptr; | |
411 | |
412 align 4; | |
413 startaddlmmx: | |
414 add ESI, 32; | |
415 movq MM0, [EAX]; | |
416 movq MM1, [EAX+8]; | |
417 movq MM2, [EAX+16]; | |
418 movq MM3, [EAX+24]; | |
419 add EAX, 32; | |
420 movq MM4, [ECX]; | |
421 movq MM5, [ECX+8]; | |
422 movq MM6, [ECX+16]; | |
423 movq MM7, [ECX+24]; | |
424 add ECX, 32; | |
425 paddb MM0, MM4; | |
426 paddb MM1, MM5; | |
427 paddb MM2, MM6; | |
428 paddb MM3, MM7; | |
429 movq [ESI -32], MM0; | |
430 movq [ESI+8 -32], MM1; | |
431 movq [ESI+16-32], MM2; | |
432 movq [ESI+24-32], MM3; | |
433 cmp ESI, EDI; | |
434 jb startaddlmmx; | |
435 | |
436 emms; | |
437 mov aptr, ESI; | |
438 mov bptr, EAX; | |
439 mov cptr, ECX; | |
440 } | |
441 } | |
442 } | |
443 | |
444 version (log) if (aptr < aend) printf("\tbase\n"); | |
445 while (aptr < aend) | |
446 *aptr++ = cast(T)(*bptr++ + *cptr++); | |
447 | |
448 return a; | |
449 } | |
450 | |
451 unittest | |
452 { | |
453 printf("_arraySliceSliceAddSliceAssign_g unittest\n"); | |
454 | |
455 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
456 { | |
457 version (log) printf(" cpuid %d\n", cpuid); | |
458 | |
459 for (int j = 0; j < 2; j++) | |
460 { | |
461 const int dim = 67; | |
462 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
463 a = a[j .. dim + j]; // misalign for second iteration | |
464 T[] b = new T[dim + j]; | |
465 b = b[j .. dim + j]; | |
466 T[] c = new T[dim + j]; | |
467 c = c[j .. dim + j]; | |
468 | |
469 for (int i = 0; i < dim; i++) | |
470 { a[i] = cast(T)i; | |
471 b[i] = cast(T)(i + 7); | |
472 c[i] = cast(T)(i * 2); | |
473 } | |
474 | |
475 c[] = a[] + b[]; | |
476 | |
477 for (int i = 0; i < dim; i++) | |
478 { | |
479 if (c[i] != cast(T)(a[i] + b[i])) | |
480 { | |
481 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); | |
482 assert(0); | |
483 } | |
484 } | |
485 } | |
486 } | |
487 } | |
488 | |
489 | |
490 /* ======================================================================== */ | |
491 | |
492 /*********************** | |
493 * Computes: | |
494 * a[] += value | |
495 */ | |
496 | |
497 T[] _arrayExpSliceAddass_a(T[] a, T value) | |
498 { | |
499 return _arrayExpSliceAddass_g(a, value); | |
500 } | |
501 | |
502 T[] _arrayExpSliceAddass_h(T[] a, T value) | |
503 { | |
504 return _arrayExpSliceAddass_g(a, value); | |
505 } | |
506 | |
507 T[] _arrayExpSliceAddass_g(T[] a, T value) | |
508 { | |
509 //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
510 auto aptr = a.ptr; | |
511 auto aend = aptr + a.length; | |
512 | |
513 version (D_InlineAsm_X86) | |
514 { | |
515 // SSE2 aligned version is 1578% faster | |
516 if (sse2() && a.length >= 64) | |
517 { | |
518 auto n = aptr + (a.length & ~63); | |
519 | |
520 uint l = cast(ubyte) value; | |
521 l |= (l << 8); | |
522 l |= (l << 16); | |
523 | |
524 if (((cast(uint) aptr) & 15) != 0) | |
525 { | |
526 asm // unaligned case | |
527 { | |
528 mov ESI, aptr; | |
529 mov EDI, n; | |
530 movd XMM4, l; | |
531 pshufd XMM4, XMM4, 0; | |
532 | |
533 align 8; | |
534 startaddasssse2u: | |
535 movdqu XMM0, [ESI]; | |
536 movdqu XMM1, [ESI+16]; | |
537 movdqu XMM2, [ESI+32]; | |
538 movdqu XMM3, [ESI+48]; | |
539 add ESI, 64; | |
540 paddb XMM0, XMM4; | |
541 paddb XMM1, XMM4; | |
542 paddb XMM2, XMM4; | |
543 paddb XMM3, XMM4; | |
544 movdqu [ESI -64], XMM0; | |
545 movdqu [ESI+16-64], XMM1; | |
546 movdqu [ESI+32-64], XMM2; | |
547 movdqu [ESI+48-64], XMM3; | |
548 cmp ESI, EDI; | |
549 jb startaddasssse2u; | |
550 | |
551 mov aptr, ESI; | |
552 } | |
553 } | |
554 else | |
555 { | |
556 asm // aligned case | |
557 { | |
558 mov ESI, aptr; | |
559 mov EDI, n; | |
560 movd XMM4, l; | |
561 pshufd XMM4, XMM4, 0; | |
562 | |
563 align 8; | |
564 startaddasssse2a: | |
565 movdqa XMM0, [ESI]; | |
566 movdqa XMM1, [ESI+16]; | |
567 movdqa XMM2, [ESI+32]; | |
568 movdqa XMM3, [ESI+48]; | |
569 add ESI, 64; | |
570 paddb XMM0, XMM4; | |
571 paddb XMM1, XMM4; | |
572 paddb XMM2, XMM4; | |
573 paddb XMM3, XMM4; | |
574 movdqa [ESI -64], XMM0; | |
575 movdqa [ESI+16-64], XMM1; | |
576 movdqa [ESI+32-64], XMM2; | |
577 movdqa [ESI+48-64], XMM3; | |
578 cmp ESI, EDI; | |
579 jb startaddasssse2a; | |
580 | |
581 mov aptr, ESI; | |
582 } | |
583 } | |
584 } | |
585 else | |
586 // MMX version is 1721% faster | |
587 if (mmx() && a.length >= 32) | |
588 { | |
589 | |
590 auto n = aptr + (a.length & ~31); | |
591 | |
592 uint l = cast(ubyte) value; | |
593 l |= (l << 8); | |
594 | |
595 asm | |
596 { | |
597 mov ESI, aptr; | |
598 mov EDI, n; | |
599 movd MM4, l; | |
600 pshufw MM4, MM4, 0; | |
601 | |
602 align 8; | |
603 startaddassmmx: | |
604 movq MM0, [ESI]; | |
605 movq MM1, [ESI+8]; | |
606 movq MM2, [ESI+16]; | |
607 movq MM3, [ESI+24]; | |
608 add ESI, 32; | |
609 paddb MM0, MM4; | |
610 paddb MM1, MM4; | |
611 paddb MM2, MM4; | |
612 paddb MM3, MM4; | |
613 movq [ESI -32], MM0; | |
614 movq [ESI+8 -32], MM1; | |
615 movq [ESI+16-32], MM2; | |
616 movq [ESI+24-32], MM3; | |
617 cmp ESI, EDI; | |
618 jb startaddassmmx; | |
619 | |
620 emms; | |
621 mov aptr, ESI; | |
622 } | |
623 } | |
624 } | |
625 | |
626 while (aptr < aend) | |
627 *aptr++ += value; | |
628 | |
629 return a; | |
630 } | |
631 | |
632 unittest | |
633 { | |
634 printf("_arrayExpSliceAddass_g unittest\n"); | |
635 | |
636 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
637 { | |
638 version (log) printf(" cpuid %d\n", cpuid); | |
639 | |
640 for (int j = 0; j < 2; j++) | |
641 { | |
642 const int dim = 67; | |
643 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
644 a = a[j .. dim + j]; // misalign for second iteration | |
645 T[] b = new T[dim + j]; | |
646 b = b[j .. dim + j]; | |
647 T[] c = new T[dim + j]; | |
648 c = c[j .. dim + j]; | |
649 | |
650 for (int i = 0; i < dim; i++) | |
651 { a[i] = cast(T)i; | |
652 b[i] = cast(T)(i + 7); | |
653 c[i] = cast(T)(i * 2); | |
654 } | |
655 | |
656 a[] = c[]; | |
657 c[] += 6; | |
658 | |
659 for (int i = 0; i < dim; i++) | |
660 { | |
661 if (c[i] != cast(T)(a[i] + 6)) | |
662 { | |
663 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); | |
664 assert(0); | |
665 } | |
666 } | |
667 } | |
668 } | |
669 } | |
670 | |
671 | |
672 /* ======================================================================== */ | |
673 | |
674 /*********************** | |
675 * Computes: | |
676 * a[] += b[] | |
677 */ | |
678 | |
679 T[] _arraySliceSliceAddass_a(T[] a, T[] b) | |
680 { | |
681 return _arraySliceSliceAddass_g(a, b); | |
682 } | |
683 | |
684 T[] _arraySliceSliceAddass_h(T[] a, T[] b) | |
685 { | |
686 return _arraySliceSliceAddass_g(a, b); | |
687 } | |
688 | |
689 T[] _arraySliceSliceAddass_g(T[] a, T[] b) | |
690 in | |
691 { | |
692 assert (a.length == b.length); | |
693 assert (disjoint(a, b)); | |
694 } | |
695 body | |
696 { | |
697 //printf("_arraySliceSliceAddass_g()\n"); | |
698 auto aptr = a.ptr; | |
699 auto aend = aptr + a.length; | |
700 auto bptr = b.ptr; | |
701 | |
702 version (D_InlineAsm_X86) | |
703 { | |
704 // SSE2 aligned version is 4727% faster | |
705 if (sse2() && a.length >= 64) | |
706 { | |
707 auto n = aptr + (a.length & ~63); | |
708 | |
709 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
710 { | |
711 asm // unaligned case | |
712 { | |
713 mov ESI, aptr; | |
714 mov EDI, n; | |
715 mov ECX, bptr; | |
716 | |
717 align 8; | |
718 startaddasslsse2u: | |
719 movdqu XMM0, [ESI]; | |
720 movdqu XMM1, [ESI+16]; | |
721 movdqu XMM2, [ESI+32]; | |
722 movdqu XMM3, [ESI+48]; | |
723 add ESI, 64; | |
724 movdqu XMM4, [ECX]; | |
725 movdqu XMM5, [ECX+16]; | |
726 movdqu XMM6, [ECX+32]; | |
727 movdqu XMM7, [ECX+48]; | |
728 add ECX, 64; | |
729 paddb XMM0, XMM4; | |
730 paddb XMM1, XMM5; | |
731 paddb XMM2, XMM6; | |
732 paddb XMM3, XMM7; | |
733 movdqu [ESI -64], XMM0; | |
734 movdqu [ESI+16-64], XMM1; | |
735 movdqu [ESI+32-64], XMM2; | |
736 movdqu [ESI+48-64], XMM3; | |
737 cmp ESI, EDI; | |
738 jb startaddasslsse2u; | |
739 | |
740 mov aptr, ESI; | |
741 mov bptr, ECX; | |
742 } | |
743 } | |
744 else | |
745 { | |
746 asm // aligned case | |
747 { | |
748 mov ESI, aptr; | |
749 mov EDI, n; | |
750 mov ECX, bptr; | |
751 | |
752 align 8; | |
753 startaddasslsse2a: | |
754 movdqa XMM0, [ESI]; | |
755 movdqa XMM1, [ESI+16]; | |
756 movdqa XMM2, [ESI+32]; | |
757 movdqa XMM3, [ESI+48]; | |
758 add ESI, 64; | |
759 movdqa XMM4, [ECX]; | |
760 movdqa XMM5, [ECX+16]; | |
761 movdqa XMM6, [ECX+32]; | |
762 movdqa XMM7, [ECX+48]; | |
763 add ECX, 64; | |
764 paddb XMM0, XMM4; | |
765 paddb XMM1, XMM5; | |
766 paddb XMM2, XMM6; | |
767 paddb XMM3, XMM7; | |
768 movdqa [ESI -64], XMM0; | |
769 movdqa [ESI+16-64], XMM1; | |
770 movdqa [ESI+32-64], XMM2; | |
771 movdqa [ESI+48-64], XMM3; | |
772 cmp ESI, EDI; | |
773 jb startaddasslsse2a; | |
774 | |
775 mov aptr, ESI; | |
776 mov bptr, ECX; | |
777 } | |
778 } | |
779 } | |
780 else | |
781 // MMX version is 3059% faster | |
782 if (mmx() && a.length >= 32) | |
783 { | |
784 | |
785 auto n = aptr + (a.length & ~31); | |
786 | |
787 asm | |
788 { | |
789 mov ESI, aptr; | |
790 mov EDI, n; | |
791 mov ECX, bptr; | |
792 | |
793 align 8; | |
794 startaddasslmmx: | |
795 movq MM0, [ESI]; | |
796 movq MM1, [ESI+8]; | |
797 movq MM2, [ESI+16]; | |
798 movq MM3, [ESI+24]; | |
799 add ESI, 32; | |
800 movq MM4, [ECX]; | |
801 movq MM5, [ECX+8]; | |
802 movq MM6, [ECX+16]; | |
803 movq MM7, [ECX+24]; | |
804 add ECX, 32; | |
805 paddb MM0, MM4; | |
806 paddb MM1, MM5; | |
807 paddb MM2, MM6; | |
808 paddb MM3, MM7; | |
809 movq [ESI -32], MM0; | |
810 movq [ESI+8 -32], MM1; | |
811 movq [ESI+16-32], MM2; | |
812 movq [ESI+24-32], MM3; | |
813 cmp ESI, EDI; | |
814 jb startaddasslmmx; | |
815 | |
816 emms; | |
817 mov aptr, ESI; | |
818 mov bptr, ECX; | |
819 } | |
820 } | |
821 } | |
822 | |
823 while (aptr < aend) | |
824 *aptr++ += *bptr++; | |
825 | |
826 return a; | |
827 } | |
828 | |
829 unittest | |
830 { | |
831 printf("_arraySliceSliceAddass_g unittest\n"); | |
832 | |
833 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
834 { | |
835 version (log) printf(" cpuid %d\n", cpuid); | |
836 | |
837 for (int j = 0; j < 2; j++) | |
838 { | |
839 const int dim = 67; | |
840 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
841 a = a[j .. dim + j]; // misalign for second iteration | |
842 T[] b = new T[dim + j]; | |
843 b = b[j .. dim + j]; | |
844 T[] c = new T[dim + j]; | |
845 c = c[j .. dim + j]; | |
846 | |
847 for (int i = 0; i < dim; i++) | |
848 { a[i] = cast(T)i; | |
849 b[i] = cast(T)(i + 7); | |
850 c[i] = cast(T)(i * 2); | |
851 } | |
852 | |
853 a[] = c[]; | |
854 c[] += b[]; | |
855 | |
856 for (int i = 0; i < dim; i++) | |
857 { | |
858 if (c[i] != cast(T)(a[i] + b[i])) | |
859 { | |
860 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); | |
861 assert(0); | |
862 } | |
863 } | |
864 } | |
865 } | |
866 } | |
867 | |
868 | |
869 /* ======================================================================== */ | |
870 | |
871 | |
872 /*********************** | |
873 * Computes: | |
874 * a[] = b[] - value | |
875 */ | |
876 | |
877 T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b) | |
878 { | |
879 return _arraySliceExpMinSliceAssign_g(a, value, b); | |
880 } | |
881 | |
882 T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b) | |
883 { | |
884 return _arraySliceExpMinSliceAssign_g(a, value, b); | |
885 } | |
886 | |
887 T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b) | |
888 in | |
889 { | |
890 assert(a.length == b.length); | |
891 assert(disjoint(a, b)); | |
892 } | |
893 body | |
894 { | |
895 //printf("_arraySliceExpMinSliceAssign_g()\n"); | |
896 auto aptr = a.ptr; | |
897 auto aend = aptr + a.length; | |
898 auto bptr = b.ptr; | |
899 | |
900 version (D_InlineAsm_X86) | |
901 { | |
902 // SSE2 aligned version is 1189% faster | |
903 if (sse2() && a.length >= 64) | |
904 { | |
905 auto n = aptr + (a.length & ~63); | |
906 | |
907 uint l = cast(ubyte) value; | |
908 l |= (l << 8); | |
909 l |= (l << 16); | |
910 | |
911 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
912 { | |
913 asm // unaligned case | |
914 { | |
915 mov ESI, aptr; | |
916 mov EDI, n; | |
917 mov EAX, bptr; | |
918 movd XMM4, l; | |
919 pshufd XMM4, XMM4, 0; | |
920 | |
921 align 8; | |
922 startsubsse2u: | |
923 add ESI, 64; | |
924 movdqu XMM0, [EAX]; | |
925 movdqu XMM1, [EAX+16]; | |
926 movdqu XMM2, [EAX+32]; | |
927 movdqu XMM3, [EAX+48]; | |
928 add EAX, 64; | |
929 psubb XMM0, XMM4; | |
930 psubb XMM1, XMM4; | |
931 psubb XMM2, XMM4; | |
932 psubb XMM3, XMM4; | |
933 movdqu [ESI -64], XMM0; | |
934 movdqu [ESI+16-64], XMM1; | |
935 movdqu [ESI+32-64], XMM2; | |
936 movdqu [ESI+48-64], XMM3; | |
937 cmp ESI, EDI; | |
938 jb startsubsse2u; | |
939 | |
940 mov aptr, ESI; | |
941 mov bptr, EAX; | |
942 } | |
943 } | |
944 else | |
945 { | |
946 asm // aligned case | |
947 { | |
948 mov ESI, aptr; | |
949 mov EDI, n; | |
950 mov EAX, bptr; | |
951 movd XMM4, l; | |
952 pshufd XMM4, XMM4, 0; | |
953 | |
954 align 8; | |
955 startsubsse2a: | |
956 add ESI, 64; | |
957 movdqa XMM0, [EAX]; | |
958 movdqa XMM1, [EAX+16]; | |
959 movdqa XMM2, [EAX+32]; | |
960 movdqa XMM3, [EAX+48]; | |
961 add EAX, 64; | |
962 psubb XMM0, XMM4; | |
963 psubb XMM1, XMM4; | |
964 psubb XMM2, XMM4; | |
965 psubb XMM3, XMM4; | |
966 movdqa [ESI -64], XMM0; | |
967 movdqa [ESI+16-64], XMM1; | |
968 movdqa [ESI+32-64], XMM2; | |
969 movdqa [ESI+48-64], XMM3; | |
970 cmp ESI, EDI; | |
971 jb startsubsse2a; | |
972 | |
973 mov aptr, ESI; | |
974 mov bptr, EAX; | |
975 } | |
976 } | |
977 } | |
978 else | |
979 // MMX version is 1079% faster | |
980 if (mmx() && a.length >= 32) | |
981 { | |
982 auto n = aptr + (a.length & ~31); | |
983 | |
984 uint l = cast(ubyte) value; | |
985 l |= (l << 8); | |
986 | |
987 asm | |
988 { | |
989 mov ESI, aptr; | |
990 mov EDI, n; | |
991 mov EAX, bptr; | |
992 movd MM4, l; | |
993 pshufw MM4, MM4, 0; | |
994 | |
995 align 4; | |
996 startsubmmx: | |
997 add ESI, 32; | |
998 movq MM0, [EAX]; | |
999 movq MM1, [EAX+8]; | |
1000 movq MM2, [EAX+16]; | |
1001 movq MM3, [EAX+24]; | |
1002 add EAX, 32; | |
1003 psubb MM0, MM4; | |
1004 psubb MM1, MM4; | |
1005 psubb MM2, MM4; | |
1006 psubb MM3, MM4; | |
1007 movq [ESI -32], MM0; | |
1008 movq [ESI+8 -32], MM1; | |
1009 movq [ESI+16-32], MM2; | |
1010 movq [ESI+24-32], MM3; | |
1011 cmp ESI, EDI; | |
1012 jb startsubmmx; | |
1013 | |
1014 emms; | |
1015 mov aptr, ESI; | |
1016 mov bptr, EAX; | |
1017 } | |
1018 } | |
1019 // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really. | |
1020 else | |
1021 if (a.length >= 4) | |
1022 { | |
1023 auto n = aptr + (a.length & ~3); | |
1024 asm | |
1025 { | |
1026 mov ESI, aptr; | |
1027 mov EDI, n; | |
1028 mov EAX, bptr; | |
1029 mov CL, value; | |
1030 | |
1031 align 4; | |
1032 startsub386: | |
1033 add ESI, 4; | |
1034 mov DX, [EAX]; | |
1035 mov BX, [EAX+2]; | |
1036 add EAX, 4; | |
1037 sub BL, CL; | |
1038 sub BH, CL; | |
1039 sub DL, CL; | |
1040 sub DH, CL; | |
1041 mov [ESI -4], DX; | |
1042 mov [ESI+2 -4], BX; | |
1043 cmp ESI, EDI; | |
1044 jb startsub386; | |
1045 | |
1046 mov aptr, ESI; | |
1047 mov bptr, EAX; | |
1048 } | |
1049 } | |
1050 } | |
1051 | |
1052 while (aptr < aend) | |
1053 *aptr++ = cast(T)(*bptr++ - value); | |
1054 | |
1055 return a; | |
1056 } | |
1057 | |
1058 unittest | |
1059 { | |
1060 printf("_arraySliceExpMinSliceAssign_g unittest\n"); | |
1061 | |
1062 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1063 { | |
1064 version (log) printf(" cpuid %d\n", cpuid); | |
1065 | |
1066 for (int j = 0; j < 2; j++) | |
1067 { | |
1068 const int dim = 67; | |
1069 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1070 a = a[j .. dim + j]; // misalign for second iteration | |
1071 T[] b = new T[dim + j]; | |
1072 b = b[j .. dim + j]; | |
1073 T[] c = new T[dim + j]; | |
1074 c = c[j .. dim + j]; | |
1075 | |
1076 for (int i = 0; i < dim; i++) | |
1077 { a[i] = cast(T)i; | |
1078 b[i] = cast(T)(i + 7); | |
1079 c[i] = cast(T)(i * 2); | |
1080 } | |
1081 | |
1082 a[] = c[]; | |
1083 c[] = b[] - 6; | |
1084 | |
1085 for (int i = 0; i < dim; i++) | |
1086 { | |
1087 if (c[i] != cast(T)(b[i] - 6)) | |
1088 { | |
1089 printf("[%d]: %d != %d - 6\n", i, c[i], b[i]); | |
1090 assert(0); | |
1091 } | |
1092 } | |
1093 } | |
1094 } | |
1095 } | |
1096 | |
1097 | |
1098 /* ======================================================================== */ | |
1099 | |
1100 /*********************** | |
1101 * Computes: | |
1102 * a[] = value - b[] | |
1103 */ | |
1104 | |
1105 T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value) | |
1106 { | |
1107 return _arrayExpSliceMinSliceAssign_g(a, b, value); | |
1108 } | |
1109 | |
1110 T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value) | |
1111 { | |
1112 return _arrayExpSliceMinSliceAssign_g(a, b, value); | |
1113 } | |
1114 | |
1115 T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value) | |
1116 in | |
1117 { | |
1118 assert(a.length == b.length); | |
1119 assert(disjoint(a, b)); | |
1120 } | |
1121 body | |
1122 { | |
1123 //printf("_arrayExpSliceMinSliceAssign_g()\n"); | |
1124 auto aptr = a.ptr; | |
1125 auto aend = aptr + a.length; | |
1126 auto bptr = b.ptr; | |
1127 | |
1128 version (D_InlineAsm_X86) | |
1129 { | |
1130 // SSE2 aligned version is 8748% faster | |
1131 if (sse2() && a.length >= 64) | |
1132 { | |
1133 auto n = aptr + (a.length & ~63); | |
1134 | |
1135 uint l = cast(ubyte) value; | |
1136 l |= (l << 8); | |
1137 l |= (l << 16); | |
1138 | |
1139 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1140 { | |
1141 asm // unaligned case | |
1142 { | |
1143 mov ESI, aptr; | |
1144 mov EDI, n; | |
1145 mov EAX, bptr; | |
1146 movd XMM4, l; | |
1147 pshufd XMM4, XMM4, 0; | |
1148 | |
1149 align 8; | |
1150 startsubrsse2u: | |
1151 add ESI, 64; | |
1152 movdqa XMM5, XMM4; | |
1153 movdqa XMM6, XMM4; | |
1154 movdqu XMM0, [EAX]; | |
1155 movdqu XMM1, [EAX+16]; | |
1156 psubb XMM5, XMM0; | |
1157 psubb XMM6, XMM1; | |
1158 movdqu [ESI -64], XMM5; | |
1159 movdqu [ESI+16-64], XMM6; | |
1160 movdqa XMM5, XMM4; | |
1161 movdqa XMM6, XMM4; | |
1162 movdqu XMM2, [EAX+32]; | |
1163 movdqu XMM3, [EAX+48]; | |
1164 add EAX, 64; | |
1165 psubb XMM5, XMM2; | |
1166 psubb XMM6, XMM3; | |
1167 movdqu [ESI+32-64], XMM5; | |
1168 movdqu [ESI+48-64], XMM6; | |
1169 cmp ESI, EDI; | |
1170 jb startsubrsse2u; | |
1171 | |
1172 mov aptr, ESI; | |
1173 mov bptr, EAX; | |
1174 } | |
1175 } | |
1176 else | |
1177 { | |
1178 asm // aligned case | |
1179 { | |
1180 mov ESI, aptr; | |
1181 mov EDI, n; | |
1182 mov EAX, bptr; | |
1183 movd XMM4, l; | |
1184 pshufd XMM4, XMM4, 0; | |
1185 | |
1186 align 8; | |
1187 startsubrsse2a: | |
1188 add ESI, 64; | |
1189 movdqa XMM5, XMM4; | |
1190 movdqa XMM6, XMM4; | |
1191 movdqa XMM0, [EAX]; | |
1192 movdqa XMM1, [EAX+16]; | |
1193 psubb XMM5, XMM0; | |
1194 psubb XMM6, XMM1; | |
1195 movdqa [ESI -64], XMM5; | |
1196 movdqa [ESI+16-64], XMM6; | |
1197 movdqa XMM5, XMM4; | |
1198 movdqa XMM6, XMM4; | |
1199 movdqa XMM2, [EAX+32]; | |
1200 movdqa XMM3, [EAX+48]; | |
1201 add EAX, 64; | |
1202 psubb XMM5, XMM2; | |
1203 psubb XMM6, XMM3; | |
1204 movdqa [ESI+32-64], XMM5; | |
1205 movdqa [ESI+48-64], XMM6; | |
1206 cmp ESI, EDI; | |
1207 jb startsubrsse2a; | |
1208 | |
1209 mov aptr, ESI; | |
1210 mov bptr, EAX; | |
1211 } | |
1212 } | |
1213 } | |
1214 else | |
1215 // MMX version is 7397% faster | |
1216 if (mmx() && a.length >= 32) | |
1217 { | |
1218 auto n = aptr + (a.length & ~31); | |
1219 | |
1220 uint l = cast(ubyte) value; | |
1221 l |= (l << 8); | |
1222 | |
1223 asm | |
1224 { | |
1225 mov ESI, aptr; | |
1226 mov EDI, n; | |
1227 mov EAX, bptr; | |
1228 movd MM4, l; | |
1229 pshufw MM4, MM4, 0; | |
1230 | |
1231 align 4; | |
1232 startsubrmmx: | |
1233 add ESI, 32; | |
1234 movq MM5, MM4; | |
1235 movq MM6, MM4; | |
1236 movq MM0, [EAX]; | |
1237 movq MM1, [EAX+8]; | |
1238 psubb MM5, MM0; | |
1239 psubb MM6, MM1; | |
1240 movq [ESI -32], MM5; | |
1241 movq [ESI+8 -32], MM6; | |
1242 movq MM5, MM4; | |
1243 movq MM6, MM4; | |
1244 movq MM2, [EAX+16]; | |
1245 movq MM3, [EAX+24]; | |
1246 add EAX, 32; | |
1247 psubb MM5, MM2; | |
1248 psubb MM6, MM3; | |
1249 movq [ESI+16-32], MM5; | |
1250 movq [ESI+24-32], MM6; | |
1251 cmp ESI, EDI; | |
1252 jb startsubrmmx; | |
1253 | |
1254 emms; | |
1255 mov aptr, ESI; | |
1256 mov bptr, EAX; | |
1257 } | |
1258 } | |
1259 | |
1260 } | |
1261 | |
1262 while (aptr < aend) | |
1263 *aptr++ = cast(T)(value - *bptr++); | |
1264 | |
1265 return a; | |
1266 } | |
1267 | |
1268 unittest | |
1269 { | |
1270 printf("_arrayExpSliceMinSliceAssign_g unittest\n"); | |
1271 | |
1272 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1273 { | |
1274 version (log) printf(" cpuid %d\n", cpuid); | |
1275 | |
1276 for (int j = 0; j < 2; j++) | |
1277 { | |
1278 const int dim = 67; | |
1279 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1280 a = a[j .. dim + j]; // misalign for second iteration | |
1281 T[] b = new T[dim + j]; | |
1282 b = b[j .. dim + j]; | |
1283 T[] c = new T[dim + j]; | |
1284 c = c[j .. dim + j]; | |
1285 | |
1286 for (int i = 0; i < dim; i++) | |
1287 { a[i] = cast(T)i; | |
1288 b[i] = cast(T)(i + 7); | |
1289 c[i] = cast(T)(i * 2); | |
1290 } | |
1291 | |
1292 a[] = c[]; | |
1293 c[] = 6 - b[]; | |
1294 | |
1295 for (int i = 0; i < dim; i++) | |
1296 { | |
1297 if (c[i] != cast(T)(6 - b[i])) | |
1298 { | |
1299 printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]); | |
1300 assert(0); | |
1301 } | |
1302 } | |
1303 } | |
1304 } | |
1305 } | |
1306 | |
1307 | |
1308 /* ======================================================================== */ | |
1309 | |
1310 /*********************** | |
1311 * Computes: | |
1312 * a[] = b[] - c[] | |
1313 */ | |
1314 | |
1315 T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b) | |
1316 { | |
1317 return _arraySliceSliceMinSliceAssign_g(a, c, b); | |
1318 } | |
1319 | |
1320 T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b) | |
1321 { | |
1322 return _arraySliceSliceMinSliceAssign_g(a, c, b); | |
1323 } | |
1324 | |
1325 T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b) | |
1326 in | |
1327 { | |
1328 assert(a.length == b.length && b.length == c.length); | |
1329 assert(disjoint(a, b)); | |
1330 assert(disjoint(a, c)); | |
1331 assert(disjoint(b, c)); | |
1332 } | |
1333 body | |
1334 { | |
1335 auto aptr = a.ptr; | |
1336 auto aend = aptr + a.length; | |
1337 auto bptr = b.ptr; | |
1338 auto cptr = c.ptr; | |
1339 | |
1340 version (D_InlineAsm_X86) | |
1341 { | |
1342 // SSE2 aligned version is 5756% faster | |
1343 if (sse2() && a.length >= 64) | |
1344 { | |
1345 auto n = aptr + (a.length & ~63); | |
1346 | |
1347 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
1348 { | |
1349 asm // unaligned case | |
1350 { | |
1351 mov ESI, aptr; | |
1352 mov EDI, n; | |
1353 mov EAX, bptr; | |
1354 mov ECX, cptr; | |
1355 | |
1356 align 8; | |
1357 startsublsse2u: | |
1358 add ESI, 64; | |
1359 movdqu XMM0, [EAX]; | |
1360 movdqu XMM1, [EAX+16]; | |
1361 movdqu XMM2, [EAX+32]; | |
1362 movdqu XMM3, [EAX+48]; | |
1363 add EAX, 64; | |
1364 movdqu XMM4, [ECX]; | |
1365 movdqu XMM5, [ECX+16]; | |
1366 movdqu XMM6, [ECX+32]; | |
1367 movdqu XMM7, [ECX+48]; | |
1368 add ECX, 64; | |
1369 psubb XMM0, XMM4; | |
1370 psubb XMM1, XMM5; | |
1371 psubb XMM2, XMM6; | |
1372 psubb XMM3, XMM7; | |
1373 movdqu [ESI -64], XMM0; | |
1374 movdqu [ESI+16-64], XMM1; | |
1375 movdqu [ESI+32-64], XMM2; | |
1376 movdqu [ESI+48-64], XMM3; | |
1377 cmp ESI, EDI; | |
1378 jb startsublsse2u; | |
1379 | |
1380 mov aptr, ESI; | |
1381 mov bptr, EAX; | |
1382 mov cptr, ECX; | |
1383 } | |
1384 } | |
1385 else | |
1386 { | |
1387 asm // aligned case | |
1388 { | |
1389 mov ESI, aptr; | |
1390 mov EDI, n; | |
1391 mov EAX, bptr; | |
1392 mov ECX, cptr; | |
1393 | |
1394 align 8; | |
1395 startsublsse2a: | |
1396 add ESI, 64; | |
1397 movdqa XMM0, [EAX]; | |
1398 movdqa XMM1, [EAX+16]; | |
1399 movdqa XMM2, [EAX+32]; | |
1400 movdqa XMM3, [EAX+48]; | |
1401 add EAX, 64; | |
1402 movdqa XMM4, [ECX]; | |
1403 movdqa XMM5, [ECX+16]; | |
1404 movdqa XMM6, [ECX+32]; | |
1405 movdqa XMM7, [ECX+48]; | |
1406 add ECX, 64; | |
1407 psubb XMM0, XMM4; | |
1408 psubb XMM1, XMM5; | |
1409 psubb XMM2, XMM6; | |
1410 psubb XMM3, XMM7; | |
1411 movdqa [ESI -64], XMM0; | |
1412 movdqa [ESI+16-64], XMM1; | |
1413 movdqa [ESI+32-64], XMM2; | |
1414 movdqa [ESI+48-64], XMM3; | |
1415 cmp ESI, EDI; | |
1416 jb startsublsse2a; | |
1417 | |
1418 mov aptr, ESI; | |
1419 mov bptr, EAX; | |
1420 mov cptr, ECX; | |
1421 } | |
1422 } | |
1423 } | |
1424 else | |
1425 // MMX version is 4428% faster | |
1426 if (mmx() && a.length >= 32) | |
1427 { | |
1428 auto n = aptr + (a.length & ~31); | |
1429 | |
1430 asm | |
1431 { | |
1432 mov ESI, aptr; | |
1433 mov EDI, n; | |
1434 mov EAX, bptr; | |
1435 mov ECX, cptr; | |
1436 | |
1437 align 8; | |
1438 startsublmmx: | |
1439 add ESI, 32; | |
1440 movq MM0, [EAX]; | |
1441 movq MM1, [EAX+8]; | |
1442 movq MM2, [EAX+16]; | |
1443 movq MM3, [EAX+24]; | |
1444 add EAX, 32; | |
1445 movq MM4, [ECX]; | |
1446 movq MM5, [ECX+8]; | |
1447 movq MM6, [ECX+16]; | |
1448 movq MM7, [ECX+24]; | |
1449 add ECX, 32; | |
1450 psubb MM0, MM4; | |
1451 psubb MM1, MM5; | |
1452 psubb MM2, MM6; | |
1453 psubb MM3, MM7; | |
1454 movq [ESI -32], MM0; | |
1455 movq [ESI+8 -32], MM1; | |
1456 movq [ESI+16-32], MM2; | |
1457 movq [ESI+24-32], MM3; | |
1458 cmp ESI, EDI; | |
1459 jb startsublmmx; | |
1460 | |
1461 emms; | |
1462 mov aptr, ESI; | |
1463 mov bptr, EAX; | |
1464 mov cptr, ECX; | |
1465 } | |
1466 } | |
1467 } | |
1468 | |
1469 while (aptr < aend) | |
1470 *aptr++ = cast(T)(*bptr++ - *cptr++); | |
1471 | |
1472 return a; | |
1473 } | |
1474 | |
1475 unittest | |
1476 { | |
1477 printf("_arraySliceSliceMinSliceAssign_g unittest\n"); | |
1478 | |
1479 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1480 { | |
1481 version (log) printf(" cpuid %d\n", cpuid); | |
1482 | |
1483 for (int j = 0; j < 2; j++) | |
1484 { | |
1485 const int dim = 67; | |
1486 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1487 a = a[j .. dim + j]; // misalign for second iteration | |
1488 T[] b = new T[dim + j]; | |
1489 b = b[j .. dim + j]; | |
1490 T[] c = new T[dim + j]; | |
1491 c = c[j .. dim + j]; | |
1492 | |
1493 for (int i = 0; i < dim; i++) | |
1494 { a[i] = cast(T)i; | |
1495 b[i] = cast(T)(i + 7); | |
1496 c[i] = cast(T)(i * 2); | |
1497 } | |
1498 | |
1499 c[] = a[] - b[]; | |
1500 | |
1501 for (int i = 0; i < dim; i++) | |
1502 { | |
1503 if (c[i] != cast(T)(a[i] - b[i])) | |
1504 { | |
1505 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); | |
1506 assert(0); | |
1507 } | |
1508 } | |
1509 } | |
1510 } | |
1511 } | |
1512 | |
1513 | |
1514 /* ======================================================================== */ | |
1515 | |
1516 /*********************** | |
1517 * Computes: | |
1518 * a[] -= value | |
1519 */ | |
1520 | |
1521 T[] _arrayExpSliceMinass_a(T[] a, T value) | |
1522 { | |
1523 return _arrayExpSliceMinass_g(a, value); | |
1524 } | |
1525 | |
1526 T[] _arrayExpSliceMinass_h(T[] a, T value) | |
1527 { | |
1528 return _arrayExpSliceMinass_g(a, value); | |
1529 } | |
1530 | |
1531 T[] _arrayExpSliceMinass_g(T[] a, T value) | |
1532 { | |
1533 //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1534 auto aptr = a.ptr; | |
1535 auto aend = aptr + a.length; | |
1536 | |
1537 version (D_InlineAsm_X86) | |
1538 { | |
1539 // SSE2 aligned version is 1577% faster | |
1540 if (sse2() && a.length >= 64) | |
1541 { | |
1542 auto n = aptr + (a.length & ~63); | |
1543 | |
1544 uint l = cast(ubyte) value; | |
1545 l |= (l << 8); | |
1546 l |= (l << 16); | |
1547 | |
1548 if (((cast(uint) aptr) & 15) != 0) | |
1549 { | |
1550 asm // unaligned case | |
1551 { | |
1552 mov ESI, aptr; | |
1553 mov EDI, n; | |
1554 movd XMM4, l; | |
1555 pshufd XMM4, XMM4, 0; | |
1556 | |
1557 align 8; | |
1558 startsubasssse2u: | |
1559 movdqu XMM0, [ESI]; | |
1560 movdqu XMM1, [ESI+16]; | |
1561 movdqu XMM2, [ESI+32]; | |
1562 movdqu XMM3, [ESI+48]; | |
1563 add ESI, 64; | |
1564 psubb XMM0, XMM4; | |
1565 psubb XMM1, XMM4; | |
1566 psubb XMM2, XMM4; | |
1567 psubb XMM3, XMM4; | |
1568 movdqu [ESI -64], XMM0; | |
1569 movdqu [ESI+16-64], XMM1; | |
1570 movdqu [ESI+32-64], XMM2; | |
1571 movdqu [ESI+48-64], XMM3; | |
1572 cmp ESI, EDI; | |
1573 jb startsubasssse2u; | |
1574 | |
1575 mov aptr, ESI; | |
1576 } | |
1577 } | |
1578 else | |
1579 { | |
1580 asm // aligned case | |
1581 { | |
1582 mov ESI, aptr; | |
1583 mov EDI, n; | |
1584 movd XMM4, l; | |
1585 pshufd XMM4, XMM4, 0; | |
1586 | |
1587 align 8; | |
1588 startsubasssse2a: | |
1589 movdqa XMM0, [ESI]; | |
1590 movdqa XMM1, [ESI+16]; | |
1591 movdqa XMM2, [ESI+32]; | |
1592 movdqa XMM3, [ESI+48]; | |
1593 add ESI, 64; | |
1594 psubb XMM0, XMM4; | |
1595 psubb XMM1, XMM4; | |
1596 psubb XMM2, XMM4; | |
1597 psubb XMM3, XMM4; | |
1598 movdqa [ESI -64], XMM0; | |
1599 movdqa [ESI+16-64], XMM1; | |
1600 movdqa [ESI+32-64], XMM2; | |
1601 movdqa [ESI+48-64], XMM3; | |
1602 cmp ESI, EDI; | |
1603 jb startsubasssse2a; | |
1604 | |
1605 mov aptr, ESI; | |
1606 } | |
1607 } | |
1608 } | |
1609 else | |
1610 // MMX version is 1577% faster | |
1611 if (mmx() && a.length >= 32) | |
1612 { | |
1613 | |
1614 auto n = aptr + (a.length & ~31); | |
1615 | |
1616 uint l = cast(ubyte) value; | |
1617 l |= (l << 8); | |
1618 | |
1619 asm | |
1620 { | |
1621 mov ESI, aptr; | |
1622 mov EDI, n; | |
1623 movd MM4, l; | |
1624 pshufw MM4, MM4, 0; | |
1625 | |
1626 align 8; | |
1627 startsubassmmx: | |
1628 movq MM0, [ESI]; | |
1629 movq MM1, [ESI+8]; | |
1630 movq MM2, [ESI+16]; | |
1631 movq MM3, [ESI+24]; | |
1632 add ESI, 32; | |
1633 psubb MM0, MM4; | |
1634 psubb MM1, MM4; | |
1635 psubb MM2, MM4; | |
1636 psubb MM3, MM4; | |
1637 movq [ESI -32], MM0; | |
1638 movq [ESI+8 -32], MM1; | |
1639 movq [ESI+16-32], MM2; | |
1640 movq [ESI+24-32], MM3; | |
1641 cmp ESI, EDI; | |
1642 jb startsubassmmx; | |
1643 | |
1644 emms; | |
1645 mov aptr, ESI; | |
1646 } | |
1647 } | |
1648 } | |
1649 | |
1650 while (aptr < aend) | |
1651 *aptr++ -= value; | |
1652 | |
1653 return a; | |
1654 } | |
1655 | |
1656 unittest | |
1657 { | |
1658 printf("_arrayExpSliceMinass_g unittest\n"); | |
1659 | |
1660 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1661 { | |
1662 version (log) printf(" cpuid %d\n", cpuid); | |
1663 | |
1664 for (int j = 0; j < 2; j++) | |
1665 { | |
1666 const int dim = 67; | |
1667 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1668 a = a[j .. dim + j]; // misalign for second iteration | |
1669 T[] b = new T[dim + j]; | |
1670 b = b[j .. dim + j]; | |
1671 T[] c = new T[dim + j]; | |
1672 c = c[j .. dim + j]; | |
1673 | |
1674 for (int i = 0; i < dim; i++) | |
1675 { a[i] = cast(T)i; | |
1676 b[i] = cast(T)(i + 7); | |
1677 c[i] = cast(T)(i * 2); | |
1678 } | |
1679 | |
1680 a[] = c[]; | |
1681 c[] -= 6; | |
1682 | |
1683 for (int i = 0; i < dim; i++) | |
1684 { | |
1685 if (c[i] != cast(T)(a[i] - 6)) | |
1686 { | |
1687 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); | |
1688 assert(0); | |
1689 } | |
1690 } | |
1691 } | |
1692 } | |
1693 } | |
1694 | |
1695 | |
1696 /* ======================================================================== */ | |
1697 | |
1698 /*********************** | |
1699 * Computes: | |
1700 * a[] -= b[] | |
1701 */ | |
1702 | |
1703 T[] _arraySliceSliceMinass_a(T[] a, T[] b) | |
1704 { | |
1705 return _arraySliceSliceMinass_g(a, b); | |
1706 } | |
1707 | |
1708 T[] _arraySliceSliceMinass_h(T[] a, T[] b) | |
1709 { | |
1710 return _arraySliceSliceMinass_g(a, b); | |
1711 } | |
1712 | |
1713 T[] _arraySliceSliceMinass_g(T[] a, T[] b) | |
1714 in | |
1715 { | |
1716 assert (a.length == b.length); | |
1717 assert (disjoint(a, b)); | |
1718 } | |
1719 body | |
1720 { | |
1721 //printf("_arraySliceSliceMinass_g()\n"); | |
1722 auto aptr = a.ptr; | |
1723 auto aend = aptr + a.length; | |
1724 auto bptr = b.ptr; | |
1725 | |
1726 version (D_InlineAsm_X86) | |
1727 { | |
1728 // SSE2 aligned version is 4800% faster | |
1729 if (sse2() && a.length >= 64) | |
1730 { | |
1731 auto n = aptr + (a.length & ~63); | |
1732 | |
1733 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1734 { | |
1735 asm // unaligned case | |
1736 { | |
1737 mov ESI, aptr; | |
1738 mov EDI, n; | |
1739 mov ECX, bptr; | |
1740 | |
1741 align 8; | |
1742 startsubasslsse2u: | |
1743 movdqu XMM0, [ESI]; | |
1744 movdqu XMM1, [ESI+16]; | |
1745 movdqu XMM2, [ESI+32]; | |
1746 movdqu XMM3, [ESI+48]; | |
1747 add ESI, 64; | |
1748 movdqu XMM4, [ECX]; | |
1749 movdqu XMM5, [ECX+16]; | |
1750 movdqu XMM6, [ECX+32]; | |
1751 movdqu XMM7, [ECX+48]; | |
1752 add ECX, 64; | |
1753 psubb XMM0, XMM4; | |
1754 psubb XMM1, XMM5; | |
1755 psubb XMM2, XMM6; | |
1756 psubb XMM3, XMM7; | |
1757 movdqu [ESI -64], XMM0; | |
1758 movdqu [ESI+16-64], XMM1; | |
1759 movdqu [ESI+32-64], XMM2; | |
1760 movdqu [ESI+48-64], XMM3; | |
1761 cmp ESI, EDI; | |
1762 jb startsubasslsse2u; | |
1763 | |
1764 mov aptr, ESI; | |
1765 mov bptr, ECX; | |
1766 } | |
1767 } | |
1768 else | |
1769 { | |
1770 asm // aligned case | |
1771 { | |
1772 mov ESI, aptr; | |
1773 mov EDI, n; | |
1774 mov ECX, bptr; | |
1775 | |
1776 align 8; | |
1777 startsubasslsse2a: | |
1778 movdqa XMM0, [ESI]; | |
1779 movdqa XMM1, [ESI+16]; | |
1780 movdqa XMM2, [ESI+32]; | |
1781 movdqa XMM3, [ESI+48]; | |
1782 add ESI, 64; | |
1783 movdqa XMM4, [ECX]; | |
1784 movdqa XMM5, [ECX+16]; | |
1785 movdqa XMM6, [ECX+32]; | |
1786 movdqa XMM7, [ECX+48]; | |
1787 add ECX, 64; | |
1788 psubb XMM0, XMM4; | |
1789 psubb XMM1, XMM5; | |
1790 psubb XMM2, XMM6; | |
1791 psubb XMM3, XMM7; | |
1792 movdqa [ESI -64], XMM0; | |
1793 movdqa [ESI+16-64], XMM1; | |
1794 movdqa [ESI+32-64], XMM2; | |
1795 movdqa [ESI+48-64], XMM3; | |
1796 cmp ESI, EDI; | |
1797 jb startsubasslsse2a; | |
1798 | |
1799 mov aptr, ESI; | |
1800 mov bptr, ECX; | |
1801 } | |
1802 } | |
1803 } | |
1804 else | |
1805 // MMX version is 3107% faster | |
1806 if (mmx() && a.length >= 32) | |
1807 { | |
1808 | |
1809 auto n = aptr + (a.length & ~31); | |
1810 | |
1811 asm | |
1812 { | |
1813 mov ESI, aptr; | |
1814 mov EDI, n; | |
1815 mov ECX, bptr; | |
1816 | |
1817 align 8; | |
1818 startsubasslmmx: | |
1819 movq MM0, [ESI]; | |
1820 movq MM1, [ESI+8]; | |
1821 movq MM2, [ESI+16]; | |
1822 movq MM3, [ESI+24]; | |
1823 add ESI, 32; | |
1824 movq MM4, [ECX]; | |
1825 movq MM5, [ECX+8]; | |
1826 movq MM6, [ECX+16]; | |
1827 movq MM7, [ECX+24]; | |
1828 add ECX, 32; | |
1829 psubb MM0, MM4; | |
1830 psubb MM1, MM5; | |
1831 psubb MM2, MM6; | |
1832 psubb MM3, MM7; | |
1833 movq [ESI -32], MM0; | |
1834 movq [ESI+8 -32], MM1; | |
1835 movq [ESI+16-32], MM2; | |
1836 movq [ESI+24-32], MM3; | |
1837 cmp ESI, EDI; | |
1838 jb startsubasslmmx; | |
1839 | |
1840 emms; | |
1841 mov aptr, ESI; | |
1842 mov bptr, ECX; | |
1843 } | |
1844 } | |
1845 } | |
1846 | |
1847 while (aptr < aend) | |
1848 *aptr++ -= *bptr++; | |
1849 | |
1850 return a; | |
1851 } | |
1852 | |
1853 unittest | |
1854 { | |
1855 printf("_arraySliceSliceMinass_g unittest\n"); | |
1856 | |
1857 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1858 { | |
1859 version (log) printf(" cpuid %d\n", cpuid); | |
1860 | |
1861 for (int j = 0; j < 2; j++) | |
1862 { | |
1863 const int dim = 67; | |
1864 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1865 a = a[j .. dim + j]; // misalign for second iteration | |
1866 T[] b = new T[dim + j]; | |
1867 b = b[j .. dim + j]; | |
1868 T[] c = new T[dim + j]; | |
1869 c = c[j .. dim + j]; | |
1870 | |
1871 for (int i = 0; i < dim; i++) | |
1872 { a[i] = cast(T)i; | |
1873 b[i] = cast(T)(i + 7); | |
1874 c[i] = cast(T)(i * 2); | |
1875 } | |
1876 | |
1877 a[] = c[]; | |
1878 c[] -= b[]; | |
1879 | |
1880 for (int i = 0; i < dim; i++) | |
1881 { | |
1882 if (c[i] != cast(T)(a[i] - b[i])) | |
1883 { | |
1884 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); | |
1885 assert(0); | |
1886 } | |
1887 } | |
1888 } | |
1889 } | |
1890 } |