Mercurial > projects > ldc
comparison druntime/src/compiler/dmd/arraybyte.d @ 1458:e0b2d67cfe7c
Added druntime (this should be removed once it works).
author | Robert Clipsham <robert@octarineparrot.com> |
---|---|
date | Tue, 02 Jun 2009 17:43:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1456:7b218ec1044f | 1458:e0b2d67cfe7c |
---|---|
1 /** | |
2 * Contains SSE2 and MMX versions of certain operations for char, byte, and | |
3 * ubyte ('a', 'g' and 'h' suffixes). | |
4 * | |
5 * Copyright: Copyright Digital Mars 2008 - 2009. | |
6 * License: <a href="http://www.boost.org/LICENSE_1_0.txt>Boost License 1.0</a>. | |
7 * Authors: Walter Bright, based on code originally written by Burton Radons | |
8 * | |
9 * Copyright Digital Mars 2008 - 2009. | |
10 * Distributed under the Boost Software License, Version 1.0. | |
11 * (See accompanying file LICENSE_1_0.txt or copy at | |
12 * http://www.boost.org/LICENSE_1_0.txt) | |
13 */ | |
14 module rt.arraybyte; | |
15 | |
16 import rt.util.cpuid; | |
17 | |
18 version (unittest) | |
19 { | |
20 private import core.stdc.stdio : printf; | |
21 /* This is so unit tests will test every CPU variant | |
22 */ | |
23 int cpuid; | |
24 const int CPUID_MAX = 4; | |
25 bool mmx() { return cpuid == 1 && rt.util.cpuid.mmx(); } | |
26 bool sse() { return cpuid == 2 && rt.util.cpuid.sse(); } | |
27 bool sse2() { return cpuid == 3 && rt.util.cpuid.sse2(); } | |
28 bool amd3dnow() { return cpuid == 4 && rt.util.cpuid.amd3dnow(); } | |
29 } | |
30 else | |
31 { | |
32 alias rt.util.cpuid.mmx mmx; | |
33 alias rt.util.cpuid.sse sse; | |
34 alias rt.util.cpuid.sse2 sse2; | |
35 alias rt.util.cpuid.amd3dnow amd3dnow; | |
36 } | |
37 | |
38 //version = log; | |
39 | |
40 bool disjoint(T)(T[] a, T[] b) | |
41 { | |
42 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); | |
43 } | |
44 | |
45 alias byte T; | |
46 | |
47 extern (C): | |
48 | |
49 /* ======================================================================== */ | |
50 | |
51 | |
52 /*********************** | |
53 * Computes: | |
54 * a[] = b[] + value | |
55 */ | |
56 | |
57 T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b) | |
58 { | |
59 return _arraySliceExpAddSliceAssign_g(a, value, b); | |
60 } | |
61 | |
62 T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b) | |
63 { | |
64 return _arraySliceExpAddSliceAssign_g(a, value, b); | |
65 } | |
66 | |
67 T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b) | |
68 in | |
69 { | |
70 assert(a.length == b.length); | |
71 assert(disjoint(a, b)); | |
72 } | |
73 body | |
74 { | |
75 //printf("_arraySliceExpAddSliceAssign_g()\n"); | |
76 auto aptr = a.ptr; | |
77 auto aend = aptr + a.length; | |
78 auto bptr = b.ptr; | |
79 | |
80 version (D_InlineAsm_X86) | |
81 { | |
82 // SSE2 aligned version is 1088% faster | |
83 if (sse2() && a.length >= 64) | |
84 { | |
85 auto n = aptr + (a.length & ~63); | |
86 | |
87 uint l = cast(ubyte) value; | |
88 l |= (l << 8); | |
89 l |= (l << 16); | |
90 | |
91 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
92 { | |
93 asm // unaligned case | |
94 { | |
95 mov ESI, aptr; | |
96 mov EDI, n; | |
97 mov EAX, bptr; | |
98 movd XMM4, l; | |
99 pshufd XMM4, XMM4, 0; | |
100 | |
101 align 8; | |
102 startaddsse2u: | |
103 add ESI, 64; | |
104 movdqu XMM0, [EAX]; | |
105 movdqu XMM1, [EAX+16]; | |
106 movdqu XMM2, [EAX+32]; | |
107 movdqu XMM3, [EAX+48]; | |
108 add EAX, 64; | |
109 paddb XMM0, XMM4; | |
110 paddb XMM1, XMM4; | |
111 paddb XMM2, XMM4; | |
112 paddb XMM3, XMM4; | |
113 movdqu [ESI -64], XMM0; | |
114 movdqu [ESI+16-64], XMM1; | |
115 movdqu [ESI+32-64], XMM2; | |
116 movdqu [ESI+48-64], XMM3; | |
117 cmp ESI, EDI; | |
118 jb startaddsse2u; | |
119 | |
120 mov aptr, ESI; | |
121 mov bptr, EAX; | |
122 } | |
123 } | |
124 else | |
125 { | |
126 asm // aligned case | |
127 { | |
128 mov ESI, aptr; | |
129 mov EDI, n; | |
130 mov EAX, bptr; | |
131 movd XMM4, l; | |
132 pshufd XMM4, XMM4, 0; | |
133 | |
134 align 8; | |
135 startaddsse2a: | |
136 add ESI, 64; | |
137 movdqa XMM0, [EAX]; | |
138 movdqa XMM1, [EAX+16]; | |
139 movdqa XMM2, [EAX+32]; | |
140 movdqa XMM3, [EAX+48]; | |
141 add EAX, 64; | |
142 paddb XMM0, XMM4; | |
143 paddb XMM1, XMM4; | |
144 paddb XMM2, XMM4; | |
145 paddb XMM3, XMM4; | |
146 movdqa [ESI -64], XMM0; | |
147 movdqa [ESI+16-64], XMM1; | |
148 movdqa [ESI+32-64], XMM2; | |
149 movdqa [ESI+48-64], XMM3; | |
150 cmp ESI, EDI; | |
151 jb startaddsse2a; | |
152 | |
153 mov aptr, ESI; | |
154 mov bptr, EAX; | |
155 } | |
156 } | |
157 } | |
158 else | |
159 // MMX version is 1000% faster | |
160 if (mmx() && a.length >= 32) | |
161 { | |
162 auto n = aptr + (a.length & ~31); | |
163 | |
164 uint l = cast(ubyte) value; | |
165 l |= (l << 8); | |
166 | |
167 asm | |
168 { | |
169 mov ESI, aptr; | |
170 mov EDI, n; | |
171 mov EAX, bptr; | |
172 movd MM4, l; | |
173 pshufw MM4, MM4, 0; | |
174 | |
175 align 4; | |
176 startaddmmx: | |
177 add ESI, 32; | |
178 movq MM0, [EAX]; | |
179 movq MM1, [EAX+8]; | |
180 movq MM2, [EAX+16]; | |
181 movq MM3, [EAX+24]; | |
182 add EAX, 32; | |
183 paddb MM0, MM4; | |
184 paddb MM1, MM4; | |
185 paddb MM2, MM4; | |
186 paddb MM3, MM4; | |
187 movq [ESI -32], MM0; | |
188 movq [ESI+8 -32], MM1; | |
189 movq [ESI+16-32], MM2; | |
190 movq [ESI+24-32], MM3; | |
191 cmp ESI, EDI; | |
192 jb startaddmmx; | |
193 | |
194 emms; | |
195 mov aptr, ESI; | |
196 mov bptr, EAX; | |
197 } | |
198 } | |
199 /* trying to be fair and treat normal 32-bit cpu the same way as we do | |
200 * the SIMD units, with unrolled asm. There's not enough registers, | |
201 * really. | |
202 */ | |
203 else | |
204 if (a.length >= 4) | |
205 { | |
206 | |
207 auto n = aptr + (a.length & ~3); | |
208 asm | |
209 { | |
210 mov ESI, aptr; | |
211 mov EDI, n; | |
212 mov EAX, bptr; | |
213 mov CL, value; | |
214 | |
215 align 4; | |
216 startadd386: | |
217 add ESI, 4; | |
218 mov DX, [EAX]; | |
219 mov BX, [EAX+2]; | |
220 add EAX, 4; | |
221 add BL, CL; | |
222 add BH, CL; | |
223 add DL, CL; | |
224 add DH, CL; | |
225 mov [ESI -4], DX; | |
226 mov [ESI+2 -4], BX; | |
227 cmp ESI, EDI; | |
228 jb startadd386; | |
229 | |
230 mov aptr, ESI; | |
231 mov bptr, EAX; | |
232 } | |
233 | |
234 } | |
235 } | |
236 | |
237 while (aptr < aend) | |
238 *aptr++ = cast(T)(*bptr++ + value); | |
239 | |
240 return a; | |
241 } | |
242 | |
243 unittest | |
244 { | |
245 printf("_arraySliceExpAddSliceAssign_g unittest\n"); | |
246 | |
247 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
248 { | |
249 version (log) printf(" cpuid %d\n", cpuid); | |
250 | |
251 for (int j = 0; j < 2; j++) | |
252 { | |
253 const int dim = 67; | |
254 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
255 a = a[j .. dim + j]; // misalign for second iteration | |
256 T[] b = new T[dim + j]; | |
257 b = b[j .. dim + j]; | |
258 T[] c = new T[dim + j]; | |
259 c = c[j .. dim + j]; | |
260 | |
261 for (int i = 0; i < dim; i++) | |
262 { a[i] = cast(T)i; | |
263 b[i] = cast(T)(i + 7); | |
264 c[i] = cast(T)(i * 2); | |
265 } | |
266 | |
267 c[] = a[] + 6; | |
268 | |
269 for (int i = 0; i < dim; i++) | |
270 { | |
271 if (c[i] != cast(T)(a[i] + 6)) | |
272 { | |
273 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); | |
274 assert(0); | |
275 } | |
276 } | |
277 } | |
278 } | |
279 } | |
280 | |
281 | |
282 /* ======================================================================== */ | |
283 | |
284 /*********************** | |
285 * Computes: | |
286 * a[] = b[] + c[] | |
287 */ | |
288 | |
289 T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b) | |
290 { | |
291 return _arraySliceSliceAddSliceAssign_g(a, c, b); | |
292 } | |
293 | |
294 T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b) | |
295 { | |
296 return _arraySliceSliceAddSliceAssign_g(a, c, b); | |
297 } | |
298 | |
299 T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b) | |
300 in | |
301 { | |
302 assert(a.length == b.length && b.length == c.length); | |
303 assert(disjoint(a, b)); | |
304 assert(disjoint(a, c)); | |
305 assert(disjoint(b, c)); | |
306 } | |
307 body | |
308 { | |
309 //printf("_arraySliceSliceAddSliceAssign_g()\n"); | |
310 auto aptr = a.ptr; | |
311 auto aend = aptr + a.length; | |
312 auto bptr = b.ptr; | |
313 auto cptr = c.ptr; | |
314 | |
315 version (D_InlineAsm_X86) | |
316 { | |
317 // SSE2 aligned version is 5739% faster | |
318 if (sse2() && a.length >= 64) | |
319 { | |
320 auto n = aptr + (a.length & ~63); | |
321 | |
322 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
323 { | |
324 version (log) printf("\tsse2 unaligned\n"); | |
325 asm // unaligned case | |
326 { | |
327 mov ESI, aptr; | |
328 mov EDI, n; | |
329 mov EAX, bptr; | |
330 mov ECX, cptr; | |
331 | |
332 align 8; | |
333 startaddlsse2u: | |
334 add ESI, 64; | |
335 movdqu XMM0, [EAX]; | |
336 movdqu XMM1, [EAX+16]; | |
337 movdqu XMM2, [EAX+32]; | |
338 movdqu XMM3, [EAX+48]; | |
339 add EAX, 64; | |
340 movdqu XMM4, [ECX]; | |
341 movdqu XMM5, [ECX+16]; | |
342 movdqu XMM6, [ECX+32]; | |
343 movdqu XMM7, [ECX+48]; | |
344 add ECX, 64; | |
345 paddb XMM0, XMM4; | |
346 paddb XMM1, XMM5; | |
347 paddb XMM2, XMM6; | |
348 paddb XMM3, XMM7; | |
349 movdqu [ESI -64], XMM0; | |
350 movdqu [ESI+16-64], XMM1; | |
351 movdqu [ESI+32-64], XMM2; | |
352 movdqu [ESI+48-64], XMM3; | |
353 cmp ESI, EDI; | |
354 jb startaddlsse2u; | |
355 | |
356 mov aptr, ESI; | |
357 mov bptr, EAX; | |
358 mov cptr, ECX; | |
359 } | |
360 } | |
361 else | |
362 { | |
363 version (log) printf("\tsse2 aligned\n"); | |
364 asm // aligned case | |
365 { | |
366 mov ESI, aptr; | |
367 mov EDI, n; | |
368 mov EAX, bptr; | |
369 mov ECX, cptr; | |
370 | |
371 align 8; | |
372 startaddlsse2a: | |
373 add ESI, 64; | |
374 movdqa XMM0, [EAX]; | |
375 movdqa XMM1, [EAX+16]; | |
376 movdqa XMM2, [EAX+32]; | |
377 movdqa XMM3, [EAX+48]; | |
378 add EAX, 64; | |
379 movdqa XMM4, [ECX]; | |
380 movdqa XMM5, [ECX+16]; | |
381 movdqa XMM6, [ECX+32]; | |
382 movdqa XMM7, [ECX+48]; | |
383 add ECX, 64; | |
384 paddb XMM0, XMM4; | |
385 paddb XMM1, XMM5; | |
386 paddb XMM2, XMM6; | |
387 paddb XMM3, XMM7; | |
388 movdqa [ESI -64], XMM0; | |
389 movdqa [ESI+16-64], XMM1; | |
390 movdqa [ESI+32-64], XMM2; | |
391 movdqa [ESI+48-64], XMM3; | |
392 cmp ESI, EDI; | |
393 jb startaddlsse2a; | |
394 | |
395 mov aptr, ESI; | |
396 mov bptr, EAX; | |
397 mov cptr, ECX; | |
398 } | |
399 } | |
400 } | |
401 else | |
402 // MMX version is 4428% faster | |
403 if (mmx() && a.length >= 32) | |
404 { | |
405 version (log) printf("\tmmx\n"); | |
406 auto n = aptr + (a.length & ~31); | |
407 | |
408 asm | |
409 { | |
410 mov ESI, aptr; | |
411 mov EDI, n; | |
412 mov EAX, bptr; | |
413 mov ECX, cptr; | |
414 | |
415 align 4; | |
416 startaddlmmx: | |
417 add ESI, 32; | |
418 movq MM0, [EAX]; | |
419 movq MM1, [EAX+8]; | |
420 movq MM2, [EAX+16]; | |
421 movq MM3, [EAX+24]; | |
422 add EAX, 32; | |
423 movq MM4, [ECX]; | |
424 movq MM5, [ECX+8]; | |
425 movq MM6, [ECX+16]; | |
426 movq MM7, [ECX+24]; | |
427 add ECX, 32; | |
428 paddb MM0, MM4; | |
429 paddb MM1, MM5; | |
430 paddb MM2, MM6; | |
431 paddb MM3, MM7; | |
432 movq [ESI -32], MM0; | |
433 movq [ESI+8 -32], MM1; | |
434 movq [ESI+16-32], MM2; | |
435 movq [ESI+24-32], MM3; | |
436 cmp ESI, EDI; | |
437 jb startaddlmmx; | |
438 | |
439 emms; | |
440 mov aptr, ESI; | |
441 mov bptr, EAX; | |
442 mov cptr, ECX; | |
443 } | |
444 } | |
445 } | |
446 | |
447 version (log) if (aptr < aend) printf("\tbase\n"); | |
448 while (aptr < aend) | |
449 *aptr++ = cast(T)(*bptr++ + *cptr++); | |
450 | |
451 return a; | |
452 } | |
453 | |
454 unittest | |
455 { | |
456 printf("_arraySliceSliceAddSliceAssign_g unittest\n"); | |
457 | |
458 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
459 { | |
460 version (log) printf(" cpuid %d\n", cpuid); | |
461 | |
462 for (int j = 0; j < 2; j++) | |
463 { | |
464 const int dim = 67; | |
465 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
466 a = a[j .. dim + j]; // misalign for second iteration | |
467 T[] b = new T[dim + j]; | |
468 b = b[j .. dim + j]; | |
469 T[] c = new T[dim + j]; | |
470 c = c[j .. dim + j]; | |
471 | |
472 for (int i = 0; i < dim; i++) | |
473 { a[i] = cast(T)i; | |
474 b[i] = cast(T)(i + 7); | |
475 c[i] = cast(T)(i * 2); | |
476 } | |
477 | |
478 c[] = a[] + b[]; | |
479 | |
480 for (int i = 0; i < dim; i++) | |
481 { | |
482 if (c[i] != cast(T)(a[i] + b[i])) | |
483 { | |
484 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); | |
485 assert(0); | |
486 } | |
487 } | |
488 } | |
489 } | |
490 } | |
491 | |
492 | |
493 /* ======================================================================== */ | |
494 | |
495 /*********************** | |
496 * Computes: | |
497 * a[] += value | |
498 */ | |
499 | |
500 T[] _arrayExpSliceAddass_a(T[] a, T value) | |
501 { | |
502 return _arrayExpSliceAddass_g(a, value); | |
503 } | |
504 | |
505 T[] _arrayExpSliceAddass_h(T[] a, T value) | |
506 { | |
507 return _arrayExpSliceAddass_g(a, value); | |
508 } | |
509 | |
510 T[] _arrayExpSliceAddass_g(T[] a, T value) | |
511 { | |
512 //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
513 auto aptr = a.ptr; | |
514 auto aend = aptr + a.length; | |
515 | |
516 version (D_InlineAsm_X86) | |
517 { | |
518 // SSE2 aligned version is 1578% faster | |
519 if (sse2() && a.length >= 64) | |
520 { | |
521 auto n = aptr + (a.length & ~63); | |
522 | |
523 uint l = cast(ubyte) value; | |
524 l |= (l << 8); | |
525 l |= (l << 16); | |
526 | |
527 if (((cast(uint) aptr) & 15) != 0) | |
528 { | |
529 asm // unaligned case | |
530 { | |
531 mov ESI, aptr; | |
532 mov EDI, n; | |
533 movd XMM4, l; | |
534 pshufd XMM4, XMM4, 0; | |
535 | |
536 align 8; | |
537 startaddasssse2u: | |
538 movdqu XMM0, [ESI]; | |
539 movdqu XMM1, [ESI+16]; | |
540 movdqu XMM2, [ESI+32]; | |
541 movdqu XMM3, [ESI+48]; | |
542 add ESI, 64; | |
543 paddb XMM0, XMM4; | |
544 paddb XMM1, XMM4; | |
545 paddb XMM2, XMM4; | |
546 paddb XMM3, XMM4; | |
547 movdqu [ESI -64], XMM0; | |
548 movdqu [ESI+16-64], XMM1; | |
549 movdqu [ESI+32-64], XMM2; | |
550 movdqu [ESI+48-64], XMM3; | |
551 cmp ESI, EDI; | |
552 jb startaddasssse2u; | |
553 | |
554 mov aptr, ESI; | |
555 } | |
556 } | |
557 else | |
558 { | |
559 asm // aligned case | |
560 { | |
561 mov ESI, aptr; | |
562 mov EDI, n; | |
563 movd XMM4, l; | |
564 pshufd XMM4, XMM4, 0; | |
565 | |
566 align 8; | |
567 startaddasssse2a: | |
568 movdqa XMM0, [ESI]; | |
569 movdqa XMM1, [ESI+16]; | |
570 movdqa XMM2, [ESI+32]; | |
571 movdqa XMM3, [ESI+48]; | |
572 add ESI, 64; | |
573 paddb XMM0, XMM4; | |
574 paddb XMM1, XMM4; | |
575 paddb XMM2, XMM4; | |
576 paddb XMM3, XMM4; | |
577 movdqa [ESI -64], XMM0; | |
578 movdqa [ESI+16-64], XMM1; | |
579 movdqa [ESI+32-64], XMM2; | |
580 movdqa [ESI+48-64], XMM3; | |
581 cmp ESI, EDI; | |
582 jb startaddasssse2a; | |
583 | |
584 mov aptr, ESI; | |
585 } | |
586 } | |
587 } | |
588 else | |
589 // MMX version is 1721% faster | |
590 if (mmx() && a.length >= 32) | |
591 { | |
592 | |
593 auto n = aptr + (a.length & ~31); | |
594 | |
595 uint l = cast(ubyte) value; | |
596 l |= (l << 8); | |
597 | |
598 asm | |
599 { | |
600 mov ESI, aptr; | |
601 mov EDI, n; | |
602 movd MM4, l; | |
603 pshufw MM4, MM4, 0; | |
604 | |
605 align 8; | |
606 startaddassmmx: | |
607 movq MM0, [ESI]; | |
608 movq MM1, [ESI+8]; | |
609 movq MM2, [ESI+16]; | |
610 movq MM3, [ESI+24]; | |
611 add ESI, 32; | |
612 paddb MM0, MM4; | |
613 paddb MM1, MM4; | |
614 paddb MM2, MM4; | |
615 paddb MM3, MM4; | |
616 movq [ESI -32], MM0; | |
617 movq [ESI+8 -32], MM1; | |
618 movq [ESI+16-32], MM2; | |
619 movq [ESI+24-32], MM3; | |
620 cmp ESI, EDI; | |
621 jb startaddassmmx; | |
622 | |
623 emms; | |
624 mov aptr, ESI; | |
625 } | |
626 } | |
627 } | |
628 | |
629 while (aptr < aend) | |
630 *aptr++ += value; | |
631 | |
632 return a; | |
633 } | |
634 | |
635 unittest | |
636 { | |
637 printf("_arrayExpSliceAddass_g unittest\n"); | |
638 | |
639 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
640 { | |
641 version (log) printf(" cpuid %d\n", cpuid); | |
642 | |
643 for (int j = 0; j < 2; j++) | |
644 { | |
645 const int dim = 67; | |
646 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
647 a = a[j .. dim + j]; // misalign for second iteration | |
648 T[] b = new T[dim + j]; | |
649 b = b[j .. dim + j]; | |
650 T[] c = new T[dim + j]; | |
651 c = c[j .. dim + j]; | |
652 | |
653 for (int i = 0; i < dim; i++) | |
654 { a[i] = cast(T)i; | |
655 b[i] = cast(T)(i + 7); | |
656 c[i] = cast(T)(i * 2); | |
657 } | |
658 | |
659 a[] = c[]; | |
660 c[] += 6; | |
661 | |
662 for (int i = 0; i < dim; i++) | |
663 { | |
664 if (c[i] != cast(T)(a[i] + 6)) | |
665 { | |
666 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); | |
667 assert(0); | |
668 } | |
669 } | |
670 } | |
671 } | |
672 } | |
673 | |
674 | |
675 /* ======================================================================== */ | |
676 | |
677 /*********************** | |
678 * Computes: | |
679 * a[] += b[] | |
680 */ | |
681 | |
682 T[] _arraySliceSliceAddass_a(T[] a, T[] b) | |
683 { | |
684 return _arraySliceSliceAddass_g(a, b); | |
685 } | |
686 | |
687 T[] _arraySliceSliceAddass_h(T[] a, T[] b) | |
688 { | |
689 return _arraySliceSliceAddass_g(a, b); | |
690 } | |
691 | |
692 T[] _arraySliceSliceAddass_g(T[] a, T[] b) | |
693 in | |
694 { | |
695 assert (a.length == b.length); | |
696 assert (disjoint(a, b)); | |
697 } | |
698 body | |
699 { | |
700 //printf("_arraySliceSliceAddass_g()\n"); | |
701 auto aptr = a.ptr; | |
702 auto aend = aptr + a.length; | |
703 auto bptr = b.ptr; | |
704 | |
705 version (D_InlineAsm_X86) | |
706 { | |
707 // SSE2 aligned version is 4727% faster | |
708 if (sse2() && a.length >= 64) | |
709 { | |
710 auto n = aptr + (a.length & ~63); | |
711 | |
712 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
713 { | |
714 asm // unaligned case | |
715 { | |
716 mov ESI, aptr; | |
717 mov EDI, n; | |
718 mov ECX, bptr; | |
719 | |
720 align 8; | |
721 startaddasslsse2u: | |
722 movdqu XMM0, [ESI]; | |
723 movdqu XMM1, [ESI+16]; | |
724 movdqu XMM2, [ESI+32]; | |
725 movdqu XMM3, [ESI+48]; | |
726 add ESI, 64; | |
727 movdqu XMM4, [ECX]; | |
728 movdqu XMM5, [ECX+16]; | |
729 movdqu XMM6, [ECX+32]; | |
730 movdqu XMM7, [ECX+48]; | |
731 add ECX, 64; | |
732 paddb XMM0, XMM4; | |
733 paddb XMM1, XMM5; | |
734 paddb XMM2, XMM6; | |
735 paddb XMM3, XMM7; | |
736 movdqu [ESI -64], XMM0; | |
737 movdqu [ESI+16-64], XMM1; | |
738 movdqu [ESI+32-64], XMM2; | |
739 movdqu [ESI+48-64], XMM3; | |
740 cmp ESI, EDI; | |
741 jb startaddasslsse2u; | |
742 | |
743 mov aptr, ESI; | |
744 mov bptr, ECX; | |
745 } | |
746 } | |
747 else | |
748 { | |
749 asm // aligned case | |
750 { | |
751 mov ESI, aptr; | |
752 mov EDI, n; | |
753 mov ECX, bptr; | |
754 | |
755 align 8; | |
756 startaddasslsse2a: | |
757 movdqa XMM0, [ESI]; | |
758 movdqa XMM1, [ESI+16]; | |
759 movdqa XMM2, [ESI+32]; | |
760 movdqa XMM3, [ESI+48]; | |
761 add ESI, 64; | |
762 movdqa XMM4, [ECX]; | |
763 movdqa XMM5, [ECX+16]; | |
764 movdqa XMM6, [ECX+32]; | |
765 movdqa XMM7, [ECX+48]; | |
766 add ECX, 64; | |
767 paddb XMM0, XMM4; | |
768 paddb XMM1, XMM5; | |
769 paddb XMM2, XMM6; | |
770 paddb XMM3, XMM7; | |
771 movdqa [ESI -64], XMM0; | |
772 movdqa [ESI+16-64], XMM1; | |
773 movdqa [ESI+32-64], XMM2; | |
774 movdqa [ESI+48-64], XMM3; | |
775 cmp ESI, EDI; | |
776 jb startaddasslsse2a; | |
777 | |
778 mov aptr, ESI; | |
779 mov bptr, ECX; | |
780 } | |
781 } | |
782 } | |
783 else | |
784 // MMX version is 3059% faster | |
785 if (mmx() && a.length >= 32) | |
786 { | |
787 | |
788 auto n = aptr + (a.length & ~31); | |
789 | |
790 asm | |
791 { | |
792 mov ESI, aptr; | |
793 mov EDI, n; | |
794 mov ECX, bptr; | |
795 | |
796 align 8; | |
797 startaddasslmmx: | |
798 movq MM0, [ESI]; | |
799 movq MM1, [ESI+8]; | |
800 movq MM2, [ESI+16]; | |
801 movq MM3, [ESI+24]; | |
802 add ESI, 32; | |
803 movq MM4, [ECX]; | |
804 movq MM5, [ECX+8]; | |
805 movq MM6, [ECX+16]; | |
806 movq MM7, [ECX+24]; | |
807 add ECX, 32; | |
808 paddb MM0, MM4; | |
809 paddb MM1, MM5; | |
810 paddb MM2, MM6; | |
811 paddb MM3, MM7; | |
812 movq [ESI -32], MM0; | |
813 movq [ESI+8 -32], MM1; | |
814 movq [ESI+16-32], MM2; | |
815 movq [ESI+24-32], MM3; | |
816 cmp ESI, EDI; | |
817 jb startaddasslmmx; | |
818 | |
819 emms; | |
820 mov aptr, ESI; | |
821 mov bptr, ECX; | |
822 } | |
823 } | |
824 } | |
825 | |
826 while (aptr < aend) | |
827 *aptr++ += *bptr++; | |
828 | |
829 return a; | |
830 } | |
831 | |
832 unittest | |
833 { | |
834 printf("_arraySliceSliceAddass_g unittest\n"); | |
835 | |
836 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
837 { | |
838 version (log) printf(" cpuid %d\n", cpuid); | |
839 | |
840 for (int j = 0; j < 2; j++) | |
841 { | |
842 const int dim = 67; | |
843 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
844 a = a[j .. dim + j]; // misalign for second iteration | |
845 T[] b = new T[dim + j]; | |
846 b = b[j .. dim + j]; | |
847 T[] c = new T[dim + j]; | |
848 c = c[j .. dim + j]; | |
849 | |
850 for (int i = 0; i < dim; i++) | |
851 { a[i] = cast(T)i; | |
852 b[i] = cast(T)(i + 7); | |
853 c[i] = cast(T)(i * 2); | |
854 } | |
855 | |
856 a[] = c[]; | |
857 c[] += b[]; | |
858 | |
859 for (int i = 0; i < dim; i++) | |
860 { | |
861 if (c[i] != cast(T)(a[i] + b[i])) | |
862 { | |
863 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); | |
864 assert(0); | |
865 } | |
866 } | |
867 } | |
868 } | |
869 } | |
870 | |
871 | |
872 /* ======================================================================== */ | |
873 | |
874 | |
875 /*********************** | |
876 * Computes: | |
877 * a[] = b[] - value | |
878 */ | |
879 | |
880 T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b) | |
881 { | |
882 return _arraySliceExpMinSliceAssign_g(a, value, b); | |
883 } | |
884 | |
885 T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b) | |
886 { | |
887 return _arraySliceExpMinSliceAssign_g(a, value, b); | |
888 } | |
889 | |
890 T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b) | |
891 in | |
892 { | |
893 assert(a.length == b.length); | |
894 assert(disjoint(a, b)); | |
895 } | |
896 body | |
897 { | |
898 //printf("_arraySliceExpMinSliceAssign_g()\n"); | |
899 auto aptr = a.ptr; | |
900 auto aend = aptr + a.length; | |
901 auto bptr = b.ptr; | |
902 | |
903 version (D_InlineAsm_X86) | |
904 { | |
905 // SSE2 aligned version is 1189% faster | |
906 if (sse2() && a.length >= 64) | |
907 { | |
908 auto n = aptr + (a.length & ~63); | |
909 | |
910 uint l = cast(ubyte) value; | |
911 l |= (l << 8); | |
912 l |= (l << 16); | |
913 | |
914 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
915 { | |
916 asm // unaligned case | |
917 { | |
918 mov ESI, aptr; | |
919 mov EDI, n; | |
920 mov EAX, bptr; | |
921 movd XMM4, l; | |
922 pshufd XMM4, XMM4, 0; | |
923 | |
924 align 8; | |
925 startsubsse2u: | |
926 add ESI, 64; | |
927 movdqu XMM0, [EAX]; | |
928 movdqu XMM1, [EAX+16]; | |
929 movdqu XMM2, [EAX+32]; | |
930 movdqu XMM3, [EAX+48]; | |
931 add EAX, 64; | |
932 psubb XMM0, XMM4; | |
933 psubb XMM1, XMM4; | |
934 psubb XMM2, XMM4; | |
935 psubb XMM3, XMM4; | |
936 movdqu [ESI -64], XMM0; | |
937 movdqu [ESI+16-64], XMM1; | |
938 movdqu [ESI+32-64], XMM2; | |
939 movdqu [ESI+48-64], XMM3; | |
940 cmp ESI, EDI; | |
941 jb startsubsse2u; | |
942 | |
943 mov aptr, ESI; | |
944 mov bptr, EAX; | |
945 } | |
946 } | |
947 else | |
948 { | |
949 asm // aligned case | |
950 { | |
951 mov ESI, aptr; | |
952 mov EDI, n; | |
953 mov EAX, bptr; | |
954 movd XMM4, l; | |
955 pshufd XMM4, XMM4, 0; | |
956 | |
957 align 8; | |
958 startsubsse2a: | |
959 add ESI, 64; | |
960 movdqa XMM0, [EAX]; | |
961 movdqa XMM1, [EAX+16]; | |
962 movdqa XMM2, [EAX+32]; | |
963 movdqa XMM3, [EAX+48]; | |
964 add EAX, 64; | |
965 psubb XMM0, XMM4; | |
966 psubb XMM1, XMM4; | |
967 psubb XMM2, XMM4; | |
968 psubb XMM3, XMM4; | |
969 movdqa [ESI -64], XMM0; | |
970 movdqa [ESI+16-64], XMM1; | |
971 movdqa [ESI+32-64], XMM2; | |
972 movdqa [ESI+48-64], XMM3; | |
973 cmp ESI, EDI; | |
974 jb startsubsse2a; | |
975 | |
976 mov aptr, ESI; | |
977 mov bptr, EAX; | |
978 } | |
979 } | |
980 } | |
981 else | |
982 // MMX version is 1079% faster | |
983 if (mmx() && a.length >= 32) | |
984 { | |
985 auto n = aptr + (a.length & ~31); | |
986 | |
987 uint l = cast(ubyte) value; | |
988 l |= (l << 8); | |
989 | |
990 asm | |
991 { | |
992 mov ESI, aptr; | |
993 mov EDI, n; | |
994 mov EAX, bptr; | |
995 movd MM4, l; | |
996 pshufw MM4, MM4, 0; | |
997 | |
998 align 4; | |
999 startsubmmx: | |
1000 add ESI, 32; | |
1001 movq MM0, [EAX]; | |
1002 movq MM1, [EAX+8]; | |
1003 movq MM2, [EAX+16]; | |
1004 movq MM3, [EAX+24]; | |
1005 add EAX, 32; | |
1006 psubb MM0, MM4; | |
1007 psubb MM1, MM4; | |
1008 psubb MM2, MM4; | |
1009 psubb MM3, MM4; | |
1010 movq [ESI -32], MM0; | |
1011 movq [ESI+8 -32], MM1; | |
1012 movq [ESI+16-32], MM2; | |
1013 movq [ESI+24-32], MM3; | |
1014 cmp ESI, EDI; | |
1015 jb startsubmmx; | |
1016 | |
1017 emms; | |
1018 mov aptr, ESI; | |
1019 mov bptr, EAX; | |
1020 } | |
1021 } | |
1022 // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really. | |
1023 else | |
1024 if (a.length >= 4) | |
1025 { | |
1026 auto n = aptr + (a.length & ~3); | |
1027 asm | |
1028 { | |
1029 mov ESI, aptr; | |
1030 mov EDI, n; | |
1031 mov EAX, bptr; | |
1032 mov CL, value; | |
1033 | |
1034 align 4; | |
1035 startsub386: | |
1036 add ESI, 4; | |
1037 mov DX, [EAX]; | |
1038 mov BX, [EAX+2]; | |
1039 add EAX, 4; | |
1040 sub BL, CL; | |
1041 sub BH, CL; | |
1042 sub DL, CL; | |
1043 sub DH, CL; | |
1044 mov [ESI -4], DX; | |
1045 mov [ESI+2 -4], BX; | |
1046 cmp ESI, EDI; | |
1047 jb startsub386; | |
1048 | |
1049 mov aptr, ESI; | |
1050 mov bptr, EAX; | |
1051 } | |
1052 } | |
1053 } | |
1054 | |
1055 while (aptr < aend) | |
1056 *aptr++ = cast(T)(*bptr++ - value); | |
1057 | |
1058 return a; | |
1059 } | |
1060 | |
1061 unittest | |
1062 { | |
1063 printf("_arraySliceExpMinSliceAssign_g unittest\n"); | |
1064 | |
1065 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1066 { | |
1067 version (log) printf(" cpuid %d\n", cpuid); | |
1068 | |
1069 for (int j = 0; j < 2; j++) | |
1070 { | |
1071 const int dim = 67; | |
1072 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1073 a = a[j .. dim + j]; // misalign for second iteration | |
1074 T[] b = new T[dim + j]; | |
1075 b = b[j .. dim + j]; | |
1076 T[] c = new T[dim + j]; | |
1077 c = c[j .. dim + j]; | |
1078 | |
1079 for (int i = 0; i < dim; i++) | |
1080 { a[i] = cast(T)i; | |
1081 b[i] = cast(T)(i + 7); | |
1082 c[i] = cast(T)(i * 2); | |
1083 } | |
1084 | |
1085 a[] = c[]; | |
1086 c[] = b[] - 6; | |
1087 | |
1088 for (int i = 0; i < dim; i++) | |
1089 { | |
1090 if (c[i] != cast(T)(b[i] - 6)) | |
1091 { | |
1092 printf("[%d]: %d != %d - 6\n", i, c[i], b[i]); | |
1093 assert(0); | |
1094 } | |
1095 } | |
1096 } | |
1097 } | |
1098 } | |
1099 | |
1100 | |
1101 /* ======================================================================== */ | |
1102 | |
1103 /*********************** | |
1104 * Computes: | |
1105 * a[] = value - b[] | |
1106 */ | |
1107 | |
1108 T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value) | |
1109 { | |
1110 return _arrayExpSliceMinSliceAssign_g(a, b, value); | |
1111 } | |
1112 | |
1113 T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value) | |
1114 { | |
1115 return _arrayExpSliceMinSliceAssign_g(a, b, value); | |
1116 } | |
1117 | |
1118 T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value) | |
1119 in | |
1120 { | |
1121 assert(a.length == b.length); | |
1122 assert(disjoint(a, b)); | |
1123 } | |
1124 body | |
1125 { | |
1126 //printf("_arrayExpSliceMinSliceAssign_g()\n"); | |
1127 auto aptr = a.ptr; | |
1128 auto aend = aptr + a.length; | |
1129 auto bptr = b.ptr; | |
1130 | |
1131 version (D_InlineAsm_X86) | |
1132 { | |
1133 // SSE2 aligned version is 8748% faster | |
1134 if (sse2() && a.length >= 64) | |
1135 { | |
1136 auto n = aptr + (a.length & ~63); | |
1137 | |
1138 uint l = cast(ubyte) value; | |
1139 l |= (l << 8); | |
1140 l |= (l << 16); | |
1141 | |
1142 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1143 { | |
1144 asm // unaligned case | |
1145 { | |
1146 mov ESI, aptr; | |
1147 mov EDI, n; | |
1148 mov EAX, bptr; | |
1149 movd XMM4, l; | |
1150 pshufd XMM4, XMM4, 0; | |
1151 | |
1152 align 8; | |
1153 startsubrsse2u: | |
1154 add ESI, 64; | |
1155 movdqa XMM5, XMM4; | |
1156 movdqa XMM6, XMM4; | |
1157 movdqu XMM0, [EAX]; | |
1158 movdqu XMM1, [EAX+16]; | |
1159 psubb XMM5, XMM0; | |
1160 psubb XMM6, XMM1; | |
1161 movdqu [ESI -64], XMM5; | |
1162 movdqu [ESI+16-64], XMM6; | |
1163 movdqa XMM5, XMM4; | |
1164 movdqa XMM6, XMM4; | |
1165 movdqu XMM2, [EAX+32]; | |
1166 movdqu XMM3, [EAX+48]; | |
1167 add EAX, 64; | |
1168 psubb XMM5, XMM2; | |
1169 psubb XMM6, XMM3; | |
1170 movdqu [ESI+32-64], XMM5; | |
1171 movdqu [ESI+48-64], XMM6; | |
1172 cmp ESI, EDI; | |
1173 jb startsubrsse2u; | |
1174 | |
1175 mov aptr, ESI; | |
1176 mov bptr, EAX; | |
1177 } | |
1178 } | |
1179 else | |
1180 { | |
1181 asm // aligned case | |
1182 { | |
1183 mov ESI, aptr; | |
1184 mov EDI, n; | |
1185 mov EAX, bptr; | |
1186 movd XMM4, l; | |
1187 pshufd XMM4, XMM4, 0; | |
1188 | |
1189 align 8; | |
1190 startsubrsse2a: | |
1191 add ESI, 64; | |
1192 movdqa XMM5, XMM4; | |
1193 movdqa XMM6, XMM4; | |
1194 movdqa XMM0, [EAX]; | |
1195 movdqa XMM1, [EAX+16]; | |
1196 psubb XMM5, XMM0; | |
1197 psubb XMM6, XMM1; | |
1198 movdqa [ESI -64], XMM5; | |
1199 movdqa [ESI+16-64], XMM6; | |
1200 movdqa XMM5, XMM4; | |
1201 movdqa XMM6, XMM4; | |
1202 movdqa XMM2, [EAX+32]; | |
1203 movdqa XMM3, [EAX+48]; | |
1204 add EAX, 64; | |
1205 psubb XMM5, XMM2; | |
1206 psubb XMM6, XMM3; | |
1207 movdqa [ESI+32-64], XMM5; | |
1208 movdqa [ESI+48-64], XMM6; | |
1209 cmp ESI, EDI; | |
1210 jb startsubrsse2a; | |
1211 | |
1212 mov aptr, ESI; | |
1213 mov bptr, EAX; | |
1214 } | |
1215 } | |
1216 } | |
1217 else | |
1218 // MMX version is 7397% faster | |
1219 if (mmx() && a.length >= 32) | |
1220 { | |
1221 auto n = aptr + (a.length & ~31); | |
1222 | |
1223 uint l = cast(ubyte) value; | |
1224 l |= (l << 8); | |
1225 | |
1226 asm | |
1227 { | |
1228 mov ESI, aptr; | |
1229 mov EDI, n; | |
1230 mov EAX, bptr; | |
1231 movd MM4, l; | |
1232 pshufw MM4, MM4, 0; | |
1233 | |
1234 align 4; | |
1235 startsubrmmx: | |
1236 add ESI, 32; | |
1237 movq MM5, MM4; | |
1238 movq MM6, MM4; | |
1239 movq MM0, [EAX]; | |
1240 movq MM1, [EAX+8]; | |
1241 psubb MM5, MM0; | |
1242 psubb MM6, MM1; | |
1243 movq [ESI -32], MM5; | |
1244 movq [ESI+8 -32], MM6; | |
1245 movq MM5, MM4; | |
1246 movq MM6, MM4; | |
1247 movq MM2, [EAX+16]; | |
1248 movq MM3, [EAX+24]; | |
1249 add EAX, 32; | |
1250 psubb MM5, MM2; | |
1251 psubb MM6, MM3; | |
1252 movq [ESI+16-32], MM5; | |
1253 movq [ESI+24-32], MM6; | |
1254 cmp ESI, EDI; | |
1255 jb startsubrmmx; | |
1256 | |
1257 emms; | |
1258 mov aptr, ESI; | |
1259 mov bptr, EAX; | |
1260 } | |
1261 } | |
1262 | |
1263 } | |
1264 | |
1265 while (aptr < aend) | |
1266 *aptr++ = cast(T)(value - *bptr++); | |
1267 | |
1268 return a; | |
1269 } | |
1270 | |
1271 unittest | |
1272 { | |
1273 printf("_arrayExpSliceMinSliceAssign_g unittest\n"); | |
1274 | |
1275 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1276 { | |
1277 version (log) printf(" cpuid %d\n", cpuid); | |
1278 | |
1279 for (int j = 0; j < 2; j++) | |
1280 { | |
1281 const int dim = 67; | |
1282 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1283 a = a[j .. dim + j]; // misalign for second iteration | |
1284 T[] b = new T[dim + j]; | |
1285 b = b[j .. dim + j]; | |
1286 T[] c = new T[dim + j]; | |
1287 c = c[j .. dim + j]; | |
1288 | |
1289 for (int i = 0; i < dim; i++) | |
1290 { a[i] = cast(T)i; | |
1291 b[i] = cast(T)(i + 7); | |
1292 c[i] = cast(T)(i * 2); | |
1293 } | |
1294 | |
1295 a[] = c[]; | |
1296 c[] = 6 - b[]; | |
1297 | |
1298 for (int i = 0; i < dim; i++) | |
1299 { | |
1300 if (c[i] != cast(T)(6 - b[i])) | |
1301 { | |
1302 printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]); | |
1303 assert(0); | |
1304 } | |
1305 } | |
1306 } | |
1307 } | |
1308 } | |
1309 | |
1310 | |
1311 /* ======================================================================== */ | |
1312 | |
1313 /*********************** | |
1314 * Computes: | |
1315 * a[] = b[] - c[] | |
1316 */ | |
1317 | |
1318 T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b) | |
1319 { | |
1320 return _arraySliceSliceMinSliceAssign_g(a, c, b); | |
1321 } | |
1322 | |
1323 T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b) | |
1324 { | |
1325 return _arraySliceSliceMinSliceAssign_g(a, c, b); | |
1326 } | |
1327 | |
1328 T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b) | |
1329 in | |
1330 { | |
1331 assert(a.length == b.length && b.length == c.length); | |
1332 assert(disjoint(a, b)); | |
1333 assert(disjoint(a, c)); | |
1334 assert(disjoint(b, c)); | |
1335 } | |
1336 body | |
1337 { | |
1338 auto aptr = a.ptr; | |
1339 auto aend = aptr + a.length; | |
1340 auto bptr = b.ptr; | |
1341 auto cptr = c.ptr; | |
1342 | |
1343 version (D_InlineAsm_X86) | |
1344 { | |
1345 // SSE2 aligned version is 5756% faster | |
1346 if (sse2() && a.length >= 64) | |
1347 { | |
1348 auto n = aptr + (a.length & ~63); | |
1349 | |
1350 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
1351 { | |
1352 asm // unaligned case | |
1353 { | |
1354 mov ESI, aptr; | |
1355 mov EDI, n; | |
1356 mov EAX, bptr; | |
1357 mov ECX, cptr; | |
1358 | |
1359 align 8; | |
1360 startsublsse2u: | |
1361 add ESI, 64; | |
1362 movdqu XMM0, [EAX]; | |
1363 movdqu XMM1, [EAX+16]; | |
1364 movdqu XMM2, [EAX+32]; | |
1365 movdqu XMM3, [EAX+48]; | |
1366 add EAX, 64; | |
1367 movdqu XMM4, [ECX]; | |
1368 movdqu XMM5, [ECX+16]; | |
1369 movdqu XMM6, [ECX+32]; | |
1370 movdqu XMM7, [ECX+48]; | |
1371 add ECX, 64; | |
1372 psubb XMM0, XMM4; | |
1373 psubb XMM1, XMM5; | |
1374 psubb XMM2, XMM6; | |
1375 psubb XMM3, XMM7; | |
1376 movdqu [ESI -64], XMM0; | |
1377 movdqu [ESI+16-64], XMM1; | |
1378 movdqu [ESI+32-64], XMM2; | |
1379 movdqu [ESI+48-64], XMM3; | |
1380 cmp ESI, EDI; | |
1381 jb startsublsse2u; | |
1382 | |
1383 mov aptr, ESI; | |
1384 mov bptr, EAX; | |
1385 mov cptr, ECX; | |
1386 } | |
1387 } | |
1388 else | |
1389 { | |
1390 asm // aligned case | |
1391 { | |
1392 mov ESI, aptr; | |
1393 mov EDI, n; | |
1394 mov EAX, bptr; | |
1395 mov ECX, cptr; | |
1396 | |
1397 align 8; | |
1398 startsublsse2a: | |
1399 add ESI, 64; | |
1400 movdqa XMM0, [EAX]; | |
1401 movdqa XMM1, [EAX+16]; | |
1402 movdqa XMM2, [EAX+32]; | |
1403 movdqa XMM3, [EAX+48]; | |
1404 add EAX, 64; | |
1405 movdqa XMM4, [ECX]; | |
1406 movdqa XMM5, [ECX+16]; | |
1407 movdqa XMM6, [ECX+32]; | |
1408 movdqa XMM7, [ECX+48]; | |
1409 add ECX, 64; | |
1410 psubb XMM0, XMM4; | |
1411 psubb XMM1, XMM5; | |
1412 psubb XMM2, XMM6; | |
1413 psubb XMM3, XMM7; | |
1414 movdqa [ESI -64], XMM0; | |
1415 movdqa [ESI+16-64], XMM1; | |
1416 movdqa [ESI+32-64], XMM2; | |
1417 movdqa [ESI+48-64], XMM3; | |
1418 cmp ESI, EDI; | |
1419 jb startsublsse2a; | |
1420 | |
1421 mov aptr, ESI; | |
1422 mov bptr, EAX; | |
1423 mov cptr, ECX; | |
1424 } | |
1425 } | |
1426 } | |
1427 else | |
1428 // MMX version is 4428% faster | |
1429 if (mmx() && a.length >= 32) | |
1430 { | |
1431 auto n = aptr + (a.length & ~31); | |
1432 | |
1433 asm | |
1434 { | |
1435 mov ESI, aptr; | |
1436 mov EDI, n; | |
1437 mov EAX, bptr; | |
1438 mov ECX, cptr; | |
1439 | |
1440 align 8; | |
1441 startsublmmx: | |
1442 add ESI, 32; | |
1443 movq MM0, [EAX]; | |
1444 movq MM1, [EAX+8]; | |
1445 movq MM2, [EAX+16]; | |
1446 movq MM3, [EAX+24]; | |
1447 add EAX, 32; | |
1448 movq MM4, [ECX]; | |
1449 movq MM5, [ECX+8]; | |
1450 movq MM6, [ECX+16]; | |
1451 movq MM7, [ECX+24]; | |
1452 add ECX, 32; | |
1453 psubb MM0, MM4; | |
1454 psubb MM1, MM5; | |
1455 psubb MM2, MM6; | |
1456 psubb MM3, MM7; | |
1457 movq [ESI -32], MM0; | |
1458 movq [ESI+8 -32], MM1; | |
1459 movq [ESI+16-32], MM2; | |
1460 movq [ESI+24-32], MM3; | |
1461 cmp ESI, EDI; | |
1462 jb startsublmmx; | |
1463 | |
1464 emms; | |
1465 mov aptr, ESI; | |
1466 mov bptr, EAX; | |
1467 mov cptr, ECX; | |
1468 } | |
1469 } | |
1470 } | |
1471 | |
1472 while (aptr < aend) | |
1473 *aptr++ = cast(T)(*bptr++ - *cptr++); | |
1474 | |
1475 return a; | |
1476 } | |
1477 | |
1478 unittest | |
1479 { | |
1480 printf("_arraySliceSliceMinSliceAssign_g unittest\n"); | |
1481 | |
1482 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1483 { | |
1484 version (log) printf(" cpuid %d\n", cpuid); | |
1485 | |
1486 for (int j = 0; j < 2; j++) | |
1487 { | |
1488 const int dim = 67; | |
1489 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1490 a = a[j .. dim + j]; // misalign for second iteration | |
1491 T[] b = new T[dim + j]; | |
1492 b = b[j .. dim + j]; | |
1493 T[] c = new T[dim + j]; | |
1494 c = c[j .. dim + j]; | |
1495 | |
1496 for (int i = 0; i < dim; i++) | |
1497 { a[i] = cast(T)i; | |
1498 b[i] = cast(T)(i + 7); | |
1499 c[i] = cast(T)(i * 2); | |
1500 } | |
1501 | |
1502 c[] = a[] - b[]; | |
1503 | |
1504 for (int i = 0; i < dim; i++) | |
1505 { | |
1506 if (c[i] != cast(T)(a[i] - b[i])) | |
1507 { | |
1508 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); | |
1509 assert(0); | |
1510 } | |
1511 } | |
1512 } | |
1513 } | |
1514 } | |
1515 | |
1516 | |
1517 /* ======================================================================== */ | |
1518 | |
1519 /*********************** | |
1520 * Computes: | |
1521 * a[] -= value | |
1522 */ | |
1523 | |
1524 T[] _arrayExpSliceMinass_a(T[] a, T value) | |
1525 { | |
1526 return _arrayExpSliceMinass_g(a, value); | |
1527 } | |
1528 | |
1529 T[] _arrayExpSliceMinass_h(T[] a, T value) | |
1530 { | |
1531 return _arrayExpSliceMinass_g(a, value); | |
1532 } | |
1533 | |
1534 T[] _arrayExpSliceMinass_g(T[] a, T value) | |
1535 { | |
1536 //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1537 auto aptr = a.ptr; | |
1538 auto aend = aptr + a.length; | |
1539 | |
1540 version (D_InlineAsm_X86) | |
1541 { | |
1542 // SSE2 aligned version is 1577% faster | |
1543 if (sse2() && a.length >= 64) | |
1544 { | |
1545 auto n = aptr + (a.length & ~63); | |
1546 | |
1547 uint l = cast(ubyte) value; | |
1548 l |= (l << 8); | |
1549 l |= (l << 16); | |
1550 | |
1551 if (((cast(uint) aptr) & 15) != 0) | |
1552 { | |
1553 asm // unaligned case | |
1554 { | |
1555 mov ESI, aptr; | |
1556 mov EDI, n; | |
1557 movd XMM4, l; | |
1558 pshufd XMM4, XMM4, 0; | |
1559 | |
1560 align 8; | |
1561 startsubasssse2u: | |
1562 movdqu XMM0, [ESI]; | |
1563 movdqu XMM1, [ESI+16]; | |
1564 movdqu XMM2, [ESI+32]; | |
1565 movdqu XMM3, [ESI+48]; | |
1566 add ESI, 64; | |
1567 psubb XMM0, XMM4; | |
1568 psubb XMM1, XMM4; | |
1569 psubb XMM2, XMM4; | |
1570 psubb XMM3, XMM4; | |
1571 movdqu [ESI -64], XMM0; | |
1572 movdqu [ESI+16-64], XMM1; | |
1573 movdqu [ESI+32-64], XMM2; | |
1574 movdqu [ESI+48-64], XMM3; | |
1575 cmp ESI, EDI; | |
1576 jb startsubasssse2u; | |
1577 | |
1578 mov aptr, ESI; | |
1579 } | |
1580 } | |
1581 else | |
1582 { | |
1583 asm // aligned case | |
1584 { | |
1585 mov ESI, aptr; | |
1586 mov EDI, n; | |
1587 movd XMM4, l; | |
1588 pshufd XMM4, XMM4, 0; | |
1589 | |
1590 align 8; | |
1591 startsubasssse2a: | |
1592 movdqa XMM0, [ESI]; | |
1593 movdqa XMM1, [ESI+16]; | |
1594 movdqa XMM2, [ESI+32]; | |
1595 movdqa XMM3, [ESI+48]; | |
1596 add ESI, 64; | |
1597 psubb XMM0, XMM4; | |
1598 psubb XMM1, XMM4; | |
1599 psubb XMM2, XMM4; | |
1600 psubb XMM3, XMM4; | |
1601 movdqa [ESI -64], XMM0; | |
1602 movdqa [ESI+16-64], XMM1; | |
1603 movdqa [ESI+32-64], XMM2; | |
1604 movdqa [ESI+48-64], XMM3; | |
1605 cmp ESI, EDI; | |
1606 jb startsubasssse2a; | |
1607 | |
1608 mov aptr, ESI; | |
1609 } | |
1610 } | |
1611 } | |
1612 else | |
1613 // MMX version is 1577% faster | |
1614 if (mmx() && a.length >= 32) | |
1615 { | |
1616 | |
1617 auto n = aptr + (a.length & ~31); | |
1618 | |
1619 uint l = cast(ubyte) value; | |
1620 l |= (l << 8); | |
1621 | |
1622 asm | |
1623 { | |
1624 mov ESI, aptr; | |
1625 mov EDI, n; | |
1626 movd MM4, l; | |
1627 pshufw MM4, MM4, 0; | |
1628 | |
1629 align 8; | |
1630 startsubassmmx: | |
1631 movq MM0, [ESI]; | |
1632 movq MM1, [ESI+8]; | |
1633 movq MM2, [ESI+16]; | |
1634 movq MM3, [ESI+24]; | |
1635 add ESI, 32; | |
1636 psubb MM0, MM4; | |
1637 psubb MM1, MM4; | |
1638 psubb MM2, MM4; | |
1639 psubb MM3, MM4; | |
1640 movq [ESI -32], MM0; | |
1641 movq [ESI+8 -32], MM1; | |
1642 movq [ESI+16-32], MM2; | |
1643 movq [ESI+24-32], MM3; | |
1644 cmp ESI, EDI; | |
1645 jb startsubassmmx; | |
1646 | |
1647 emms; | |
1648 mov aptr, ESI; | |
1649 } | |
1650 } | |
1651 } | |
1652 | |
1653 while (aptr < aend) | |
1654 *aptr++ -= value; | |
1655 | |
1656 return a; | |
1657 } | |
1658 | |
1659 unittest | |
1660 { | |
1661 printf("_arrayExpSliceMinass_g unittest\n"); | |
1662 | |
1663 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1664 { | |
1665 version (log) printf(" cpuid %d\n", cpuid); | |
1666 | |
1667 for (int j = 0; j < 2; j++) | |
1668 { | |
1669 const int dim = 67; | |
1670 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1671 a = a[j .. dim + j]; // misalign for second iteration | |
1672 T[] b = new T[dim + j]; | |
1673 b = b[j .. dim + j]; | |
1674 T[] c = new T[dim + j]; | |
1675 c = c[j .. dim + j]; | |
1676 | |
1677 for (int i = 0; i < dim; i++) | |
1678 { a[i] = cast(T)i; | |
1679 b[i] = cast(T)(i + 7); | |
1680 c[i] = cast(T)(i * 2); | |
1681 } | |
1682 | |
1683 a[] = c[]; | |
1684 c[] -= 6; | |
1685 | |
1686 for (int i = 0; i < dim; i++) | |
1687 { | |
1688 if (c[i] != cast(T)(a[i] - 6)) | |
1689 { | |
1690 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); | |
1691 assert(0); | |
1692 } | |
1693 } | |
1694 } | |
1695 } | |
1696 } | |
1697 | |
1698 | |
1699 /* ======================================================================== */ | |
1700 | |
1701 /*********************** | |
1702 * Computes: | |
1703 * a[] -= b[] | |
1704 */ | |
1705 | |
1706 T[] _arraySliceSliceMinass_a(T[] a, T[] b) | |
1707 { | |
1708 return _arraySliceSliceMinass_g(a, b); | |
1709 } | |
1710 | |
1711 T[] _arraySliceSliceMinass_h(T[] a, T[] b) | |
1712 { | |
1713 return _arraySliceSliceMinass_g(a, b); | |
1714 } | |
1715 | |
1716 T[] _arraySliceSliceMinass_g(T[] a, T[] b) | |
1717 in | |
1718 { | |
1719 assert (a.length == b.length); | |
1720 assert (disjoint(a, b)); | |
1721 } | |
1722 body | |
1723 { | |
1724 //printf("_arraySliceSliceMinass_g()\n"); | |
1725 auto aptr = a.ptr; | |
1726 auto aend = aptr + a.length; | |
1727 auto bptr = b.ptr; | |
1728 | |
1729 version (D_InlineAsm_X86) | |
1730 { | |
1731 // SSE2 aligned version is 4800% faster | |
1732 if (sse2() && a.length >= 64) | |
1733 { | |
1734 auto n = aptr + (a.length & ~63); | |
1735 | |
1736 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1737 { | |
1738 asm // unaligned case | |
1739 { | |
1740 mov ESI, aptr; | |
1741 mov EDI, n; | |
1742 mov ECX, bptr; | |
1743 | |
1744 align 8; | |
1745 startsubasslsse2u: | |
1746 movdqu XMM0, [ESI]; | |
1747 movdqu XMM1, [ESI+16]; | |
1748 movdqu XMM2, [ESI+32]; | |
1749 movdqu XMM3, [ESI+48]; | |
1750 add ESI, 64; | |
1751 movdqu XMM4, [ECX]; | |
1752 movdqu XMM5, [ECX+16]; | |
1753 movdqu XMM6, [ECX+32]; | |
1754 movdqu XMM7, [ECX+48]; | |
1755 add ECX, 64; | |
1756 psubb XMM0, XMM4; | |
1757 psubb XMM1, XMM5; | |
1758 psubb XMM2, XMM6; | |
1759 psubb XMM3, XMM7; | |
1760 movdqu [ESI -64], XMM0; | |
1761 movdqu [ESI+16-64], XMM1; | |
1762 movdqu [ESI+32-64], XMM2; | |
1763 movdqu [ESI+48-64], XMM3; | |
1764 cmp ESI, EDI; | |
1765 jb startsubasslsse2u; | |
1766 | |
1767 mov aptr, ESI; | |
1768 mov bptr, ECX; | |
1769 } | |
1770 } | |
1771 else | |
1772 { | |
1773 asm // aligned case | |
1774 { | |
1775 mov ESI, aptr; | |
1776 mov EDI, n; | |
1777 mov ECX, bptr; | |
1778 | |
1779 align 8; | |
1780 startsubasslsse2a: | |
1781 movdqa XMM0, [ESI]; | |
1782 movdqa XMM1, [ESI+16]; | |
1783 movdqa XMM2, [ESI+32]; | |
1784 movdqa XMM3, [ESI+48]; | |
1785 add ESI, 64; | |
1786 movdqa XMM4, [ECX]; | |
1787 movdqa XMM5, [ECX+16]; | |
1788 movdqa XMM6, [ECX+32]; | |
1789 movdqa XMM7, [ECX+48]; | |
1790 add ECX, 64; | |
1791 psubb XMM0, XMM4; | |
1792 psubb XMM1, XMM5; | |
1793 psubb XMM2, XMM6; | |
1794 psubb XMM3, XMM7; | |
1795 movdqa [ESI -64], XMM0; | |
1796 movdqa [ESI+16-64], XMM1; | |
1797 movdqa [ESI+32-64], XMM2; | |
1798 movdqa [ESI+48-64], XMM3; | |
1799 cmp ESI, EDI; | |
1800 jb startsubasslsse2a; | |
1801 | |
1802 mov aptr, ESI; | |
1803 mov bptr, ECX; | |
1804 } | |
1805 } | |
1806 } | |
1807 else | |
1808 // MMX version is 3107% faster | |
1809 if (mmx() && a.length >= 32) | |
1810 { | |
1811 | |
1812 auto n = aptr + (a.length & ~31); | |
1813 | |
1814 asm | |
1815 { | |
1816 mov ESI, aptr; | |
1817 mov EDI, n; | |
1818 mov ECX, bptr; | |
1819 | |
1820 align 8; | |
1821 startsubasslmmx: | |
1822 movq MM0, [ESI]; | |
1823 movq MM1, [ESI+8]; | |
1824 movq MM2, [ESI+16]; | |
1825 movq MM3, [ESI+24]; | |
1826 add ESI, 32; | |
1827 movq MM4, [ECX]; | |
1828 movq MM5, [ECX+8]; | |
1829 movq MM6, [ECX+16]; | |
1830 movq MM7, [ECX+24]; | |
1831 add ECX, 32; | |
1832 psubb MM0, MM4; | |
1833 psubb MM1, MM5; | |
1834 psubb MM2, MM6; | |
1835 psubb MM3, MM7; | |
1836 movq [ESI -32], MM0; | |
1837 movq [ESI+8 -32], MM1; | |
1838 movq [ESI+16-32], MM2; | |
1839 movq [ESI+24-32], MM3; | |
1840 cmp ESI, EDI; | |
1841 jb startsubasslmmx; | |
1842 | |
1843 emms; | |
1844 mov aptr, ESI; | |
1845 mov bptr, ECX; | |
1846 } | |
1847 } | |
1848 } | |
1849 | |
1850 while (aptr < aend) | |
1851 *aptr++ -= *bptr++; | |
1852 | |
1853 return a; | |
1854 } | |
1855 | |
1856 unittest | |
1857 { | |
1858 printf("_arraySliceSliceMinass_g unittest\n"); | |
1859 | |
1860 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1861 { | |
1862 version (log) printf(" cpuid %d\n", cpuid); | |
1863 | |
1864 for (int j = 0; j < 2; j++) | |
1865 { | |
1866 const int dim = 67; | |
1867 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1868 a = a[j .. dim + j]; // misalign for second iteration | |
1869 T[] b = new T[dim + j]; | |
1870 b = b[j .. dim + j]; | |
1871 T[] c = new T[dim + j]; | |
1872 c = c[j .. dim + j]; | |
1873 | |
1874 for (int i = 0; i < dim; i++) | |
1875 { a[i] = cast(T)i; | |
1876 b[i] = cast(T)(i + 7); | |
1877 c[i] = cast(T)(i * 2); | |
1878 } | |
1879 | |
1880 a[] = c[]; | |
1881 c[] -= b[]; | |
1882 | |
1883 for (int i = 0; i < dim; i++) | |
1884 { | |
1885 if (c[i] != cast(T)(a[i] - b[i])) | |
1886 { | |
1887 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); | |
1888 assert(0); | |
1889 } | |
1890 } | |
1891 } | |
1892 } | |
1893 } |