Mercurial > projects > ldc
comparison druntime/src/compiler/ldc/arrayshort.d @ 1458:e0b2d67cfe7c
Added druntime (this should be removed once it works).
author | Robert Clipsham <robert@octarineparrot.com> |
---|---|
date | Tue, 02 Jun 2009 17:43:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1456:7b218ec1044f | 1458:e0b2d67cfe7c |
---|---|
1 /** | |
2 * Contains SSE2 and MMX versions of certain operations for wchar, short, | |
3 * and ushort ('u', 's' and 't' suffixes). | |
4 * | |
5 * Copyright: Copyright Digital Mars 2008 - 2009. | |
6 * License: <a href="http://www.boost.org/LICENSE_1_0.txt>Boost License 1.0</a>. | |
7 * Authors: Walter Bright, based on code originally written by Burton Radons | |
8 * | |
9 * Copyright Digital Mars 2008 - 2009. | |
10 * Distributed under the Boost Software License, Version 1.0. | |
11 * (See accompanying file LICENSE_1_0.txt or copy at | |
12 * http://www.boost.org/LICENSE_1_0.txt) | |
13 */ | |
14 module rt.arrayshort; | |
15 | |
16 private import rt.util.cpuid; | |
17 | |
18 version (unittest) | |
19 { | |
20 private import core.stdc.stdio : printf; | |
21 /* This is so unit tests will test every CPU variant | |
22 */ | |
23 int cpuid; | |
24 const int CPUID_MAX = 4; | |
25 bool mmx() { return cpuid == 1 && rt.util.cpuid.mmx(); } | |
26 bool sse() { return cpuid == 2 && rt.util.cpuid.sse(); } | |
27 bool sse2() { return cpuid == 3 && rt.util.cpuid.sse2(); } | |
28 bool amd3dnow() { return cpuid == 4 && rt.util.cpuid.amd3dnow(); } | |
29 } | |
30 else | |
31 { | |
32 alias rt.util.cpuid.mmx mmx; | |
33 alias rt.util.cpuid.sse sse; | |
34 alias rt.util.cpuid.sse2 sse2; | |
35 alias rt.util.cpuid.sse2 sse2; | |
36 } | |
37 | |
38 //version = log; | |
39 | |
40 bool disjoint(T)(T[] a, T[] b) | |
41 { | |
42 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); | |
43 } | |
44 | |
45 alias short T; | |
46 | |
47 extern (C): | |
48 | |
49 /* ======================================================================== */ | |
50 | |
51 /*********************** | |
52 * Computes: | |
53 * a[] = b[] + value | |
54 */ | |
55 | |
56 T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b) | |
57 { | |
58 return _arraySliceExpAddSliceAssign_s(a, value, b); | |
59 } | |
60 | |
61 T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b) | |
62 { | |
63 return _arraySliceExpAddSliceAssign_s(a, value, b); | |
64 } | |
65 | |
66 T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b) | |
67 in | |
68 { | |
69 assert(a.length == b.length); | |
70 assert(disjoint(a, b)); | |
71 } | |
72 body | |
73 { | |
74 //printf("_arraySliceExpAddSliceAssign_s()\n"); | |
75 auto aptr = a.ptr; | |
76 auto aend = aptr + a.length; | |
77 auto bptr = b.ptr; | |
78 | |
79 version (D_InlineAsm_X86) | |
80 { | |
81 // SSE2 aligned version is 3343% faster | |
82 if (sse2() && a.length >= 16) | |
83 { | |
84 auto n = aptr + (a.length & ~15); | |
85 | |
86 uint l = cast(ushort) value; | |
87 l |= (l << 16); | |
88 | |
89 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
90 { | |
91 asm // unaligned case | |
92 { | |
93 mov ESI, aptr; | |
94 mov EDI, n; | |
95 mov EAX, bptr; | |
96 movd XMM2, l; | |
97 pshufd XMM2, XMM2, 0; | |
98 | |
99 align 4; | |
100 startaddsse2u: | |
101 add ESI, 32; | |
102 movdqu XMM0, [EAX]; | |
103 movdqu XMM1, [EAX+16]; | |
104 add EAX, 32; | |
105 paddw XMM0, XMM2; | |
106 paddw XMM1, XMM2; | |
107 movdqu [ESI -32], XMM0; | |
108 movdqu [ESI+16-32], XMM1; | |
109 cmp ESI, EDI; | |
110 jb startaddsse2u; | |
111 | |
112 mov aptr, ESI; | |
113 mov bptr, EAX; | |
114 } | |
115 } | |
116 else | |
117 { | |
118 asm // aligned case | |
119 { | |
120 mov ESI, aptr; | |
121 mov EDI, n; | |
122 mov EAX, bptr; | |
123 movd XMM2, l; | |
124 pshufd XMM2, XMM2, 0; | |
125 | |
126 align 4; | |
127 startaddsse2a: | |
128 add ESI, 32; | |
129 movdqa XMM0, [EAX]; | |
130 movdqa XMM1, [EAX+16]; | |
131 add EAX, 32; | |
132 paddw XMM0, XMM2; | |
133 paddw XMM1, XMM2; | |
134 movdqa [ESI -32], XMM0; | |
135 movdqa [ESI+16-32], XMM1; | |
136 cmp ESI, EDI; | |
137 jb startaddsse2a; | |
138 | |
139 mov aptr, ESI; | |
140 mov bptr, EAX; | |
141 } | |
142 } | |
143 } | |
144 else | |
145 // MMX version is 3343% faster | |
146 if (mmx() && a.length >= 8) | |
147 { | |
148 auto n = aptr + (a.length & ~7); | |
149 | |
150 uint l = cast(ushort) value; | |
151 | |
152 asm | |
153 { | |
154 mov ESI, aptr; | |
155 mov EDI, n; | |
156 mov EAX, bptr; | |
157 movd MM2, l; | |
158 pshufw MM2, MM2, 0; | |
159 | |
160 align 4; | |
161 startmmx: | |
162 add ESI, 16; | |
163 movq MM0, [EAX]; | |
164 movq MM1, [EAX+8]; | |
165 add EAX, 16; | |
166 paddw MM0, MM2; | |
167 paddw MM1, MM2; | |
168 movq [ESI -16], MM0; | |
169 movq [ESI+8-16], MM1; | |
170 cmp ESI, EDI; | |
171 jb startmmx; | |
172 | |
173 emms; | |
174 mov aptr, ESI; | |
175 mov bptr, EAX; | |
176 } | |
177 } | |
178 } | |
179 | |
180 while (aptr < aend) | |
181 *aptr++ = cast(T)(*bptr++ + value); | |
182 | |
183 return a; | |
184 } | |
185 | |
186 unittest | |
187 { | |
188 printf("_arraySliceExpAddSliceAssign_s unittest\n"); | |
189 | |
190 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
191 { | |
192 version (log) printf(" cpuid %d\n", cpuid); | |
193 | |
194 for (int j = 0; j < 2; j++) | |
195 { | |
196 const int dim = 67; | |
197 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
198 a = a[j .. dim + j]; // misalign for second iteration | |
199 T[] b = new T[dim + j]; | |
200 b = b[j .. dim + j]; | |
201 T[] c = new T[dim + j]; | |
202 c = c[j .. dim + j]; | |
203 | |
204 for (int i = 0; i < dim; i++) | |
205 { a[i] = cast(T)i; | |
206 b[i] = cast(T)(i + 7); | |
207 c[i] = cast(T)(i * 2); | |
208 } | |
209 | |
210 c[] = a[] + 6; | |
211 | |
212 for (int i = 0; i < dim; i++) | |
213 { | |
214 if (c[i] != cast(T)(a[i] + 6)) | |
215 { | |
216 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); | |
217 assert(0); | |
218 } | |
219 } | |
220 } | |
221 } | |
222 } | |
223 | |
224 | |
225 /* ======================================================================== */ | |
226 | |
227 /*********************** | |
228 * Computes: | |
229 * a[] = b[] + c[] | |
230 */ | |
231 | |
232 T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b) | |
233 { | |
234 return _arraySliceSliceAddSliceAssign_s(a, c, b); | |
235 } | |
236 | |
237 T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b) | |
238 { | |
239 return _arraySliceSliceAddSliceAssign_s(a, c, b); | |
240 } | |
241 | |
242 T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b) | |
243 in | |
244 { | |
245 assert(a.length == b.length && b.length == c.length); | |
246 assert(disjoint(a, b)); | |
247 assert(disjoint(a, c)); | |
248 assert(disjoint(b, c)); | |
249 } | |
250 body | |
251 { | |
252 //printf("_arraySliceSliceAddSliceAssign_s()\n"); | |
253 auto aptr = a.ptr; | |
254 auto aend = aptr + a.length; | |
255 auto bptr = b.ptr; | |
256 auto cptr = c.ptr; | |
257 | |
258 version (D_InlineAsm_X86) | |
259 { | |
260 // SSE2 aligned version is 3777% faster | |
261 if (sse2() && a.length >= 16) | |
262 { | |
263 auto n = aptr + (a.length & ~15); | |
264 | |
265 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
266 { | |
267 asm // unaligned case | |
268 { | |
269 mov ESI, aptr; | |
270 mov EDI, n; | |
271 mov EAX, bptr; | |
272 mov ECX, cptr; | |
273 | |
274 align 4; | |
275 startsse2u: | |
276 add ESI, 32; | |
277 movdqu XMM0, [EAX]; | |
278 movdqu XMM1, [EAX+16]; | |
279 add EAX, 32; | |
280 movdqu XMM2, [ECX]; | |
281 movdqu XMM3, [ECX+16]; | |
282 add ECX, 32; | |
283 paddw XMM0, XMM2; | |
284 paddw XMM1, XMM3; | |
285 movdqu [ESI -32], XMM0; | |
286 movdqu [ESI+16-32], XMM1; | |
287 cmp ESI, EDI; | |
288 jb startsse2u; | |
289 | |
290 mov aptr, ESI; | |
291 mov bptr, EAX; | |
292 mov cptr, ECX; | |
293 } | |
294 } | |
295 else | |
296 { | |
297 asm // aligned case | |
298 { | |
299 mov ESI, aptr; | |
300 mov EDI, n; | |
301 mov EAX, bptr; | |
302 mov ECX, cptr; | |
303 | |
304 align 4; | |
305 startsse2a: | |
306 add ESI, 32; | |
307 movdqa XMM0, [EAX]; | |
308 movdqa XMM1, [EAX+16]; | |
309 add EAX, 32; | |
310 movdqa XMM2, [ECX]; | |
311 movdqa XMM3, [ECX+16]; | |
312 add ECX, 32; | |
313 paddw XMM0, XMM2; | |
314 paddw XMM1, XMM3; | |
315 movdqa [ESI -32], XMM0; | |
316 movdqa [ESI+16-32], XMM1; | |
317 cmp ESI, EDI; | |
318 jb startsse2a; | |
319 | |
320 mov aptr, ESI; | |
321 mov bptr, EAX; | |
322 mov cptr, ECX; | |
323 } | |
324 } | |
325 } | |
326 else | |
327 // MMX version is 2068% faster | |
328 if (mmx() && a.length >= 8) | |
329 { | |
330 auto n = aptr + (a.length & ~7); | |
331 | |
332 asm | |
333 { | |
334 mov ESI, aptr; | |
335 mov EDI, n; | |
336 mov EAX, bptr; | |
337 mov ECX, cptr; | |
338 | |
339 align 4; | |
340 startmmx: | |
341 add ESI, 16; | |
342 movq MM0, [EAX]; | |
343 movq MM1, [EAX+8]; | |
344 add EAX, 16; | |
345 movq MM2, [ECX]; | |
346 movq MM3, [ECX+8]; | |
347 add ECX, 16; | |
348 paddw MM0, MM2; | |
349 paddw MM1, MM3; | |
350 movq [ESI -16], MM0; | |
351 movq [ESI+8-16], MM1; | |
352 cmp ESI, EDI; | |
353 jb startmmx; | |
354 | |
355 emms; | |
356 mov aptr, ESI; | |
357 mov bptr, EAX; | |
358 mov cptr, ECX; | |
359 } | |
360 } | |
361 } | |
362 | |
363 while (aptr < aend) | |
364 *aptr++ = cast(T)(*bptr++ + *cptr++); | |
365 | |
366 return a; | |
367 } | |
368 | |
369 unittest | |
370 { | |
371 printf("_arraySliceSliceAddSliceAssign_s unittest\n"); | |
372 | |
373 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
374 { | |
375 version (log) printf(" cpuid %d\n", cpuid); | |
376 | |
377 for (int j = 0; j < 2; j++) | |
378 { | |
379 const int dim = 67; | |
380 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
381 a = a[j .. dim + j]; // misalign for second iteration | |
382 T[] b = new T[dim + j]; | |
383 b = b[j .. dim + j]; | |
384 T[] c = new T[dim + j]; | |
385 c = c[j .. dim + j]; | |
386 | |
387 for (int i = 0; i < dim; i++) | |
388 { a[i] = cast(T)i; | |
389 b[i] = cast(T)(i + 7); | |
390 c[i] = cast(T)(i * 2); | |
391 } | |
392 | |
393 c[] = a[] + b[]; | |
394 | |
395 for (int i = 0; i < dim; i++) | |
396 { | |
397 if (c[i] != cast(T)(a[i] + b[i])) | |
398 { | |
399 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); | |
400 assert(0); | |
401 } | |
402 } | |
403 } | |
404 } | |
405 } | |
406 | |
407 | |
408 /* ======================================================================== */ | |
409 | |
410 /*********************** | |
411 * Computes: | |
412 * a[] += value | |
413 */ | |
414 | |
415 T[] _arrayExpSliceAddass_u(T[] a, T value) | |
416 { | |
417 return _arrayExpSliceAddass_s(a, value); | |
418 } | |
419 | |
420 T[] _arrayExpSliceAddass_t(T[] a, T value) | |
421 { | |
422 return _arrayExpSliceAddass_s(a, value); | |
423 } | |
424 | |
425 T[] _arrayExpSliceAddass_s(T[] a, T value) | |
426 { | |
427 //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
428 auto aptr = a.ptr; | |
429 auto aend = aptr + a.length; | |
430 | |
431 version (D_InlineAsm_X86) | |
432 { | |
433 // SSE2 aligned version is 832% faster | |
434 if (sse2() && a.length >= 16) | |
435 { | |
436 auto n = aptr + (a.length & ~15); | |
437 | |
438 uint l = cast(ushort) value; | |
439 l |= (l << 16); | |
440 | |
441 if (((cast(uint) aptr) & 15) != 0) | |
442 { | |
443 asm // unaligned case | |
444 { | |
445 mov ESI, aptr; | |
446 mov EDI, n; | |
447 movd XMM2, l; | |
448 pshufd XMM2, XMM2, 0; | |
449 | |
450 align 4; | |
451 startaddsse2u: | |
452 movdqu XMM0, [ESI]; | |
453 movdqu XMM1, [ESI+16]; | |
454 add ESI, 32; | |
455 paddw XMM0, XMM2; | |
456 paddw XMM1, XMM2; | |
457 movdqu [ESI -32], XMM0; | |
458 movdqu [ESI+16-32], XMM1; | |
459 cmp ESI, EDI; | |
460 jb startaddsse2u; | |
461 | |
462 mov aptr, ESI; | |
463 } | |
464 } | |
465 else | |
466 { | |
467 asm // aligned case | |
468 { | |
469 mov ESI, aptr; | |
470 mov EDI, n; | |
471 movd XMM2, l; | |
472 pshufd XMM2, XMM2, 0; | |
473 | |
474 align 4; | |
475 startaddsse2a: | |
476 movdqa XMM0, [ESI]; | |
477 movdqa XMM1, [ESI+16]; | |
478 add ESI, 32; | |
479 paddw XMM0, XMM2; | |
480 paddw XMM1, XMM2; | |
481 movdqa [ESI -32], XMM0; | |
482 movdqa [ESI+16-32], XMM1; | |
483 cmp ESI, EDI; | |
484 jb startaddsse2a; | |
485 | |
486 mov aptr, ESI; | |
487 } | |
488 } | |
489 } | |
490 else | |
491 // MMX version is 826% faster | |
492 if (mmx() && a.length >= 8) | |
493 { | |
494 auto n = aptr + (a.length & ~7); | |
495 | |
496 uint l = cast(ushort) value; | |
497 | |
498 asm | |
499 { | |
500 mov ESI, aptr; | |
501 mov EDI, n; | |
502 movd MM2, l; | |
503 pshufw MM2, MM2, 0; | |
504 | |
505 align 4; | |
506 startmmx: | |
507 movq MM0, [ESI]; | |
508 movq MM1, [ESI+8]; | |
509 add ESI, 16; | |
510 paddw MM0, MM2; | |
511 paddw MM1, MM2; | |
512 movq [ESI -16], MM0; | |
513 movq [ESI+8-16], MM1; | |
514 cmp ESI, EDI; | |
515 jb startmmx; | |
516 | |
517 emms; | |
518 mov aptr, ESI; | |
519 } | |
520 } | |
521 } | |
522 | |
523 while (aptr < aend) | |
524 *aptr++ += value; | |
525 | |
526 return a; | |
527 } | |
528 | |
529 unittest | |
530 { | |
531 printf("_arrayExpSliceAddass_s unittest\n"); | |
532 | |
533 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
534 { | |
535 version (log) printf(" cpuid %d\n", cpuid); | |
536 | |
537 for (int j = 0; j < 2; j++) | |
538 { | |
539 const int dim = 67; | |
540 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
541 a = a[j .. dim + j]; // misalign for second iteration | |
542 T[] b = new T[dim + j]; | |
543 b = b[j .. dim + j]; | |
544 T[] c = new T[dim + j]; | |
545 c = c[j .. dim + j]; | |
546 | |
547 for (int i = 0; i < dim; i++) | |
548 { a[i] = cast(T)i; | |
549 b[i] = cast(T)(i + 7); | |
550 c[i] = cast(T)(i * 2); | |
551 } | |
552 | |
553 a[] = c[]; | |
554 a[] += 6; | |
555 | |
556 for (int i = 0; i < dim; i++) | |
557 { | |
558 if (a[i] != cast(T)(c[i] + 6)) | |
559 { | |
560 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); | |
561 assert(0); | |
562 } | |
563 } | |
564 } | |
565 } | |
566 } | |
567 | |
568 | |
569 /* ======================================================================== */ | |
570 | |
571 /*********************** | |
572 * Computes: | |
573 * a[] += b[] | |
574 */ | |
575 | |
576 T[] _arraySliceSliceAddass_u(T[] a, T[] b) | |
577 { | |
578 return _arraySliceSliceAddass_s(a, b); | |
579 } | |
580 | |
581 T[] _arraySliceSliceAddass_t(T[] a, T[] b) | |
582 { | |
583 return _arraySliceSliceAddass_s(a, b); | |
584 } | |
585 | |
586 T[] _arraySliceSliceAddass_s(T[] a, T[] b) | |
587 in | |
588 { | |
589 assert (a.length == b.length); | |
590 assert (disjoint(a, b)); | |
591 } | |
592 body | |
593 { | |
594 //printf("_arraySliceSliceAddass_s()\n"); | |
595 auto aptr = a.ptr; | |
596 auto aend = aptr + a.length; | |
597 auto bptr = b.ptr; | |
598 | |
599 version (D_InlineAsm_X86) | |
600 { | |
601 // SSE2 aligned version is 2085% faster | |
602 if (sse2() && a.length >= 16) | |
603 { | |
604 auto n = aptr + (a.length & ~15); | |
605 | |
606 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
607 { | |
608 asm // unaligned case | |
609 { | |
610 mov ESI, aptr; | |
611 mov EDI, n; | |
612 mov ECX, bptr; | |
613 | |
614 align 4; | |
615 startsse2u: | |
616 movdqu XMM0, [ESI]; | |
617 movdqu XMM1, [ESI+16]; | |
618 add ESI, 32; | |
619 movdqu XMM2, [ECX]; | |
620 movdqu XMM3, [ECX+16]; | |
621 add ECX, 32; | |
622 paddw XMM0, XMM2; | |
623 paddw XMM1, XMM3; | |
624 movdqu [ESI -32], XMM0; | |
625 movdqu [ESI+16-32], XMM1; | |
626 cmp ESI, EDI; | |
627 jb startsse2u; | |
628 | |
629 mov aptr, ESI; | |
630 mov bptr, ECX; | |
631 } | |
632 } | |
633 else | |
634 { | |
635 asm // aligned case | |
636 { | |
637 mov ESI, aptr; | |
638 mov EDI, n; | |
639 mov ECX, bptr; | |
640 | |
641 align 4; | |
642 startsse2a: | |
643 movdqa XMM0, [ESI]; | |
644 movdqa XMM1, [ESI+16]; | |
645 add ESI, 32; | |
646 movdqa XMM2, [ECX]; | |
647 movdqa XMM3, [ECX+16]; | |
648 add ECX, 32; | |
649 paddw XMM0, XMM2; | |
650 paddw XMM1, XMM3; | |
651 movdqa [ESI -32], XMM0; | |
652 movdqa [ESI+16-32], XMM1; | |
653 cmp ESI, EDI; | |
654 jb startsse2a; | |
655 | |
656 mov aptr, ESI; | |
657 mov bptr, ECX; | |
658 } | |
659 } | |
660 } | |
661 else | |
662 // MMX version is 1022% faster | |
663 if (mmx() && a.length >= 8) | |
664 { | |
665 auto n = aptr + (a.length & ~7); | |
666 | |
667 asm | |
668 { | |
669 mov ESI, aptr; | |
670 mov EDI, n; | |
671 mov ECX, bptr; | |
672 | |
673 align 4; | |
674 start: | |
675 movq MM0, [ESI]; | |
676 movq MM1, [ESI+8]; | |
677 add ESI, 16; | |
678 movq MM2, [ECX]; | |
679 movq MM3, [ECX+8]; | |
680 add ECX, 16; | |
681 paddw MM0, MM2; | |
682 paddw MM1, MM3; | |
683 movq [ESI -16], MM0; | |
684 movq [ESI+8-16], MM1; | |
685 cmp ESI, EDI; | |
686 jb start; | |
687 | |
688 emms; | |
689 mov aptr, ESI; | |
690 mov bptr, ECX; | |
691 } | |
692 } | |
693 } | |
694 | |
695 while (aptr < aend) | |
696 *aptr++ += *bptr++; | |
697 | |
698 return a; | |
699 } | |
700 | |
701 unittest | |
702 { | |
703 printf("_arraySliceSliceAddass_s unittest\n"); | |
704 | |
705 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
706 { | |
707 version (log) printf(" cpuid %d\n", cpuid); | |
708 | |
709 for (int j = 0; j < 2; j++) | |
710 { | |
711 const int dim = 67; | |
712 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
713 a = a[j .. dim + j]; // misalign for second iteration | |
714 T[] b = new T[dim + j]; | |
715 b = b[j .. dim + j]; | |
716 T[] c = new T[dim + j]; | |
717 c = c[j .. dim + j]; | |
718 | |
719 for (int i = 0; i < dim; i++) | |
720 { a[i] = cast(T)i; | |
721 b[i] = cast(T)(i + 7); | |
722 c[i] = cast(T)(i * 2); | |
723 } | |
724 | |
725 b[] = c[]; | |
726 c[] += a[]; | |
727 | |
728 for (int i = 0; i < dim; i++) | |
729 { | |
730 if (c[i] != cast(T)(b[i] + a[i])) | |
731 { | |
732 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); | |
733 assert(0); | |
734 } | |
735 } | |
736 } | |
737 } | |
738 } | |
739 | |
740 | |
741 /* ======================================================================== */ | |
742 | |
743 /*********************** | |
744 * Computes: | |
745 * a[] = b[] - value | |
746 */ | |
747 | |
748 T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b) | |
749 { | |
750 return _arraySliceExpMinSliceAssign_s(a, value, b); | |
751 } | |
752 | |
753 T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b) | |
754 { | |
755 return _arraySliceExpMinSliceAssign_s(a, value, b); | |
756 } | |
757 | |
758 T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b) | |
759 in | |
760 { | |
761 assert(a.length == b.length); | |
762 assert(disjoint(a, b)); | |
763 } | |
764 body | |
765 { | |
766 //printf("_arraySliceExpMinSliceAssign_s()\n"); | |
767 auto aptr = a.ptr; | |
768 auto aend = aptr + a.length; | |
769 auto bptr = b.ptr; | |
770 | |
771 version (D_InlineAsm_X86) | |
772 { | |
773 // SSE2 aligned version is 3695% faster | |
774 if (sse2() && a.length >= 16) | |
775 { | |
776 auto n = aptr + (a.length & ~15); | |
777 | |
778 uint l = cast(ushort) value; | |
779 l |= (l << 16); | |
780 | |
781 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
782 { | |
783 asm // unaligned case | |
784 { | |
785 mov ESI, aptr; | |
786 mov EDI, n; | |
787 mov EAX, bptr; | |
788 movd XMM2, l; | |
789 pshufd XMM2, XMM2, 0; | |
790 | |
791 align 4; | |
792 startaddsse2u: | |
793 add ESI, 32; | |
794 movdqu XMM0, [EAX]; | |
795 movdqu XMM1, [EAX+16]; | |
796 add EAX, 32; | |
797 psubw XMM0, XMM2; | |
798 psubw XMM1, XMM2; | |
799 movdqu [ESI -32], XMM0; | |
800 movdqu [ESI+16-32], XMM1; | |
801 cmp ESI, EDI; | |
802 jb startaddsse2u; | |
803 | |
804 mov aptr, ESI; | |
805 mov bptr, EAX; | |
806 } | |
807 } | |
808 else | |
809 { | |
810 asm // aligned case | |
811 { | |
812 mov ESI, aptr; | |
813 mov EDI, n; | |
814 mov EAX, bptr; | |
815 movd XMM2, l; | |
816 pshufd XMM2, XMM2, 0; | |
817 | |
818 align 4; | |
819 startaddsse2a: | |
820 add ESI, 32; | |
821 movdqa XMM0, [EAX]; | |
822 movdqa XMM1, [EAX+16]; | |
823 add EAX, 32; | |
824 psubw XMM0, XMM2; | |
825 psubw XMM1, XMM2; | |
826 movdqa [ESI -32], XMM0; | |
827 movdqa [ESI+16-32], XMM1; | |
828 cmp ESI, EDI; | |
829 jb startaddsse2a; | |
830 | |
831 mov aptr, ESI; | |
832 mov bptr, EAX; | |
833 } | |
834 } | |
835 } | |
836 else | |
837 // MMX version is 3049% faster | |
838 if (mmx() && a.length >= 8) | |
839 { | |
840 auto n = aptr + (a.length & ~7); | |
841 | |
842 uint l = cast(ushort) value; | |
843 | |
844 asm | |
845 { | |
846 mov ESI, aptr; | |
847 mov EDI, n; | |
848 mov EAX, bptr; | |
849 movd MM2, l; | |
850 pshufw MM2, MM2, 0; | |
851 | |
852 align 4; | |
853 startmmx: | |
854 add ESI, 16; | |
855 movq MM0, [EAX]; | |
856 movq MM1, [EAX+8]; | |
857 add EAX, 16; | |
858 psubw MM0, MM2; | |
859 psubw MM1, MM2; | |
860 movq [ESI -16], MM0; | |
861 movq [ESI+8-16], MM1; | |
862 cmp ESI, EDI; | |
863 jb startmmx; | |
864 | |
865 emms; | |
866 mov aptr, ESI; | |
867 mov bptr, EAX; | |
868 } | |
869 } | |
870 } | |
871 | |
872 while (aptr < aend) | |
873 *aptr++ = cast(T)(*bptr++ - value); | |
874 | |
875 return a; | |
876 } | |
877 | |
878 unittest | |
879 { | |
880 printf("_arraySliceExpMinSliceAssign_s unittest\n"); | |
881 | |
882 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
883 { | |
884 version (log) printf(" cpuid %d\n", cpuid); | |
885 | |
886 for (int j = 0; j < 2; j++) | |
887 { | |
888 const int dim = 67; | |
889 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
890 a = a[j .. dim + j]; // misalign for second iteration | |
891 T[] b = new T[dim + j]; | |
892 b = b[j .. dim + j]; | |
893 T[] c = new T[dim + j]; | |
894 c = c[j .. dim + j]; | |
895 | |
896 for (int i = 0; i < dim; i++) | |
897 { a[i] = cast(T)i; | |
898 b[i] = cast(T)(i + 7); | |
899 c[i] = cast(T)(i * 2); | |
900 } | |
901 | |
902 c[] = a[] - 6; | |
903 | |
904 for (int i = 0; i < dim; i++) | |
905 { | |
906 if (c[i] != cast(T)(a[i] - 6)) | |
907 { | |
908 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); | |
909 assert(0); | |
910 } | |
911 } | |
912 } | |
913 } | |
914 } | |
915 | |
916 | |
917 /* ======================================================================== */ | |
918 | |
919 /*********************** | |
920 * Computes: | |
921 * a[] = value - b[] | |
922 */ | |
923 | |
924 T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value) | |
925 { | |
926 return _arrayExpSliceMinSliceAssign_s(a, b, value); | |
927 } | |
928 | |
929 T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value) | |
930 { | |
931 return _arrayExpSliceMinSliceAssign_s(a, b, value); | |
932 } | |
933 | |
934 T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value) | |
935 in | |
936 { | |
937 assert(a.length == b.length); | |
938 assert(disjoint(a, b)); | |
939 } | |
940 body | |
941 { | |
942 //printf("_arrayExpSliceMinSliceAssign_s()\n"); | |
943 auto aptr = a.ptr; | |
944 auto aend = aptr + a.length; | |
945 auto bptr = b.ptr; | |
946 | |
947 version (D_InlineAsm_X86) | |
948 { | |
949 // SSE2 aligned version is 4995% faster | |
950 if (sse2() && a.length >= 16) | |
951 { | |
952 auto n = aptr + (a.length & ~15); | |
953 | |
954 uint l = cast(ushort) value; | |
955 l |= (l << 16); | |
956 | |
957 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
958 { | |
959 asm // unaligned case | |
960 { | |
961 mov ESI, aptr; | |
962 mov EDI, n; | |
963 mov EAX, bptr; | |
964 | |
965 align 4; | |
966 startaddsse2u: | |
967 movd XMM2, l; | |
968 pshufd XMM2, XMM2, 0; | |
969 movd XMM3, l; | |
970 pshufd XMM3, XMM3, 0; | |
971 add ESI, 32; | |
972 movdqu XMM0, [EAX]; | |
973 movdqu XMM1, [EAX+16]; | |
974 add EAX, 32; | |
975 psubw XMM2, XMM0; | |
976 psubw XMM3, XMM1; | |
977 movdqu [ESI -32], XMM2; | |
978 movdqu [ESI+16-32], XMM3; | |
979 cmp ESI, EDI; | |
980 jb startaddsse2u; | |
981 | |
982 mov aptr, ESI; | |
983 mov bptr, EAX; | |
984 } | |
985 } | |
986 else | |
987 { | |
988 asm // aligned case | |
989 { | |
990 mov ESI, aptr; | |
991 mov EDI, n; | |
992 mov EAX, bptr; | |
993 | |
994 align 4; | |
995 startaddsse2a: | |
996 movd XMM2, l; | |
997 pshufd XMM2, XMM2, 0; | |
998 movd XMM3, l; | |
999 pshufd XMM3, XMM3, 0; | |
1000 add ESI, 32; | |
1001 movdqa XMM0, [EAX]; | |
1002 movdqa XMM1, [EAX+16]; | |
1003 add EAX, 32; | |
1004 psubw XMM2, XMM0; | |
1005 psubw XMM3, XMM1; | |
1006 movdqa [ESI -32], XMM2; | |
1007 movdqa [ESI+16-32], XMM3; | |
1008 cmp ESI, EDI; | |
1009 jb startaddsse2a; | |
1010 | |
1011 mov aptr, ESI; | |
1012 mov bptr, EAX; | |
1013 } | |
1014 } | |
1015 } | |
1016 else | |
1017 // MMX version is 4562% faster | |
1018 if (mmx() && a.length >= 8) | |
1019 { | |
1020 auto n = aptr + (a.length & ~7); | |
1021 | |
1022 uint l = cast(ushort) value; | |
1023 | |
1024 asm | |
1025 { | |
1026 mov ESI, aptr; | |
1027 mov EDI, n; | |
1028 mov EAX, bptr; | |
1029 movd MM4, l; | |
1030 pshufw MM4, MM4, 0; | |
1031 | |
1032 align 4; | |
1033 startmmx: | |
1034 add ESI, 16; | |
1035 movq MM2, [EAX]; | |
1036 movq MM3, [EAX+8]; | |
1037 movq MM0, MM4; | |
1038 movq MM1, MM4; | |
1039 add EAX, 16; | |
1040 psubw MM0, MM2; | |
1041 psubw MM1, MM3; | |
1042 movq [ESI -16], MM0; | |
1043 movq [ESI+8-16], MM1; | |
1044 cmp ESI, EDI; | |
1045 jb startmmx; | |
1046 | |
1047 emms; | |
1048 mov aptr, ESI; | |
1049 mov bptr, EAX; | |
1050 } | |
1051 } | |
1052 } | |
1053 | |
1054 while (aptr < aend) | |
1055 *aptr++ = cast(T)(value - *bptr++); | |
1056 | |
1057 return a; | |
1058 } | |
1059 | |
1060 unittest | |
1061 { | |
1062 printf("_arrayExpSliceMinSliceAssign_s unittest\n"); | |
1063 | |
1064 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1065 { | |
1066 version (log) printf(" cpuid %d\n", cpuid); | |
1067 | |
1068 for (int j = 0; j < 2; j++) | |
1069 { | |
1070 const int dim = 67; | |
1071 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1072 a = a[j .. dim + j]; // misalign for second iteration | |
1073 T[] b = new T[dim + j]; | |
1074 b = b[j .. dim + j]; | |
1075 T[] c = new T[dim + j]; | |
1076 c = c[j .. dim + j]; | |
1077 | |
1078 for (int i = 0; i < dim; i++) | |
1079 { a[i] = cast(T)i; | |
1080 b[i] = cast(T)(i + 7); | |
1081 c[i] = cast(T)(i * 2); | |
1082 } | |
1083 | |
1084 c[] = 6 - a[]; | |
1085 | |
1086 for (int i = 0; i < dim; i++) | |
1087 { | |
1088 if (c[i] != cast(T)(6 - a[i])) | |
1089 { | |
1090 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); | |
1091 assert(0); | |
1092 } | |
1093 } | |
1094 } | |
1095 } | |
1096 } | |
1097 | |
1098 | |
1099 /* ======================================================================== */ | |
1100 | |
1101 /*********************** | |
1102 * Computes: | |
1103 * a[] = b[] - c[] | |
1104 */ | |
1105 | |
1106 T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b) | |
1107 { | |
1108 return _arraySliceSliceMinSliceAssign_s(a, c, b); | |
1109 } | |
1110 | |
1111 T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b) | |
1112 { | |
1113 return _arraySliceSliceMinSliceAssign_s(a, c, b); | |
1114 } | |
1115 | |
1116 T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b) | |
1117 in | |
1118 { | |
1119 assert(a.length == b.length && b.length == c.length); | |
1120 assert(disjoint(a, b)); | |
1121 assert(disjoint(a, c)); | |
1122 assert(disjoint(b, c)); | |
1123 } | |
1124 body | |
1125 { | |
1126 auto aptr = a.ptr; | |
1127 auto aend = aptr + a.length; | |
1128 auto bptr = b.ptr; | |
1129 auto cptr = c.ptr; | |
1130 | |
1131 version (D_InlineAsm_X86) | |
1132 { | |
1133 // SSE2 aligned version is 4129% faster | |
1134 if (sse2() && a.length >= 16) | |
1135 { | |
1136 auto n = aptr + (a.length & ~15); | |
1137 | |
1138 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
1139 { | |
1140 asm // unaligned case | |
1141 { | |
1142 mov ESI, aptr; | |
1143 mov EDI, n; | |
1144 mov EAX, bptr; | |
1145 mov ECX, cptr; | |
1146 | |
1147 align 4; | |
1148 startsse2u: | |
1149 add ESI, 32; | |
1150 movdqu XMM0, [EAX]; | |
1151 movdqu XMM1, [EAX+16]; | |
1152 add EAX, 32; | |
1153 movdqu XMM2, [ECX]; | |
1154 movdqu XMM3, [ECX+16]; | |
1155 add ECX, 32; | |
1156 psubw XMM0, XMM2; | |
1157 psubw XMM1, XMM3; | |
1158 movdqu [ESI -32], XMM0; | |
1159 movdqu [ESI+16-32], XMM1; | |
1160 cmp ESI, EDI; | |
1161 jb startsse2u; | |
1162 | |
1163 mov aptr, ESI; | |
1164 mov bptr, EAX; | |
1165 mov cptr, ECX; | |
1166 } | |
1167 } | |
1168 else | |
1169 { | |
1170 asm // aligned case | |
1171 { | |
1172 mov ESI, aptr; | |
1173 mov EDI, n; | |
1174 mov EAX, bptr; | |
1175 mov ECX, cptr; | |
1176 | |
1177 align 4; | |
1178 startsse2a: | |
1179 add ESI, 32; | |
1180 movdqa XMM0, [EAX]; | |
1181 movdqa XMM1, [EAX+16]; | |
1182 add EAX, 32; | |
1183 movdqa XMM2, [ECX]; | |
1184 movdqa XMM3, [ECX+16]; | |
1185 add ECX, 32; | |
1186 psubw XMM0, XMM2; | |
1187 psubw XMM1, XMM3; | |
1188 movdqa [ESI -32], XMM0; | |
1189 movdqa [ESI+16-32], XMM1; | |
1190 cmp ESI, EDI; | |
1191 jb startsse2a; | |
1192 | |
1193 mov aptr, ESI; | |
1194 mov bptr, EAX; | |
1195 mov cptr, ECX; | |
1196 } | |
1197 } | |
1198 } | |
1199 else | |
1200 // MMX version is 2018% faster | |
1201 if (mmx() && a.length >= 8) | |
1202 { | |
1203 auto n = aptr + (a.length & ~7); | |
1204 | |
1205 asm | |
1206 { | |
1207 mov ESI, aptr; | |
1208 mov EDI, n; | |
1209 mov EAX, bptr; | |
1210 mov ECX, cptr; | |
1211 | |
1212 align 4; | |
1213 startmmx: | |
1214 add ESI, 16; | |
1215 movq MM0, [EAX]; | |
1216 movq MM1, [EAX+8]; | |
1217 add EAX, 16; | |
1218 movq MM2, [ECX]; | |
1219 movq MM3, [ECX+8]; | |
1220 add ECX, 16; | |
1221 psubw MM0, MM2; | |
1222 psubw MM1, MM3; | |
1223 movq [ESI -16], MM0; | |
1224 movq [ESI+8-16], MM1; | |
1225 cmp ESI, EDI; | |
1226 jb startmmx; | |
1227 | |
1228 emms; | |
1229 mov aptr, ESI; | |
1230 mov bptr, EAX; | |
1231 mov cptr, ECX; | |
1232 } | |
1233 } | |
1234 } | |
1235 | |
1236 while (aptr < aend) | |
1237 *aptr++ = cast(T)(*bptr++ - *cptr++); | |
1238 | |
1239 return a; | |
1240 } | |
1241 | |
1242 unittest | |
1243 { | |
1244 printf("_arraySliceSliceMinSliceAssign_s unittest\n"); | |
1245 | |
1246 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1247 { | |
1248 version (log) printf(" cpuid %d\n", cpuid); | |
1249 | |
1250 for (int j = 0; j < 2; j++) | |
1251 { | |
1252 const int dim = 67; | |
1253 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1254 a = a[j .. dim + j]; // misalign for second iteration | |
1255 T[] b = new T[dim + j]; | |
1256 b = b[j .. dim + j]; | |
1257 T[] c = new T[dim + j]; | |
1258 c = c[j .. dim + j]; | |
1259 | |
1260 for (int i = 0; i < dim; i++) | |
1261 { a[i] = cast(T)i; | |
1262 b[i] = cast(T)(i + 7); | |
1263 c[i] = cast(T)(i * 2); | |
1264 } | |
1265 | |
1266 c[] = a[] - b[]; | |
1267 | |
1268 for (int i = 0; i < dim; i++) | |
1269 { | |
1270 if (c[i] != cast(T)(a[i] - b[i])) | |
1271 { | |
1272 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); | |
1273 assert(0); | |
1274 } | |
1275 } | |
1276 } | |
1277 } | |
1278 } | |
1279 | |
1280 | |
1281 /* ======================================================================== */ | |
1282 | |
1283 /*********************** | |
1284 * Computes: | |
1285 * a[] -= value | |
1286 */ | |
1287 | |
1288 T[] _arrayExpSliceMinass_u(T[] a, T value) | |
1289 { | |
1290 return _arrayExpSliceMinass_s(a, value); | |
1291 } | |
1292 | |
1293 T[] _arrayExpSliceMinass_t(T[] a, T value) | |
1294 { | |
1295 return _arrayExpSliceMinass_s(a, value); | |
1296 } | |
1297 | |
1298 T[] _arrayExpSliceMinass_s(T[] a, T value) | |
1299 { | |
1300 //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1301 auto aptr = a.ptr; | |
1302 auto aend = aptr + a.length; | |
1303 | |
1304 version (D_InlineAsm_X86) | |
1305 { | |
1306 // SSE2 aligned version is 835% faster | |
1307 if (sse2() && a.length >= 16) | |
1308 { | |
1309 auto n = aptr + (a.length & ~15); | |
1310 | |
1311 uint l = cast(ushort) value; | |
1312 l |= (l << 16); | |
1313 | |
1314 if (((cast(uint) aptr) & 15) != 0) | |
1315 { | |
1316 asm // unaligned case | |
1317 { | |
1318 mov ESI, aptr; | |
1319 mov EDI, n; | |
1320 movd XMM2, l; | |
1321 pshufd XMM2, XMM2, 0; | |
1322 | |
1323 align 4; | |
1324 startaddsse2u: | |
1325 movdqu XMM0, [ESI]; | |
1326 movdqu XMM1, [ESI+16]; | |
1327 add ESI, 32; | |
1328 psubw XMM0, XMM2; | |
1329 psubw XMM1, XMM2; | |
1330 movdqu [ESI -32], XMM0; | |
1331 movdqu [ESI+16-32], XMM1; | |
1332 cmp ESI, EDI; | |
1333 jb startaddsse2u; | |
1334 | |
1335 mov aptr, ESI; | |
1336 } | |
1337 } | |
1338 else | |
1339 { | |
1340 asm // aligned case | |
1341 { | |
1342 mov ESI, aptr; | |
1343 mov EDI, n; | |
1344 movd XMM2, l; | |
1345 pshufd XMM2, XMM2, 0; | |
1346 | |
1347 align 4; | |
1348 startaddsse2a: | |
1349 movdqa XMM0, [ESI]; | |
1350 movdqa XMM1, [ESI+16]; | |
1351 add ESI, 32; | |
1352 psubw XMM0, XMM2; | |
1353 psubw XMM1, XMM2; | |
1354 movdqa [ESI -32], XMM0; | |
1355 movdqa [ESI+16-32], XMM1; | |
1356 cmp ESI, EDI; | |
1357 jb startaddsse2a; | |
1358 | |
1359 mov aptr, ESI; | |
1360 } | |
1361 } | |
1362 } | |
1363 else | |
1364 // MMX version is 835% faster | |
1365 if (mmx() && a.length >= 8) | |
1366 { | |
1367 auto n = aptr + (a.length & ~7); | |
1368 | |
1369 uint l = cast(ushort) value; | |
1370 | |
1371 asm | |
1372 { | |
1373 mov ESI, aptr; | |
1374 mov EDI, n; | |
1375 movd MM2, l; | |
1376 pshufw MM2, MM2, 0; | |
1377 | |
1378 align 4; | |
1379 startmmx: | |
1380 movq MM0, [ESI]; | |
1381 movq MM1, [ESI+8]; | |
1382 add ESI, 16; | |
1383 psubw MM0, MM2; | |
1384 psubw MM1, MM2; | |
1385 movq [ESI -16], MM0; | |
1386 movq [ESI+8-16], MM1; | |
1387 cmp ESI, EDI; | |
1388 jb startmmx; | |
1389 | |
1390 emms; | |
1391 mov aptr, ESI; | |
1392 } | |
1393 } | |
1394 } | |
1395 | |
1396 while (aptr < aend) | |
1397 *aptr++ -= value; | |
1398 | |
1399 return a; | |
1400 } | |
1401 | |
1402 unittest | |
1403 { | |
1404 printf("_arrayExpSliceMinass_s unittest\n"); | |
1405 | |
1406 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1407 { | |
1408 version (log) printf(" cpuid %d\n", cpuid); | |
1409 | |
1410 for (int j = 0; j < 2; j++) | |
1411 { | |
1412 const int dim = 67; | |
1413 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1414 a = a[j .. dim + j]; // misalign for second iteration | |
1415 T[] b = new T[dim + j]; | |
1416 b = b[j .. dim + j]; | |
1417 T[] c = new T[dim + j]; | |
1418 c = c[j .. dim + j]; | |
1419 | |
1420 for (int i = 0; i < dim; i++) | |
1421 { a[i] = cast(T)i; | |
1422 b[i] = cast(T)(i + 7); | |
1423 c[i] = cast(T)(i * 2); | |
1424 } | |
1425 | |
1426 a[] = c[]; | |
1427 a[] -= 6; | |
1428 | |
1429 for (int i = 0; i < dim; i++) | |
1430 { | |
1431 if (a[i] != cast(T)(c[i] - 6)) | |
1432 { | |
1433 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); | |
1434 assert(0); | |
1435 } | |
1436 } | |
1437 } | |
1438 } | |
1439 } | |
1440 | |
1441 | |
1442 /* ======================================================================== */ | |
1443 | |
1444 /*********************** | |
1445 * Computes: | |
1446 * a[] -= b[] | |
1447 */ | |
1448 | |
1449 T[] _arraySliceSliceMinass_u(T[] a, T[] b) | |
1450 { | |
1451 return _arraySliceSliceMinass_s(a, b); | |
1452 } | |
1453 | |
1454 T[] _arraySliceSliceMinass_t(T[] a, T[] b) | |
1455 { | |
1456 return _arraySliceSliceMinass_s(a, b); | |
1457 } | |
1458 | |
1459 T[] _arraySliceSliceMinass_s(T[] a, T[] b) | |
1460 in | |
1461 { | |
1462 assert (a.length == b.length); | |
1463 assert (disjoint(a, b)); | |
1464 } | |
1465 body | |
1466 { | |
1467 //printf("_arraySliceSliceMinass_s()\n"); | |
1468 auto aptr = a.ptr; | |
1469 auto aend = aptr + a.length; | |
1470 auto bptr = b.ptr; | |
1471 | |
1472 version (D_InlineAsm_X86) | |
1473 { | |
1474 // SSE2 aligned version is 2121% faster | |
1475 if (sse2() && a.length >= 16) | |
1476 { | |
1477 auto n = aptr + (a.length & ~15); | |
1478 | |
1479 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1480 { | |
1481 asm // unaligned case | |
1482 { | |
1483 mov ESI, aptr; | |
1484 mov EDI, n; | |
1485 mov ECX, bptr; | |
1486 | |
1487 align 4; | |
1488 startsse2u: | |
1489 movdqu XMM0, [ESI]; | |
1490 movdqu XMM1, [ESI+16]; | |
1491 add ESI, 32; | |
1492 movdqu XMM2, [ECX]; | |
1493 movdqu XMM3, [ECX+16]; | |
1494 add ECX, 32; | |
1495 psubw XMM0, XMM2; | |
1496 psubw XMM1, XMM3; | |
1497 movdqu [ESI -32], XMM0; | |
1498 movdqu [ESI+16-32], XMM1; | |
1499 cmp ESI, EDI; | |
1500 jb startsse2u; | |
1501 | |
1502 mov aptr, ESI; | |
1503 mov bptr, ECX; | |
1504 } | |
1505 } | |
1506 else | |
1507 { | |
1508 asm // aligned case | |
1509 { | |
1510 mov ESI, aptr; | |
1511 mov EDI, n; | |
1512 mov ECX, bptr; | |
1513 | |
1514 align 4; | |
1515 startsse2a: | |
1516 movdqa XMM0, [ESI]; | |
1517 movdqa XMM1, [ESI+16]; | |
1518 add ESI, 32; | |
1519 movdqa XMM2, [ECX]; | |
1520 movdqa XMM3, [ECX+16]; | |
1521 add ECX, 32; | |
1522 psubw XMM0, XMM2; | |
1523 psubw XMM1, XMM3; | |
1524 movdqa [ESI -32], XMM0; | |
1525 movdqa [ESI+16-32], XMM1; | |
1526 cmp ESI, EDI; | |
1527 jb startsse2a; | |
1528 | |
1529 mov aptr, ESI; | |
1530 mov bptr, ECX; | |
1531 } | |
1532 } | |
1533 } | |
1534 else | |
1535 // MMX version is 1116% faster | |
1536 if (mmx() && a.length >= 8) | |
1537 { | |
1538 auto n = aptr + (a.length & ~7); | |
1539 | |
1540 asm | |
1541 { | |
1542 mov ESI, aptr; | |
1543 mov EDI, n; | |
1544 mov ECX, bptr; | |
1545 | |
1546 align 4; | |
1547 start: | |
1548 movq MM0, [ESI]; | |
1549 movq MM1, [ESI+8]; | |
1550 add ESI, 16; | |
1551 movq MM2, [ECX]; | |
1552 movq MM3, [ECX+8]; | |
1553 add ECX, 16; | |
1554 psubw MM0, MM2; | |
1555 psubw MM1, MM3; | |
1556 movq [ESI -16], MM0; | |
1557 movq [ESI+8-16], MM1; | |
1558 cmp ESI, EDI; | |
1559 jb start; | |
1560 | |
1561 emms; | |
1562 mov aptr, ESI; | |
1563 mov bptr, ECX; | |
1564 } | |
1565 } | |
1566 } | |
1567 | |
1568 while (aptr < aend) | |
1569 *aptr++ -= *bptr++; | |
1570 | |
1571 return a; | |
1572 } | |
1573 | |
1574 unittest | |
1575 { | |
1576 printf("_arraySliceSliceMinass_s unittest\n"); | |
1577 | |
1578 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1579 { | |
1580 version (log) printf(" cpuid %d\n", cpuid); | |
1581 | |
1582 for (int j = 0; j < 2; j++) | |
1583 { | |
1584 const int dim = 67; | |
1585 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1586 a = a[j .. dim + j]; // misalign for second iteration | |
1587 T[] b = new T[dim + j]; | |
1588 b = b[j .. dim + j]; | |
1589 T[] c = new T[dim + j]; | |
1590 c = c[j .. dim + j]; | |
1591 | |
1592 for (int i = 0; i < dim; i++) | |
1593 { a[i] = cast(T)i; | |
1594 b[i] = cast(T)(i + 7); | |
1595 c[i] = cast(T)(i * 2); | |
1596 } | |
1597 | |
1598 b[] = c[]; | |
1599 c[] -= a[]; | |
1600 | |
1601 for (int i = 0; i < dim; i++) | |
1602 { | |
1603 if (c[i] != cast(T)(b[i] - a[i])) | |
1604 { | |
1605 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); | |
1606 assert(0); | |
1607 } | |
1608 } | |
1609 } | |
1610 } | |
1611 } | |
1612 | |
1613 | |
1614 /* ======================================================================== */ | |
1615 | |
1616 /*********************** | |
1617 * Computes: | |
1618 * a[] = b[] * value | |
1619 */ | |
1620 | |
1621 T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b) | |
1622 { | |
1623 return _arraySliceExpMulSliceAssign_s(a, value, b); | |
1624 } | |
1625 | |
1626 T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b) | |
1627 { | |
1628 return _arraySliceExpMulSliceAssign_s(a, value, b); | |
1629 } | |
1630 | |
1631 T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b) | |
1632 in | |
1633 { | |
1634 assert(a.length == b.length); | |
1635 assert(disjoint(a, b)); | |
1636 } | |
1637 body | |
1638 { | |
1639 //printf("_arraySliceExpMulSliceAssign_s()\n"); | |
1640 auto aptr = a.ptr; | |
1641 auto aend = aptr + a.length; | |
1642 auto bptr = b.ptr; | |
1643 | |
1644 version (D_InlineAsm_X86) | |
1645 { | |
1646 // SSE2 aligned version is 3733% faster | |
1647 if (sse2() && a.length >= 16) | |
1648 { | |
1649 auto n = aptr + (a.length & ~15); | |
1650 | |
1651 uint l = cast(ushort) value; | |
1652 l |= l << 16; | |
1653 | |
1654 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
1655 { | |
1656 asm | |
1657 { | |
1658 mov ESI, aptr; | |
1659 mov EDI, n; | |
1660 mov EAX, bptr; | |
1661 movd XMM2, l; | |
1662 pshufd XMM2, XMM2, 0; | |
1663 | |
1664 align 4; | |
1665 startsse2u: | |
1666 add ESI, 32; | |
1667 movdqu XMM0, [EAX]; | |
1668 movdqu XMM1, [EAX+16]; | |
1669 add EAX, 32; | |
1670 pmullw XMM0, XMM2; | |
1671 pmullw XMM1, XMM2; | |
1672 movdqu [ESI -32], XMM0; | |
1673 movdqu [ESI+16-32], XMM1; | |
1674 cmp ESI, EDI; | |
1675 jb startsse2u; | |
1676 | |
1677 mov aptr, ESI; | |
1678 mov bptr, EAX; | |
1679 } | |
1680 } | |
1681 else | |
1682 { | |
1683 asm | |
1684 { | |
1685 mov ESI, aptr; | |
1686 mov EDI, n; | |
1687 mov EAX, bptr; | |
1688 movd XMM2, l; | |
1689 pshufd XMM2, XMM2, 0; | |
1690 | |
1691 align 4; | |
1692 startsse2a: | |
1693 add ESI, 32; | |
1694 movdqa XMM0, [EAX]; | |
1695 movdqa XMM1, [EAX+16]; | |
1696 add EAX, 32; | |
1697 pmullw XMM0, XMM2; | |
1698 pmullw XMM1, XMM2; | |
1699 movdqa [ESI -32], XMM0; | |
1700 movdqa [ESI+16-32], XMM1; | |
1701 cmp ESI, EDI; | |
1702 jb startsse2a; | |
1703 | |
1704 mov aptr, ESI; | |
1705 mov bptr, EAX; | |
1706 } | |
1707 } | |
1708 } | |
1709 else | |
1710 // MMX version is 3733% faster | |
1711 if (mmx() && a.length >= 8) | |
1712 { | |
1713 auto n = aptr + (a.length & ~7); | |
1714 | |
1715 uint l = cast(ushort) value; | |
1716 | |
1717 asm | |
1718 { | |
1719 mov ESI, aptr; | |
1720 mov EDI, n; | |
1721 mov EAX, bptr; | |
1722 movd MM2, l; | |
1723 pshufw MM2, MM2, 0; | |
1724 | |
1725 align 4; | |
1726 startmmx: | |
1727 add ESI, 16; | |
1728 movq MM0, [EAX]; | |
1729 movq MM1, [EAX+8]; | |
1730 add EAX, 16; | |
1731 pmullw MM0, MM2; | |
1732 pmullw MM1, MM2; | |
1733 movq [ESI -16], MM0; | |
1734 movq [ESI+8-16], MM1; | |
1735 cmp ESI, EDI; | |
1736 jb startmmx; | |
1737 | |
1738 emms; | |
1739 mov aptr, ESI; | |
1740 mov bptr, EAX; | |
1741 } | |
1742 } | |
1743 } | |
1744 | |
1745 while (aptr < aend) | |
1746 *aptr++ = cast(T)(*bptr++ * value); | |
1747 | |
1748 return a; | |
1749 } | |
1750 | |
1751 unittest | |
1752 { | |
1753 printf("_arraySliceExpMulSliceAssign_s unittest\n"); | |
1754 | |
1755 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1756 { | |
1757 version (log) printf(" cpuid %d\n", cpuid); | |
1758 | |
1759 for (int j = 0; j < 2; j++) | |
1760 { | |
1761 const int dim = 67; | |
1762 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1763 a = a[j .. dim + j]; // misalign for second iteration | |
1764 T[] b = new T[dim + j]; | |
1765 b = b[j .. dim + j]; | |
1766 T[] c = new T[dim + j]; | |
1767 c = c[j .. dim + j]; | |
1768 | |
1769 for (int i = 0; i < dim; i++) | |
1770 { a[i] = cast(T)i; | |
1771 b[i] = cast(T)(i + 7); | |
1772 c[i] = cast(T)(i * 2); | |
1773 } | |
1774 | |
1775 c[] = a[] * 6; | |
1776 | |
1777 for (int i = 0; i < dim; i++) | |
1778 { | |
1779 if (c[i] != cast(T)(a[i] * 6)) | |
1780 { | |
1781 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); | |
1782 assert(0); | |
1783 } | |
1784 } | |
1785 } | |
1786 } | |
1787 } | |
1788 | |
1789 | |
1790 /* ======================================================================== */ | |
1791 | |
1792 /*********************** | |
1793 * Computes: | |
1794 * a[] = b[] * c[] | |
1795 */ | |
1796 | |
1797 T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b) | |
1798 { | |
1799 return _arraySliceSliceMulSliceAssign_s(a, c, b); | |
1800 } | |
1801 | |
1802 T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b) | |
1803 { | |
1804 return _arraySliceSliceMulSliceAssign_s(a, c, b); | |
1805 } | |
1806 | |
1807 T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b) | |
1808 in | |
1809 { | |
1810 assert(a.length == b.length && b.length == c.length); | |
1811 assert(disjoint(a, b)); | |
1812 assert(disjoint(a, c)); | |
1813 assert(disjoint(b, c)); | |
1814 } | |
1815 body | |
1816 { | |
1817 //printf("_arraySliceSliceMulSliceAssign_s()\n"); | |
1818 auto aptr = a.ptr; | |
1819 auto aend = aptr + a.length; | |
1820 auto bptr = b.ptr; | |
1821 auto cptr = c.ptr; | |
1822 | |
1823 version (D_InlineAsm_X86) | |
1824 { | |
1825 // SSE2 aligned version is 2515% faster | |
1826 if (sse2() && a.length >= 16) | |
1827 { | |
1828 auto n = aptr + (a.length & ~15); | |
1829 | |
1830 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) | |
1831 { | |
1832 asm | |
1833 { | |
1834 mov ESI, aptr; | |
1835 mov EDI, n; | |
1836 mov EAX, bptr; | |
1837 mov ECX, cptr; | |
1838 | |
1839 align 4; | |
1840 startsse2u: | |
1841 add ESI, 32; | |
1842 movdqu XMM0, [EAX]; | |
1843 movdqu XMM2, [ECX]; | |
1844 movdqu XMM1, [EAX+16]; | |
1845 movdqu XMM3, [ECX+16]; | |
1846 add EAX, 32; | |
1847 add ECX, 32; | |
1848 pmullw XMM0, XMM2; | |
1849 pmullw XMM1, XMM3; | |
1850 movdqu [ESI -32], XMM0; | |
1851 movdqu [ESI+16-32], XMM1; | |
1852 cmp ESI, EDI; | |
1853 jb startsse2u; | |
1854 | |
1855 mov aptr, ESI; | |
1856 mov bptr, EAX; | |
1857 mov cptr, ECX; | |
1858 } | |
1859 } | |
1860 else | |
1861 { | |
1862 asm | |
1863 { | |
1864 mov ESI, aptr; | |
1865 mov EDI, n; | |
1866 mov EAX, bptr; | |
1867 mov ECX, cptr; | |
1868 | |
1869 align 4; | |
1870 startsse2a: | |
1871 add ESI, 32; | |
1872 movdqa XMM0, [EAX]; | |
1873 movdqa XMM2, [ECX]; | |
1874 movdqa XMM1, [EAX+16]; | |
1875 movdqa XMM3, [ECX+16]; | |
1876 add EAX, 32; | |
1877 add ECX, 32; | |
1878 pmullw XMM0, XMM2; | |
1879 pmullw XMM1, XMM3; | |
1880 movdqa [ESI -32], XMM0; | |
1881 movdqa [ESI+16-32], XMM1; | |
1882 cmp ESI, EDI; | |
1883 jb startsse2a; | |
1884 | |
1885 mov aptr, ESI; | |
1886 mov bptr, EAX; | |
1887 mov cptr, ECX; | |
1888 } | |
1889 } | |
1890 } | |
1891 else | |
1892 // MMX version is 2515% faster | |
1893 if (mmx() && a.length >= 8) | |
1894 { | |
1895 auto n = aptr + (a.length & ~7); | |
1896 | |
1897 asm | |
1898 { | |
1899 mov ESI, aptr; | |
1900 mov EDI, n; | |
1901 mov EAX, bptr; | |
1902 mov ECX, cptr; | |
1903 | |
1904 align 4; | |
1905 startmmx: | |
1906 add ESI, 16; | |
1907 movq MM0, [EAX]; | |
1908 movq MM2, [ECX]; | |
1909 movq MM1, [EAX+8]; | |
1910 movq MM3, [ECX+8]; | |
1911 add EAX, 16; | |
1912 add ECX, 16; | |
1913 pmullw MM0, MM2; | |
1914 pmullw MM1, MM3; | |
1915 movq [ESI -16], MM0; | |
1916 movq [ESI+8-16], MM1; | |
1917 cmp ESI, EDI; | |
1918 jb startmmx; | |
1919 | |
1920 emms; | |
1921 mov aptr, ESI; | |
1922 mov bptr, EAX; | |
1923 mov cptr, ECX; | |
1924 } | |
1925 } | |
1926 } | |
1927 | |
1928 while (aptr < aend) | |
1929 *aptr++ = cast(T)(*bptr++ * *cptr++); | |
1930 | |
1931 return a; | |
1932 } | |
1933 | |
1934 unittest | |
1935 { | |
1936 printf("_arraySliceSliceMulSliceAssign_s unittest\n"); | |
1937 | |
1938 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1939 { | |
1940 version (log) printf(" cpuid %d\n", cpuid); | |
1941 | |
1942 for (int j = 0; j < 2; j++) | |
1943 { | |
1944 const int dim = 67; | |
1945 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1946 a = a[j .. dim + j]; // misalign for second iteration | |
1947 T[] b = new T[dim + j]; | |
1948 b = b[j .. dim + j]; | |
1949 T[] c = new T[dim + j]; | |
1950 c = c[j .. dim + j]; | |
1951 | |
1952 for (int i = 0; i < dim; i++) | |
1953 { a[i] = cast(T)i; | |
1954 b[i] = cast(T)(i + 7); | |
1955 c[i] = cast(T)(i * 2); | |
1956 } | |
1957 | |
1958 c[] = a[] * b[]; | |
1959 | |
1960 for (int i = 0; i < dim; i++) | |
1961 { | |
1962 if (c[i] != cast(T)(a[i] * b[i])) | |
1963 { | |
1964 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); | |
1965 assert(0); | |
1966 } | |
1967 } | |
1968 } | |
1969 } | |
1970 } | |
1971 | |
1972 | |
1973 /* ======================================================================== */ | |
1974 | |
1975 /*********************** | |
1976 * Computes: | |
1977 * a[] *= value | |
1978 */ | |
1979 | |
1980 T[] _arrayExpSliceMulass_u(T[] a, T value) | |
1981 { | |
1982 return _arrayExpSliceMulass_s(a, value); | |
1983 } | |
1984 | |
1985 T[] _arrayExpSliceMulass_t(T[] a, T value) | |
1986 { | |
1987 return _arrayExpSliceMulass_s(a, value); | |
1988 } | |
1989 | |
1990 T[] _arrayExpSliceMulass_s(T[] a, T value) | |
1991 { | |
1992 //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1993 auto aptr = a.ptr; | |
1994 auto aend = aptr + a.length; | |
1995 | |
1996 version (D_InlineAsm_X86) | |
1997 { | |
1998 // SSE2 aligned version is 2044% faster | |
1999 if (sse2() && a.length >= 16) | |
2000 { | |
2001 auto n = aptr + (a.length & ~15); | |
2002 | |
2003 uint l = cast(ushort) value; | |
2004 l |= l << 16; | |
2005 | |
2006 if (((cast(uint) aptr) & 15) != 0) | |
2007 { | |
2008 asm | |
2009 { | |
2010 mov ESI, aptr; | |
2011 mov EDI, n; | |
2012 movd XMM2, l; | |
2013 pshufd XMM2, XMM2, 0; | |
2014 | |
2015 align 4; | |
2016 startsse2u: | |
2017 movdqu XMM0, [ESI]; | |
2018 movdqu XMM1, [ESI+16]; | |
2019 add ESI, 32; | |
2020 pmullw XMM0, XMM2; | |
2021 pmullw XMM1, XMM2; | |
2022 movdqu [ESI -32], XMM0; | |
2023 movdqu [ESI+16-32], XMM1; | |
2024 cmp ESI, EDI; | |
2025 jb startsse2u; | |
2026 | |
2027 mov aptr, ESI; | |
2028 } | |
2029 } | |
2030 else | |
2031 { | |
2032 asm | |
2033 { | |
2034 mov ESI, aptr; | |
2035 mov EDI, n; | |
2036 movd XMM2, l; | |
2037 pshufd XMM2, XMM2, 0; | |
2038 | |
2039 align 4; | |
2040 startsse2a: | |
2041 movdqa XMM0, [ESI]; | |
2042 movdqa XMM1, [ESI+16]; | |
2043 add ESI, 32; | |
2044 pmullw XMM0, XMM2; | |
2045 pmullw XMM1, XMM2; | |
2046 movdqa [ESI -32], XMM0; | |
2047 movdqa [ESI+16-32], XMM1; | |
2048 cmp ESI, EDI; | |
2049 jb startsse2a; | |
2050 | |
2051 mov aptr, ESI; | |
2052 } | |
2053 } | |
2054 } | |
2055 else | |
2056 // MMX version is 2056% faster | |
2057 if (mmx() && a.length >= 8) | |
2058 { | |
2059 auto n = aptr + (a.length & ~7); | |
2060 | |
2061 uint l = cast(ushort) value; | |
2062 | |
2063 asm | |
2064 { | |
2065 mov ESI, aptr; | |
2066 mov EDI, n; | |
2067 movd MM2, l; | |
2068 pshufw MM2, MM2, 0; | |
2069 | |
2070 align 4; | |
2071 startmmx: | |
2072 movq MM0, [ESI]; | |
2073 movq MM1, [ESI+8]; | |
2074 add ESI, 16; | |
2075 pmullw MM0, MM2; | |
2076 pmullw MM1, MM2; | |
2077 movq [ESI -16], MM0; | |
2078 movq [ESI+8-16], MM1; | |
2079 cmp ESI, EDI; | |
2080 jb startmmx; | |
2081 | |
2082 emms; | |
2083 mov aptr, ESI; | |
2084 } | |
2085 } | |
2086 } | |
2087 | |
2088 while (aptr < aend) | |
2089 *aptr++ *= value; | |
2090 | |
2091 return a; | |
2092 } | |
2093 | |
2094 unittest | |
2095 { | |
2096 printf("_arrayExpSliceMulass_s unittest\n"); | |
2097 | |
2098 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
2099 { | |
2100 version (log) printf(" cpuid %d\n", cpuid); | |
2101 | |
2102 for (int j = 0; j < 2; j++) | |
2103 { | |
2104 const int dim = 67; | |
2105 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
2106 a = a[j .. dim + j]; // misalign for second iteration | |
2107 T[] b = new T[dim + j]; | |
2108 b = b[j .. dim + j]; | |
2109 T[] c = new T[dim + j]; | |
2110 c = c[j .. dim + j]; | |
2111 | |
2112 for (int i = 0; i < dim; i++) | |
2113 { a[i] = cast(T)i; | |
2114 b[i] = cast(T)(i + 7); | |
2115 c[i] = cast(T)(i * 2); | |
2116 } | |
2117 | |
2118 b[] = a[]; | |
2119 a[] *= 6; | |
2120 | |
2121 for (int i = 0; i < dim; i++) | |
2122 { | |
2123 if (a[i] != cast(T)(b[i] * 6)) | |
2124 { | |
2125 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); | |
2126 assert(0); | |
2127 } | |
2128 } | |
2129 } | |
2130 } | |
2131 } | |
2132 | |
2133 | |
2134 /* ======================================================================== */ | |
2135 | |
2136 /*********************** | |
2137 * Computes: | |
2138 * a[] *= b[] | |
2139 */ | |
2140 | |
2141 T[] _arraySliceSliceMulass_u(T[] a, T[] b) | |
2142 { | |
2143 return _arraySliceSliceMulass_s(a, b); | |
2144 } | |
2145 | |
2146 T[] _arraySliceSliceMulass_t(T[] a, T[] b) | |
2147 { | |
2148 return _arraySliceSliceMulass_s(a, b); | |
2149 } | |
2150 | |
2151 T[] _arraySliceSliceMulass_s(T[] a, T[] b) | |
2152 in | |
2153 { | |
2154 assert (a.length == b.length); | |
2155 assert (disjoint(a, b)); | |
2156 } | |
2157 body | |
2158 { | |
2159 //printf("_arraySliceSliceMulass_s()\n"); | |
2160 auto aptr = a.ptr; | |
2161 auto aend = aptr + a.length; | |
2162 auto bptr = b.ptr; | |
2163 | |
2164 version (D_InlineAsm_X86) | |
2165 { | |
2166 // SSE2 aligned version is 2519% faster | |
2167 if (sse2() && a.length >= 16) | |
2168 { | |
2169 auto n = aptr + (a.length & ~15); | |
2170 | |
2171 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) | |
2172 { | |
2173 asm | |
2174 { | |
2175 mov ESI, aptr; | |
2176 mov EDI, n; | |
2177 mov ECX, bptr; | |
2178 | |
2179 align 4; | |
2180 startsse2u: | |
2181 movdqu XMM0, [ESI]; | |
2182 movdqu XMM2, [ECX]; | |
2183 movdqu XMM1, [ESI+16]; | |
2184 movdqu XMM3, [ECX+16]; | |
2185 add ESI, 32; | |
2186 add ECX, 32; | |
2187 pmullw XMM0, XMM2; | |
2188 pmullw XMM1, XMM3; | |
2189 movdqu [ESI -32], XMM0; | |
2190 movdqu [ESI+16-32], XMM1; | |
2191 cmp ESI, EDI; | |
2192 jb startsse2u; | |
2193 | |
2194 mov aptr, ESI; | |
2195 mov bptr, ECX; | |
2196 } | |
2197 } | |
2198 else | |
2199 { | |
2200 asm | |
2201 { | |
2202 mov ESI, aptr; | |
2203 mov EDI, n; | |
2204 mov ECX, bptr; | |
2205 | |
2206 align 4; | |
2207 startsse2a: | |
2208 movdqa XMM0, [ESI]; | |
2209 movdqa XMM2, [ECX]; | |
2210 movdqa XMM1, [ESI+16]; | |
2211 movdqa XMM3, [ECX+16]; | |
2212 add ESI, 32; | |
2213 add ECX, 32; | |
2214 pmullw XMM0, XMM2; | |
2215 pmullw XMM1, XMM3; | |
2216 movdqa [ESI -32], XMM0; | |
2217 movdqa [ESI+16-32], XMM1; | |
2218 cmp ESI, EDI; | |
2219 jb startsse2a; | |
2220 | |
2221 mov aptr, ESI; | |
2222 mov bptr, ECX; | |
2223 } | |
2224 } | |
2225 } | |
2226 else | |
2227 // MMX version is 1712% faster | |
2228 if (mmx() && a.length >= 8) | |
2229 { | |
2230 auto n = aptr + (a.length & ~7); | |
2231 | |
2232 asm | |
2233 { | |
2234 mov ESI, aptr; | |
2235 mov EDI, n; | |
2236 mov ECX, bptr; | |
2237 | |
2238 align 4; | |
2239 startmmx: | |
2240 movq MM0, [ESI]; | |
2241 movq MM2, [ECX]; | |
2242 movq MM1, [ESI+8]; | |
2243 movq MM3, [ECX+8]; | |
2244 add ESI, 16; | |
2245 add ECX, 16; | |
2246 pmullw MM0, MM2; | |
2247 pmullw MM1, MM3; | |
2248 movq [ESI -16], MM0; | |
2249 movq [ESI+8-16], MM1; | |
2250 cmp ESI, EDI; | |
2251 jb startmmx; | |
2252 | |
2253 emms; | |
2254 mov aptr, ESI; | |
2255 mov bptr, ECX; | |
2256 } | |
2257 } | |
2258 } | |
2259 | |
2260 while (aptr < aend) | |
2261 *aptr++ *= *bptr++; | |
2262 | |
2263 return a; | |
2264 } | |
2265 | |
2266 unittest | |
2267 { | |
2268 printf("_arraySliceSliceMulass_s unittest\n"); | |
2269 | |
2270 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
2271 { | |
2272 version (log) printf(" cpuid %d\n", cpuid); | |
2273 | |
2274 for (int j = 0; j < 2; j++) | |
2275 { | |
2276 const int dim = 67; | |
2277 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
2278 a = a[j .. dim + j]; // misalign for second iteration | |
2279 T[] b = new T[dim + j]; | |
2280 b = b[j .. dim + j]; | |
2281 T[] c = new T[dim + j]; | |
2282 c = c[j .. dim + j]; | |
2283 | |
2284 for (int i = 0; i < dim; i++) | |
2285 { a[i] = cast(T)i; | |
2286 b[i] = cast(T)(i + 7); | |
2287 c[i] = cast(T)(i * 2); | |
2288 } | |
2289 | |
2290 b[] = a[]; | |
2291 a[] *= c[]; | |
2292 | |
2293 for (int i = 0; i < dim; i++) | |
2294 { | |
2295 if (a[i] != cast(T)(b[i] * c[i])) | |
2296 { | |
2297 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); | |
2298 assert(0); | |
2299 } | |
2300 } | |
2301 } | |
2302 } | |
2303 } |