Mercurial > projects > ldc
comparison druntime/src/compiler/dmd/arraydouble.d @ 759:d3eb054172f9
Added copy of druntime from DMD 2.020 modified for LDC.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:52:37 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
758:f04dde6e882c | 759:d3eb054172f9 |
---|---|
1 /*************************** | |
2 * D programming language http://www.digitalmars.com/d/ | |
3 * Runtime support for double array operations. | |
4 * Based on code originally written by Burton Radons. | |
5 * Placed in public domain. | |
6 */ | |
7 | |
8 module rt.arraydouble; | |
9 | |
10 private import util.cpuid; | |
11 | |
12 version (Unittest) | |
13 { | |
14 /* This is so unit tests will test every CPU variant | |
15 */ | |
16 int cpuid; | |
17 const int CPUID_MAX = 5; | |
18 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); } | |
19 bool sse() { return cpuid == 2 && util.cpuid.sse(); } | |
20 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); } | |
21 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); } | |
22 } | |
23 else | |
24 { | |
25 alias util.cpuid.mmx mmx; | |
26 alias util.cpuid.sse sse; | |
27 alias util.cpuid.sse2 sse2; | |
28 alias util.cpuid.amd3dnow amd3dnow; | |
29 } | |
30 | |
31 //version = log; | |
32 | |
33 bool disjoint(T)(T[] a, T[] b) | |
34 { | |
35 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); | |
36 } | |
37 | |
38 /* Performance figures measured by Burton Radons | |
39 */ | |
40 | |
41 alias double T; | |
42 | |
43 extern (C): | |
44 | |
45 /* ======================================================================== */ | |
46 | |
47 /*********************** | |
48 * Computes: | |
49 * a[] = b[] + c[] | |
50 */ | |
51 | |
52 T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b) | |
53 in | |
54 { | |
55 assert(a.length == b.length && b.length == c.length); | |
56 assert(disjoint(a, b)); | |
57 assert(disjoint(a, c)); | |
58 assert(disjoint(b, c)); | |
59 } | |
60 body | |
61 { | |
62 auto aptr = a.ptr; | |
63 auto aend = aptr + a.length; | |
64 auto bptr = b.ptr; | |
65 auto cptr = c.ptr; | |
66 | |
67 version (D_InlineAsm_X86) | |
68 { | |
69 // SSE2 version is 333% faster | |
70 if (sse2() && b.length >= 16) | |
71 { | |
72 auto n = aptr + (b.length & ~15); | |
73 | |
74 // Unaligned case | |
75 asm | |
76 { | |
77 mov EAX, bptr; // left operand | |
78 mov ECX, cptr; // right operand | |
79 mov ESI, aptr; // destination operand | |
80 mov EDI, n; // end comparison | |
81 | |
82 align 8; | |
83 startsseloopb: | |
84 movupd XMM0, [EAX]; | |
85 movupd XMM1, [EAX+16]; | |
86 movupd XMM2, [EAX+32]; | |
87 movupd XMM3, [EAX+48]; | |
88 add EAX, 64; | |
89 movupd XMM4, [ECX]; | |
90 movupd XMM5, [ECX+16]; | |
91 movupd XMM6, [ECX+32]; | |
92 movupd XMM7, [ECX+48]; | |
93 add ESI, 64; | |
94 addpd XMM0, XMM4; | |
95 addpd XMM1, XMM5; | |
96 addpd XMM2, XMM6; | |
97 addpd XMM3, XMM7; | |
98 add ECX, 64; | |
99 movupd [ESI+ 0-64], XMM0; | |
100 movupd [ESI+16-64], XMM1; | |
101 movupd [ESI+32-64], XMM2; | |
102 movupd [ESI+48-64], XMM3; | |
103 cmp ESI, EDI; | |
104 jb startsseloopb; | |
105 | |
106 mov aptr, ESI; | |
107 mov bptr, EAX; | |
108 mov cptr, ECX; | |
109 } | |
110 } | |
111 } | |
112 | |
113 // Handle remainder | |
114 while (aptr < aend) | |
115 *aptr++ = *bptr++ + *cptr++; | |
116 | |
117 return a; | |
118 } | |
119 | |
120 | |
121 unittest | |
122 { | |
123 printf("_arraySliceSliceAddSliceAssign_d unittest\n"); | |
124 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
125 { | |
126 version (log) printf(" cpuid %d\n", cpuid); | |
127 | |
128 for (int j = 0; j < 2; j++) | |
129 { | |
130 const int dim = 67; | |
131 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
132 a = a[j .. dim + j]; // misalign for second iteration | |
133 T[] b = new T[dim + j]; | |
134 b = b[j .. dim + j]; | |
135 T[] c = new T[dim + j]; | |
136 c = c[j .. dim + j]; | |
137 | |
138 for (int i = 0; i < dim; i++) | |
139 { a[i] = cast(T)i; | |
140 b[i] = cast(T)(i + 7); | |
141 c[i] = cast(T)(i * 2); | |
142 } | |
143 | |
144 c[] = a[] + b[]; | |
145 | |
146 for (int i = 0; i < dim; i++) | |
147 { | |
148 if (c[i] != cast(T)(a[i] + b[i])) | |
149 { | |
150 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); | |
151 assert(0); | |
152 } | |
153 } | |
154 } | |
155 } | |
156 } | |
157 | |
158 /* ======================================================================== */ | |
159 | |
160 /*********************** | |
161 * Computes: | |
162 * a[] = b[] - c[] | |
163 */ | |
164 | |
165 T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b) | |
166 in | |
167 { | |
168 assert(a.length == b.length && b.length == c.length); | |
169 assert(disjoint(a, b)); | |
170 assert(disjoint(a, c)); | |
171 assert(disjoint(b, c)); | |
172 } | |
173 body | |
174 { | |
175 auto aptr = a.ptr; | |
176 auto aend = aptr + a.length; | |
177 auto bptr = b.ptr; | |
178 auto cptr = c.ptr; | |
179 | |
180 version (D_InlineAsm_X86) | |
181 { | |
182 // SSE2 version is 324% faster | |
183 if (sse2() && b.length >= 8) | |
184 { | |
185 auto n = aptr + (b.length & ~7); | |
186 | |
187 // Unaligned case | |
188 asm | |
189 { | |
190 mov EAX, bptr; // left operand | |
191 mov ECX, cptr; // right operand | |
192 mov ESI, aptr; // destination operand | |
193 mov EDI, n; // end comparison | |
194 | |
195 align 8; | |
196 startsseloopb: | |
197 movupd XMM0, [EAX]; | |
198 movupd XMM1, [EAX+16]; | |
199 movupd XMM2, [EAX+32]; | |
200 movupd XMM3, [EAX+48]; | |
201 add EAX, 64; | |
202 movupd XMM4, [ECX]; | |
203 movupd XMM5, [ECX+16]; | |
204 movupd XMM6, [ECX+32]; | |
205 movupd XMM7, [ECX+48]; | |
206 add ESI, 64; | |
207 subpd XMM0, XMM4; | |
208 subpd XMM1, XMM5; | |
209 subpd XMM2, XMM6; | |
210 subpd XMM3, XMM7; | |
211 add ECX, 64; | |
212 movupd [ESI+ 0-64], XMM0; | |
213 movupd [ESI+16-64], XMM1; | |
214 movupd [ESI+32-64], XMM2; | |
215 movupd [ESI+48-64], XMM3; | |
216 cmp ESI, EDI; | |
217 jb startsseloopb; | |
218 | |
219 mov aptr, ESI; | |
220 mov bptr, EAX; | |
221 mov cptr, ECX; | |
222 } | |
223 } | |
224 } | |
225 | |
226 // Handle remainder | |
227 while (aptr < aend) | |
228 *aptr++ = *bptr++ - *cptr++; | |
229 | |
230 return a; | |
231 } | |
232 | |
233 | |
234 unittest | |
235 { | |
236 printf("_arraySliceSliceMinSliceAssign_d unittest\n"); | |
237 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
238 { | |
239 version (log) printf(" cpuid %d\n", cpuid); | |
240 | |
241 for (int j = 0; j < 2; j++) | |
242 { | |
243 const int dim = 67; | |
244 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
245 a = a[j .. dim + j]; // misalign for second iteration | |
246 T[] b = new T[dim + j]; | |
247 b = b[j .. dim + j]; | |
248 T[] c = new T[dim + j]; | |
249 c = c[j .. dim + j]; | |
250 | |
251 for (int i = 0; i < dim; i++) | |
252 { a[i] = cast(T)i; | |
253 b[i] = cast(T)(i + 7); | |
254 c[i] = cast(T)(i * 2); | |
255 } | |
256 | |
257 c[] = a[] - b[]; | |
258 | |
259 for (int i = 0; i < dim; i++) | |
260 { | |
261 if (c[i] != cast(T)(a[i] - b[i])) | |
262 { | |
263 printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); | |
264 assert(0); | |
265 } | |
266 } | |
267 } | |
268 } | |
269 } | |
270 | |
271 | |
272 /* ======================================================================== */ | |
273 | |
274 /*********************** | |
275 * Computes: | |
276 * a[] = b[] + value | |
277 */ | |
278 | |
279 T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) | |
280 in | |
281 { | |
282 assert(a.length == b.length); | |
283 assert(disjoint(a, b)); | |
284 } | |
285 body | |
286 { | |
287 //printf("_arraySliceExpAddSliceAssign_d()\n"); | |
288 auto aptr = a.ptr; | |
289 auto aend = aptr + a.length; | |
290 auto bptr = b.ptr; | |
291 | |
292 version (D_InlineAsm_X86) | |
293 { | |
294 // SSE2 version is 305% faster | |
295 if (sse2() && a.length >= 8) | |
296 { | |
297 auto n = aptr + (a.length & ~7); | |
298 | |
299 // Unaligned case | |
300 asm | |
301 { | |
302 mov EAX, bptr; | |
303 mov ESI, aptr; | |
304 mov EDI, n; | |
305 movsd XMM4, value; | |
306 shufpd XMM4, XMM4, 0; | |
307 | |
308 align 8; | |
309 startsseloop: | |
310 add ESI, 64; | |
311 movupd XMM0, [EAX]; | |
312 movupd XMM1, [EAX+16]; | |
313 movupd XMM2, [EAX+32]; | |
314 movupd XMM3, [EAX+48]; | |
315 add EAX, 64; | |
316 addpd XMM0, XMM4; | |
317 addpd XMM1, XMM4; | |
318 addpd XMM2, XMM4; | |
319 addpd XMM3, XMM4; | |
320 movupd [ESI+ 0-64], XMM0; | |
321 movupd [ESI+16-64], XMM1; | |
322 movupd [ESI+32-64], XMM2; | |
323 movupd [ESI+48-64], XMM3; | |
324 cmp ESI, EDI; | |
325 jb startsseloop; | |
326 | |
327 mov aptr, ESI; | |
328 mov bptr, EAX; | |
329 } | |
330 } | |
331 } | |
332 | |
333 while (aptr < aend) | |
334 *aptr++ = *bptr++ + value; | |
335 | |
336 return a; | |
337 } | |
338 | |
339 unittest | |
340 { | |
341 printf("_arraySliceExpAddSliceAssign_d unittest\n"); | |
342 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
343 { | |
344 version (log) printf(" cpuid %d\n", cpuid); | |
345 | |
346 for (int j = 0; j < 2; j++) | |
347 { | |
348 const int dim = 67; | |
349 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
350 a = a[j .. dim + j]; // misalign for second iteration | |
351 T[] b = new T[dim + j]; | |
352 b = b[j .. dim + j]; | |
353 T[] c = new T[dim + j]; | |
354 c = c[j .. dim + j]; | |
355 | |
356 for (int i = 0; i < dim; i++) | |
357 { a[i] = cast(T)i; | |
358 b[i] = cast(T)(i + 7); | |
359 c[i] = cast(T)(i * 2); | |
360 } | |
361 | |
362 c[] = a[] + 6; | |
363 | |
364 for (int i = 0; i < dim; i++) | |
365 { | |
366 if (c[i] != cast(T)(a[i] + 6)) | |
367 { | |
368 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); | |
369 assert(0); | |
370 } | |
371 } | |
372 } | |
373 } | |
374 } | |
375 | |
376 /* ======================================================================== */ | |
377 | |
378 /*********************** | |
379 * Computes: | |
380 * a[] += value | |
381 */ | |
382 | |
383 T[] _arrayExpSliceAddass_d(T[] a, T value) | |
384 { | |
385 //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
386 auto aptr = a.ptr; | |
387 auto aend = aptr + a.length; | |
388 | |
389 version (D_InlineAsm_X86) | |
390 { | |
391 // SSE2 version is 114% faster | |
392 if (sse2() && a.length >= 8) | |
393 { | |
394 auto n = cast(T*)((cast(uint)aend) & ~7); | |
395 if (aptr < n) | |
396 | |
397 // Unaligned case | |
398 asm | |
399 { | |
400 mov ESI, aptr; | |
401 mov EDI, n; | |
402 movsd XMM4, value; | |
403 shufpd XMM4, XMM4, 0; | |
404 | |
405 align 8; | |
406 startsseloopa: | |
407 movupd XMM0, [ESI]; | |
408 movupd XMM1, [ESI+16]; | |
409 movupd XMM2, [ESI+32]; | |
410 movupd XMM3, [ESI+48]; | |
411 add ESI, 64; | |
412 addpd XMM0, XMM4; | |
413 addpd XMM1, XMM4; | |
414 addpd XMM2, XMM4; | |
415 addpd XMM3, XMM4; | |
416 movupd [ESI+ 0-64], XMM0; | |
417 movupd [ESI+16-64], XMM1; | |
418 movupd [ESI+32-64], XMM2; | |
419 movupd [ESI+48-64], XMM3; | |
420 cmp ESI, EDI; | |
421 jb startsseloopa; | |
422 | |
423 mov aptr, ESI; | |
424 } | |
425 } | |
426 } | |
427 | |
428 while (aptr < aend) | |
429 *aptr++ += value; | |
430 | |
431 return a; | |
432 } | |
433 | |
434 unittest | |
435 { | |
436 printf("_arrayExpSliceAddass_d unittest\n"); | |
437 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
438 { | |
439 version (log) printf(" cpuid %d\n", cpuid); | |
440 | |
441 for (int j = 0; j < 2; j++) | |
442 { | |
443 const int dim = 67; | |
444 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
445 a = a[j .. dim + j]; // misalign for second iteration | |
446 T[] b = new T[dim + j]; | |
447 b = b[j .. dim + j]; | |
448 T[] c = new T[dim + j]; | |
449 c = c[j .. dim + j]; | |
450 | |
451 for (int i = 0; i < dim; i++) | |
452 { a[i] = cast(T)i; | |
453 b[i] = cast(T)(i + 7); | |
454 c[i] = cast(T)(i * 2); | |
455 } | |
456 | |
457 a[] = c[]; | |
458 c[] += 6; | |
459 | |
460 for (int i = 0; i < dim; i++) | |
461 { | |
462 if (c[i] != cast(T)(a[i] + 6)) | |
463 { | |
464 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); | |
465 assert(0); | |
466 } | |
467 } | |
468 } | |
469 } | |
470 } | |
471 | |
472 /* ======================================================================== */ | |
473 | |
474 /*********************** | |
475 * Computes: | |
476 * a[] += b[] | |
477 */ | |
478 | |
479 T[] _arraySliceSliceAddass_d(T[] a, T[] b) | |
480 in | |
481 { | |
482 assert (a.length == b.length); | |
483 assert (disjoint(a, b)); | |
484 } | |
485 body | |
486 { | |
487 //printf("_arraySliceSliceAddass_d()\n"); | |
488 auto aptr = a.ptr; | |
489 auto aend = aptr + a.length; | |
490 auto bptr = b.ptr; | |
491 | |
492 version (D_InlineAsm_X86) | |
493 { | |
494 // SSE2 version is 183% faster | |
495 if (sse2() && a.length >= 8) | |
496 { | |
497 auto n = aptr + (a.length & ~7); | |
498 | |
499 // Unaligned case | |
500 asm | |
501 { | |
502 mov ECX, bptr; // right operand | |
503 mov ESI, aptr; // destination operand | |
504 mov EDI, n; // end comparison | |
505 | |
506 align 8; | |
507 startsseloopb: | |
508 movupd XMM0, [ESI]; | |
509 movupd XMM1, [ESI+16]; | |
510 movupd XMM2, [ESI+32]; | |
511 movupd XMM3, [ESI+48]; | |
512 add ESI, 64; | |
513 movupd XMM4, [ECX]; | |
514 movupd XMM5, [ECX+16]; | |
515 movupd XMM6, [ECX+32]; | |
516 movupd XMM7, [ECX+48]; | |
517 add ECX, 64; | |
518 addpd XMM0, XMM4; | |
519 addpd XMM1, XMM5; | |
520 addpd XMM2, XMM6; | |
521 addpd XMM3, XMM7; | |
522 movupd [ESI+ 0-64], XMM0; | |
523 movupd [ESI+16-64], XMM1; | |
524 movupd [ESI+32-64], XMM2; | |
525 movupd [ESI+48-64], XMM3; | |
526 cmp ESI, EDI; | |
527 jb startsseloopb; | |
528 | |
529 mov aptr, ESI; | |
530 mov bptr, ECX; | |
531 } | |
532 } | |
533 } | |
534 | |
535 while (aptr < aend) | |
536 *aptr++ += *bptr++; | |
537 | |
538 return a; | |
539 } | |
540 | |
541 unittest | |
542 { | |
543 printf("_arraySliceSliceAddass_d unittest\n"); | |
544 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
545 { | |
546 version (log) printf(" cpuid %d\n", cpuid); | |
547 | |
548 for (int j = 0; j < 2; j++) | |
549 { | |
550 const int dim = 67; | |
551 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
552 a = a[j .. dim + j]; // misalign for second iteration | |
553 T[] b = new T[dim + j]; | |
554 b = b[j .. dim + j]; | |
555 T[] c = new T[dim + j]; | |
556 c = c[j .. dim + j]; | |
557 | |
558 for (int i = 0; i < dim; i++) | |
559 { a[i] = cast(T)i; | |
560 b[i] = cast(T)(i + 7); | |
561 c[i] = cast(T)(i * 2); | |
562 } | |
563 | |
564 a[] = c[]; | |
565 c[] += b[]; | |
566 | |
567 for (int i = 0; i < dim; i++) | |
568 { | |
569 if (c[i] != cast(T)(a[i] + b[i])) | |
570 { | |
571 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); | |
572 assert(0); | |
573 } | |
574 } | |
575 } | |
576 } | |
577 } | |
578 | |
579 /* ======================================================================== */ | |
580 | |
581 /*********************** | |
582 * Computes: | |
583 * a[] = b[] - value | |
584 */ | |
585 | |
586 T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) | |
587 in | |
588 { | |
589 assert (a.length == b.length); | |
590 assert (disjoint(a, b)); | |
591 } | |
592 body | |
593 { | |
594 //printf("_arraySliceExpMinSliceAssign_d()\n"); | |
595 auto aptr = a.ptr; | |
596 auto aend = aptr + a.length; | |
597 auto bptr = b.ptr; | |
598 | |
599 version (D_InlineAsm_X86) | |
600 { | |
601 // SSE2 version is 305% faster | |
602 if (sse2() && a.length >= 8) | |
603 { | |
604 auto n = aptr + (a.length & ~7); | |
605 | |
606 // Unaligned case | |
607 asm | |
608 { | |
609 mov EAX, bptr; | |
610 mov ESI, aptr; | |
611 mov EDI, n; | |
612 movsd XMM4, value; | |
613 shufpd XMM4, XMM4, 0; | |
614 | |
615 align 8; | |
616 startsseloop: | |
617 add ESI, 64; | |
618 movupd XMM0, [EAX]; | |
619 movupd XMM1, [EAX+16]; | |
620 movupd XMM2, [EAX+32]; | |
621 movupd XMM3, [EAX+48]; | |
622 add EAX, 64; | |
623 subpd XMM0, XMM4; | |
624 subpd XMM1, XMM4; | |
625 subpd XMM2, XMM4; | |
626 subpd XMM3, XMM4; | |
627 movupd [ESI+ 0-64], XMM0; | |
628 movupd [ESI+16-64], XMM1; | |
629 movupd [ESI+32-64], XMM2; | |
630 movupd [ESI+48-64], XMM3; | |
631 cmp ESI, EDI; | |
632 jb startsseloop; | |
633 | |
634 mov aptr, ESI; | |
635 mov bptr, EAX; | |
636 } | |
637 } | |
638 } | |
639 | |
640 while (aptr < aend) | |
641 *aptr++ = *bptr++ - value; | |
642 | |
643 return a; | |
644 } | |
645 | |
646 unittest | |
647 { | |
648 printf("_arraySliceExpMinSliceAssign_d unittest\n"); | |
649 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
650 { | |
651 version (log) printf(" cpuid %d\n", cpuid); | |
652 | |
653 for (int j = 0; j < 2; j++) | |
654 { | |
655 const int dim = 67; | |
656 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
657 a = a[j .. dim + j]; // misalign for second iteration | |
658 T[] b = new T[dim + j]; | |
659 b = b[j .. dim + j]; | |
660 T[] c = new T[dim + j]; | |
661 c = c[j .. dim + j]; | |
662 | |
663 for (int i = 0; i < dim; i++) | |
664 { a[i] = cast(T)i; | |
665 b[i] = cast(T)(i + 7); | |
666 c[i] = cast(T)(i * 2); | |
667 } | |
668 | |
669 c[] = a[] - 6; | |
670 | |
671 for (int i = 0; i < dim; i++) | |
672 { | |
673 if (c[i] != cast(T)(a[i] - 6)) | |
674 { | |
675 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); | |
676 assert(0); | |
677 } | |
678 } | |
679 } | |
680 } | |
681 } | |
682 | |
683 /* ======================================================================== */ | |
684 | |
685 /*********************** | |
686 * Computes: | |
687 * a[] = value - b[] | |
688 */ | |
689 | |
690 T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) | |
691 in | |
692 { | |
693 assert (a.length == b.length); | |
694 assert (disjoint(a, b)); | |
695 } | |
696 body | |
697 { | |
698 //printf("_arrayExpSliceMinSliceAssign_d()\n"); | |
699 auto aptr = a.ptr; | |
700 auto aend = aptr + a.length; | |
701 auto bptr = b.ptr; | |
702 | |
703 version (D_InlineAsm_X86) | |
704 { | |
705 // SSE2 version is 66% faster | |
706 if (sse2() && a.length >= 8) | |
707 { | |
708 auto n = aptr + (a.length & ~7); | |
709 | |
710 // Unaligned case | |
711 asm | |
712 { | |
713 mov EAX, bptr; | |
714 mov ESI, aptr; | |
715 mov EDI, n; | |
716 movsd XMM4, value; | |
717 shufpd XMM4, XMM4, 0; | |
718 | |
719 align 8; | |
720 startsseloop: | |
721 add ESI, 64; | |
722 movapd XMM5, XMM4; | |
723 movapd XMM6, XMM4; | |
724 movupd XMM0, [EAX]; | |
725 movupd XMM1, [EAX+16]; | |
726 movupd XMM2, [EAX+32]; | |
727 movupd XMM3, [EAX+48]; | |
728 add EAX, 64; | |
729 subpd XMM5, XMM0; | |
730 subpd XMM6, XMM1; | |
731 movupd [ESI+ 0-64], XMM5; | |
732 movupd [ESI+16-64], XMM6; | |
733 movapd XMM5, XMM4; | |
734 movapd XMM6, XMM4; | |
735 subpd XMM5, XMM2; | |
736 subpd XMM6, XMM3; | |
737 movupd [ESI+32-64], XMM5; | |
738 movupd [ESI+48-64], XMM6; | |
739 cmp ESI, EDI; | |
740 jb startsseloop; | |
741 | |
742 mov aptr, ESI; | |
743 mov bptr, EAX; | |
744 } | |
745 } | |
746 } | |
747 | |
748 while (aptr < aend) | |
749 *aptr++ = value - *bptr++; | |
750 | |
751 return a; | |
752 } | |
753 | |
754 unittest | |
755 { | |
756 printf("_arrayExpSliceMinSliceAssign_d unittest\n"); | |
757 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
758 { | |
759 version (log) printf(" cpuid %d\n", cpuid); | |
760 | |
761 for (int j = 0; j < 2; j++) | |
762 { | |
763 const int dim = 67; | |
764 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
765 a = a[j .. dim + j]; // misalign for second iteration | |
766 T[] b = new T[dim + j]; | |
767 b = b[j .. dim + j]; | |
768 T[] c = new T[dim + j]; | |
769 c = c[j .. dim + j]; | |
770 | |
771 for (int i = 0; i < dim; i++) | |
772 { a[i] = cast(T)i; | |
773 b[i] = cast(T)(i + 7); | |
774 c[i] = cast(T)(i * 2); | |
775 } | |
776 | |
777 c[] = 6 - a[]; | |
778 | |
779 for (int i = 0; i < dim; i++) | |
780 { | |
781 if (c[i] != cast(T)(6 - a[i])) | |
782 { | |
783 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); | |
784 assert(0); | |
785 } | |
786 } | |
787 } | |
788 } | |
789 } | |
790 | |
791 /* ======================================================================== */ | |
792 | |
793 /*********************** | |
794 * Computes: | |
795 * a[] -= value | |
796 */ | |
797 | |
798 T[] _arrayExpSliceMinass_d(T[] a, T value) | |
799 { | |
800 //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
801 auto aptr = a.ptr; | |
802 auto aend = aptr + a.length; | |
803 | |
804 version (D_InlineAsm_X86) | |
805 { | |
806 // SSE2 version is 115% faster | |
807 if (sse2() && a.length >= 8) | |
808 { | |
809 auto n = cast(T*)((cast(uint)aend) & ~7); | |
810 if (aptr < n) | |
811 | |
812 // Unaligned case | |
813 asm | |
814 { | |
815 mov ESI, aptr; | |
816 mov EDI, n; | |
817 movsd XMM4, value; | |
818 shufpd XMM4, XMM4, 0; | |
819 | |
820 align 8; | |
821 startsseloopa: | |
822 movupd XMM0, [ESI]; | |
823 movupd XMM1, [ESI+16]; | |
824 movupd XMM2, [ESI+32]; | |
825 movupd XMM3, [ESI+48]; | |
826 add ESI, 64; | |
827 subpd XMM0, XMM4; | |
828 subpd XMM1, XMM4; | |
829 subpd XMM2, XMM4; | |
830 subpd XMM3, XMM4; | |
831 movupd [ESI+ 0-64], XMM0; | |
832 movupd [ESI+16-64], XMM1; | |
833 movupd [ESI+32-64], XMM2; | |
834 movupd [ESI+48-64], XMM3; | |
835 cmp ESI, EDI; | |
836 jb startsseloopa; | |
837 | |
838 mov aptr, ESI; | |
839 } | |
840 } | |
841 } | |
842 | |
843 while (aptr < aend) | |
844 *aptr++ -= value; | |
845 | |
846 return a; | |
847 } | |
848 | |
849 unittest | |
850 { | |
851 printf("_arrayExpSliceMinass_d unittest\n"); | |
852 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
853 { | |
854 version (log) printf(" cpuid %d\n", cpuid); | |
855 | |
856 for (int j = 0; j < 2; j++) | |
857 { | |
858 const int dim = 67; | |
859 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
860 a = a[j .. dim + j]; // misalign for second iteration | |
861 T[] b = new T[dim + j]; | |
862 b = b[j .. dim + j]; | |
863 T[] c = new T[dim + j]; | |
864 c = c[j .. dim + j]; | |
865 | |
866 for (int i = 0; i < dim; i++) | |
867 { a[i] = cast(T)i; | |
868 b[i] = cast(T)(i + 7); | |
869 c[i] = cast(T)(i * 2); | |
870 } | |
871 | |
872 a[] = c[]; | |
873 c[] -= 6; | |
874 | |
875 for (int i = 0; i < dim; i++) | |
876 { | |
877 if (c[i] != cast(T)(a[i] - 6)) | |
878 { | |
879 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); | |
880 assert(0); | |
881 } | |
882 } | |
883 } | |
884 } | |
885 } | |
886 | |
887 /* ======================================================================== */ | |
888 | |
889 /*********************** | |
890 * Computes: | |
891 * a[] -= b[] | |
892 */ | |
893 | |
894 T[] _arraySliceSliceMinass_d(T[] a, T[] b) | |
895 in | |
896 { | |
897 assert (a.length == b.length); | |
898 assert (disjoint(a, b)); | |
899 } | |
900 body | |
901 { | |
902 //printf("_arraySliceSliceMinass_d()\n"); | |
903 auto aptr = a.ptr; | |
904 auto aend = aptr + a.length; | |
905 auto bptr = b.ptr; | |
906 | |
907 version (D_InlineAsm_X86) | |
908 { | |
909 // SSE2 version is 183% faster | |
910 if (sse2() && a.length >= 8) | |
911 { | |
912 auto n = aptr + (a.length & ~7); | |
913 | |
914 // Unaligned case | |
915 asm | |
916 { | |
917 mov ECX, bptr; // right operand | |
918 mov ESI, aptr; // destination operand | |
919 mov EDI, n; // end comparison | |
920 | |
921 align 8; | |
922 startsseloopb: | |
923 movupd XMM0, [ESI]; | |
924 movupd XMM1, [ESI+16]; | |
925 movupd XMM2, [ESI+32]; | |
926 movupd XMM3, [ESI+48]; | |
927 add ESI, 64; | |
928 movupd XMM4, [ECX]; | |
929 movupd XMM5, [ECX+16]; | |
930 movupd XMM6, [ECX+32]; | |
931 movupd XMM7, [ECX+48]; | |
932 add ECX, 64; | |
933 subpd XMM0, XMM4; | |
934 subpd XMM1, XMM5; | |
935 subpd XMM2, XMM6; | |
936 subpd XMM3, XMM7; | |
937 movupd [ESI+ 0-64], XMM0; | |
938 movupd [ESI+16-64], XMM1; | |
939 movupd [ESI+32-64], XMM2; | |
940 movupd [ESI+48-64], XMM3; | |
941 cmp ESI, EDI; | |
942 jb startsseloopb; | |
943 | |
944 mov aptr, ESI; | |
945 mov bptr, ECX; | |
946 } | |
947 } | |
948 } | |
949 | |
950 while (aptr < aend) | |
951 *aptr++ -= *bptr++; | |
952 | |
953 return a; | |
954 } | |
955 | |
956 unittest | |
957 { | |
958 printf("_arrayExpSliceMinass_d unittest\n"); | |
959 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
960 { | |
961 version (log) printf(" cpuid %d\n", cpuid); | |
962 | |
963 for (int j = 0; j < 2; j++) | |
964 { | |
965 const int dim = 67; | |
966 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
967 a = a[j .. dim + j]; // misalign for second iteration | |
968 T[] b = new T[dim + j]; | |
969 b = b[j .. dim + j]; | |
970 T[] c = new T[dim + j]; | |
971 c = c[j .. dim + j]; | |
972 | |
973 for (int i = 0; i < dim; i++) | |
974 { a[i] = cast(T)i; | |
975 b[i] = cast(T)(i + 7); | |
976 c[i] = cast(T)(i * 2); | |
977 } | |
978 | |
979 a[] = c[]; | |
980 c[] -= 6; | |
981 | |
982 for (int i = 0; i < dim; i++) | |
983 { | |
984 if (c[i] != cast(T)(a[i] - 6)) | |
985 { | |
986 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); | |
987 assert(0); | |
988 } | |
989 } | |
990 } | |
991 } | |
992 } | |
993 | |
994 /* ======================================================================== */ | |
995 | |
996 /*********************** | |
997 * Computes: | |
998 * a[] = b[] * value | |
999 */ | |
1000 | |
1001 T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b) | |
1002 in | |
1003 { | |
1004 assert(a.length == b.length); | |
1005 assert(disjoint(a, b)); | |
1006 } | |
1007 body | |
1008 { | |
1009 //printf("_arraySliceExpMulSliceAssign_d()\n"); | |
1010 auto aptr = a.ptr; | |
1011 auto aend = aptr + a.length; | |
1012 auto bptr = b.ptr; | |
1013 | |
1014 version (D_InlineAsm_X86) | |
1015 { | |
1016 // SSE2 version is 304% faster | |
1017 if (sse2() && a.length >= 8) | |
1018 { | |
1019 auto n = aptr + (a.length & ~7); | |
1020 | |
1021 // Unaligned case | |
1022 asm | |
1023 { | |
1024 mov EAX, bptr; | |
1025 mov ESI, aptr; | |
1026 mov EDI, n; | |
1027 movsd XMM4, value; | |
1028 shufpd XMM4, XMM4, 0; | |
1029 | |
1030 align 8; | |
1031 startsseloop: | |
1032 add ESI, 64; | |
1033 movupd XMM0, [EAX]; | |
1034 movupd XMM1, [EAX+16]; | |
1035 movupd XMM2, [EAX+32]; | |
1036 movupd XMM3, [EAX+48]; | |
1037 add EAX, 64; | |
1038 mulpd XMM0, XMM4; | |
1039 mulpd XMM1, XMM4; | |
1040 mulpd XMM2, XMM4; | |
1041 mulpd XMM3, XMM4; | |
1042 movupd [ESI+ 0-64], XMM0; | |
1043 movupd [ESI+16-64], XMM1; | |
1044 movupd [ESI+32-64], XMM2; | |
1045 movupd [ESI+48-64], XMM3; | |
1046 cmp ESI, EDI; | |
1047 jb startsseloop; | |
1048 | |
1049 mov aptr, ESI; | |
1050 mov bptr, EAX; | |
1051 } | |
1052 } | |
1053 } | |
1054 | |
1055 while (aptr < aend) | |
1056 *aptr++ = *bptr++ * value; | |
1057 | |
1058 return a; | |
1059 } | |
1060 | |
1061 unittest | |
1062 { | |
1063 printf("_arraySliceExpMulSliceAssign_d unittest\n"); | |
1064 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1065 { | |
1066 version (log) printf(" cpuid %d\n", cpuid); | |
1067 | |
1068 for (int j = 0; j < 2; j++) | |
1069 { | |
1070 const int dim = 67; | |
1071 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1072 a = a[j .. dim + j]; // misalign for second iteration | |
1073 T[] b = new T[dim + j]; | |
1074 b = b[j .. dim + j]; | |
1075 T[] c = new T[dim + j]; | |
1076 c = c[j .. dim + j]; | |
1077 | |
1078 for (int i = 0; i < dim; i++) | |
1079 { a[i] = cast(T)i; | |
1080 b[i] = cast(T)(i + 7); | |
1081 c[i] = cast(T)(i * 2); | |
1082 } | |
1083 | |
1084 c[] = a[] * 6; | |
1085 | |
1086 for (int i = 0; i < dim; i++) | |
1087 { | |
1088 if (c[i] != cast(T)(a[i] * 6)) | |
1089 { | |
1090 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); | |
1091 assert(0); | |
1092 } | |
1093 } | |
1094 } | |
1095 } | |
1096 } | |
1097 | |
1098 /* ======================================================================== */ | |
1099 | |
1100 /*********************** | |
1101 * Computes: | |
1102 * a[] = b[] * c[] | |
1103 */ | |
1104 | |
1105 T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b) | |
1106 in | |
1107 { | |
1108 assert(a.length == b.length && b.length == c.length); | |
1109 assert(disjoint(a, b)); | |
1110 assert(disjoint(a, c)); | |
1111 assert(disjoint(b, c)); | |
1112 } | |
1113 body | |
1114 { | |
1115 //printf("_arraySliceSliceMulSliceAssign_d()\n"); | |
1116 auto aptr = a.ptr; | |
1117 auto aend = aptr + a.length; | |
1118 auto bptr = b.ptr; | |
1119 auto cptr = c.ptr; | |
1120 | |
1121 version (D_InlineAsm_X86) | |
1122 { | |
1123 // SSE2 version is 329% faster | |
1124 if (sse2() && a.length >= 8) | |
1125 { | |
1126 auto n = aptr + (a.length & ~7); | |
1127 | |
1128 // Unaligned case | |
1129 asm | |
1130 { | |
1131 mov EAX, bptr; // left operand | |
1132 mov ECX, cptr; // right operand | |
1133 mov ESI, aptr; // destination operand | |
1134 mov EDI, n; // end comparison | |
1135 | |
1136 align 8; | |
1137 startsseloopb: | |
1138 movupd XMM0, [EAX]; | |
1139 movupd XMM1, [EAX+16]; | |
1140 movupd XMM2, [EAX+32]; | |
1141 movupd XMM3, [EAX+48]; | |
1142 add ESI, 64; | |
1143 movupd XMM4, [ECX]; | |
1144 movupd XMM5, [ECX+16]; | |
1145 movupd XMM6, [ECX+32]; | |
1146 movupd XMM7, [ECX+48]; | |
1147 add EAX, 64; | |
1148 mulpd XMM0, XMM4; | |
1149 mulpd XMM1, XMM5; | |
1150 mulpd XMM2, XMM6; | |
1151 mulpd XMM3, XMM7; | |
1152 add ECX, 64; | |
1153 movupd [ESI+ 0-64], XMM0; | |
1154 movupd [ESI+16-64], XMM1; | |
1155 movupd [ESI+32-64], XMM2; | |
1156 movupd [ESI+48-64], XMM3; | |
1157 cmp ESI, EDI; | |
1158 jb startsseloopb; | |
1159 | |
1160 mov aptr, ESI; | |
1161 mov bptr, EAX; | |
1162 mov cptr, ECX; | |
1163 } | |
1164 } | |
1165 } | |
1166 | |
1167 while (aptr < aend) | |
1168 *aptr++ = *bptr++ * *cptr++; | |
1169 | |
1170 return a; | |
1171 } | |
1172 | |
1173 unittest | |
1174 { | |
1175 printf("_arraySliceSliceMulSliceAssign_d unittest\n"); | |
1176 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1177 { | |
1178 version (log) printf(" cpuid %d\n", cpuid); | |
1179 | |
1180 for (int j = 0; j < 2; j++) | |
1181 { | |
1182 const int dim = 67; | |
1183 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1184 a = a[j .. dim + j]; // misalign for second iteration | |
1185 T[] b = new T[dim + j]; | |
1186 b = b[j .. dim + j]; | |
1187 T[] c = new T[dim + j]; | |
1188 c = c[j .. dim + j]; | |
1189 | |
1190 for (int i = 0; i < dim; i++) | |
1191 { a[i] = cast(T)i; | |
1192 b[i] = cast(T)(i + 7); | |
1193 c[i] = cast(T)(i * 2); | |
1194 } | |
1195 | |
1196 c[] = a[] * b[]; | |
1197 | |
1198 for (int i = 0; i < dim; i++) | |
1199 { | |
1200 if (c[i] != cast(T)(a[i] * b[i])) | |
1201 { | |
1202 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); | |
1203 assert(0); | |
1204 } | |
1205 } | |
1206 } | |
1207 } | |
1208 } | |
1209 | |
1210 /* ======================================================================== */ | |
1211 | |
1212 /*********************** | |
1213 * Computes: | |
1214 * a[] *= value | |
1215 */ | |
1216 | |
1217 T[] _arrayExpSliceMulass_d(T[] a, T value) | |
1218 { | |
1219 //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1220 auto aptr = a.ptr; | |
1221 auto aend = aptr + a.length; | |
1222 | |
1223 version (D_InlineAsm_X86) | |
1224 { | |
1225 // SSE2 version is 109% faster | |
1226 if (sse2() && a.length >= 8) | |
1227 { | |
1228 auto n = cast(T*)((cast(uint)aend) & ~7); | |
1229 if (aptr < n) | |
1230 | |
1231 // Unaligned case | |
1232 asm | |
1233 { | |
1234 mov ESI, aptr; | |
1235 mov EDI, n; | |
1236 movsd XMM4, value; | |
1237 shufpd XMM4, XMM4, 0; | |
1238 | |
1239 align 8; | |
1240 startsseloopa: | |
1241 movupd XMM0, [ESI]; | |
1242 movupd XMM1, [ESI+16]; | |
1243 movupd XMM2, [ESI+32]; | |
1244 movupd XMM3, [ESI+48]; | |
1245 add ESI, 64; | |
1246 mulpd XMM0, XMM4; | |
1247 mulpd XMM1, XMM4; | |
1248 mulpd XMM2, XMM4; | |
1249 mulpd XMM3, XMM4; | |
1250 movupd [ESI+ 0-64], XMM0; | |
1251 movupd [ESI+16-64], XMM1; | |
1252 movupd [ESI+32-64], XMM2; | |
1253 movupd [ESI+48-64], XMM3; | |
1254 cmp ESI, EDI; | |
1255 jb startsseloopa; | |
1256 | |
1257 mov aptr, ESI; | |
1258 } | |
1259 } | |
1260 } | |
1261 | |
1262 while (aptr < aend) | |
1263 *aptr++ *= value; | |
1264 | |
1265 return a; | |
1266 } | |
1267 | |
1268 unittest | |
1269 { | |
1270 printf("_arrayExpSliceMulass_d unittest\n"); | |
1271 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1272 { | |
1273 version (log) printf(" cpuid %d\n", cpuid); | |
1274 | |
1275 for (int j = 0; j < 2; j++) | |
1276 { | |
1277 const int dim = 67; | |
1278 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1279 a = a[j .. dim + j]; // misalign for second iteration | |
1280 T[] b = new T[dim + j]; | |
1281 b = b[j .. dim + j]; | |
1282 T[] c = new T[dim + j]; | |
1283 c = c[j .. dim + j]; | |
1284 | |
1285 for (int i = 0; i < dim; i++) | |
1286 { a[i] = cast(T)i; | |
1287 b[i] = cast(T)(i + 7); | |
1288 c[i] = cast(T)(i * 2); | |
1289 } | |
1290 | |
1291 a[] = c[]; | |
1292 c[] *= 6; | |
1293 | |
1294 for (int i = 0; i < dim; i++) | |
1295 { | |
1296 if (c[i] != cast(T)(a[i] * 6)) | |
1297 { | |
1298 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); | |
1299 assert(0); | |
1300 } | |
1301 } | |
1302 } | |
1303 } | |
1304 } | |
1305 | |
1306 /* ======================================================================== */ | |
1307 | |
1308 /*********************** | |
1309 * Computes: | |
1310 * a[] *= b[] | |
1311 */ | |
1312 | |
1313 T[] _arraySliceSliceMulass_d(T[] a, T[] b) | |
1314 in | |
1315 { | |
1316 assert (a.length == b.length); | |
1317 assert (disjoint(a, b)); | |
1318 } | |
1319 body | |
1320 { | |
1321 //printf("_arraySliceSliceMulass_d()\n"); | |
1322 auto aptr = a.ptr; | |
1323 auto aend = aptr + a.length; | |
1324 auto bptr = b.ptr; | |
1325 | |
1326 version (D_InlineAsm_X86) | |
1327 { | |
1328 // SSE2 version is 205% faster | |
1329 if (sse2() && a.length >= 8) | |
1330 { | |
1331 auto n = aptr + (a.length & ~7); | |
1332 | |
1333 // Unaligned case | |
1334 asm | |
1335 { | |
1336 mov ECX, bptr; // right operand | |
1337 mov ESI, aptr; // destination operand | |
1338 mov EDI, n; // end comparison | |
1339 | |
1340 align 8; | |
1341 startsseloopb: | |
1342 movupd XMM0, [ESI]; | |
1343 movupd XMM1, [ESI+16]; | |
1344 movupd XMM2, [ESI+32]; | |
1345 movupd XMM3, [ESI+48]; | |
1346 add ESI, 64; | |
1347 movupd XMM4, [ECX]; | |
1348 movupd XMM5, [ECX+16]; | |
1349 movupd XMM6, [ECX+32]; | |
1350 movupd XMM7, [ECX+48]; | |
1351 add ECX, 64; | |
1352 mulpd XMM0, XMM4; | |
1353 mulpd XMM1, XMM5; | |
1354 mulpd XMM2, XMM6; | |
1355 mulpd XMM3, XMM7; | |
1356 movupd [ESI+ 0-64], XMM0; | |
1357 movupd [ESI+16-64], XMM1; | |
1358 movupd [ESI+32-64], XMM2; | |
1359 movupd [ESI+48-64], XMM3; | |
1360 cmp ESI, EDI; | |
1361 jb startsseloopb; | |
1362 | |
1363 mov aptr, ESI; | |
1364 mov bptr, ECX; | |
1365 } | |
1366 } | |
1367 } | |
1368 | |
1369 while (aptr < aend) | |
1370 *aptr++ *= *bptr++; | |
1371 | |
1372 return a; | |
1373 } | |
1374 | |
1375 unittest | |
1376 { | |
1377 printf("_arrayExpSliceMulass_d unittest\n"); | |
1378 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1379 { | |
1380 version (log) printf(" cpuid %d\n", cpuid); | |
1381 | |
1382 for (int j = 0; j < 2; j++) | |
1383 { | |
1384 const int dim = 67; | |
1385 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1386 a = a[j .. dim + j]; // misalign for second iteration | |
1387 T[] b = new T[dim + j]; | |
1388 b = b[j .. dim + j]; | |
1389 T[] c = new T[dim + j]; | |
1390 c = c[j .. dim + j]; | |
1391 | |
1392 for (int i = 0; i < dim; i++) | |
1393 { a[i] = cast(T)i; | |
1394 b[i] = cast(T)(i + 7); | |
1395 c[i] = cast(T)(i * 2); | |
1396 } | |
1397 | |
1398 a[] = c[]; | |
1399 c[] *= 6; | |
1400 | |
1401 for (int i = 0; i < dim; i++) | |
1402 { | |
1403 if (c[i] != cast(T)(a[i] * 6)) | |
1404 { | |
1405 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); | |
1406 assert(0); | |
1407 } | |
1408 } | |
1409 } | |
1410 } | |
1411 } | |
1412 | |
1413 /* ======================================================================== */ | |
1414 | |
1415 /*********************** | |
1416 * Computes: | |
1417 * a[] = b[] / value | |
1418 */ | |
1419 | |
1420 T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b) | |
1421 in | |
1422 { | |
1423 assert(a.length == b.length); | |
1424 assert(disjoint(a, b)); | |
1425 } | |
1426 body | |
1427 { | |
1428 //printf("_arraySliceExpDivSliceAssign_d()\n"); | |
1429 auto aptr = a.ptr; | |
1430 auto aend = aptr + a.length; | |
1431 auto bptr = b.ptr; | |
1432 | |
1433 /* Multiplying by the reciprocal is faster, but does | |
1434 * not produce as accurate an answer. | |
1435 */ | |
1436 T recip = cast(T)1 / value; | |
1437 | |
1438 version (D_InlineAsm_X86) | |
1439 { | |
1440 // SSE2 version is 299% faster | |
1441 if (sse2() && a.length >= 8) | |
1442 { | |
1443 auto n = aptr + (a.length & ~7); | |
1444 | |
1445 // Unaligned case | |
1446 asm | |
1447 { | |
1448 mov EAX, bptr; | |
1449 mov ESI, aptr; | |
1450 mov EDI, n; | |
1451 movsd XMM4, recip; | |
1452 //movsd XMM4, value | |
1453 //rcpsd XMM4, XMM4 | |
1454 shufpd XMM4, XMM4, 0; | |
1455 | |
1456 align 8; | |
1457 startsseloop: | |
1458 add ESI, 64; | |
1459 movupd XMM0, [EAX]; | |
1460 movupd XMM1, [EAX+16]; | |
1461 movupd XMM2, [EAX+32]; | |
1462 movupd XMM3, [EAX+48]; | |
1463 add EAX, 64; | |
1464 mulpd XMM0, XMM4; | |
1465 mulpd XMM1, XMM4; | |
1466 mulpd XMM2, XMM4; | |
1467 mulpd XMM3, XMM4; | |
1468 //divpd XMM0, XMM4; | |
1469 //divpd XMM1, XMM4; | |
1470 //divpd XMM2, XMM4; | |
1471 //divpd XMM3, XMM4; | |
1472 movupd [ESI+ 0-64], XMM0; | |
1473 movupd [ESI+16-64], XMM1; | |
1474 movupd [ESI+32-64], XMM2; | |
1475 movupd [ESI+48-64], XMM3; | |
1476 cmp ESI, EDI; | |
1477 jb startsseloop; | |
1478 | |
1479 mov aptr, ESI; | |
1480 mov bptr, EAX; | |
1481 } | |
1482 } | |
1483 } | |
1484 | |
1485 while (aptr < aend) | |
1486 { | |
1487 *aptr++ = *bptr++ / value; | |
1488 //*aptr++ = *bptr++ * recip; | |
1489 } | |
1490 | |
1491 return a; | |
1492 } | |
1493 | |
1494 unittest | |
1495 { | |
1496 printf("_arraySliceExpDivSliceAssign_d unittest\n"); | |
1497 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1498 { | |
1499 version (log) printf(" cpuid %d\n", cpuid); | |
1500 | |
1501 for (int j = 0; j < 2; j++) | |
1502 { | |
1503 const int dim = 67; | |
1504 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1505 a = a[j .. dim + j]; // misalign for second iteration | |
1506 T[] b = new T[dim + j]; | |
1507 b = b[j .. dim + j]; | |
1508 T[] c = new T[dim + j]; | |
1509 c = c[j .. dim + j]; | |
1510 | |
1511 for (int i = 0; i < dim; i++) | |
1512 { a[i] = cast(T)i; | |
1513 b[i] = cast(T)(i + 7); | |
1514 c[i] = cast(T)(i * 2); | |
1515 } | |
1516 | |
1517 c[] = a[] / 8; | |
1518 | |
1519 for (int i = 0; i < dim; i++) | |
1520 { | |
1521 //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]); | |
1522 if (c[i] != cast(T)(a[i] / 8)) | |
1523 { | |
1524 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); | |
1525 assert(0); | |
1526 } | |
1527 } | |
1528 } | |
1529 } | |
1530 } | |
1531 | |
1532 /* ======================================================================== */ | |
1533 | |
1534 /*********************** | |
1535 * Computes: | |
1536 * a[] /= value | |
1537 */ | |
1538 | |
1539 T[] _arrayExpSliceDivass_d(T[] a, T value) | |
1540 { | |
1541 //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1542 auto aptr = a.ptr; | |
1543 auto aend = aptr + a.length; | |
1544 | |
1545 /* Multiplying by the reciprocal is faster, but does | |
1546 * not produce as accurate an answer. | |
1547 */ | |
1548 T recip = cast(T)1 / value; | |
1549 | |
1550 version (D_InlineAsm_X86) | |
1551 { | |
1552 // SSE2 version is 65% faster | |
1553 if (sse2() && a.length >= 8) | |
1554 { | |
1555 auto n = aptr + (a.length & ~7); | |
1556 | |
1557 // Unaligned case | |
1558 asm | |
1559 { | |
1560 mov ESI, aptr; | |
1561 mov EDI, n; | |
1562 movsd XMM4, recip; | |
1563 //movsd XMM4, value | |
1564 //rcpsd XMM4, XMM4 | |
1565 shufpd XMM4, XMM4, 0; | |
1566 | |
1567 align 8; | |
1568 startsseloopa: | |
1569 movupd XMM0, [ESI]; | |
1570 movupd XMM1, [ESI+16]; | |
1571 movupd XMM2, [ESI+32]; | |
1572 movupd XMM3, [ESI+48]; | |
1573 add ESI, 64; | |
1574 mulpd XMM0, XMM4; | |
1575 mulpd XMM1, XMM4; | |
1576 mulpd XMM2, XMM4; | |
1577 mulpd XMM3, XMM4; | |
1578 //divpd XMM0, XMM4; | |
1579 //divpd XMM1, XMM4; | |
1580 //divpd XMM2, XMM4; | |
1581 //divpd XMM3, XMM4; | |
1582 movupd [ESI+ 0-64], XMM0; | |
1583 movupd [ESI+16-64], XMM1; | |
1584 movupd [ESI+32-64], XMM2; | |
1585 movupd [ESI+48-64], XMM3; | |
1586 cmp ESI, EDI; | |
1587 jb startsseloopa; | |
1588 | |
1589 mov aptr, ESI; | |
1590 } | |
1591 } | |
1592 } | |
1593 | |
1594 while (aptr < aend) | |
1595 *aptr++ *= recip; | |
1596 | |
1597 return a; | |
1598 } | |
1599 | |
1600 | |
1601 unittest | |
1602 { | |
1603 printf("_arrayExpSliceDivass_d unittest\n"); | |
1604 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1605 { | |
1606 version (log) printf(" cpuid %d\n", cpuid); | |
1607 | |
1608 for (int j = 0; j < 2; j++) | |
1609 { | |
1610 const int dim = 67; | |
1611 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1612 a = a[j .. dim + j]; // misalign for second iteration | |
1613 T[] b = new T[dim + j]; | |
1614 b = b[j .. dim + j]; | |
1615 T[] c = new T[dim + j]; | |
1616 c = c[j .. dim + j]; | |
1617 | |
1618 for (int i = 0; i < dim; i++) | |
1619 { a[i] = cast(T)i; | |
1620 b[i] = cast(T)(i + 7); | |
1621 c[i] = cast(T)(i * 2); | |
1622 } | |
1623 | |
1624 a[] = c[]; | |
1625 c[] /= 8; | |
1626 | |
1627 for (int i = 0; i < dim; i++) | |
1628 { | |
1629 if (c[i] != cast(T)(a[i] / 8)) | |
1630 { | |
1631 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); | |
1632 assert(0); | |
1633 } | |
1634 } | |
1635 } | |
1636 } | |
1637 } | |
1638 | |
1639 | |
1640 /* ======================================================================== */ | |
1641 | |
1642 /*********************** | |
1643 * Computes: | |
1644 * a[] -= b[] * value | |
1645 */ | |
1646 | |
1647 T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b) | |
1648 { | |
1649 return _arraySliceExpMulSliceAddass_d(a, -value, b); | |
1650 } | |
1651 | |
1652 /*********************** | |
1653 * Computes: | |
1654 * a[] += b[] * value | |
1655 */ | |
1656 | |
1657 T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b) | |
1658 in | |
1659 { | |
1660 assert(a.length == b.length); | |
1661 assert(disjoint(a, b)); | |
1662 } | |
1663 body | |
1664 { | |
1665 auto aptr = a.ptr; | |
1666 auto aend = aptr + a.length; | |
1667 auto bptr = b.ptr; | |
1668 | |
1669 // Handle remainder | |
1670 while (aptr < aend) | |
1671 *aptr++ += *bptr++ * value; | |
1672 | |
1673 return a; | |
1674 } | |
1675 | |
1676 unittest | |
1677 { | |
1678 printf("_arraySliceExpMulSliceAddass_d unittest\n"); | |
1679 | |
1680 cpuid = 1; | |
1681 { | |
1682 version (log) printf(" cpuid %d\n", cpuid); | |
1683 | |
1684 for (int j = 0; j < 1; j++) | |
1685 { | |
1686 const int dim = 67; | |
1687 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1688 a = a[j .. dim + j]; // misalign for second iteration | |
1689 T[] b = new T[dim + j]; | |
1690 b = b[j .. dim + j]; | |
1691 T[] c = new T[dim + j]; | |
1692 c = c[j .. dim + j]; | |
1693 | |
1694 for (int i = 0; i < dim; i++) | |
1695 { a[i] = cast(T)i; | |
1696 b[i] = cast(T)(i + 7); | |
1697 c[i] = cast(T)(i * 2); | |
1698 } | |
1699 | |
1700 b[] = c[]; | |
1701 c[] += a[] * 6; | |
1702 | |
1703 for (int i = 0; i < dim; i++) | |
1704 { | |
1705 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); | |
1706 if (c[i] != cast(T)(b[i] + a[i] * 6)) | |
1707 { | |
1708 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); | |
1709 assert(0); | |
1710 } | |
1711 } | |
1712 } | |
1713 } | |
1714 } |