Mercurial > projects > ldc
comparison druntime/src/compiler/ldc/arraydouble.d @ 1458:e0b2d67cfe7c
Added druntime (this should be removed once it works).
author | Robert Clipsham <robert@octarineparrot.com> |
---|---|
date | Tue, 02 Jun 2009 17:43:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1456:7b218ec1044f | 1458:e0b2d67cfe7c |
---|---|
1 /** | |
2 * Contains SSE2 and MMX versions of certain operations for double. | |
3 * | |
4 * Copyright: Copyright Digital Mars 2008 - 2009. | |
5 * License: <a href="http://www.boost.org/LICENSE_1_0.txt>Boost License 1.0</a>. | |
6 * Authors: Walter Bright, based on code originally written by Burton Radons | |
7 * | |
8 * Copyright Digital Mars 2008 - 2009. | |
9 * Distributed under the Boost Software License, Version 1.0. | |
10 * (See accompanying file LICENSE_1_0.txt or copy at | |
11 * http://www.boost.org/LICENSE_1_0.txt) | |
12 */ | |
13 module rt.arraydouble; | |
14 | |
15 private import rt.util.cpuid; | |
16 | |
17 version (unittest) | |
18 { | |
19 private import core.stdc.stdio : printf; | |
20 /* This is so unit tests will test every CPU variant | |
21 */ | |
22 int cpuid; | |
23 const int CPUID_MAX = 5; | |
24 bool mmx() { return cpuid == 1 && rt.util.cpuid.mmx(); } | |
25 bool sse() { return cpuid == 2 && rt.util.cpuid.sse(); } | |
26 bool sse2() { return cpuid == 3 && rt.util.cpuid.sse2(); } | |
27 bool amd3dnow() { return cpuid == 4 && rt.util.cpuid.amd3dnow(); } | |
28 } | |
29 else | |
30 { | |
31 alias rt.util.cpuid.mmx mmx; | |
32 alias rt.util.cpuid.sse sse; | |
33 alias rt.util.cpuid.sse2 sse2; | |
34 alias rt.util.cpuid.amd3dnow amd3dnow; | |
35 } | |
36 | |
37 //version = log; | |
38 | |
39 bool disjoint(T)(T[] a, T[] b) | |
40 { | |
41 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); | |
42 } | |
43 | |
44 /* Performance figures measured by Burton Radons | |
45 */ | |
46 | |
47 alias double T; | |
48 | |
49 extern (C): | |
50 | |
51 /* ======================================================================== */ | |
52 | |
53 /*********************** | |
54 * Computes: | |
55 * a[] = b[] + c[] | |
56 */ | |
57 | |
58 T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b) | |
59 in | |
60 { | |
61 assert(a.length == b.length && b.length == c.length); | |
62 assert(disjoint(a, b)); | |
63 assert(disjoint(a, c)); | |
64 assert(disjoint(b, c)); | |
65 } | |
66 body | |
67 { | |
68 auto aptr = a.ptr; | |
69 auto aend = aptr + a.length; | |
70 auto bptr = b.ptr; | |
71 auto cptr = c.ptr; | |
72 | |
73 version (D_InlineAsm_X86) | |
74 { | |
75 // SSE2 version is 333% faster | |
76 if (sse2() && b.length >= 16) | |
77 { | |
78 auto n = aptr + (b.length & ~15); | |
79 | |
80 // Unaligned case | |
81 asm | |
82 { | |
83 mov EAX, bptr; // left operand | |
84 mov ECX, cptr; // right operand | |
85 mov ESI, aptr; // destination operand | |
86 mov EDI, n; // end comparison | |
87 | |
88 align 8; | |
89 startsseloopb: | |
90 movupd XMM0, [EAX]; | |
91 movupd XMM1, [EAX+16]; | |
92 movupd XMM2, [EAX+32]; | |
93 movupd XMM3, [EAX+48]; | |
94 add EAX, 64; | |
95 movupd XMM4, [ECX]; | |
96 movupd XMM5, [ECX+16]; | |
97 movupd XMM6, [ECX+32]; | |
98 movupd XMM7, [ECX+48]; | |
99 add ESI, 64; | |
100 addpd XMM0, XMM4; | |
101 addpd XMM1, XMM5; | |
102 addpd XMM2, XMM6; | |
103 addpd XMM3, XMM7; | |
104 add ECX, 64; | |
105 movupd [ESI+ 0-64], XMM0; | |
106 movupd [ESI+16-64], XMM1; | |
107 movupd [ESI+32-64], XMM2; | |
108 movupd [ESI+48-64], XMM3; | |
109 cmp ESI, EDI; | |
110 jb startsseloopb; | |
111 | |
112 mov aptr, ESI; | |
113 mov bptr, EAX; | |
114 mov cptr, ECX; | |
115 } | |
116 } | |
117 } | |
118 | |
119 // Handle remainder | |
120 while (aptr < aend) | |
121 *aptr++ = *bptr++ + *cptr++; | |
122 | |
123 return a; | |
124 } | |
125 | |
126 | |
127 unittest | |
128 { | |
129 printf("_arraySliceSliceAddSliceAssign_d unittest\n"); | |
130 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
131 { | |
132 version (log) printf(" cpuid %d\n", cpuid); | |
133 | |
134 for (int j = 0; j < 2; j++) | |
135 { | |
136 const int dim = 67; | |
137 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
138 a = a[j .. dim + j]; // misalign for second iteration | |
139 T[] b = new T[dim + j]; | |
140 b = b[j .. dim + j]; | |
141 T[] c = new T[dim + j]; | |
142 c = c[j .. dim + j]; | |
143 | |
144 for (int i = 0; i < dim; i++) | |
145 { a[i] = cast(T)i; | |
146 b[i] = cast(T)(i + 7); | |
147 c[i] = cast(T)(i * 2); | |
148 } | |
149 | |
150 c[] = a[] + b[]; | |
151 | |
152 for (int i = 0; i < dim; i++) | |
153 { | |
154 if (c[i] != cast(T)(a[i] + b[i])) | |
155 { | |
156 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); | |
157 assert(0); | |
158 } | |
159 } | |
160 } | |
161 } | |
162 } | |
163 | |
164 /* ======================================================================== */ | |
165 | |
166 /*********************** | |
167 * Computes: | |
168 * a[] = b[] - c[] | |
169 */ | |
170 | |
171 T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b) | |
172 in | |
173 { | |
174 assert(a.length == b.length && b.length == c.length); | |
175 assert(disjoint(a, b)); | |
176 assert(disjoint(a, c)); | |
177 assert(disjoint(b, c)); | |
178 } | |
179 body | |
180 { | |
181 auto aptr = a.ptr; | |
182 auto aend = aptr + a.length; | |
183 auto bptr = b.ptr; | |
184 auto cptr = c.ptr; | |
185 | |
186 version (D_InlineAsm_X86) | |
187 { | |
188 // SSE2 version is 324% faster | |
189 if (sse2() && b.length >= 8) | |
190 { | |
191 auto n = aptr + (b.length & ~7); | |
192 | |
193 // Unaligned case | |
194 asm | |
195 { | |
196 mov EAX, bptr; // left operand | |
197 mov ECX, cptr; // right operand | |
198 mov ESI, aptr; // destination operand | |
199 mov EDI, n; // end comparison | |
200 | |
201 align 8; | |
202 startsseloopb: | |
203 movupd XMM0, [EAX]; | |
204 movupd XMM1, [EAX+16]; | |
205 movupd XMM2, [EAX+32]; | |
206 movupd XMM3, [EAX+48]; | |
207 add EAX, 64; | |
208 movupd XMM4, [ECX]; | |
209 movupd XMM5, [ECX+16]; | |
210 movupd XMM6, [ECX+32]; | |
211 movupd XMM7, [ECX+48]; | |
212 add ESI, 64; | |
213 subpd XMM0, XMM4; | |
214 subpd XMM1, XMM5; | |
215 subpd XMM2, XMM6; | |
216 subpd XMM3, XMM7; | |
217 add ECX, 64; | |
218 movupd [ESI+ 0-64], XMM0; | |
219 movupd [ESI+16-64], XMM1; | |
220 movupd [ESI+32-64], XMM2; | |
221 movupd [ESI+48-64], XMM3; | |
222 cmp ESI, EDI; | |
223 jb startsseloopb; | |
224 | |
225 mov aptr, ESI; | |
226 mov bptr, EAX; | |
227 mov cptr, ECX; | |
228 } | |
229 } | |
230 } | |
231 | |
232 // Handle remainder | |
233 while (aptr < aend) | |
234 *aptr++ = *bptr++ - *cptr++; | |
235 | |
236 return a; | |
237 } | |
238 | |
239 | |
240 unittest | |
241 { | |
242 printf("_arraySliceSliceMinSliceAssign_d unittest\n"); | |
243 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
244 { | |
245 version (log) printf(" cpuid %d\n", cpuid); | |
246 | |
247 for (int j = 0; j < 2; j++) | |
248 { | |
249 const int dim = 67; | |
250 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
251 a = a[j .. dim + j]; // misalign for second iteration | |
252 T[] b = new T[dim + j]; | |
253 b = b[j .. dim + j]; | |
254 T[] c = new T[dim + j]; | |
255 c = c[j .. dim + j]; | |
256 | |
257 for (int i = 0; i < dim; i++) | |
258 { a[i] = cast(T)i; | |
259 b[i] = cast(T)(i + 7); | |
260 c[i] = cast(T)(i * 2); | |
261 } | |
262 | |
263 c[] = a[] - b[]; | |
264 | |
265 for (int i = 0; i < dim; i++) | |
266 { | |
267 if (c[i] != cast(T)(a[i] - b[i])) | |
268 { | |
269 printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); | |
270 assert(0); | |
271 } | |
272 } | |
273 } | |
274 } | |
275 } | |
276 | |
277 | |
278 /* ======================================================================== */ | |
279 | |
280 /*********************** | |
281 * Computes: | |
282 * a[] = b[] + value | |
283 */ | |
284 | |
285 T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) | |
286 in | |
287 { | |
288 assert(a.length == b.length); | |
289 assert(disjoint(a, b)); | |
290 } | |
291 body | |
292 { | |
293 //printf("_arraySliceExpAddSliceAssign_d()\n"); | |
294 auto aptr = a.ptr; | |
295 auto aend = aptr + a.length; | |
296 auto bptr = b.ptr; | |
297 | |
298 version (D_InlineAsm_X86) | |
299 { | |
300 // SSE2 version is 305% faster | |
301 if (sse2() && a.length >= 8) | |
302 { | |
303 auto n = aptr + (a.length & ~7); | |
304 | |
305 // Unaligned case | |
306 asm | |
307 { | |
308 mov EAX, bptr; | |
309 mov ESI, aptr; | |
310 mov EDI, n; | |
311 movsd XMM4, value; | |
312 shufpd XMM4, XMM4, 0; | |
313 | |
314 align 8; | |
315 startsseloop: | |
316 add ESI, 64; | |
317 movupd XMM0, [EAX]; | |
318 movupd XMM1, [EAX+16]; | |
319 movupd XMM2, [EAX+32]; | |
320 movupd XMM3, [EAX+48]; | |
321 add EAX, 64; | |
322 addpd XMM0, XMM4; | |
323 addpd XMM1, XMM4; | |
324 addpd XMM2, XMM4; | |
325 addpd XMM3, XMM4; | |
326 movupd [ESI+ 0-64], XMM0; | |
327 movupd [ESI+16-64], XMM1; | |
328 movupd [ESI+32-64], XMM2; | |
329 movupd [ESI+48-64], XMM3; | |
330 cmp ESI, EDI; | |
331 jb startsseloop; | |
332 | |
333 mov aptr, ESI; | |
334 mov bptr, EAX; | |
335 } | |
336 } | |
337 } | |
338 | |
339 while (aptr < aend) | |
340 *aptr++ = *bptr++ + value; | |
341 | |
342 return a; | |
343 } | |
344 | |
345 unittest | |
346 { | |
347 printf("_arraySliceExpAddSliceAssign_d unittest\n"); | |
348 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
349 { | |
350 version (log) printf(" cpuid %d\n", cpuid); | |
351 | |
352 for (int j = 0; j < 2; j++) | |
353 { | |
354 const int dim = 67; | |
355 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
356 a = a[j .. dim + j]; // misalign for second iteration | |
357 T[] b = new T[dim + j]; | |
358 b = b[j .. dim + j]; | |
359 T[] c = new T[dim + j]; | |
360 c = c[j .. dim + j]; | |
361 | |
362 for (int i = 0; i < dim; i++) | |
363 { a[i] = cast(T)i; | |
364 b[i] = cast(T)(i + 7); | |
365 c[i] = cast(T)(i * 2); | |
366 } | |
367 | |
368 c[] = a[] + 6; | |
369 | |
370 for (int i = 0; i < dim; i++) | |
371 { | |
372 if (c[i] != cast(T)(a[i] + 6)) | |
373 { | |
374 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); | |
375 assert(0); | |
376 } | |
377 } | |
378 } | |
379 } | |
380 } | |
381 | |
382 /* ======================================================================== */ | |
383 | |
384 /*********************** | |
385 * Computes: | |
386 * a[] += value | |
387 */ | |
388 | |
389 T[] _arrayExpSliceAddass_d(T[] a, T value) | |
390 { | |
391 //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
392 auto aptr = a.ptr; | |
393 auto aend = aptr + a.length; | |
394 | |
395 version (D_InlineAsm_X86) | |
396 { | |
397 // SSE2 version is 114% faster | |
398 if (sse2() && a.length >= 8) | |
399 { | |
400 auto n = cast(T*)((cast(uint)aend) & ~7); | |
401 if (aptr < n) | |
402 | |
403 // Unaligned case | |
404 asm | |
405 { | |
406 mov ESI, aptr; | |
407 mov EDI, n; | |
408 movsd XMM4, value; | |
409 shufpd XMM4, XMM4, 0; | |
410 | |
411 align 8; | |
412 startsseloopa: | |
413 movupd XMM0, [ESI]; | |
414 movupd XMM1, [ESI+16]; | |
415 movupd XMM2, [ESI+32]; | |
416 movupd XMM3, [ESI+48]; | |
417 add ESI, 64; | |
418 addpd XMM0, XMM4; | |
419 addpd XMM1, XMM4; | |
420 addpd XMM2, XMM4; | |
421 addpd XMM3, XMM4; | |
422 movupd [ESI+ 0-64], XMM0; | |
423 movupd [ESI+16-64], XMM1; | |
424 movupd [ESI+32-64], XMM2; | |
425 movupd [ESI+48-64], XMM3; | |
426 cmp ESI, EDI; | |
427 jb startsseloopa; | |
428 | |
429 mov aptr, ESI; | |
430 } | |
431 } | |
432 } | |
433 | |
434 while (aptr < aend) | |
435 *aptr++ += value; | |
436 | |
437 return a; | |
438 } | |
439 | |
440 unittest | |
441 { | |
442 printf("_arrayExpSliceAddass_d unittest\n"); | |
443 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
444 { | |
445 version (log) printf(" cpuid %d\n", cpuid); | |
446 | |
447 for (int j = 0; j < 2; j++) | |
448 { | |
449 const int dim = 67; | |
450 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
451 a = a[j .. dim + j]; // misalign for second iteration | |
452 T[] b = new T[dim + j]; | |
453 b = b[j .. dim + j]; | |
454 T[] c = new T[dim + j]; | |
455 c = c[j .. dim + j]; | |
456 | |
457 for (int i = 0; i < dim; i++) | |
458 { a[i] = cast(T)i; | |
459 b[i] = cast(T)(i + 7); | |
460 c[i] = cast(T)(i * 2); | |
461 } | |
462 | |
463 a[] = c[]; | |
464 c[] += 6; | |
465 | |
466 for (int i = 0; i < dim; i++) | |
467 { | |
468 if (c[i] != cast(T)(a[i] + 6)) | |
469 { | |
470 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); | |
471 assert(0); | |
472 } | |
473 } | |
474 } | |
475 } | |
476 } | |
477 | |
478 /* ======================================================================== */ | |
479 | |
480 /*********************** | |
481 * Computes: | |
482 * a[] += b[] | |
483 */ | |
484 | |
485 T[] _arraySliceSliceAddass_d(T[] a, T[] b) | |
486 in | |
487 { | |
488 assert (a.length == b.length); | |
489 assert (disjoint(a, b)); | |
490 } | |
491 body | |
492 { | |
493 //printf("_arraySliceSliceAddass_d()\n"); | |
494 auto aptr = a.ptr; | |
495 auto aend = aptr + a.length; | |
496 auto bptr = b.ptr; | |
497 | |
498 version (D_InlineAsm_X86) | |
499 { | |
500 // SSE2 version is 183% faster | |
501 if (sse2() && a.length >= 8) | |
502 { | |
503 auto n = aptr + (a.length & ~7); | |
504 | |
505 // Unaligned case | |
506 asm | |
507 { | |
508 mov ECX, bptr; // right operand | |
509 mov ESI, aptr; // destination operand | |
510 mov EDI, n; // end comparison | |
511 | |
512 align 8; | |
513 startsseloopb: | |
514 movupd XMM0, [ESI]; | |
515 movupd XMM1, [ESI+16]; | |
516 movupd XMM2, [ESI+32]; | |
517 movupd XMM3, [ESI+48]; | |
518 add ESI, 64; | |
519 movupd XMM4, [ECX]; | |
520 movupd XMM5, [ECX+16]; | |
521 movupd XMM6, [ECX+32]; | |
522 movupd XMM7, [ECX+48]; | |
523 add ECX, 64; | |
524 addpd XMM0, XMM4; | |
525 addpd XMM1, XMM5; | |
526 addpd XMM2, XMM6; | |
527 addpd XMM3, XMM7; | |
528 movupd [ESI+ 0-64], XMM0; | |
529 movupd [ESI+16-64], XMM1; | |
530 movupd [ESI+32-64], XMM2; | |
531 movupd [ESI+48-64], XMM3; | |
532 cmp ESI, EDI; | |
533 jb startsseloopb; | |
534 | |
535 mov aptr, ESI; | |
536 mov bptr, ECX; | |
537 } | |
538 } | |
539 } | |
540 | |
541 while (aptr < aend) | |
542 *aptr++ += *bptr++; | |
543 | |
544 return a; | |
545 } | |
546 | |
547 unittest | |
548 { | |
549 printf("_arraySliceSliceAddass_d unittest\n"); | |
550 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
551 { | |
552 version (log) printf(" cpuid %d\n", cpuid); | |
553 | |
554 for (int j = 0; j < 2; j++) | |
555 { | |
556 const int dim = 67; | |
557 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
558 a = a[j .. dim + j]; // misalign for second iteration | |
559 T[] b = new T[dim + j]; | |
560 b = b[j .. dim + j]; | |
561 T[] c = new T[dim + j]; | |
562 c = c[j .. dim + j]; | |
563 | |
564 for (int i = 0; i < dim; i++) | |
565 { a[i] = cast(T)i; | |
566 b[i] = cast(T)(i + 7); | |
567 c[i] = cast(T)(i * 2); | |
568 } | |
569 | |
570 a[] = c[]; | |
571 c[] += b[]; | |
572 | |
573 for (int i = 0; i < dim; i++) | |
574 { | |
575 if (c[i] != cast(T)(a[i] + b[i])) | |
576 { | |
577 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); | |
578 assert(0); | |
579 } | |
580 } | |
581 } | |
582 } | |
583 } | |
584 | |
585 /* ======================================================================== */ | |
586 | |
587 /*********************** | |
588 * Computes: | |
589 * a[] = b[] - value | |
590 */ | |
591 | |
592 T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) | |
593 in | |
594 { | |
595 assert (a.length == b.length); | |
596 assert (disjoint(a, b)); | |
597 } | |
598 body | |
599 { | |
600 //printf("_arraySliceExpMinSliceAssign_d()\n"); | |
601 auto aptr = a.ptr; | |
602 auto aend = aptr + a.length; | |
603 auto bptr = b.ptr; | |
604 | |
605 version (D_InlineAsm_X86) | |
606 { | |
607 // SSE2 version is 305% faster | |
608 if (sse2() && a.length >= 8) | |
609 { | |
610 auto n = aptr + (a.length & ~7); | |
611 | |
612 // Unaligned case | |
613 asm | |
614 { | |
615 mov EAX, bptr; | |
616 mov ESI, aptr; | |
617 mov EDI, n; | |
618 movsd XMM4, value; | |
619 shufpd XMM4, XMM4, 0; | |
620 | |
621 align 8; | |
622 startsseloop: | |
623 add ESI, 64; | |
624 movupd XMM0, [EAX]; | |
625 movupd XMM1, [EAX+16]; | |
626 movupd XMM2, [EAX+32]; | |
627 movupd XMM3, [EAX+48]; | |
628 add EAX, 64; | |
629 subpd XMM0, XMM4; | |
630 subpd XMM1, XMM4; | |
631 subpd XMM2, XMM4; | |
632 subpd XMM3, XMM4; | |
633 movupd [ESI+ 0-64], XMM0; | |
634 movupd [ESI+16-64], XMM1; | |
635 movupd [ESI+32-64], XMM2; | |
636 movupd [ESI+48-64], XMM3; | |
637 cmp ESI, EDI; | |
638 jb startsseloop; | |
639 | |
640 mov aptr, ESI; | |
641 mov bptr, EAX; | |
642 } | |
643 } | |
644 } | |
645 | |
646 while (aptr < aend) | |
647 *aptr++ = *bptr++ - value; | |
648 | |
649 return a; | |
650 } | |
651 | |
652 unittest | |
653 { | |
654 printf("_arraySliceExpMinSliceAssign_d unittest\n"); | |
655 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
656 { | |
657 version (log) printf(" cpuid %d\n", cpuid); | |
658 | |
659 for (int j = 0; j < 2; j++) | |
660 { | |
661 const int dim = 67; | |
662 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
663 a = a[j .. dim + j]; // misalign for second iteration | |
664 T[] b = new T[dim + j]; | |
665 b = b[j .. dim + j]; | |
666 T[] c = new T[dim + j]; | |
667 c = c[j .. dim + j]; | |
668 | |
669 for (int i = 0; i < dim; i++) | |
670 { a[i] = cast(T)i; | |
671 b[i] = cast(T)(i + 7); | |
672 c[i] = cast(T)(i * 2); | |
673 } | |
674 | |
675 c[] = a[] - 6; | |
676 | |
677 for (int i = 0; i < dim; i++) | |
678 { | |
679 if (c[i] != cast(T)(a[i] - 6)) | |
680 { | |
681 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); | |
682 assert(0); | |
683 } | |
684 } | |
685 } | |
686 } | |
687 } | |
688 | |
689 /* ======================================================================== */ | |
690 | |
691 /*********************** | |
692 * Computes: | |
693 * a[] = value - b[] | |
694 */ | |
695 | |
696 T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) | |
697 in | |
698 { | |
699 assert (a.length == b.length); | |
700 assert (disjoint(a, b)); | |
701 } | |
702 body | |
703 { | |
704 //printf("_arrayExpSliceMinSliceAssign_d()\n"); | |
705 auto aptr = a.ptr; | |
706 auto aend = aptr + a.length; | |
707 auto bptr = b.ptr; | |
708 | |
709 version (D_InlineAsm_X86) | |
710 { | |
711 // SSE2 version is 66% faster | |
712 if (sse2() && a.length >= 8) | |
713 { | |
714 auto n = aptr + (a.length & ~7); | |
715 | |
716 // Unaligned case | |
717 asm | |
718 { | |
719 mov EAX, bptr; | |
720 mov ESI, aptr; | |
721 mov EDI, n; | |
722 movsd XMM4, value; | |
723 shufpd XMM4, XMM4, 0; | |
724 | |
725 align 8; | |
726 startsseloop: | |
727 add ESI, 64; | |
728 movapd XMM5, XMM4; | |
729 movapd XMM6, XMM4; | |
730 movupd XMM0, [EAX]; | |
731 movupd XMM1, [EAX+16]; | |
732 movupd XMM2, [EAX+32]; | |
733 movupd XMM3, [EAX+48]; | |
734 add EAX, 64; | |
735 subpd XMM5, XMM0; | |
736 subpd XMM6, XMM1; | |
737 movupd [ESI+ 0-64], XMM5; | |
738 movupd [ESI+16-64], XMM6; | |
739 movapd XMM5, XMM4; | |
740 movapd XMM6, XMM4; | |
741 subpd XMM5, XMM2; | |
742 subpd XMM6, XMM3; | |
743 movupd [ESI+32-64], XMM5; | |
744 movupd [ESI+48-64], XMM6; | |
745 cmp ESI, EDI; | |
746 jb startsseloop; | |
747 | |
748 mov aptr, ESI; | |
749 mov bptr, EAX; | |
750 } | |
751 } | |
752 } | |
753 | |
754 while (aptr < aend) | |
755 *aptr++ = value - *bptr++; | |
756 | |
757 return a; | |
758 } | |
759 | |
760 unittest | |
761 { | |
762 printf("_arrayExpSliceMinSliceAssign_d unittest\n"); | |
763 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
764 { | |
765 version (log) printf(" cpuid %d\n", cpuid); | |
766 | |
767 for (int j = 0; j < 2; j++) | |
768 { | |
769 const int dim = 67; | |
770 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
771 a = a[j .. dim + j]; // misalign for second iteration | |
772 T[] b = new T[dim + j]; | |
773 b = b[j .. dim + j]; | |
774 T[] c = new T[dim + j]; | |
775 c = c[j .. dim + j]; | |
776 | |
777 for (int i = 0; i < dim; i++) | |
778 { a[i] = cast(T)i; | |
779 b[i] = cast(T)(i + 7); | |
780 c[i] = cast(T)(i * 2); | |
781 } | |
782 | |
783 c[] = 6 - a[]; | |
784 | |
785 for (int i = 0; i < dim; i++) | |
786 { | |
787 if (c[i] != cast(T)(6 - a[i])) | |
788 { | |
789 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); | |
790 assert(0); | |
791 } | |
792 } | |
793 } | |
794 } | |
795 } | |
796 | |
797 /* ======================================================================== */ | |
798 | |
799 /*********************** | |
800 * Computes: | |
801 * a[] -= value | |
802 */ | |
803 | |
804 T[] _arrayExpSliceMinass_d(T[] a, T value) | |
805 { | |
806 //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
807 auto aptr = a.ptr; | |
808 auto aend = aptr + a.length; | |
809 | |
810 version (D_InlineAsm_X86) | |
811 { | |
812 // SSE2 version is 115% faster | |
813 if (sse2() && a.length >= 8) | |
814 { | |
815 auto n = cast(T*)((cast(uint)aend) & ~7); | |
816 if (aptr < n) | |
817 | |
818 // Unaligned case | |
819 asm | |
820 { | |
821 mov ESI, aptr; | |
822 mov EDI, n; | |
823 movsd XMM4, value; | |
824 shufpd XMM4, XMM4, 0; | |
825 | |
826 align 8; | |
827 startsseloopa: | |
828 movupd XMM0, [ESI]; | |
829 movupd XMM1, [ESI+16]; | |
830 movupd XMM2, [ESI+32]; | |
831 movupd XMM3, [ESI+48]; | |
832 add ESI, 64; | |
833 subpd XMM0, XMM4; | |
834 subpd XMM1, XMM4; | |
835 subpd XMM2, XMM4; | |
836 subpd XMM3, XMM4; | |
837 movupd [ESI+ 0-64], XMM0; | |
838 movupd [ESI+16-64], XMM1; | |
839 movupd [ESI+32-64], XMM2; | |
840 movupd [ESI+48-64], XMM3; | |
841 cmp ESI, EDI; | |
842 jb startsseloopa; | |
843 | |
844 mov aptr, ESI; | |
845 } | |
846 } | |
847 } | |
848 | |
849 while (aptr < aend) | |
850 *aptr++ -= value; | |
851 | |
852 return a; | |
853 } | |
854 | |
855 unittest | |
856 { | |
857 printf("_arrayExpSliceMinass_d unittest\n"); | |
858 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
859 { | |
860 version (log) printf(" cpuid %d\n", cpuid); | |
861 | |
862 for (int j = 0; j < 2; j++) | |
863 { | |
864 const int dim = 67; | |
865 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
866 a = a[j .. dim + j]; // misalign for second iteration | |
867 T[] b = new T[dim + j]; | |
868 b = b[j .. dim + j]; | |
869 T[] c = new T[dim + j]; | |
870 c = c[j .. dim + j]; | |
871 | |
872 for (int i = 0; i < dim; i++) | |
873 { a[i] = cast(T)i; | |
874 b[i] = cast(T)(i + 7); | |
875 c[i] = cast(T)(i * 2); | |
876 } | |
877 | |
878 a[] = c[]; | |
879 c[] -= 6; | |
880 | |
881 for (int i = 0; i < dim; i++) | |
882 { | |
883 if (c[i] != cast(T)(a[i] - 6)) | |
884 { | |
885 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); | |
886 assert(0); | |
887 } | |
888 } | |
889 } | |
890 } | |
891 } | |
892 | |
893 /* ======================================================================== */ | |
894 | |
895 /*********************** | |
896 * Computes: | |
897 * a[] -= b[] | |
898 */ | |
899 | |
900 T[] _arraySliceSliceMinass_d(T[] a, T[] b) | |
901 in | |
902 { | |
903 assert (a.length == b.length); | |
904 assert (disjoint(a, b)); | |
905 } | |
906 body | |
907 { | |
908 //printf("_arraySliceSliceMinass_d()\n"); | |
909 auto aptr = a.ptr; | |
910 auto aend = aptr + a.length; | |
911 auto bptr = b.ptr; | |
912 | |
913 version (D_InlineAsm_X86) | |
914 { | |
915 // SSE2 version is 183% faster | |
916 if (sse2() && a.length >= 8) | |
917 { | |
918 auto n = aptr + (a.length & ~7); | |
919 | |
920 // Unaligned case | |
921 asm | |
922 { | |
923 mov ECX, bptr; // right operand | |
924 mov ESI, aptr; // destination operand | |
925 mov EDI, n; // end comparison | |
926 | |
927 align 8; | |
928 startsseloopb: | |
929 movupd XMM0, [ESI]; | |
930 movupd XMM1, [ESI+16]; | |
931 movupd XMM2, [ESI+32]; | |
932 movupd XMM3, [ESI+48]; | |
933 add ESI, 64; | |
934 movupd XMM4, [ECX]; | |
935 movupd XMM5, [ECX+16]; | |
936 movupd XMM6, [ECX+32]; | |
937 movupd XMM7, [ECX+48]; | |
938 add ECX, 64; | |
939 subpd XMM0, XMM4; | |
940 subpd XMM1, XMM5; | |
941 subpd XMM2, XMM6; | |
942 subpd XMM3, XMM7; | |
943 movupd [ESI+ 0-64], XMM0; | |
944 movupd [ESI+16-64], XMM1; | |
945 movupd [ESI+32-64], XMM2; | |
946 movupd [ESI+48-64], XMM3; | |
947 cmp ESI, EDI; | |
948 jb startsseloopb; | |
949 | |
950 mov aptr, ESI; | |
951 mov bptr, ECX; | |
952 } | |
953 } | |
954 } | |
955 | |
956 while (aptr < aend) | |
957 *aptr++ -= *bptr++; | |
958 | |
959 return a; | |
960 } | |
961 | |
962 unittest | |
963 { | |
964 printf("_arrayExpSliceMinass_d unittest\n"); | |
965 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
966 { | |
967 version (log) printf(" cpuid %d\n", cpuid); | |
968 | |
969 for (int j = 0; j < 2; j++) | |
970 { | |
971 const int dim = 67; | |
972 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
973 a = a[j .. dim + j]; // misalign for second iteration | |
974 T[] b = new T[dim + j]; | |
975 b = b[j .. dim + j]; | |
976 T[] c = new T[dim + j]; | |
977 c = c[j .. dim + j]; | |
978 | |
979 for (int i = 0; i < dim; i++) | |
980 { a[i] = cast(T)i; | |
981 b[i] = cast(T)(i + 7); | |
982 c[i] = cast(T)(i * 2); | |
983 } | |
984 | |
985 a[] = c[]; | |
986 c[] -= 6; | |
987 | |
988 for (int i = 0; i < dim; i++) | |
989 { | |
990 if (c[i] != cast(T)(a[i] - 6)) | |
991 { | |
992 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); | |
993 assert(0); | |
994 } | |
995 } | |
996 } | |
997 } | |
998 } | |
999 | |
1000 /* ======================================================================== */ | |
1001 | |
1002 /*********************** | |
1003 * Computes: | |
1004 * a[] = b[] * value | |
1005 */ | |
1006 | |
1007 T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b) | |
1008 in | |
1009 { | |
1010 assert(a.length == b.length); | |
1011 assert(disjoint(a, b)); | |
1012 } | |
1013 body | |
1014 { | |
1015 //printf("_arraySliceExpMulSliceAssign_d()\n"); | |
1016 auto aptr = a.ptr; | |
1017 auto aend = aptr + a.length; | |
1018 auto bptr = b.ptr; | |
1019 | |
1020 version (D_InlineAsm_X86) | |
1021 { | |
1022 // SSE2 version is 304% faster | |
1023 if (sse2() && a.length >= 8) | |
1024 { | |
1025 auto n = aptr + (a.length & ~7); | |
1026 | |
1027 // Unaligned case | |
1028 asm | |
1029 { | |
1030 mov EAX, bptr; | |
1031 mov ESI, aptr; | |
1032 mov EDI, n; | |
1033 movsd XMM4, value; | |
1034 shufpd XMM4, XMM4, 0; | |
1035 | |
1036 align 8; | |
1037 startsseloop: | |
1038 add ESI, 64; | |
1039 movupd XMM0, [EAX]; | |
1040 movupd XMM1, [EAX+16]; | |
1041 movupd XMM2, [EAX+32]; | |
1042 movupd XMM3, [EAX+48]; | |
1043 add EAX, 64; | |
1044 mulpd XMM0, XMM4; | |
1045 mulpd XMM1, XMM4; | |
1046 mulpd XMM2, XMM4; | |
1047 mulpd XMM3, XMM4; | |
1048 movupd [ESI+ 0-64], XMM0; | |
1049 movupd [ESI+16-64], XMM1; | |
1050 movupd [ESI+32-64], XMM2; | |
1051 movupd [ESI+48-64], XMM3; | |
1052 cmp ESI, EDI; | |
1053 jb startsseloop; | |
1054 | |
1055 mov aptr, ESI; | |
1056 mov bptr, EAX; | |
1057 } | |
1058 } | |
1059 } | |
1060 | |
1061 while (aptr < aend) | |
1062 *aptr++ = *bptr++ * value; | |
1063 | |
1064 return a; | |
1065 } | |
1066 | |
1067 unittest | |
1068 { | |
1069 printf("_arraySliceExpMulSliceAssign_d unittest\n"); | |
1070 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1071 { | |
1072 version (log) printf(" cpuid %d\n", cpuid); | |
1073 | |
1074 for (int j = 0; j < 2; j++) | |
1075 { | |
1076 const int dim = 67; | |
1077 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1078 a = a[j .. dim + j]; // misalign for second iteration | |
1079 T[] b = new T[dim + j]; | |
1080 b = b[j .. dim + j]; | |
1081 T[] c = new T[dim + j]; | |
1082 c = c[j .. dim + j]; | |
1083 | |
1084 for (int i = 0; i < dim; i++) | |
1085 { a[i] = cast(T)i; | |
1086 b[i] = cast(T)(i + 7); | |
1087 c[i] = cast(T)(i * 2); | |
1088 } | |
1089 | |
1090 c[] = a[] * 6; | |
1091 | |
1092 for (int i = 0; i < dim; i++) | |
1093 { | |
1094 if (c[i] != cast(T)(a[i] * 6)) | |
1095 { | |
1096 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); | |
1097 assert(0); | |
1098 } | |
1099 } | |
1100 } | |
1101 } | |
1102 } | |
1103 | |
1104 /* ======================================================================== */ | |
1105 | |
1106 /*********************** | |
1107 * Computes: | |
1108 * a[] = b[] * c[] | |
1109 */ | |
1110 | |
1111 T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b) | |
1112 in | |
1113 { | |
1114 assert(a.length == b.length && b.length == c.length); | |
1115 assert(disjoint(a, b)); | |
1116 assert(disjoint(a, c)); | |
1117 assert(disjoint(b, c)); | |
1118 } | |
1119 body | |
1120 { | |
1121 //printf("_arraySliceSliceMulSliceAssign_d()\n"); | |
1122 auto aptr = a.ptr; | |
1123 auto aend = aptr + a.length; | |
1124 auto bptr = b.ptr; | |
1125 auto cptr = c.ptr; | |
1126 | |
1127 version (D_InlineAsm_X86) | |
1128 { | |
1129 // SSE2 version is 329% faster | |
1130 if (sse2() && a.length >= 8) | |
1131 { | |
1132 auto n = aptr + (a.length & ~7); | |
1133 | |
1134 // Unaligned case | |
1135 asm | |
1136 { | |
1137 mov EAX, bptr; // left operand | |
1138 mov ECX, cptr; // right operand | |
1139 mov ESI, aptr; // destination operand | |
1140 mov EDI, n; // end comparison | |
1141 | |
1142 align 8; | |
1143 startsseloopb: | |
1144 movupd XMM0, [EAX]; | |
1145 movupd XMM1, [EAX+16]; | |
1146 movupd XMM2, [EAX+32]; | |
1147 movupd XMM3, [EAX+48]; | |
1148 add ESI, 64; | |
1149 movupd XMM4, [ECX]; | |
1150 movupd XMM5, [ECX+16]; | |
1151 movupd XMM6, [ECX+32]; | |
1152 movupd XMM7, [ECX+48]; | |
1153 add EAX, 64; | |
1154 mulpd XMM0, XMM4; | |
1155 mulpd XMM1, XMM5; | |
1156 mulpd XMM2, XMM6; | |
1157 mulpd XMM3, XMM7; | |
1158 add ECX, 64; | |
1159 movupd [ESI+ 0-64], XMM0; | |
1160 movupd [ESI+16-64], XMM1; | |
1161 movupd [ESI+32-64], XMM2; | |
1162 movupd [ESI+48-64], XMM3; | |
1163 cmp ESI, EDI; | |
1164 jb startsseloopb; | |
1165 | |
1166 mov aptr, ESI; | |
1167 mov bptr, EAX; | |
1168 mov cptr, ECX; | |
1169 } | |
1170 } | |
1171 } | |
1172 | |
1173 while (aptr < aend) | |
1174 *aptr++ = *bptr++ * *cptr++; | |
1175 | |
1176 return a; | |
1177 } | |
1178 | |
1179 unittest | |
1180 { | |
1181 printf("_arraySliceSliceMulSliceAssign_d unittest\n"); | |
1182 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1183 { | |
1184 version (log) printf(" cpuid %d\n", cpuid); | |
1185 | |
1186 for (int j = 0; j < 2; j++) | |
1187 { | |
1188 const int dim = 67; | |
1189 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1190 a = a[j .. dim + j]; // misalign for second iteration | |
1191 T[] b = new T[dim + j]; | |
1192 b = b[j .. dim + j]; | |
1193 T[] c = new T[dim + j]; | |
1194 c = c[j .. dim + j]; | |
1195 | |
1196 for (int i = 0; i < dim; i++) | |
1197 { a[i] = cast(T)i; | |
1198 b[i] = cast(T)(i + 7); | |
1199 c[i] = cast(T)(i * 2); | |
1200 } | |
1201 | |
1202 c[] = a[] * b[]; | |
1203 | |
1204 for (int i = 0; i < dim; i++) | |
1205 { | |
1206 if (c[i] != cast(T)(a[i] * b[i])) | |
1207 { | |
1208 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); | |
1209 assert(0); | |
1210 } | |
1211 } | |
1212 } | |
1213 } | |
1214 } | |
1215 | |
1216 /* ======================================================================== */ | |
1217 | |
1218 /*********************** | |
1219 * Computes: | |
1220 * a[] *= value | |
1221 */ | |
1222 | |
1223 T[] _arrayExpSliceMulass_d(T[] a, T value) | |
1224 { | |
1225 //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1226 auto aptr = a.ptr; | |
1227 auto aend = aptr + a.length; | |
1228 | |
1229 version (D_InlineAsm_X86) | |
1230 { | |
1231 // SSE2 version is 109% faster | |
1232 if (sse2() && a.length >= 8) | |
1233 { | |
1234 auto n = cast(T*)((cast(uint)aend) & ~7); | |
1235 if (aptr < n) | |
1236 | |
1237 // Unaligned case | |
1238 asm | |
1239 { | |
1240 mov ESI, aptr; | |
1241 mov EDI, n; | |
1242 movsd XMM4, value; | |
1243 shufpd XMM4, XMM4, 0; | |
1244 | |
1245 align 8; | |
1246 startsseloopa: | |
1247 movupd XMM0, [ESI]; | |
1248 movupd XMM1, [ESI+16]; | |
1249 movupd XMM2, [ESI+32]; | |
1250 movupd XMM3, [ESI+48]; | |
1251 add ESI, 64; | |
1252 mulpd XMM0, XMM4; | |
1253 mulpd XMM1, XMM4; | |
1254 mulpd XMM2, XMM4; | |
1255 mulpd XMM3, XMM4; | |
1256 movupd [ESI+ 0-64], XMM0; | |
1257 movupd [ESI+16-64], XMM1; | |
1258 movupd [ESI+32-64], XMM2; | |
1259 movupd [ESI+48-64], XMM3; | |
1260 cmp ESI, EDI; | |
1261 jb startsseloopa; | |
1262 | |
1263 mov aptr, ESI; | |
1264 } | |
1265 } | |
1266 } | |
1267 | |
1268 while (aptr < aend) | |
1269 *aptr++ *= value; | |
1270 | |
1271 return a; | |
1272 } | |
1273 | |
1274 unittest | |
1275 { | |
1276 printf("_arrayExpSliceMulass_d unittest\n"); | |
1277 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1278 { | |
1279 version (log) printf(" cpuid %d\n", cpuid); | |
1280 | |
1281 for (int j = 0; j < 2; j++) | |
1282 { | |
1283 const int dim = 67; | |
1284 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1285 a = a[j .. dim + j]; // misalign for second iteration | |
1286 T[] b = new T[dim + j]; | |
1287 b = b[j .. dim + j]; | |
1288 T[] c = new T[dim + j]; | |
1289 c = c[j .. dim + j]; | |
1290 | |
1291 for (int i = 0; i < dim; i++) | |
1292 { a[i] = cast(T)i; | |
1293 b[i] = cast(T)(i + 7); | |
1294 c[i] = cast(T)(i * 2); | |
1295 } | |
1296 | |
1297 a[] = c[]; | |
1298 c[] *= 6; | |
1299 | |
1300 for (int i = 0; i < dim; i++) | |
1301 { | |
1302 if (c[i] != cast(T)(a[i] * 6)) | |
1303 { | |
1304 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); | |
1305 assert(0); | |
1306 } | |
1307 } | |
1308 } | |
1309 } | |
1310 } | |
1311 | |
1312 /* ======================================================================== */ | |
1313 | |
1314 /*********************** | |
1315 * Computes: | |
1316 * a[] *= b[] | |
1317 */ | |
1318 | |
1319 T[] _arraySliceSliceMulass_d(T[] a, T[] b) | |
1320 in | |
1321 { | |
1322 assert (a.length == b.length); | |
1323 assert (disjoint(a, b)); | |
1324 } | |
1325 body | |
1326 { | |
1327 //printf("_arraySliceSliceMulass_d()\n"); | |
1328 auto aptr = a.ptr; | |
1329 auto aend = aptr + a.length; | |
1330 auto bptr = b.ptr; | |
1331 | |
1332 version (D_InlineAsm_X86) | |
1333 { | |
1334 // SSE2 version is 205% faster | |
1335 if (sse2() && a.length >= 8) | |
1336 { | |
1337 auto n = aptr + (a.length & ~7); | |
1338 | |
1339 // Unaligned case | |
1340 asm | |
1341 { | |
1342 mov ECX, bptr; // right operand | |
1343 mov ESI, aptr; // destination operand | |
1344 mov EDI, n; // end comparison | |
1345 | |
1346 align 8; | |
1347 startsseloopb: | |
1348 movupd XMM0, [ESI]; | |
1349 movupd XMM1, [ESI+16]; | |
1350 movupd XMM2, [ESI+32]; | |
1351 movupd XMM3, [ESI+48]; | |
1352 add ESI, 64; | |
1353 movupd XMM4, [ECX]; | |
1354 movupd XMM5, [ECX+16]; | |
1355 movupd XMM6, [ECX+32]; | |
1356 movupd XMM7, [ECX+48]; | |
1357 add ECX, 64; | |
1358 mulpd XMM0, XMM4; | |
1359 mulpd XMM1, XMM5; | |
1360 mulpd XMM2, XMM6; | |
1361 mulpd XMM3, XMM7; | |
1362 movupd [ESI+ 0-64], XMM0; | |
1363 movupd [ESI+16-64], XMM1; | |
1364 movupd [ESI+32-64], XMM2; | |
1365 movupd [ESI+48-64], XMM3; | |
1366 cmp ESI, EDI; | |
1367 jb startsseloopb; | |
1368 | |
1369 mov aptr, ESI; | |
1370 mov bptr, ECX; | |
1371 } | |
1372 } | |
1373 } | |
1374 | |
1375 while (aptr < aend) | |
1376 *aptr++ *= *bptr++; | |
1377 | |
1378 return a; | |
1379 } | |
1380 | |
1381 unittest | |
1382 { | |
1383 printf("_arrayExpSliceMulass_d unittest\n"); | |
1384 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1385 { | |
1386 version (log) printf(" cpuid %d\n", cpuid); | |
1387 | |
1388 for (int j = 0; j < 2; j++) | |
1389 { | |
1390 const int dim = 67; | |
1391 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1392 a = a[j .. dim + j]; // misalign for second iteration | |
1393 T[] b = new T[dim + j]; | |
1394 b = b[j .. dim + j]; | |
1395 T[] c = new T[dim + j]; | |
1396 c = c[j .. dim + j]; | |
1397 | |
1398 for (int i = 0; i < dim; i++) | |
1399 { a[i] = cast(T)i; | |
1400 b[i] = cast(T)(i + 7); | |
1401 c[i] = cast(T)(i * 2); | |
1402 } | |
1403 | |
1404 a[] = c[]; | |
1405 c[] *= 6; | |
1406 | |
1407 for (int i = 0; i < dim; i++) | |
1408 { | |
1409 if (c[i] != cast(T)(a[i] * 6)) | |
1410 { | |
1411 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); | |
1412 assert(0); | |
1413 } | |
1414 } | |
1415 } | |
1416 } | |
1417 } | |
1418 | |
1419 /* ======================================================================== */ | |
1420 | |
1421 /*********************** | |
1422 * Computes: | |
1423 * a[] = b[] / value | |
1424 */ | |
1425 | |
1426 T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b) | |
1427 in | |
1428 { | |
1429 assert(a.length == b.length); | |
1430 assert(disjoint(a, b)); | |
1431 } | |
1432 body | |
1433 { | |
1434 //printf("_arraySliceExpDivSliceAssign_d()\n"); | |
1435 auto aptr = a.ptr; | |
1436 auto aend = aptr + a.length; | |
1437 auto bptr = b.ptr; | |
1438 | |
1439 /* Multiplying by the reciprocal is faster, but does | |
1440 * not produce as accurate an answer. | |
1441 */ | |
1442 T recip = cast(T)1 / value; | |
1443 | |
1444 version (D_InlineAsm_X86) | |
1445 { | |
1446 // SSE2 version is 299% faster | |
1447 if (sse2() && a.length >= 8) | |
1448 { | |
1449 auto n = aptr + (a.length & ~7); | |
1450 | |
1451 // Unaligned case | |
1452 asm | |
1453 { | |
1454 mov EAX, bptr; | |
1455 mov ESI, aptr; | |
1456 mov EDI, n; | |
1457 movsd XMM4, recip; | |
1458 //movsd XMM4, value | |
1459 //rcpsd XMM4, XMM4 | |
1460 shufpd XMM4, XMM4, 0; | |
1461 | |
1462 align 8; | |
1463 startsseloop: | |
1464 add ESI, 64; | |
1465 movupd XMM0, [EAX]; | |
1466 movupd XMM1, [EAX+16]; | |
1467 movupd XMM2, [EAX+32]; | |
1468 movupd XMM3, [EAX+48]; | |
1469 add EAX, 64; | |
1470 mulpd XMM0, XMM4; | |
1471 mulpd XMM1, XMM4; | |
1472 mulpd XMM2, XMM4; | |
1473 mulpd XMM3, XMM4; | |
1474 //divpd XMM0, XMM4; | |
1475 //divpd XMM1, XMM4; | |
1476 //divpd XMM2, XMM4; | |
1477 //divpd XMM3, XMM4; | |
1478 movupd [ESI+ 0-64], XMM0; | |
1479 movupd [ESI+16-64], XMM1; | |
1480 movupd [ESI+32-64], XMM2; | |
1481 movupd [ESI+48-64], XMM3; | |
1482 cmp ESI, EDI; | |
1483 jb startsseloop; | |
1484 | |
1485 mov aptr, ESI; | |
1486 mov bptr, EAX; | |
1487 } | |
1488 } | |
1489 } | |
1490 | |
1491 while (aptr < aend) | |
1492 { | |
1493 *aptr++ = *bptr++ / value; | |
1494 //*aptr++ = *bptr++ * recip; | |
1495 } | |
1496 | |
1497 return a; | |
1498 } | |
1499 | |
1500 unittest | |
1501 { | |
1502 printf("_arraySliceExpDivSliceAssign_d unittest\n"); | |
1503 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1504 { | |
1505 version (log) printf(" cpuid %d\n", cpuid); | |
1506 | |
1507 for (int j = 0; j < 2; j++) | |
1508 { | |
1509 const int dim = 67; | |
1510 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1511 a = a[j .. dim + j]; // misalign for second iteration | |
1512 T[] b = new T[dim + j]; | |
1513 b = b[j .. dim + j]; | |
1514 T[] c = new T[dim + j]; | |
1515 c = c[j .. dim + j]; | |
1516 | |
1517 for (int i = 0; i < dim; i++) | |
1518 { a[i] = cast(T)i; | |
1519 b[i] = cast(T)(i + 7); | |
1520 c[i] = cast(T)(i * 2); | |
1521 } | |
1522 | |
1523 c[] = a[] / 8; | |
1524 | |
1525 for (int i = 0; i < dim; i++) | |
1526 { | |
1527 //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]); | |
1528 if (c[i] != cast(T)(a[i] / 8)) | |
1529 { | |
1530 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); | |
1531 assert(0); | |
1532 } | |
1533 } | |
1534 } | |
1535 } | |
1536 } | |
1537 | |
1538 /* ======================================================================== */ | |
1539 | |
1540 /*********************** | |
1541 * Computes: | |
1542 * a[] /= value | |
1543 */ | |
1544 | |
1545 T[] _arrayExpSliceDivass_d(T[] a, T value) | |
1546 { | |
1547 //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); | |
1548 auto aptr = a.ptr; | |
1549 auto aend = aptr + a.length; | |
1550 | |
1551 /* Multiplying by the reciprocal is faster, but does | |
1552 * not produce as accurate an answer. | |
1553 */ | |
1554 T recip = cast(T)1 / value; | |
1555 | |
1556 version (D_InlineAsm_X86) | |
1557 { | |
1558 // SSE2 version is 65% faster | |
1559 if (sse2() && a.length >= 8) | |
1560 { | |
1561 auto n = aptr + (a.length & ~7); | |
1562 | |
1563 // Unaligned case | |
1564 asm | |
1565 { | |
1566 mov ESI, aptr; | |
1567 mov EDI, n; | |
1568 movsd XMM4, recip; | |
1569 //movsd XMM4, value | |
1570 //rcpsd XMM4, XMM4 | |
1571 shufpd XMM4, XMM4, 0; | |
1572 | |
1573 align 8; | |
1574 startsseloopa: | |
1575 movupd XMM0, [ESI]; | |
1576 movupd XMM1, [ESI+16]; | |
1577 movupd XMM2, [ESI+32]; | |
1578 movupd XMM3, [ESI+48]; | |
1579 add ESI, 64; | |
1580 mulpd XMM0, XMM4; | |
1581 mulpd XMM1, XMM4; | |
1582 mulpd XMM2, XMM4; | |
1583 mulpd XMM3, XMM4; | |
1584 //divpd XMM0, XMM4; | |
1585 //divpd XMM1, XMM4; | |
1586 //divpd XMM2, XMM4; | |
1587 //divpd XMM3, XMM4; | |
1588 movupd [ESI+ 0-64], XMM0; | |
1589 movupd [ESI+16-64], XMM1; | |
1590 movupd [ESI+32-64], XMM2; | |
1591 movupd [ESI+48-64], XMM3; | |
1592 cmp ESI, EDI; | |
1593 jb startsseloopa; | |
1594 | |
1595 mov aptr, ESI; | |
1596 } | |
1597 } | |
1598 } | |
1599 | |
1600 while (aptr < aend) | |
1601 *aptr++ *= recip; | |
1602 | |
1603 return a; | |
1604 } | |
1605 | |
1606 | |
1607 unittest | |
1608 { | |
1609 printf("_arrayExpSliceDivass_d unittest\n"); | |
1610 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) | |
1611 { | |
1612 version (log) printf(" cpuid %d\n", cpuid); | |
1613 | |
1614 for (int j = 0; j < 2; j++) | |
1615 { | |
1616 const int dim = 67; | |
1617 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1618 a = a[j .. dim + j]; // misalign for second iteration | |
1619 T[] b = new T[dim + j]; | |
1620 b = b[j .. dim + j]; | |
1621 T[] c = new T[dim + j]; | |
1622 c = c[j .. dim + j]; | |
1623 | |
1624 for (int i = 0; i < dim; i++) | |
1625 { a[i] = cast(T)i; | |
1626 b[i] = cast(T)(i + 7); | |
1627 c[i] = cast(T)(i * 2); | |
1628 } | |
1629 | |
1630 a[] = c[]; | |
1631 c[] /= 8; | |
1632 | |
1633 for (int i = 0; i < dim; i++) | |
1634 { | |
1635 if (c[i] != cast(T)(a[i] / 8)) | |
1636 { | |
1637 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); | |
1638 assert(0); | |
1639 } | |
1640 } | |
1641 } | |
1642 } | |
1643 } | |
1644 | |
1645 | |
1646 /* ======================================================================== */ | |
1647 | |
1648 /*********************** | |
1649 * Computes: | |
1650 * a[] -= b[] * value | |
1651 */ | |
1652 | |
1653 T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b) | |
1654 { | |
1655 return _arraySliceExpMulSliceAddass_d(a, -value, b); | |
1656 } | |
1657 | |
1658 /*********************** | |
1659 * Computes: | |
1660 * a[] += b[] * value | |
1661 */ | |
1662 | |
1663 T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b) | |
1664 in | |
1665 { | |
1666 assert(a.length == b.length); | |
1667 assert(disjoint(a, b)); | |
1668 } | |
1669 body | |
1670 { | |
1671 auto aptr = a.ptr; | |
1672 auto aend = aptr + a.length; | |
1673 auto bptr = b.ptr; | |
1674 | |
1675 // Handle remainder | |
1676 while (aptr < aend) | |
1677 *aptr++ += *bptr++ * value; | |
1678 | |
1679 return a; | |
1680 } | |
1681 | |
1682 unittest | |
1683 { | |
1684 printf("_arraySliceExpMulSliceAddass_d unittest\n"); | |
1685 | |
1686 cpuid = 1; | |
1687 { | |
1688 version (log) printf(" cpuid %d\n", cpuid); | |
1689 | |
1690 for (int j = 0; j < 1; j++) | |
1691 { | |
1692 const int dim = 67; | |
1693 T[] a = new T[dim + j]; // aligned on 16 byte boundary | |
1694 a = a[j .. dim + j]; // misalign for second iteration | |
1695 T[] b = new T[dim + j]; | |
1696 b = b[j .. dim + j]; | |
1697 T[] c = new T[dim + j]; | |
1698 c = c[j .. dim + j]; | |
1699 | |
1700 for (int i = 0; i < dim; i++) | |
1701 { a[i] = cast(T)i; | |
1702 b[i] = cast(T)(i + 7); | |
1703 c[i] = cast(T)(i * 2); | |
1704 } | |
1705 | |
1706 b[] = c[]; | |
1707 c[] += a[] * 6; | |
1708 | |
1709 for (int i = 0; i < dim; i++) | |
1710 { | |
1711 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); | |
1712 if (c[i] != cast(T)(b[i] + a[i] * 6)) | |
1713 { | |
1714 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); | |
1715 assert(0); | |
1716 } | |
1717 } | |
1718 } | |
1719 } | |
1720 } |