Mercurial > projects > ldc
view druntime/src/compiler/dmd/arraydouble.d @ 759:d3eb054172f9
Added copy of druntime from DMD 2.020 modified for LDC.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:52:37 +0100 |
parents | |
children |
line wrap: on
line source
/*************************** * D programming language http://www.digitalmars.com/d/ * Runtime support for double array operations. * Based on code originally written by Burton Radons. * Placed in public domain. */ module rt.arraydouble; private import util.cpuid; version (Unittest) { /* This is so unit tests will test every CPU variant */ int cpuid; const int CPUID_MAX = 5; bool mmx() { return cpuid == 1 && util.cpuid.mmx(); } bool sse() { return cpuid == 2 && util.cpuid.sse(); } bool sse2() { return cpuid == 3 && util.cpuid.sse2(); } bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); } } else { alias util.cpuid.mmx mmx; alias util.cpuid.sse sse; alias util.cpuid.sse2 sse2; alias util.cpuid.amd3dnow amd3dnow; } //version = log; bool disjoint(T)(T[] a, T[] b) { return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); } /* Performance figures measured by Burton Radons */ alias double T; extern (C): /* ======================================================================== */ /*********************** * Computes: * a[] = b[] + c[] */ T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b) in { assert(a.length == b.length && b.length == c.length); assert(disjoint(a, b)); assert(disjoint(a, c)); assert(disjoint(b, c)); } body { auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; auto cptr = c.ptr; version (D_InlineAsm_X86) { // SSE2 version is 333% faster if (sse2() && b.length >= 16) { auto n = aptr + (b.length & ~15); // Unaligned case asm { mov EAX, bptr; // left operand mov ECX, cptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movupd XMM0, [EAX]; movupd XMM1, [EAX+16]; movupd XMM2, [EAX+32]; movupd XMM3, [EAX+48]; add EAX, 64; movupd XMM4, [ECX]; movupd XMM5, [ECX+16]; movupd XMM6, [ECX+32]; movupd XMM7, [ECX+48]; add ESI, 64; addpd XMM0, XMM4; addpd XMM1, XMM5; addpd XMM2, XMM6; addpd XMM3, XMM7; add ECX, 64; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } } // Handle remainder while (aptr < aend) *aptr++ = *bptr++ + *cptr++; return a; } unittest { printf("_arraySliceSliceAddSliceAssign_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] + b[]; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] + b[i])) { printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] - c[] */ T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b) in { assert(a.length == b.length && b.length == c.length); assert(disjoint(a, b)); assert(disjoint(a, c)); assert(disjoint(b, c)); } body { auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; auto cptr = c.ptr; version (D_InlineAsm_X86) { // SSE2 version is 324% faster if (sse2() && b.length >= 8) { auto n = aptr + (b.length & ~7); // Unaligned case asm { mov EAX, bptr; // left operand mov ECX, cptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movupd XMM0, [EAX]; movupd XMM1, [EAX+16]; movupd XMM2, [EAX+32]; movupd XMM3, [EAX+48]; add EAX, 64; movupd XMM4, [ECX]; movupd XMM5, [ECX+16]; movupd XMM6, [ECX+32]; movupd XMM7, [ECX+48]; add ESI, 64; subpd XMM0, XMM4; subpd XMM1, XMM5; subpd XMM2, XMM6; subpd XMM3, XMM7; add ECX, 64; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } } // Handle remainder while (aptr < aend) *aptr++ = *bptr++ - *cptr++; return a; } unittest { printf("_arraySliceSliceMinSliceAssign_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] - b[]; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] - b[i])) { printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] + value */ T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) in { assert(a.length == b.length); assert(disjoint(a, b)); } body { //printf("_arraySliceExpAddSliceAssign_d()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE2 version is 305% faster if (sse2() && a.length >= 8) { auto n = aptr + (a.length & ~7); // Unaligned case asm { mov EAX, bptr; mov ESI, aptr; mov EDI, n; movsd XMM4, value; shufpd XMM4, XMM4, 0; align 8; startsseloop: add ESI, 64; movupd XMM0, [EAX]; movupd XMM1, [EAX+16]; movupd XMM2, [EAX+32]; movupd XMM3, [EAX+48]; add EAX, 64; addpd XMM0, XMM4; addpd XMM1, XMM4; addpd XMM2, XMM4; addpd XMM3, XMM4; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloop; mov aptr, ESI; mov bptr, EAX; } } } while (aptr < aend) *aptr++ = *bptr++ + value; return a; } unittest { printf("_arraySliceExpAddSliceAssign_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] + 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] + 6)) { printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] += value */ T[] _arrayExpSliceAddass_d(T[] a, T value) { //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); auto aptr = a.ptr; auto aend = aptr + a.length; version (D_InlineAsm_X86) { // SSE2 version is 114% faster if (sse2() && a.length >= 8) { auto n = cast(T*)((cast(uint)aend) & ~7); if (aptr < n) // Unaligned case asm { mov ESI, aptr; mov EDI, n; movsd XMM4, value; shufpd XMM4, XMM4, 0; align 8; startsseloopa: movupd XMM0, [ESI]; movupd XMM1, [ESI+16]; movupd XMM2, [ESI+32]; movupd XMM3, [ESI+48]; add ESI, 64; addpd XMM0, XMM4; addpd XMM1, XMM4; addpd XMM2, XMM4; addpd XMM3, XMM4; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopa; mov aptr, ESI; } } } while (aptr < aend) *aptr++ += value; return a; } unittest { printf("_arrayExpSliceAddass_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] += 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] + 6)) { printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] += b[] */ T[] _arraySliceSliceAddass_d(T[] a, T[] b) in { assert (a.length == b.length); assert (disjoint(a, b)); } body { //printf("_arraySliceSliceAddass_d()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE2 version is 183% faster if (sse2() && a.length >= 8) { auto n = aptr + (a.length & ~7); // Unaligned case asm { mov ECX, bptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movupd XMM0, [ESI]; movupd XMM1, [ESI+16]; movupd XMM2, [ESI+32]; movupd XMM3, [ESI+48]; add ESI, 64; movupd XMM4, [ECX]; movupd XMM5, [ECX+16]; movupd XMM6, [ECX+32]; movupd XMM7, [ECX+48]; add ECX, 64; addpd XMM0, XMM4; addpd XMM1, XMM5; addpd XMM2, XMM6; addpd XMM3, XMM7; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, ECX; } } } while (aptr < aend) *aptr++ += *bptr++; return a; } unittest { printf("_arraySliceSliceAddass_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] += b[]; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] + b[i])) { printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] - value */ T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) in { assert (a.length == b.length); assert (disjoint(a, b)); } body { //printf("_arraySliceExpMinSliceAssign_d()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE2 version is 305% faster if (sse2() && a.length >= 8) { auto n = aptr + (a.length & ~7); // Unaligned case asm { mov EAX, bptr; mov ESI, aptr; mov EDI, n; movsd XMM4, value; shufpd XMM4, XMM4, 0; align 8; startsseloop: add ESI, 64; movupd XMM0, [EAX]; movupd XMM1, [EAX+16]; movupd XMM2, [EAX+32]; movupd XMM3, [EAX+48]; add EAX, 64; subpd XMM0, XMM4; subpd XMM1, XMM4; subpd XMM2, XMM4; subpd XMM3, XMM4; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloop; mov aptr, ESI; mov bptr, EAX; } } } while (aptr < aend) *aptr++ = *bptr++ - value; return a; } unittest { printf("_arraySliceExpMinSliceAssign_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] - 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] - 6)) { printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = value - b[] */ T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) in { assert (a.length == b.length); assert (disjoint(a, b)); } body { //printf("_arrayExpSliceMinSliceAssign_d()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE2 version is 66% faster if (sse2() && a.length >= 8) { auto n = aptr + (a.length & ~7); // Unaligned case asm { mov EAX, bptr; mov ESI, aptr; mov EDI, n; movsd XMM4, value; shufpd XMM4, XMM4, 0; align 8; startsseloop: add ESI, 64; movapd XMM5, XMM4; movapd XMM6, XMM4; movupd XMM0, [EAX]; movupd XMM1, [EAX+16]; movupd XMM2, [EAX+32]; movupd XMM3, [EAX+48]; add EAX, 64; subpd XMM5, XMM0; subpd XMM6, XMM1; movupd [ESI+ 0-64], XMM5; movupd [ESI+16-64], XMM6; movapd XMM5, XMM4; movapd XMM6, XMM4; subpd XMM5, XMM2; subpd XMM6, XMM3; movupd [ESI+32-64], XMM5; movupd [ESI+48-64], XMM6; cmp ESI, EDI; jb startsseloop; mov aptr, ESI; mov bptr, EAX; } } } while (aptr < aend) *aptr++ = value - *bptr++; return a; } unittest { printf("_arrayExpSliceMinSliceAssign_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = 6 - a[]; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(6 - a[i])) { printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] -= value */ T[] _arrayExpSliceMinass_d(T[] a, T value) { //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); auto aptr = a.ptr; auto aend = aptr + a.length; version (D_InlineAsm_X86) { // SSE2 version is 115% faster if (sse2() && a.length >= 8) { auto n = cast(T*)((cast(uint)aend) & ~7); if (aptr < n) // Unaligned case asm { mov ESI, aptr; mov EDI, n; movsd XMM4, value; shufpd XMM4, XMM4, 0; align 8; startsseloopa: movupd XMM0, [ESI]; movupd XMM1, [ESI+16]; movupd XMM2, [ESI+32]; movupd XMM3, [ESI+48]; add ESI, 64; subpd XMM0, XMM4; subpd XMM1, XMM4; subpd XMM2, XMM4; subpd XMM3, XMM4; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopa; mov aptr, ESI; } } } while (aptr < aend) *aptr++ -= value; return a; } unittest { printf("_arrayExpSliceMinass_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] -= 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] - 6)) { printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] -= b[] */ T[] _arraySliceSliceMinass_d(T[] a, T[] b) in { assert (a.length == b.length); assert (disjoint(a, b)); } body { //printf("_arraySliceSliceMinass_d()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE2 version is 183% faster if (sse2() && a.length >= 8) { auto n = aptr + (a.length & ~7); // Unaligned case asm { mov ECX, bptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movupd XMM0, [ESI]; movupd XMM1, [ESI+16]; movupd XMM2, [ESI+32]; movupd XMM3, [ESI+48]; add ESI, 64; movupd XMM4, [ECX]; movupd XMM5, [ECX+16]; movupd XMM6, [ECX+32]; movupd XMM7, [ECX+48]; add ECX, 64; subpd XMM0, XMM4; subpd XMM1, XMM5; subpd XMM2, XMM6; subpd XMM3, XMM7; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, ECX; } } } while (aptr < aend) *aptr++ -= *bptr++; return a; } unittest { printf("_arrayExpSliceMinass_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] -= 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] - 6)) { printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] * value */ T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b) in { assert(a.length == b.length); assert(disjoint(a, b)); } body { //printf("_arraySliceExpMulSliceAssign_d()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE2 version is 304% faster if (sse2() && a.length >= 8) { auto n = aptr + (a.length & ~7); // Unaligned case asm { mov EAX, bptr; mov ESI, aptr; mov EDI, n; movsd XMM4, value; shufpd XMM4, XMM4, 0; align 8; startsseloop: add ESI, 64; movupd XMM0, [EAX]; movupd XMM1, [EAX+16]; movupd XMM2, [EAX+32]; movupd XMM3, [EAX+48]; add EAX, 64; mulpd XMM0, XMM4; mulpd XMM1, XMM4; mulpd XMM2, XMM4; mulpd XMM3, XMM4; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloop; mov aptr, ESI; mov bptr, EAX; } } } while (aptr < aend) *aptr++ = *bptr++ * value; return a; } unittest { printf("_arraySliceExpMulSliceAssign_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] * 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] * 6)) { printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] * c[] */ T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b) in { assert(a.length == b.length && b.length == c.length); assert(disjoint(a, b)); assert(disjoint(a, c)); assert(disjoint(b, c)); } body { //printf("_arraySliceSliceMulSliceAssign_d()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; auto cptr = c.ptr; version (D_InlineAsm_X86) { // SSE2 version is 329% faster if (sse2() && a.length >= 8) { auto n = aptr + (a.length & ~7); // Unaligned case asm { mov EAX, bptr; // left operand mov ECX, cptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movupd XMM0, [EAX]; movupd XMM1, [EAX+16]; movupd XMM2, [EAX+32]; movupd XMM3, [EAX+48]; add ESI, 64; movupd XMM4, [ECX]; movupd XMM5, [ECX+16]; movupd XMM6, [ECX+32]; movupd XMM7, [ECX+48]; add EAX, 64; mulpd XMM0, XMM4; mulpd XMM1, XMM5; mulpd XMM2, XMM6; mulpd XMM3, XMM7; add ECX, 64; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } } while (aptr < aend) *aptr++ = *bptr++ * *cptr++; return a; } unittest { printf("_arraySliceSliceMulSliceAssign_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] * b[]; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] * b[i])) { printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] *= value */ T[] _arrayExpSliceMulass_d(T[] a, T value) { //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); auto aptr = a.ptr; auto aend = aptr + a.length; version (D_InlineAsm_X86) { // SSE2 version is 109% faster if (sse2() && a.length >= 8) { auto n = cast(T*)((cast(uint)aend) & ~7); if (aptr < n) // Unaligned case asm { mov ESI, aptr; mov EDI, n; movsd XMM4, value; shufpd XMM4, XMM4, 0; align 8; startsseloopa: movupd XMM0, [ESI]; movupd XMM1, [ESI+16]; movupd XMM2, [ESI+32]; movupd XMM3, [ESI+48]; add ESI, 64; mulpd XMM0, XMM4; mulpd XMM1, XMM4; mulpd XMM2, XMM4; mulpd XMM3, XMM4; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopa; mov aptr, ESI; } } } while (aptr < aend) *aptr++ *= value; return a; } unittest { printf("_arrayExpSliceMulass_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] *= 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] * 6)) { printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] *= b[] */ T[] _arraySliceSliceMulass_d(T[] a, T[] b) in { assert (a.length == b.length); assert (disjoint(a, b)); } body { //printf("_arraySliceSliceMulass_d()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE2 version is 205% faster if (sse2() && a.length >= 8) { auto n = aptr + (a.length & ~7); // Unaligned case asm { mov ECX, bptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movupd XMM0, [ESI]; movupd XMM1, [ESI+16]; movupd XMM2, [ESI+32]; movupd XMM3, [ESI+48]; add ESI, 64; movupd XMM4, [ECX]; movupd XMM5, [ECX+16]; movupd XMM6, [ECX+32]; movupd XMM7, [ECX+48]; add ECX, 64; mulpd XMM0, XMM4; mulpd XMM1, XMM5; mulpd XMM2, XMM6; mulpd XMM3, XMM7; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, ECX; } } } while (aptr < aend) *aptr++ *= *bptr++; return a; } unittest { printf("_arrayExpSliceMulass_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] *= 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] * 6)) { printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] / value */ T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b) in { assert(a.length == b.length); assert(disjoint(a, b)); } body { //printf("_arraySliceExpDivSliceAssign_d()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; /* Multiplying by the reciprocal is faster, but does * not produce as accurate an answer. */ T recip = cast(T)1 / value; version (D_InlineAsm_X86) { // SSE2 version is 299% faster if (sse2() && a.length >= 8) { auto n = aptr + (a.length & ~7); // Unaligned case asm { mov EAX, bptr; mov ESI, aptr; mov EDI, n; movsd XMM4, recip; //movsd XMM4, value //rcpsd XMM4, XMM4 shufpd XMM4, XMM4, 0; align 8; startsseloop: add ESI, 64; movupd XMM0, [EAX]; movupd XMM1, [EAX+16]; movupd XMM2, [EAX+32]; movupd XMM3, [EAX+48]; add EAX, 64; mulpd XMM0, XMM4; mulpd XMM1, XMM4; mulpd XMM2, XMM4; mulpd XMM3, XMM4; //divpd XMM0, XMM4; //divpd XMM1, XMM4; //divpd XMM2, XMM4; //divpd XMM3, XMM4; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloop; mov aptr, ESI; mov bptr, EAX; } } } while (aptr < aend) { *aptr++ = *bptr++ / value; //*aptr++ = *bptr++ * recip; } return a; } unittest { printf("_arraySliceExpDivSliceAssign_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] / 8; for (int i = 0; i < dim; i++) { //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]); if (c[i] != cast(T)(a[i] / 8)) { printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] /= value */ T[] _arrayExpSliceDivass_d(T[] a, T value) { //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); auto aptr = a.ptr; auto aend = aptr + a.length; /* Multiplying by the reciprocal is faster, but does * not produce as accurate an answer. */ T recip = cast(T)1 / value; version (D_InlineAsm_X86) { // SSE2 version is 65% faster if (sse2() && a.length >= 8) { auto n = aptr + (a.length & ~7); // Unaligned case asm { mov ESI, aptr; mov EDI, n; movsd XMM4, recip; //movsd XMM4, value //rcpsd XMM4, XMM4 shufpd XMM4, XMM4, 0; align 8; startsseloopa: movupd XMM0, [ESI]; movupd XMM1, [ESI+16]; movupd XMM2, [ESI+32]; movupd XMM3, [ESI+48]; add ESI, 64; mulpd XMM0, XMM4; mulpd XMM1, XMM4; mulpd XMM2, XMM4; mulpd XMM3, XMM4; //divpd XMM0, XMM4; //divpd XMM1, XMM4; //divpd XMM2, XMM4; //divpd XMM3, XMM4; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopa; mov aptr, ESI; } } } while (aptr < aend) *aptr++ *= recip; return a; } unittest { printf("_arrayExpSliceDivass_d unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] /= 8; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] / 8)) { printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] -= b[] * value */ T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b) { return _arraySliceExpMulSliceAddass_d(a, -value, b); } /*********************** * Computes: * a[] += b[] * value */ T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b) in { assert(a.length == b.length); assert(disjoint(a, b)); } body { auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; // Handle remainder while (aptr < aend) *aptr++ += *bptr++ * value; return a; } unittest { printf("_arraySliceExpMulSliceAddass_d unittest\n"); cpuid = 1; { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 1; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } b[] = c[]; c[] += a[] * 6; for (int i = 0; i < dim; i++) { //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); if (c[i] != cast(T)(b[i] + a[i] * 6)) { printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); assert(0); } } } } }