Mercurial > projects > ldc
diff druntime/src/compiler/dmd/arraydouble.d @ 759:d3eb054172f9
Added copy of druntime from DMD 2.020 modified for LDC.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:52:37 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/arraydouble.d Tue Nov 11 01:52:37 2008 +0100 @@ -0,0 +1,1714 @@ +/*************************** + * D programming language http://www.digitalmars.com/d/ + * Runtime support for double array operations. + * Based on code originally written by Burton Radons. + * Placed in public domain. + */ + +module rt.arraydouble; + +private import util.cpuid; + +version (Unittest) +{ + /* This is so unit tests will test every CPU variant + */ + int cpuid; + const int CPUID_MAX = 5; + bool mmx() { return cpuid == 1 && util.cpuid.mmx(); } + bool sse() { return cpuid == 2 && util.cpuid.sse(); } + bool sse2() { return cpuid == 3 && util.cpuid.sse2(); } + bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); } +} +else +{ + alias util.cpuid.mmx mmx; + alias util.cpuid.sse sse; + alias util.cpuid.sse2 sse2; + alias util.cpuid.amd3dnow amd3dnow; +} + +//version = log; + +bool disjoint(T)(T[] a, T[] b) +{ + return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); +} + +/* Performance figures measured by Burton Radons + */ + +alias double T; + +extern (C): + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + c[] + */ + +T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 333% faster + if (sse2() && b.length >= 16) + { + auto n = aptr + (b.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ESI, 64; + addpd XMM0, XMM4; + addpd XMM1, XMM5; + addpd XMM2, XMM6; + addpd XMM3, XMM7; + add ECX, 64; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + // Handle remainder + while (aptr < aend) + *aptr++ = *bptr++ + *cptr++; + + return a; +} + + +unittest +{ + printf("_arraySliceSliceAddSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - c[] + */ + +T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 324% faster + if (sse2() && b.length >= 8) + { + auto n = aptr + (b.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ESI, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM5; + subpd XMM2, XMM6; + subpd XMM3, XMM7; + add ECX, 64; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + // Handle remainder + while (aptr < aend) + *aptr++ = *bptr++ - *cptr++; + + return a; +} + + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + value + */ + +T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpAddSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 305% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + addpd XMM0, XMM4; + addpd XMM1, XMM4; + addpd XMM2, XMM4; + addpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ + value; + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_d(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 version is 114% faster + if (sse2() && a.length >= 8) + { + auto n = cast(T*)((cast(uint)aend) & ~7); + if (aptr < n) + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + addpd XMM0, XMM4; + addpd XMM1, XMM4; + addpd XMM2, XMM4; + addpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_d(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 183% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ECX, 64; + addpd XMM0, XMM4; + addpd XMM1, XMM5; + addpd XMM2, XMM6; + addpd XMM3, XMM7; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 305% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM4; + subpd XMM2, XMM4; + subpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ - value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 66% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movapd XMM5, XMM4; + movapd XMM6, XMM4; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + subpd XMM5, XMM0; + subpd XMM6, XMM1; + movupd [ESI+ 0-64], XMM5; + movupd [ESI+16-64], XMM6; + movapd XMM5, XMM4; + movapd XMM6, XMM4; + subpd XMM5, XMM2; + subpd XMM6, XMM3; + movupd [ESI+32-64], XMM5; + movupd [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = value - *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = 6 - a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - a[i])) + { + printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_d(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 version is 115% faster + if (sse2() && a.length >= 8) + { + auto n = cast(T*)((cast(uint)aend) & ~7); + if (aptr < n) + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM4; + subpd XMM2, XMM4; + subpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_d(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 183% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ECX, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM5; + subpd XMM2, XMM6; + subpd XMM3, XMM7; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * value + */ + +T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMulSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 304% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * c[] + */ + +T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceMulSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 329% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add EAX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM5; + mulpd XMM2, XMM6; + mulpd XMM3, XMM7; + add ECX, 64; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * b[i])) + { + printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= value + */ + +T[] _arrayExpSliceMulass_d(T[] a, T value) +{ + //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 version is 109% faster + if (sse2() && a.length >= 8) + { + auto n = cast(T*)((cast(uint)aend) & ~7); + if (aptr < n) + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= b[] + */ + +T[] _arraySliceSliceMulass_d(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMulass_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 205% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ECX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM5; + mulpd XMM2, XMM6; + mulpd XMM3, XMM7; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ *= *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] / value + */ + +T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpDivSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + /* Multiplying by the reciprocal is faster, but does + * not produce as accurate an answer. + */ + T recip = cast(T)1 / value; + + version (D_InlineAsm_X86) + { + // SSE2 version is 299% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, recip; + //movsd XMM4, value + //rcpsd XMM4, XMM4 + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + //divpd XMM0, XMM4; + //divpd XMM1, XMM4; + //divpd XMM2, XMM4; + //divpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + { + *aptr++ = *bptr++ / value; + //*aptr++ = *bptr++ * recip; + } + + return a; +} + +unittest +{ + printf("_arraySliceExpDivSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] / 8; + + for (int i = 0; i < dim; i++) + { + //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]); + if (c[i] != cast(T)(a[i] / 8)) + { + printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] /= value + */ + +T[] _arrayExpSliceDivass_d(T[] a, T value) +{ + //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + /* Multiplying by the reciprocal is faster, but does + * not produce as accurate an answer. + */ + T recip = cast(T)1 / value; + + version (D_InlineAsm_X86) + { + // SSE2 version is 65% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, recip; + //movsd XMM4, value + //rcpsd XMM4, XMM4 + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + //divpd XMM0, XMM4; + //divpd XMM1, XMM4; + //divpd XMM2, XMM4; + //divpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= recip; + + return a; +} + + +unittest +{ + printf("_arrayExpSliceDivass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] /= 8; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] / 8)) + { + printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] * value + */ + +T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAddass_d(a, -value, b); +} + +/*********************** + * Computes: + * a[] += b[] * value + */ + +T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + // Handle remainder + while (aptr < aend) + *aptr++ += *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAddass_d unittest\n"); + + cpuid = 1; + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 1; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] += a[] * 6; + + for (int i = 0; i < dim; i++) + { + //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); + if (c[i] != cast(T)(b[i] + a[i] * 6)) + { + printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +}