Mercurial > projects > ldc
diff druntime/src/compiler/dmd/arraybyte.d @ 759:d3eb054172f9
Added copy of druntime from DMD 2.020 modified for LDC.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:52:37 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/arraybyte.d Tue Nov 11 01:52:37 2008 +0100 @@ -0,0 +1,1890 @@ +/*************************** + * D programming language http://www.digitalmars.com/d/ + * Runtime support for byte array operations. + * Based on code originally written by Burton Radons. + * Placed in public domain. + */ + +/* Contains SSE2 and MMX versions of certain operations for char, byte, + * and ubyte ('a', 'g' and 'h' suffixes). + */ + +module rt.arraybyte; + +import util.cpuid; + +version (Unittest) +{ + /* This is so unit tests will test every CPU variant + */ + int cpuid; + const int CPUID_MAX = 4; + bool mmx() { return cpuid == 1 && util.cpuid.mmx(); } + bool sse() { return cpuid == 2 && util.cpuid.sse(); } + bool sse2() { return cpuid == 3 && util.cpuid.sse2(); } + bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); } +} +else +{ + alias util.cpuid.mmx mmx; + alias util.cpuid.sse sse; + alias util.cpuid.sse2 sse2; + alias util.cpuid.amd3dnow amd3dnow; +} + +//version = log; + +bool disjoint(T)(T[] a, T[] b) +{ + return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); +} + +alias byte T; + +extern (C): + +/* ======================================================================== */ + + +/*********************** + * Computes: + * a[] = b[] + value + */ + +T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b) +{ + return _arraySliceExpAddSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b) +{ + return _arraySliceExpAddSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpAddSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1088% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 1000% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startaddmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + paddb MM0, MM4; + paddb MM1, MM4; + paddb MM2, MM4; + paddb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + /* trying to be fair and treat normal 32-bit cpu the same way as we do + * the SIMD units, with unrolled asm. There's not enough registers, + * really. + */ + else + if (a.length >= 4) + { + + auto n = aptr + (a.length & ~3); + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov CL, value; + + align 4; + startadd386: + add ESI, 4; + mov DX, [EAX]; + mov BX, [EAX+2]; + add EAX, 4; + add BL, CL; + add BH, CL; + add DL, CL; + add DH, CL; + mov [ESI -4], DX; + mov [ESI+2 -4], BX; + cmp ESI, EDI; + jb startadd386; + + mov aptr, ESI; + mov bptr, EAX; + } + + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ + value); + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + c[] + */ + +T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceAddSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 5739% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + version (log) printf("\tsse2 unaligned\n"); + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startaddlsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddlsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + version (log) printf("\tsse2 aligned\n"); + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startaddlsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddlsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 4428% faster + if (mmx() && a.length >= 32) + { + version (log) printf("\tmmx\n"); + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startaddlmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + paddb MM0, MM4; + paddb MM1, MM5; + paddb MM2, MM6; + paddb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddlmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + version (log) if (aptr < aend) printf("\tbase\n"); + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ + *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_a(T[] a, T value) +{ + return _arrayExpSliceAddass_g(a, value); +} + +T[] _arrayExpSliceAddass_h(T[] a, T value) +{ + return _arrayExpSliceAddass_g(a, value); +} + +T[] _arrayExpSliceAddass_g(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1578% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddasssse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasssse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddasssse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasssse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 1721% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 8; + startaddassmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + paddb MM0, MM4; + paddb MM1, MM4; + paddb MM2, MM4; + paddb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddassmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_a(T[] a, T[] b) +{ + return _arraySliceSliceAddass_g(a, b); +} + +T[] _arraySliceSliceAddass_h(T[] a, T[] b) +{ + return _arraySliceSliceAddass_g(a, b); +} + +T[] _arraySliceSliceAddass_g(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 4727% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startaddasslsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasslsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startaddasslsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasslsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 3059% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startaddasslmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + paddb MM0, MM4; + paddb MM1, MM5; + paddb MM2, MM6; + paddb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddasslmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1189% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 1079% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startsubmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + psubb MM0, MM4; + psubb MM1, MM4; + psubb MM2, MM4; + psubb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsubmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really. + else + if (a.length >= 4) + { + auto n = aptr + (a.length & ~3); + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov CL, value; + + align 4; + startsub386: + add ESI, 4; + mov DX, [EAX]; + mov BX, [EAX+2]; + add EAX, 4; + sub BL, CL; + sub BH, CL; + sub DL, CL; + sub DH, CL; + mov [ESI -4], DX; + mov [ESI+2 -4], BX; + cmp ESI, EDI; + jb startsub386; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ - value); + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] = b[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, c[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_g(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_g(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 8748% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubrsse2u: + add ESI, 64; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + psubb XMM5, XMM0; + psubb XMM6, XMM1; + movdqu [ESI -64], XMM5; + movdqu [ESI+16-64], XMM6; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + psubb XMM5, XMM2; + psubb XMM6, XMM3; + movdqu [ESI+32-64], XMM5; + movdqu [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsubrsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubrsse2a: + add ESI, 64; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + psubb XMM5, XMM0; + psubb XMM6, XMM1; + movdqa [ESI -64], XMM5; + movdqa [ESI+16-64], XMM6; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + psubb XMM5, XMM2; + psubb XMM6, XMM3; + movdqa [ESI+32-64], XMM5; + movdqa [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsubrsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 7397% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startsubrmmx: + add ESI, 32; + movq MM5, MM4; + movq MM6, MM4; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + psubb MM5, MM0; + psubb MM6, MM1; + movq [ESI -32], MM5; + movq [ESI+8 -32], MM6; + movq MM5, MM4; + movq MM6, MM4; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + psubb MM5, MM2; + psubb MM6, MM3; + movq [ESI+16-32], MM5; + movq [ESI+24-32], MM6; + cmp ESI, EDI; + jb startsubrmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + + } + + while (aptr < aend) + *aptr++ = cast(T)(value - *bptr++); + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] = 6 - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - b[i])) + { + printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - c[] + */ + +T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 5756% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startsublsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsublsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startsublsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsublsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 4428% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startsublmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + psubb MM0, MM4; + psubb MM1, MM5; + psubb MM2, MM6; + psubb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsublmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ - *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_a(T[] a, T value) +{ + return _arrayExpSliceMinass_g(a, value); +} + +T[] _arrayExpSliceMinass_h(T[] a, T value) +{ + return _arrayExpSliceMinass_g(a, value); +} + +T[] _arrayExpSliceMinass_g(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1577% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubasssse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasssse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubasssse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasssse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 1577% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 8; + startsubassmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + psubb MM0, MM4; + psubb MM1, MM4; + psubb MM2, MM4; + psubb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsubassmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_a(T[] a, T[] b) +{ + return _arraySliceSliceMinass_g(a, b); +} + +T[] _arraySliceSliceMinass_h(T[] a, T[] b) +{ + return _arraySliceSliceMinass_g(a, b); +} + +T[] _arraySliceSliceMinass_g(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 4800% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startsubasslsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasslsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startsubasslsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasslsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 3107% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startsubasslmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + psubb MM0, MM4; + psubb MM1, MM5; + psubb MM2, MM6; + psubb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsubasslmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +}