diff druntime/src/compiler/dmd/arraydouble.d @ 759:d3eb054172f9

Added copy of druntime from DMD 2.020 modified for LDC.
author Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date Tue, 11 Nov 2008 01:52:37 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/druntime/src/compiler/dmd/arraydouble.d	Tue Nov 11 01:52:37 2008 +0100
@@ -0,0 +1,1714 @@
+/***************************
+ * D programming language http://www.digitalmars.com/d/
+ * Runtime support for double array operations.
+ * Based on code originally written by Burton Radons.
+ * Placed in public domain.
+ */
+
+module rt.arraydouble;
+
+private import util.cpuid;
+
+version (Unittest)
+{
+    /* This is so unit tests will test every CPU variant
+     */
+    int cpuid;
+    const int CPUID_MAX = 5;
+    bool mmx()      { return cpuid == 1 && util.cpuid.mmx(); }
+    bool sse()      { return cpuid == 2 && util.cpuid.sse(); }
+    bool sse2()     { return cpuid == 3 && util.cpuid.sse2(); }
+    bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
+}
+else
+{
+    alias util.cpuid.mmx mmx;
+    alias util.cpuid.sse sse;
+    alias util.cpuid.sse2 sse2;
+    alias util.cpuid.amd3dnow amd3dnow;
+}
+
+//version = log;
+
+bool disjoint(T)(T[] a, T[] b)
+{
+    return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
+}
+
+/* Performance figures measured by Burton Radons
+ */
+
+alias double T;
+
+extern (C):
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] = b[] + c[]
+ */
+
+T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b)
+in
+{
+        assert(a.length == b.length && b.length == c.length);
+        assert(disjoint(a, b));
+        assert(disjoint(a, c));
+        assert(disjoint(b, c));
+}
+body
+{
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+    auto cptr = c.ptr;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 333% faster
+        if (sse2() && b.length >= 16)
+        {
+            auto n = aptr + (b.length & ~15);
+
+            // Unaligned case
+            asm
+            {
+                mov EAX, bptr; // left operand
+                mov ECX, cptr; // right operand
+                mov ESI, aptr; // destination operand
+                mov EDI, n;    // end comparison
+
+                align 8;
+            startsseloopb:
+                movupd XMM0, [EAX];
+                movupd XMM1, [EAX+16];
+                movupd XMM2, [EAX+32];
+                movupd XMM3, [EAX+48];
+                add EAX, 64;
+                movupd XMM4, [ECX];
+                movupd XMM5, [ECX+16];
+                movupd XMM6, [ECX+32];
+                movupd XMM7, [ECX+48];
+                add ESI, 64;
+                addpd XMM0, XMM4;
+                addpd XMM1, XMM5;
+                addpd XMM2, XMM6;
+                addpd XMM3, XMM7;
+                add ECX, 64;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloopb;
+
+                mov aptr, ESI;
+                mov bptr, EAX;
+                mov cptr, ECX;
+            }
+        }
+    }
+
+    // Handle remainder
+    while (aptr < aend)
+        *aptr++ = *bptr++ + *cptr++;
+
+    return a;
+}
+
+
+unittest
+{
+    printf("_arraySliceSliceAddSliceAssign_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            c[] = a[] + b[];
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] + b[i]))
+                {
+                    printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] = b[] - c[]
+ */
+
+T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b)
+in
+{
+        assert(a.length == b.length && b.length == c.length);
+        assert(disjoint(a, b));
+        assert(disjoint(a, c));
+        assert(disjoint(b, c));
+}
+body
+{
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+    auto cptr = c.ptr;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 324% faster
+        if (sse2() && b.length >= 8)
+        {
+            auto n = aptr + (b.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov EAX, bptr; // left operand
+                mov ECX, cptr; // right operand
+                mov ESI, aptr; // destination operand
+                mov EDI, n;    // end comparison
+
+                align 8;
+            startsseloopb:
+                movupd XMM0, [EAX];
+                movupd XMM1, [EAX+16];
+                movupd XMM2, [EAX+32];
+                movupd XMM3, [EAX+48];
+                add EAX, 64;
+                movupd XMM4, [ECX];
+                movupd XMM5, [ECX+16];
+                movupd XMM6, [ECX+32];
+                movupd XMM7, [ECX+48];
+                add ESI, 64;
+                subpd XMM0, XMM4;
+                subpd XMM1, XMM5;
+                subpd XMM2, XMM6;
+                subpd XMM3, XMM7;
+                add ECX, 64;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloopb;
+
+                mov aptr, ESI;
+                mov bptr, EAX;
+                mov cptr, ECX;
+            }
+        }
+    }
+
+    // Handle remainder
+    while (aptr < aend)
+        *aptr++ = *bptr++ - *cptr++;
+
+    return a;
+}
+
+
+unittest
+{
+    printf("_arraySliceSliceMinSliceAssign_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            c[] = a[] - b[];
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] - b[i]))
+                {
+                    printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] = b[] + value
+ */
+
+T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b)
+in
+{
+    assert(a.length == b.length);
+    assert(disjoint(a, b));
+}
+body
+{
+    //printf("_arraySliceExpAddSliceAssign_d()\n");
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 305% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = aptr + (a.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov EAX, bptr;
+                mov ESI, aptr;
+                mov EDI, n;
+                movsd XMM4, value;
+                shufpd XMM4, XMM4, 0;
+
+                align 8;
+            startsseloop:
+                add ESI, 64;
+                movupd XMM0, [EAX];
+                movupd XMM1, [EAX+16];
+                movupd XMM2, [EAX+32];
+                movupd XMM3, [EAX+48];
+                add EAX, 64;
+                addpd XMM0, XMM4;
+                addpd XMM1, XMM4;
+                addpd XMM2, XMM4;
+                addpd XMM3, XMM4;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloop;
+
+                mov aptr, ESI;
+                mov bptr, EAX;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ = *bptr++ + value;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arraySliceExpAddSliceAssign_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            c[] = a[] + 6;
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] + 6))
+                {
+                    printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] += value
+ */
+
+T[] _arrayExpSliceAddass_d(T[] a, T value)
+{
+    //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 114% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = cast(T*)((cast(uint)aend) & ~7);
+            if (aptr < n)
+
+            // Unaligned case
+            asm
+            {
+                mov ESI, aptr;
+                mov EDI, n;
+                movsd XMM4, value;
+                shufpd XMM4, XMM4, 0;
+
+                align 8;
+            startsseloopa:
+                movupd XMM0, [ESI];
+                movupd XMM1, [ESI+16];
+                movupd XMM2, [ESI+32];
+                movupd XMM3, [ESI+48];
+                add ESI, 64;
+                addpd XMM0, XMM4;
+                addpd XMM1, XMM4;
+                addpd XMM2, XMM4;
+                addpd XMM3, XMM4;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloopa;
+
+                mov aptr, ESI;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ += value;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arrayExpSliceAddass_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            a[] = c[];
+            c[] += 6;
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] + 6))
+                {
+                    printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] += b[]
+ */
+
+T[] _arraySliceSliceAddass_d(T[] a, T[] b)
+in
+{
+    assert (a.length == b.length);
+    assert (disjoint(a, b));
+}
+body
+{
+    //printf("_arraySliceSliceAddass_d()\n");
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 183% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = aptr + (a.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov ECX, bptr; // right operand
+                mov ESI, aptr; // destination operand
+                mov EDI, n; // end comparison
+
+                align 8;
+            startsseloopb:
+                movupd XMM0, [ESI];
+                movupd XMM1, [ESI+16];
+                movupd XMM2, [ESI+32];
+                movupd XMM3, [ESI+48];
+                add ESI, 64;
+                movupd XMM4, [ECX];
+                movupd XMM5, [ECX+16];
+                movupd XMM6, [ECX+32];
+                movupd XMM7, [ECX+48];
+                add ECX, 64;
+                addpd XMM0, XMM4;
+                addpd XMM1, XMM5;
+                addpd XMM2, XMM6;
+                addpd XMM3, XMM7;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloopb;
+
+                mov aptr, ESI;
+                mov bptr, ECX;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ += *bptr++;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arraySliceSliceAddass_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            a[] = c[];
+            c[] += b[];
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] + b[i]))
+                {
+                    printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] = b[] - value
+ */
+
+T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b)
+in
+{
+    assert (a.length == b.length);
+    assert (disjoint(a, b));
+}
+body
+{
+    //printf("_arraySliceExpMinSliceAssign_d()\n");
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 305% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = aptr + (a.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov EAX, bptr;
+                mov ESI, aptr;
+                mov EDI, n;
+                movsd XMM4, value;
+                shufpd XMM4, XMM4, 0;
+
+                align 8;
+            startsseloop:
+                add ESI, 64;
+                movupd XMM0, [EAX];
+                movupd XMM1, [EAX+16];
+                movupd XMM2, [EAX+32];
+                movupd XMM3, [EAX+48];
+                add EAX, 64;
+                subpd XMM0, XMM4;
+                subpd XMM1, XMM4;
+                subpd XMM2, XMM4;
+                subpd XMM3, XMM4;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloop;
+
+                mov aptr, ESI;
+                mov bptr, EAX;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ = *bptr++ - value;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arraySliceExpMinSliceAssign_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            c[] = a[] - 6;
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] - 6))
+                {
+                    printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] = value - b[]
+ */
+
+T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value)
+in
+{
+    assert (a.length == b.length);
+    assert (disjoint(a, b));
+}
+body
+{
+    //printf("_arrayExpSliceMinSliceAssign_d()\n");
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 66% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = aptr + (a.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov EAX, bptr;
+                mov ESI, aptr;
+                mov EDI, n;
+                movsd XMM4, value;
+                shufpd XMM4, XMM4, 0;
+
+                align 8;
+            startsseloop:
+                add ESI, 64;
+                movapd XMM5, XMM4;
+                movapd XMM6, XMM4;
+                movupd XMM0, [EAX];
+                movupd XMM1, [EAX+16];
+                movupd XMM2, [EAX+32];
+                movupd XMM3, [EAX+48];
+                add EAX, 64;
+                subpd XMM5, XMM0;
+                subpd XMM6, XMM1;
+                movupd [ESI+ 0-64], XMM5;
+                movupd [ESI+16-64], XMM6;
+                movapd XMM5, XMM4;
+                movapd XMM6, XMM4;
+                subpd XMM5, XMM2;
+                subpd XMM6, XMM3;
+                movupd [ESI+32-64], XMM5;
+                movupd [ESI+48-64], XMM6;
+                cmp ESI, EDI;
+                jb startsseloop;
+
+                mov aptr, ESI;
+                mov bptr, EAX;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ = value - *bptr++;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arrayExpSliceMinSliceAssign_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            c[] = 6 - a[];
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(6 - a[i]))
+                {
+                    printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] -= value
+ */
+
+T[] _arrayExpSliceMinass_d(T[] a, T value)
+{
+    //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 115% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = cast(T*)((cast(uint)aend) & ~7);
+            if (aptr < n)
+
+            // Unaligned case
+            asm
+            {
+                mov ESI, aptr;
+                mov EDI, n;
+                movsd XMM4, value;
+                shufpd XMM4, XMM4, 0;
+
+                align 8;
+            startsseloopa:
+                movupd XMM0, [ESI];
+                movupd XMM1, [ESI+16];
+                movupd XMM2, [ESI+32];
+                movupd XMM3, [ESI+48];
+                add ESI, 64;
+                subpd XMM0, XMM4;
+                subpd XMM1, XMM4;
+                subpd XMM2, XMM4;
+                subpd XMM3, XMM4;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloopa;
+
+                mov aptr, ESI;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ -= value;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arrayExpSliceMinass_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            a[] = c[];
+            c[] -= 6;
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] - 6))
+                {
+                    printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] -= b[]
+ */
+
+T[] _arraySliceSliceMinass_d(T[] a, T[] b)
+in
+{
+    assert (a.length == b.length);
+    assert (disjoint(a, b));
+}
+body
+{
+    //printf("_arraySliceSliceMinass_d()\n");
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 183% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = aptr + (a.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov ECX, bptr; // right operand
+                mov ESI, aptr; // destination operand
+                mov EDI, n; // end comparison
+
+                align 8;
+            startsseloopb:
+                movupd XMM0, [ESI];
+                movupd XMM1, [ESI+16];
+                movupd XMM2, [ESI+32];
+                movupd XMM3, [ESI+48];
+                add ESI, 64;
+                movupd XMM4, [ECX];
+                movupd XMM5, [ECX+16];
+                movupd XMM6, [ECX+32];
+                movupd XMM7, [ECX+48];
+                add ECX, 64;
+                subpd XMM0, XMM4;
+                subpd XMM1, XMM5;
+                subpd XMM2, XMM6;
+                subpd XMM3, XMM7;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloopb;
+
+                mov aptr, ESI;
+                mov bptr, ECX;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ -= *bptr++;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arrayExpSliceMinass_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            a[] = c[];
+            c[] -= 6;
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] - 6))
+                {
+                    printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] = b[] * value
+ */
+
+T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b)
+in
+{
+    assert(a.length == b.length);
+    assert(disjoint(a, b));
+}
+body
+{
+    //printf("_arraySliceExpMulSliceAssign_d()\n");
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 304% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = aptr + (a.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov EAX, bptr;
+                mov ESI, aptr;
+                mov EDI, n;
+                movsd XMM4, value;
+                shufpd XMM4, XMM4, 0;
+
+                align 8;
+            startsseloop:
+                add ESI, 64;
+                movupd XMM0, [EAX];
+                movupd XMM1, [EAX+16];
+                movupd XMM2, [EAX+32];
+                movupd XMM3, [EAX+48];
+                add EAX, 64;
+                mulpd XMM0, XMM4;
+                mulpd XMM1, XMM4;
+                mulpd XMM2, XMM4;
+                mulpd XMM3, XMM4;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloop;
+
+                mov aptr, ESI;
+                mov bptr, EAX;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ = *bptr++ * value;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arraySliceExpMulSliceAssign_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            c[] = a[] * 6;
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] * 6))
+                {
+                    printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] = b[] * c[]
+ */
+
+T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b)
+in
+{
+        assert(a.length == b.length && b.length == c.length);
+        assert(disjoint(a, b));
+        assert(disjoint(a, c));
+        assert(disjoint(b, c));
+}
+body
+{
+    //printf("_arraySliceSliceMulSliceAssign_d()\n");
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+    auto cptr = c.ptr;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 329% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = aptr + (a.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov EAX, bptr; // left operand
+                mov ECX, cptr; // right operand
+                mov ESI, aptr; // destination operand
+                mov EDI, n; // end comparison
+
+                align 8;
+            startsseloopb:
+                movupd XMM0, [EAX];
+                movupd XMM1, [EAX+16];
+                movupd XMM2, [EAX+32];
+                movupd XMM3, [EAX+48];
+                add ESI, 64;
+                movupd XMM4, [ECX];
+                movupd XMM5, [ECX+16];
+                movupd XMM6, [ECX+32];
+                movupd XMM7, [ECX+48];
+                add EAX, 64;
+                mulpd XMM0, XMM4;
+                mulpd XMM1, XMM5;
+                mulpd XMM2, XMM6;
+                mulpd XMM3, XMM7;
+                add ECX, 64;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloopb;
+
+                mov aptr, ESI;
+                mov bptr, EAX;
+                mov cptr, ECX;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ = *bptr++ * *cptr++;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arraySliceSliceMulSliceAssign_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            c[] = a[] * b[];
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] * b[i]))
+                {
+                    printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] *= value
+ */
+
+T[] _arrayExpSliceMulass_d(T[] a, T value)
+{
+    //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 109% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = cast(T*)((cast(uint)aend) & ~7);
+            if (aptr < n)
+
+            // Unaligned case
+            asm
+            {
+                mov ESI, aptr;
+                mov EDI, n;
+                movsd XMM4, value;
+                shufpd XMM4, XMM4, 0;
+
+                align 8;
+            startsseloopa:
+                movupd XMM0, [ESI];
+                movupd XMM1, [ESI+16];
+                movupd XMM2, [ESI+32];
+                movupd XMM3, [ESI+48];
+                add ESI, 64;
+                mulpd XMM0, XMM4;
+                mulpd XMM1, XMM4;
+                mulpd XMM2, XMM4;
+                mulpd XMM3, XMM4;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloopa;
+
+                mov aptr, ESI;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ *= value;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arrayExpSliceMulass_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            a[] = c[];
+            c[] *= 6;
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] * 6))
+                {
+                    printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] *= b[]
+ */
+
+T[] _arraySliceSliceMulass_d(T[] a, T[] b)
+in
+{
+    assert (a.length == b.length);
+    assert (disjoint(a, b));
+}
+body
+{
+    //printf("_arraySliceSliceMulass_d()\n");
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 205% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = aptr + (a.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov ECX, bptr; // right operand
+                mov ESI, aptr; // destination operand
+                mov EDI, n; // end comparison
+
+                align 8;
+            startsseloopb:
+                movupd XMM0, [ESI];
+                movupd XMM1, [ESI+16];
+                movupd XMM2, [ESI+32];
+                movupd XMM3, [ESI+48];
+                add ESI, 64;
+                movupd XMM4, [ECX];
+                movupd XMM5, [ECX+16];
+                movupd XMM6, [ECX+32];
+                movupd XMM7, [ECX+48];
+                add ECX, 64;
+                mulpd XMM0, XMM4;
+                mulpd XMM1, XMM5;
+                mulpd XMM2, XMM6;
+                mulpd XMM3, XMM7;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloopb;
+
+                mov aptr, ESI;
+                mov bptr, ECX;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ *= *bptr++;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arrayExpSliceMulass_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            a[] = c[];
+            c[] *= 6;
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] * 6))
+                {
+                    printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] = b[] / value
+ */
+
+T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b)
+in
+{
+    assert(a.length == b.length);
+    assert(disjoint(a, b));
+}
+body
+{
+    //printf("_arraySliceExpDivSliceAssign_d()\n");
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+
+    /* Multiplying by the reciprocal is faster, but does
+     * not produce as accurate an answer.
+     */
+    T recip = cast(T)1 / value;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 299% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = aptr + (a.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov EAX, bptr;
+                mov ESI, aptr;
+                mov EDI, n;
+                movsd XMM4, recip;
+                //movsd XMM4, value
+                //rcpsd XMM4, XMM4
+                shufpd XMM4, XMM4, 0;
+
+                align 8;
+            startsseloop:
+                add ESI, 64;
+                movupd XMM0, [EAX];
+                movupd XMM1, [EAX+16];
+                movupd XMM2, [EAX+32];
+                movupd XMM3, [EAX+48];
+                add EAX, 64;
+                mulpd XMM0, XMM4;
+                mulpd XMM1, XMM4;
+                mulpd XMM2, XMM4;
+                mulpd XMM3, XMM4;
+                //divpd XMM0, XMM4;
+                //divpd XMM1, XMM4;
+                //divpd XMM2, XMM4;
+                //divpd XMM3, XMM4;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloop;
+
+                mov aptr, ESI;
+                mov bptr, EAX;
+            }
+        }
+    }
+
+    while (aptr < aend)
+    {
+        *aptr++ = *bptr++ / value;
+        //*aptr++ = *bptr++ * recip;
+    }
+
+    return a;
+}
+
+unittest
+{
+    printf("_arraySliceExpDivSliceAssign_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            c[] = a[] / 8;
+
+            for (int i = 0; i < dim; i++)
+            {
+                //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]);
+                if (c[i] != cast(T)(a[i] / 8))
+                {
+                    printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] /= value
+ */
+
+T[] _arrayExpSliceDivass_d(T[] a, T value)
+{
+    //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+
+    /* Multiplying by the reciprocal is faster, but does
+     * not produce as accurate an answer.
+     */
+    T recip = cast(T)1 / value;
+
+    version (D_InlineAsm_X86)
+    {
+        // SSE2 version is 65% faster
+        if (sse2() && a.length >= 8)
+        {
+            auto n = aptr + (a.length & ~7);
+
+            // Unaligned case
+            asm
+            {
+                mov ESI, aptr;
+                mov EDI, n;
+                movsd XMM4, recip;
+                //movsd XMM4, value
+                //rcpsd XMM4, XMM4
+                shufpd XMM4, XMM4, 0;
+
+                align 8;
+            startsseloopa:
+                movupd XMM0, [ESI];
+                movupd XMM1, [ESI+16];
+                movupd XMM2, [ESI+32];
+                movupd XMM3, [ESI+48];
+                add ESI, 64;
+                mulpd XMM0, XMM4;
+                mulpd XMM1, XMM4;
+                mulpd XMM2, XMM4;
+                mulpd XMM3, XMM4;
+                //divpd XMM0, XMM4;
+                //divpd XMM1, XMM4;
+                //divpd XMM2, XMM4;
+                //divpd XMM3, XMM4;
+                movupd [ESI+ 0-64], XMM0;
+                movupd [ESI+16-64], XMM1;
+                movupd [ESI+32-64], XMM2;
+                movupd [ESI+48-64], XMM3;
+                cmp ESI, EDI;
+                jb startsseloopa;
+
+                mov aptr, ESI;
+            }
+        }
+    }
+
+    while (aptr < aend)
+        *aptr++ *= recip;
+
+    return a;
+}
+
+
+unittest
+{
+    printf("_arrayExpSliceDivass_d unittest\n");
+    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 2; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            a[] = c[];
+            c[] /= 8;
+
+            for (int i = 0; i < dim; i++)
+            {
+                if (c[i] != cast(T)(a[i] / 8))
+                {
+                    printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}
+
+
+/* ======================================================================== */
+
+/***********************
+ * Computes:
+ *      a[] -= b[] * value
+ */
+
+T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b)
+{
+    return _arraySliceExpMulSliceAddass_d(a, -value, b);
+}
+
+/***********************
+ * Computes:
+ *      a[] += b[] * value
+ */
+
+T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b)
+in
+{
+        assert(a.length == b.length);
+        assert(disjoint(a, b));
+}
+body
+{
+    auto aptr = a.ptr;
+    auto aend = aptr + a.length;
+    auto bptr = b.ptr;
+
+    // Handle remainder
+    while (aptr < aend)
+        *aptr++ += *bptr++ * value;
+
+    return a;
+}
+
+unittest
+{
+    printf("_arraySliceExpMulSliceAddass_d unittest\n");
+
+    cpuid = 1;
+    {
+        version (log) printf("    cpuid %d\n", cpuid);
+
+        for (int j = 0; j < 1; j++)
+        {
+            const int dim = 67;
+            T[] a = new T[dim + j];     // aligned on 16 byte boundary
+            a = a[j .. dim + j];        // misalign for second iteration
+            T[] b = new T[dim + j];
+            b = b[j .. dim + j];
+            T[] c = new T[dim + j];
+            c = c[j .. dim + j];
+
+            for (int i = 0; i < dim; i++)
+            {   a[i] = cast(T)i;
+                b[i] = cast(T)(i + 7);
+                c[i] = cast(T)(i * 2);
+            }
+
+            b[] = c[];
+            c[] += a[] * 6;
+
+            for (int i = 0; i < dim; i++)
+            {
+                //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
+                if (c[i] != cast(T)(b[i] + a[i] * 6))
+                {
+                    printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
+                    assert(0);
+                }
+            }
+        }
+    }
+}