# HG changeset patch # User Robert Clipsham # Date 1243960986 -3600 # Node ID e0b2d67cfe7cf9e62f2490f962e0ab06c3ebf915 # Parent 7b218ec1044fee98ae2133248980824da31ae8d7 Added druntime (this should be removed once it works). diff -r 7b218ec1044f -r e0b2d67cfe7c .hgignore --- a/.hgignore Tue Jun 02 17:12:37 2009 +0100 +++ b/.hgignore Tue Jun 02 17:43:06 2009 +0100 @@ -21,7 +21,6 @@ ^tests/dstress/ ^tests/reference/ ^tango/ -^druntime/ ^import/ ^bin/ldc2?$ ^bin/ldc2?\.conf$ diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/import/core/bitmanip.di --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/import/core/bitmanip.di Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,262 @@ +/** + * This module contains a collection of bit-level operations. + * + * Copyright: Copyright (c) 2005-2008, The D Runtime Project + * License: BSD Style, see LICENSE + * Authors: Walter Bright, Don Clugston, Sean Kelly + */ +module core.bitmanip; + + +version( DDoc ) +{ + /** + * Scans the bits in v starting with bit 0, looking + * for the first set bit. + * Returns: + * The bit number of the first bit set. + * The return value is undefined if v is zero. + */ + int bsf( uint v ); + + + /** + * Scans the bits in v from the most significant bit + * to the least significant bit, looking + * for the first set bit. + * Returns: + * The bit number of the first bit set. + * The return value is undefined if v is zero. + * Example: + * --- + * import bitmanip; + * + * int main() + * { + * uint v; + * int x; + * + * v = 0x21; + * x = bsf(v); + * printf("bsf(x%x) = %d\n", v, x); + * x = bsr(v); + * printf("bsr(x%x) = %d\n", v, x); + * return 0; + * } + * --- + * Output: + * bsf(x21) = 0
+ * bsr(x21) = 5 + */ + int bsr( uint v ); + + + /** + * Tests the bit. + */ + int bt( uint* p, uint bitnum ); + + + /** + * Tests and complements the bit. + */ + int btc( uint* p, uint bitnum ); + + + /** + * Tests and resets (sets to 0) the bit. + */ + int btr( uint* p, uint bitnum ); + + + /** + * Tests and sets the bit. + * Params: + * p = a non-NULL pointer to an array of uints. + * index = a bit number, starting with bit 0 of p[0], + * and progressing. It addresses bits like the expression: + --- + p[index / (uint.sizeof*8)] & (1 << (index & ((uint.sizeof*8) - 1))) + --- + * Returns: + * A non-zero value if the bit was set, and a zero + * if it was clear. + * + * Example: + * --- + import bitmanip; + + int main() + { + uint array[2]; + + array[0] = 2; + array[1] = 0x100; + + printf("btc(array, 35) = %d\n", btc(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("btc(array, 35) = %d\n", btc(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("bts(array, 35) = %d\n", bts(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("btr(array, 35) = %d\n", btr(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("bt(array, 1) = %d\n", bt(array, 1)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + return 0; + } + * --- + * Output: +

+    btc(array, 35) = 0
+    array = [0]:x2, [1]:x108
+    btc(array, 35) = -1
+    array = [0]:x2, [1]:x100
+    bts(array, 35) = 0
+    array = [0]:x2, [1]:x108
+    btr(array, 35) = -1
+    array = [0]:x2, [1]:x100
+    bt(array, 1) = -1
+    array = [0]:x2, [1]:x100
+

+btc(array, 35) = 0
+array = [0]:x2, [1]:x108
+btc(array, 35) = -1
+array = [0]:x2, [1]:x100
+bts(array, 35) = 0
+array = [0]:x2, [1]:x108
+btr(array, 35) = -1
+array = [0]:x2, [1]:x100
+bt(array, 1) = -1
+array = [0]:x2, [1]:x100
+

+ */ +nothrow int bts( uint* p, uint bitnum ); + + +/** + * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes + * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3 + * becomes byte 0. + */ +pure nothrow uint bswap( uint v ); + + +/** + * Reads I/O port at port_address. + */ +nothrow ubyte inp( uint port_address ); + + +/** + * ditto + */ +nothrow ushort inpw( uint port_address ); + + +/** + * ditto + */ +nothrow uint inpl( uint port_address ); + + +/** + * Writes and returns value to I/O port at port_address. + */ +nothrow ubyte outp( uint port_address, ubyte value ); + + +/** + * ditto + */ +nothrow ushort outpw( uint port_address, ushort value ); + + +/** + * ditto + */ +nothrow uint outpl( uint port_address, uint value ); diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/import/std/stdarg.di --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/import/std/stdarg.di Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,32 @@ +/** + * These functions are built-in intrinsics to the compiler. + * + * Copyright: Public Domain + * License: Public Domain + * Authors: David Friedman + */ +module std.stdarg; + +version( GNU ) +{ + private import gcc.builtins; + alias __builtin_va_list va_list; + alias __builtin_va_end va_end; + alias __builtin_va_copy va_copy; +} + +template va_start(T) +{ + void va_start( out va_list ap, inout T parmn ) + { + + } +} + +template va_arg(T) +{ + T va_arg( inout va_list ap ) + { + return T.init; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/build-dmd.bat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/build-dmd.bat Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,5 @@ +@echo off +set OLDHOME=%HOME% +set HOME=%CD% +make clean all -fdmd-win32.mak +set HOME=%OLDHOME% \ No newline at end of file diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/build-dmd.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/build-dmd.sh Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +OLDHOME=$HOME +export HOME=`pwd` + +goerror(){ + export HOME=$OLDHOME + echo "=" + echo "= *** Error ***" + echo "=" + exit 1 +} + +make clean -fdmd-posix.mak || goerror +make -fdmd-posix.mak || goerror +chmod 644 ../import/core/*.di || goerror +chmod 644 ../import/core/sync/*.di || goerror + +export HOME=$OLDHOME diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/build-ldc.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/build-ldc.sh Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +OLDHOME=$HOME +export HOME=`pwd` + +goerror(){ + export HOME=$OLDHOME + echo "=" + echo "= *** Error ***" + echo "=" + exit 1 +} + +make clean -fldc-posix.mak || goerror +make -fldc-posix.mak || goerror +chmod 644 ../import/core/*.di || goerror +chmod 644 ../import/core/sync/*.di || goerror + +export HOME=$OLDHOME diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/common/core/bitmanip.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/common/core/bitmanip.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,286 @@ +/** + * This module contains a collection of bit-level operations. + * + * Copyright: Copyright (c) 2005-2008, The D Runtime Project + * License: BSD Style, see LICENSE + * Authors: Walter Bright, Don Clugston, Sean Kelly + */ +module core.bitmanip; + + +version( DDoc ) +{ + /** + * Scans the bits in v starting with bit 0, looking + * for the first set bit. + * Returns: + * The bit number of the first bit set. + * The return value is undefined if v is zero. + */ + int bsf( uint v ); + + + /** + * Scans the bits in v from the most significant bit + * to the least significant bit, looking + * for the first set bit. + * Returns: + * The bit number of the first bit set. + * The return value is undefined if v is zero. + * Example: + * --- + * import core.bitmanip; + * + * int main() + * { + * uint v; + * int x; + * + * v = 0x21; + * x = bsf(v); + * printf("bsf(x%x) = %d\n", v, x); + * x = bsr(v); + * printf("bsr(x%x) = %d\n", v, x); + * return 0; + * } + * --- + * Output: + * bsf(x21) = 0
+ * bsr(x21) = 5 + */ + int bsr( uint v ); + + + /** + * Tests the bit. + */ + int bt( uint* p, uint bitnum ); + + + /** + * Tests and complements the bit. + */ + int btc( uint* p, uint bitnum ); + + + /** + * Tests and resets (sets to 0) the bit. + */ + int btr( uint* p, uint bitnum ); + + + /** + * Tests and sets the bit. + * Params: + * p = a non-NULL pointer to an array of uints. + * index = a bit number, starting with bit 0 of p[0], + * and progressing. It addresses bits like the expression: + --- + p[index / (uint.sizeof*8)] & (1 << (index & ((uint.sizeof*8) - 1))) + --- + * Returns: + * A non-zero value if the bit was set, and a zero + * if it was clear. + * + * Example: + * --- + import core.bitmanip; + + int main() + { + uint array[2]; + + array[0] = 2; + array[1] = 0x100; + + printf("btc(array, 35) = %d\n", btc(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("btc(array, 35) = %d\n", btc(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("bts(array, 35) = %d\n", bts(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("btr(array, 35) = %d\n", btr(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("bt(array, 1) = %d\n", bt(array, 1)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + return 0; + } + * --- + * Output: +

+    btc(array, 35) = 0
+    array = [0]:x2, [1]:x108
+    btc(array, 35) = -1
+    array = [0]:x2, [1]:x100
+    bts(array, 35) = 0
+    array = [0]:x2, [1]:x108
+    btr(array, 35) = -1
+    array = [0]:x2, [1]:x100
+    bt(array, 1) = -1
+    array = [0]:x2, [1]:x100
+

+    btc(array, 35) = 0
+    array = [0]:x2, [1]:x108
+    btc(array, 35) = -1
+    array = [0]:x2, [1]:x100
+    bts(array, 35) = 0
+    array = [0]:x2, [1]:x108
+    btr(array, 35) = -1
+    array = [0]:x2, [1]:x100
+    bt(array, 1) = -1
+    array = [0]:x2, [1]:x100
+

+ */ + int bts( uint* p, uint bitnum ); + + + /** + * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes + * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3 + * becomes byte 0. + */ + uint bswap( uint v ); + + + /** + * Reads I/O port at port_address. + */ + ubyte inp( uint port_address ); + + + /** + * ditto + */ + ushort inpw( uint port_address ); + + + /** + * ditto + */ + uint inpl( uint port_address ); + + + /** + * Writes and returns value to I/O port at port_address. + */ + ubyte outp( uint port_address, ubyte value ); + + + /** + * ditto + */ + ushort outpw( uint port_address, ushort value ); + + + /** + * ditto + */ + uint outpl( uint port_address, uint value ); +} +else +{ + public import std.intrinsic; +} + + +/** + * Calculates the number of set bits in a 32-bit integer. + */ +int popcnt( uint x ) +{ + // Avoid branches, and the potential for cache misses which + // could be incurred with a table lookup. + + // We need to mask alternate bits to prevent the + // sum from overflowing. + // add neighbouring bits. Each bit is 0 or 1. + x = x - ((x>>1) & 0x5555_5555); + // now each two bits of x is a number 00,01 or 10. + // now add neighbouring pairs + x = ((x&0xCCCC_CCCC)>>2) + (x&0x3333_3333); + // now each nibble holds 0000-0100. Adding them won't + // overflow any more, so we don't need to mask any more + + // Now add the nibbles, then the bytes, then the words + // We still need to mask to prevent double-counting. + // Note that if we used a rotate instead of a shift, we + // wouldn't need the masks, and could just divide the sum + // by 8 to account for the double-counting. + // On some CPUs, it may be faster to perform a multiply. + + x += (x>>4); + x &= 0x0F0F_0F0F; + x += (x>>8); + x &= 0x00FF_00FF; + x += (x>>16); + x &= 0xFFFF; + return x; +} + + +debug( UnitTest ) +{ + unittest + { + assert( popcnt( 0 ) == 0 ); + assert( popcnt( 7 ) == 3 ); + assert( popcnt( 0xAA )== 4 ); + assert( popcnt( 0x8421_1248 ) == 8 ); + assert( popcnt( 0xFFFF_FFFF ) == 32 ); + assert( popcnt( 0xCCCC_CCCC ) == 16 ); + assert( popcnt( 0x7777_7777 ) == 24 ); + } +} + + +/** + * Reverses the order of bits in a 32-bit integer. + */ +uint bitswap( uint x ) +{ + + version( D_InlineAsm_X86 ) + { + asm + { + // Author: Tiago Gasiba. + mov EDX, EAX; + shr EAX, 1; + and EDX, 0x5555_5555; + and EAX, 0x5555_5555; + shl EDX, 1; + or EAX, EDX; + mov EDX, EAX; + shr EAX, 2; + and EDX, 0x3333_3333; + and EAX, 0x3333_3333; + shl EDX, 2; + or EAX, EDX; + mov EDX, EAX; + shr EAX, 4; + and EDX, 0x0f0f_0f0f; + and EAX, 0x0f0f_0f0f; + shl EDX, 4; + or EAX, EDX; + bswap EAX; + } + } + else + { + // swap odd and even bits + x = ((x >> 1) & 0x5555_5555) | ((x & 0x5555_5555) << 1); + // swap consecutive pairs + x = ((x >> 2) & 0x3333_3333) | ((x & 0x3333_3333) << 2); + // swap nibbles + x = ((x >> 4) & 0x0F0F_0F0F) | ((x & 0x0F0F_0F0F) << 4); + // swap bytes + x = ((x >> 8) & 0x00FF_00FF) | ((x & 0x00FF_00FF) << 8); + // swap 2-byte long pairs + x = ( x >> 16 ) | ( x << 16); + return x; + + } +} + + +debug( UnitTest ) +{ + unittest + { + assert( bitswap( 0x8000_0100 ) == 0x0080_0001 ); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/common/core/exception.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/common/core/exception.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,279 @@ +/** + * The exception module defines all system-level exceptions and provides a + * mechanism to alter system-level error handling. + * + * Copyright: Copyright Sean Kelly 2005 - 2009. + * License: . + * Authors: Sean Kelly + * + * Copyright Sean Kelly 2005 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module core.runtime; + + +private +{ + extern (C) bool rt_isHalting(); + + alias bool function() ModuleUnitTester; + alias bool function(Object) CollectHandler; + alias Exception.TraceInfo function( void* ptr = null ) TraceHandler; + + extern (C) void rt_setCollectHandler( CollectHandler h ); + extern (C) void rt_setTraceHandler( TraceHandler h ); + + alias void delegate( Throwable ) ExceptionHandler; + extern (C) bool rt_init( ExceptionHandler dg = null ); + extern (C) bool rt_term( ExceptionHandler dg = null ); + + extern (C) void* rt_loadLibrary( in char[] name ); + extern (C) bool rt_unloadLibrary( void* ptr ); +} + + +/////////////////////////////////////////////////////////////////////////////// +// Runtime +/////////////////////////////////////////////////////////////////////////////// + + +/** + * This struct encapsulates all functionality related to the underlying runtime + * module for the calling context. + */ +struct Runtime +{ + /** + * Initializes the runtime. This call is to be used in instances where the + * standard program initialization process is not executed. This is most + * often in shared libraries or in libraries linked to a C program. + * + * Params: + * dg = A delegate which will receive any exception thrown during the + * initialization process or null if such exceptions should be + * discarded. + * + * Returns: + * true if initialization succeeds and false if initialization fails. + */ + static bool initialize( ExceptionHandler dg = null ) + { + return rt_init( dg ); + } + + + /** + * Terminates the runtime. This call is to be used in instances where the + * standard program termination process will not be not executed. This is + * most often in shared libraries or in libraries linked to a C program. + * + * Params: + * dg = A delegate which will receive any exception thrown during the + * termination process or null if such exceptions should be + * discarded. + * + * Returns: + * true if termination succeeds and false if termination fails. + */ + static bool terminate( ExceptionHandler dg = null ) + { + return rt_term( dg ); + } + + + /** + * Returns true if the runtime is halting. Under normal circumstances, + * this will be set between the time that normal application code has + * exited and before module dtors are called. + * + * Returns: + * true if the runtime is halting. + */ + static bool isHalting() + { + return rt_isHalting(); + } + + + /** + * Locates a dynamic library with the supplied library name and dynamically + * loads it into the caller's address space. If the library contains a D + * runtime it will be integrated with the current runtime. + * + * Params: + * name = The name of the dynamic library to load. + * + * Returns: + * A reference to the library or null on error. + */ + static void* loadLibrary( in char[] name ) + { + return rt_loadLibrary( name ); + } + + + /** + * Unloads the dynamic library referenced by p. If this library contains a + * D runtime then any necessary finalization or cleanup of that runtime + * will be performed. + * + * Params: + * p = A reference to the library to unload. + */ + static bool unloadLibrary( void* p ) + { + return rt_unloadLibrary( p ); + } + + + /** + * Overrides the default trace mechanism with s user-supplied version. A + * trace represents the context from which an exception was thrown, and the + * trace handler will be called when this occurs. The pointer supplied to + * this routine indicates the base address from which tracing should occur. + * If the supplied pointer is null then the trace routine should determine + * an appropriate calling context from which to begin the trace. + * + * Params: + * h = The new trace handler. Set to null to use the default handler. + */ + static void traceHandler( TraceHandler h ) + { + rt_setTraceHandler( h ); + } + + + /** + * Overrides the default collect hander with a user-supplied version. This + * routine will be called for each resource object that is finalized in a + * non-deterministic manner--typically during a garbage collection cycle. + * If the supplied routine returns true then the object's dtor will called + * as normal, but if the routine returns false than the dtor will not be + * called. The default behavior is for all object dtors to be called. + * + * Params: + * h = The new collect handler. Set to null to use the default handler. + */ + static void collectHandler( CollectHandler h ) + { + rt_setCollectHandler( h ); + } + + + /** + * Overrides the default module unit tester with a user-supplied version. + * This routine will be called once on program initialization. The return + * value of this routine indicates to the runtime whether the body of the + * program will be executed. + * + * Params: + * h = The new unit tester. Set to null to use the default unit tester. + */ + static void moduleUnitTester( ModuleUnitTester h ) + { + sm_moduleUnitTester = h; + } + + +private: + // Unit tests should only be run in single-threaded + __gshared ModuleUnitTester sm_moduleUnitTester = null; +} + + +/////////////////////////////////////////////////////////////////////////////// +// Overridable Callbacks +/////////////////////////////////////////////////////////////////////////////// + + +/** + * This routine is called by the runtime to run module unit tests on startup. + * The user-supplied unit tester will be called if one has been supplied, + * otherwise all unit tests will be run in sequence. + * + * Returns: + * true if execution should continue after testing is complete and false if + * not. Default behavior is to return true. + */ +extern (C) bool runModuleUnitTests() +{ + if( Runtime.sm_moduleUnitTester is null ) + { + foreach( m; ModuleInfo ) + { + if( m.unitTest ) + m.unitTest(); + } + return true; + } + return Runtime.sm_moduleUnitTester(); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/common/core/stdc/errno.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/common/core/stdc/errno.c Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,26 @@ +/** + * This file contains wrapper functions for macro-defined C rouines. + * + * Copyright: Copyright Sean Kelly 2005 - 2009. + * License: Boost License 1.0. + * Authors: Sean Kelly + * + * Copyright Sean Kelly 2005 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module core.sync.barrier; + + +public import core.sync.exception; +private import core.sync.condition; +private import core.sync.mutex; + +version( Win32 ) +{ + private import core.sys.windows.windows; +} +else version( Posix ) +{ + private import core.stdc.errno; + private import core.sys.posix.pthread; +} + + +//////////////////////////////////////////////////////////////////////////////// +// Barrier +// +// void wait(); +//////////////////////////////////////////////////////////////////////////////// + + +/** + * This class represents a barrier across which threads may only travel in + * groups of a specific size. + */ +class Barrier +{ + //////////////////////////////////////////////////////////////////////////// + // Initialization + //////////////////////////////////////////////////////////////////////////// + + + /** + * Initializes a barrier object which releases threads in groups of limit + * in size. + * + * Params: + * limit = The number of waiting threads to release in unison. + * + * Throws: + * SyncException on error. + */ + this( uint limit ) + in + { + assert( limit > 0 ); + } + body + { + m_lock = new Mutex; + m_cond = new Condition( m_lock ); + m_group = 0; + m_limit = limit; + m_count = limit; + } + + + //////////////////////////////////////////////////////////////////////////// + // General Actions + //////////////////////////////////////////////////////////////////////////// + + + /** + * Wait for the pre-determined number of threads and then proceed. + * + * Throws: + * SyncException on error. + */ + void wait() + { + synchronized( m_lock ) + { + uint group = m_group; + + if( --m_count == 0 ) + { + m_group++; + m_count = m_limit; + m_cond.notifyAll(); + } + while( group == m_group ) + m_cond.wait(); + } + } + + +private: + Mutex m_lock; + Condition m_cond; + uint m_group; + uint m_limit; + uint m_count; +} + + +//////////////////////////////////////////////////////////////////////////////// +// Unit Tests +//////////////////////////////////////////////////////////////////////////////// + + +version( unittest ) +{ + private import core.thread; + + + unittest + { + int numThreads = 10; + auto barrier = new Barrier( numThreads ); + auto synInfo = new Object; + int numReady = 0; + int numPassed = 0; + + void threadFn() + { + synchronized( synInfo ) + { + ++numReady; + } + barrier.wait(); + synchronized( synInfo ) + { + ++numPassed; + } + } + + auto group = new ThreadGroup; + + for( int i = 0; i < numThreads; ++i ) + { + group.create( &threadFn ); + } + group.joinAll(); + assert( numReady == numThreads && numPassed == numThreads ); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/common/core/sync/condition.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/common/core/sync/condition.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,573 @@ +/** + * The condition module provides a primitive for synchronized condition + * checking. + * + * Copyright: Copyright Sean Kelly 2005 - 2009. + * License: = 0 ); + } + body + { + version( Win32 ) + { + enum : uint + { + TICKS_PER_MILLI = 10_000, + MAX_WAIT_MILLIS = uint.max - 1 + } + + period /= TICKS_PER_MILLI; + if( period > MAX_WAIT_MILLIS ) + period = MAX_WAIT_MILLIS; + return timedWait( cast(uint) period ); + } + else version( Posix ) + { + timespec t = void; + mktspec( t, period ); + + int rc = pthread_cond_timedwait( &m_hndl, m_mutexAddr, &t ); + if( !rc ) + return true; + if( rc == ETIMEDOUT ) + return false; + throw new SyncException( "Unable to wait for condition" ); + } + } + + /** + * Notifies one waiter. + * + * Throws: + * SyncException on error. + */ + void notify() + { + version( Win32 ) + { + notify( false ); + } + else version( Posix ) + { + int rc = pthread_cond_signal( &m_hndl ); + if( rc ) + throw new SyncException( "Unable to notify condition" ); + } + } + + + /** + * Notifies all waiters. + * + * Throws: + * SyncException on error. + */ + void notifyAll() + { + version( Win32 ) + { + notify( true ); + } + else version( Posix ) + { + int rc = pthread_cond_broadcast( &m_hndl ); + if( rc ) + throw new SyncException( "Unable to notify condition" ); + } + } + + +private: + version( Win32 ) + { + bool timedWait( DWORD timeout ) + { + int numSignalsLeft; + int numWaitersGone; + DWORD rc; + + rc = WaitForSingleObject( m_blockLock, INFINITE ); + assert( rc == WAIT_OBJECT_0 ); + + m_numWaitersBlocked++; + + rc = ReleaseSemaphore( m_blockLock, 1, null ); + assert( rc ); + + m_assocMutex.unlock(); + scope(failure) m_assocMutex.lock(); + + rc = WaitForSingleObject( m_blockQueue, timeout ); + assert( rc == WAIT_OBJECT_0 || rc == WAIT_TIMEOUT ); + bool timedOut = (rc == WAIT_TIMEOUT); + + EnterCriticalSection( &m_unblockLock ); + scope(failure) LeaveCriticalSection( &m_unblockLock ); + + if( (numSignalsLeft = m_numWaitersToUnblock) != 0 ) + { + if ( timedOut ) + { + // timeout (or canceled) + if( m_numWaitersBlocked != 0 ) + { + m_numWaitersBlocked--; + // do not unblock next waiter below (already unblocked) + numSignalsLeft = 0; + } + else + { + // spurious wakeup pending!! + m_numWaitersGone = 1; + } + } + if( --m_numWaitersToUnblock == 0 ) + { + if( m_numWaitersBlocked != 0 ) + { + // open the gate + rc = ReleaseSemaphore( m_blockLock, 1, null ); + assert( rc ); + // do not open the gate below again + numSignalsLeft = 0; + } + else if( (numWaitersGone = m_numWaitersGone) != 0 ) + { + m_numWaitersGone = 0; + } + } + } + else if( ++m_numWaitersGone == int.max / 2 ) + { + // timeout/canceled or spurious event :-) + rc = WaitForSingleObject( m_blockLock, INFINITE ); + assert( rc == WAIT_OBJECT_0 ); + // something is going on here - test of timeouts? + m_numWaitersBlocked -= m_numWaitersGone; + rc = ReleaseSemaphore( m_blockLock, 1, null ); + assert( rc == WAIT_OBJECT_0 ); + m_numWaitersGone = 0; + } + + LeaveCriticalSection( &m_unblockLock ); + + if( numSignalsLeft == 1 ) + { + // better now than spurious later (same as ResetEvent) + for( ; numWaitersGone > 0; --numWaitersGone ) + { + rc = WaitForSingleObject( m_blockQueue, INFINITE ); + assert( rc == WAIT_OBJECT_0 ); + } + // open the gate + rc = ReleaseSemaphore( m_blockLock, 1, null ); + assert( rc ); + } + else if( numSignalsLeft != 0 ) + { + // unblock next waiter + rc = ReleaseSemaphore( m_blockQueue, 1, null ); + assert( rc ); + } + m_assocMutex.lock(); + return !timedOut; + } + + + void notify( bool all ) + { + DWORD rc; + + EnterCriticalSection( &m_unblockLock ); + scope(failure) LeaveCriticalSection( &m_unblockLock ); + + if( m_numWaitersToUnblock != 0 ) + { + if( m_numWaitersBlocked == 0 ) + { + LeaveCriticalSection( &m_unblockLock ); + return; + } + if( all ) + { + m_numWaitersToUnblock += m_numWaitersBlocked; + m_numWaitersBlocked = 0; + } + else + { + m_numWaitersToUnblock++; + m_numWaitersBlocked--; + } + LeaveCriticalSection( &m_unblockLock ); + } + else if( m_numWaitersBlocked > m_numWaitersGone ) + { + rc = WaitForSingleObject( m_blockLock, INFINITE ); + assert( rc == WAIT_OBJECT_0 ); + if( 0 != m_numWaitersGone ) + { + m_numWaitersBlocked -= m_numWaitersGone; + m_numWaitersGone = 0; + } + if( all ) + { + m_numWaitersToUnblock = m_numWaitersBlocked; + m_numWaitersBlocked = 0; + } + else + { + m_numWaitersToUnblock = 1; + m_numWaitersBlocked--; + } + LeaveCriticalSection( &m_unblockLock ); + rc = ReleaseSemaphore( m_blockQueue, 1, null ); + assert( rc ); + } + else + { + LeaveCriticalSection( &m_unblockLock ); + } + } + + + // NOTE: This implementation uses Algorithm 8c as described here: + // http://groups.google.com/group/comp.programming.threads/ + // browse_frm/thread/1692bdec8040ba40/e7a5f9d40e86503a + HANDLE m_blockLock; // auto-reset event (now semaphore) + HANDLE m_blockQueue; // auto-reset event (now semaphore) + Mutex m_assocMutex; // external mutex/CS + CRITICAL_SECTION m_unblockLock; // internal mutex/CS + int m_numWaitersGone = 0; + int m_numWaitersBlocked = 0; + int m_numWaitersToUnblock = 0; + } + else version( Posix ) + { + pthread_cond_t m_hndl; + pthread_mutex_t* m_mutexAddr; + } +} + + +//////////////////////////////////////////////////////////////////////////////// +// Unit Tests +//////////////////////////////////////////////////////////////////////////////// + + +version( unittest ) +{ + private import core.thread; + private import core.sync.mutex; + private import core.sync.semaphore; + + + void testNotify() + { + auto mutex = new Mutex; + auto condReady = new Condition( mutex ); + auto semDone = new Semaphore; + auto synLoop = new Object; + int numWaiters = 10; + int numTries = 10; + int numReady = 0; + int numTotal = 0; + int numDone = 0; + int numPost = 0; + + void waiter() + { + for( int i = 0; i < numTries; ++i ) + { + synchronized( mutex ) + { + while( numReady < 1 ) + { + condReady.wait(); + } + --numReady; + ++numTotal; + } + + synchronized( synLoop ) + { + ++numDone; + } + semDone.wait(); + } + } + + auto group = new ThreadGroup; + + for( int i = 0; i < numWaiters; ++i ) + group.create( &waiter ); + + for( int i = 0; i < numTries; ++i ) + { + for( int j = 0; j < numWaiters; ++j ) + { + synchronized( mutex ) + { + ++numReady; + condReady.notify(); + } + } + while( true ) + { + synchronized( synLoop ) + { + if( numDone >= numWaiters ) + break; + } + Thread.yield(); + } + for( int j = 0; j < numWaiters; ++j ) + { + semDone.notify(); + } + } + + group.joinAll(); + assert( numTotal == numWaiters * numTries ); + } + + + void testNotifyAll() + { + auto mutex = new Mutex; + auto condReady = new Condition( mutex ); + int numWaiters = 10; + int numReady = 0; + int numDone = 0; + bool alert = false; + + void waiter() + { + synchronized( mutex ) + { + ++numReady; + while( !alert ) + condReady.wait(); + ++numDone; + } + } + + auto group = new ThreadGroup; + + for( int i = 0; i < numWaiters; ++i ) + group.create( &waiter ); + + while( true ) + { + synchronized( mutex ) + { + if( numReady >= numWaiters ) + { + alert = true; + condReady.notifyAll(); + break; + } + } + Thread.yield(); + } + group.joinAll(); + assert( numReady == numWaiters && numDone == numWaiters ); + } + + + void testWaitTimeout() + { + auto mutex = new Mutex; + auto condReady = new Condition( mutex ); + bool waiting = false; + bool alertedOne = true; + bool alertedTwo = true; + + void waiter() + { + synchronized( mutex ) + { + waiting = true; + alertedOne = condReady.wait( 10_000_000 ); // 1s + alertedTwo = condReady.wait( 10_000_000 ); // 1s + } + } + + auto thread = new Thread( &waiter ); + thread.start(); + + while( true ) + { + synchronized( mutex ) + { + if( waiting ) + { + condReady.notify(); + break; + } + } + Thread.yield(); + } + thread.join(); + assert( waiting && alertedOne && !alertedTwo ); + } + + + unittest + { + testNotify(); + testNotifyAll(); + testWaitTimeout(); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/common/core/sync/config.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/common/core/sync/config.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,72 @@ +/** + * The config module contains utility routines and configuration information + * specific to this package. + * + * Copyright: Copyright Sean Kelly 2005 - 2009. + * License: Boost License 1.0. + * Authors: Sean Kelly + * + * Copyright Sean Kelly 2005 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module core.sync.exception; + + +/** + * Base class for synchronization exceptions. + */ +class SyncException : Exception +{ + this( string msg ) + { + super( msg ); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/common/core/sync/mutex.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/common/core/sync/mutex.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,268 @@ +/** + * The mutex module provides a primitive for maintaining mutually exclusive + * access. + * + * Copyright: Copyright Sean Kelly 2005 - 2009. + * License: 0 ) + m_writerQueue.notify(); + } + } + } + + + /** + * Attempts to acquire a read lock on the enclosing mutex. If one can + * be obtained without blocking, the lock is acquired and true is + * returned. If not, the lock is not acquired and false is returned. + * + * Returns: + * true if the lock was acquired and false if not. + */ + bool tryLock() + { + synchronized( m_commonMutex ) + { + if( shouldQueueReader() ) + return false; + ++m_numActiveReaders; + return true; + } + } + + + private: + bool shouldQueueReader() + { + if( m_numActiveWriters > 0 ) + return true; + + switch( m_policy ) + { + case Policy.PREFER_WRITERS: + return m_numQueuedWriters > 0; + + case Policy.PREFER_READERS: + default: + break; + } + + return false; + } + + struct MonitorProxy + { + Object.Monitor link; + } + + MonitorProxy m_proxy; + } + + + //////////////////////////////////////////////////////////////////////////// + // Writer + //////////////////////////////////////////////////////////////////////////// + + + /** + * This class can be considered a mutex in its own right, and is used to + * negotiate a write lock for the enclosing mutex. + */ + class Writer : + Object.Monitor + { + /** + * Initializes a read/write mutex writer proxy object. + */ + this() + { + m_proxy.link = this; + (cast(void**) this)[1] = &m_proxy; + } + + + /** + * Acquires a write lock on the enclosing mutex. + */ + void lock() + { + synchronized( m_commonMutex ) + { + ++m_numQueuedWriters; + scope(exit) --m_numQueuedWriters; + + while( shouldQueueWriter() ) + m_writerQueue.wait(); + ++m_numActiveWriters; + } + } + + + /** + * Releases a write lock on the enclosing mutex. + */ + void unlock() + { + synchronized( m_commonMutex ) + { + if( --m_numActiveWriters < 1 ) + { + switch( m_policy ) + { + default: + case Policy.PREFER_READERS: + if( m_numQueuedReaders > 0 ) + m_readerQueue.notifyAll(); + else if( m_numQueuedWriters > 0 ) + m_writerQueue.notify(); + break; + case Policy.PREFER_WRITERS: + if( m_numQueuedWriters > 0 ) + m_writerQueue.notify(); + else if( m_numQueuedReaders > 0 ) + m_readerQueue.notifyAll(); + } + } + } + } + + + /** + * Attempts to acquire a write lock on the enclosing mutex. If one can + * be obtained without blocking, the lock is acquired and true is + * returned. If not, the lock is not acquired and false is returned. + * + * Returns: + * true if the lock was acquired and false if not. + */ + bool tryLock() + { + synchronized( m_commonMutex ) + { + if( shouldQueueWriter() ) + return false; + ++m_numActiveWriters; + return true; + } + } + + + private: + bool shouldQueueWriter() + { + if( m_numActiveWriters > 0 || + m_numActiveReaders > 0 ) + return true; + switch( m_policy ) + { + case Policy.PREFER_READERS: + return m_numQueuedReaders > 0; + + case Policy.PREFER_WRITERS: + default: + break; + } + + return false; + } + + struct MonitorProxy + { + Object.Monitor link; + } + + MonitorProxy m_proxy; + } + + +private: + Policy m_policy; + Reader m_reader; + Writer m_writer; + + Mutex m_commonMutex; + Condition m_readerQueue; + Condition m_writerQueue; + + int m_numQueuedReaders; + int m_numActiveReaders; + int m_numQueuedWriters; + int m_numActiveWriters; +} + + +//////////////////////////////////////////////////////////////////////////////// +// Unit Tests +//////////////////////////////////////////////////////////////////////////////// + + +version( unittest ) +{ + static if( !is( typeof( Thread ) ) ) + private import core.thread; + + + void testRead( ReadWriteMutex.Policy policy ) + { + auto mutex = new ReadWriteMutex( policy ); + auto synInfo = new Object; + int numThreads = 10; + int numReaders = 0; + int maxReaders = 0; + + void readerFn() + { + synchronized( mutex.reader() ) + { + synchronized( synInfo ) + { + if( ++numReaders > maxReaders ) + maxReaders = numReaders; + } + Thread.sleep( 100_000 ); // 1ms + synchronized( synInfo ) + { + --numReaders; + } + } + } + + auto group = new ThreadGroup; + + for( int i = 0; i < numThreads; ++i ) + { + group.create( &readerFn ); + } + group.joinAll(); + assert( numReaders < 1 && maxReaders > 1 ); + } + + + void testReadWrite( ReadWriteMutex.Policy policy ) + { + auto mutex = new ReadWriteMutex( policy ); + auto synInfo = new Object; + int numThreads = 10; + int numReaders = 0; + int numWriters = 0; + int maxReaders = 0; + int maxWriters = 0; + int numTries = 20; + + void readerFn() + { + for( int i = 0; i < numTries; ++i ) + { + synchronized( mutex.reader() ) + { + synchronized( synInfo ) + { + if( ++numReaders > maxReaders ) + maxReaders = numReaders; + } + Thread.sleep( 100_000 ); // 1ms + synchronized( synInfo ) + { + --numReaders; + } + } + } + } + + void writerFn() + { + for( int i = 0; i < numTries; ++i ) + { + synchronized( mutex.writer() ) + { + synchronized( synInfo ) + { + if( ++numWriters > maxWriters ) + maxWriters = numWriters; + } + Thread.sleep( 100_000 ); // 1ms + synchronized( synInfo ) + { + --numWriters; + } + } + } + } + + auto group = new ThreadGroup; + + for( int i = 0; i < numThreads; ++i ) + { + group.create( &readerFn ); + group.create( &writerFn ); + } + group.joinAll(); + assert( numReaders < 1 && maxReaders > 1 && + numWriters < 1 && maxWriters < 2 ); + } + + + unittest + { + testRead( ReadWriteMutex.Policy.PREFER_READERS ); + testRead( ReadWriteMutex.Policy.PREFER_WRITERS ); + testReadWrite( ReadWriteMutex.Policy.PREFER_READERS ); + testReadWrite( ReadWriteMutex.Policy.PREFER_WRITERS ); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/common/core/sync/semaphore.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/common/core/sync/semaphore.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,506 @@ +/** + * The semaphore module provides a general use semaphore for synchronization. + * + * Copyright: Copyright Sean Kelly 2005 - 2009. + * License: = 0 ); + } + body + { + version( Win32 ) + { + enum : uint + { + TICKS_PER_MILLI = 10_000, + MAX_WAIT_MILLIS = uint.max - 1 + } + + period /= TICKS_PER_MILLI; + if( period > MAX_WAIT_MILLIS ) + period = MAX_WAIT_MILLIS; + switch( WaitForSingleObject( m_hndl, cast(uint) period ) ) + { + case WAIT_OBJECT_0: + return true; + case WAIT_TIMEOUT: + return false; + default: + throw new SyncException( "Unable to wait for semaphore" ); + } + } + else version( OSX ) + { + mach_timespec_t t = void; + (cast(byte*) &t)[0 .. t.sizeof] = 0; + + if( period != 0 ) + { + enum : uint + { + NANOS_PER_TICK = 100, + TICKS_PER_SECOND = 10_000_000, + NANOS_PER_SECOND = NANOS_PER_TICK * TICKS_PER_SECOND, + } + + if( t.tv_sec.max - t.tv_sec < period / TICKS_PER_SECOND ) + { + t.tv_sec = t.tv_sec.max; + t.tv_nsec = 0; + } + else + { + t.tv_sec += cast(typeof(t.tv_sec)) (period / TICKS_PER_SECOND); + long ns = (period % TICKS_PER_SECOND) * NANOS_PER_TICK; + if( NANOS_PER_SECOND - t.tv_nsec > ns ) + t.tv_nsec = cast(typeof(t.tv_nsec)) ns; + else + { + t.tv_sec += 1; + t.tv_nsec += ns - NANOS_PER_SECOND; + } + } + } + while( true ) + { + auto rc = semaphore_timedwait( m_hndl, t ); + if( !rc ) + return true; + if( rc == KERN_OPERATION_TIMED_OUT ) + return false; + if( rc != KERN_ABORTED || errno != EINTR ) + throw new SyncException( "Unable to wait for semaphore" ); + } + // -w trip + return false; + } + else version( Posix ) + { + timespec t = void; + mktspec( t, period ); + + while( true ) + { + if( !sem_timedwait( &m_hndl, &t ) ) + return true; + if( errno == ETIMEDOUT ) + return false; + if( errno != EINTR ) + throw new SyncException( "Unable to wait for semaphore" ); + } + // -w trip + return false; + } + } + + + /** + * Atomically increment the current count by one. This will notify one + * waiter, if there are any in the queue. + * + * Throws: + * SyncException on error. + */ + void notify() + { + version( Win32 ) + { + if( !ReleaseSemaphore( m_hndl, 1, null ) ) + throw new SyncException( "Unable to notify semaphore" ); + } + else version( OSX ) + { + auto rc = semaphore_signal( m_hndl ); + if( rc ) + throw new SyncException( "Unable to notify semaphore" ); + } + else version( Posix ) + { + int rc = sem_post( &m_hndl ); + if( rc ) + throw new SyncException( "Unable to notify semaphore" ); + } + } + + + /** + * If the current count is equal to zero, return. Otherwise, atomically + * decrement the count by one and return true. + * + * Throws: + * SyncException on error. + * + * Returns: + * true if the count was above zero and false if not. + */ + bool tryWait() + { + version( Win32 ) + { + switch( WaitForSingleObject( m_hndl, 0 ) ) + { + case WAIT_OBJECT_0: + return true; + case WAIT_TIMEOUT: + return false; + default: + throw new SyncException( "Unable to wait for semaphore" ); + } + } + else version( OSX ) + { + return wait( 0 ); + } + else version( Posix ) + { + while( true ) + { + if( !sem_trywait( &m_hndl ) ) + return true; + if( errno == EAGAIN ) + return false; + if( errno != EINTR ) + throw new SyncException( "Unable to wait for semaphore" ); + } + // -w trip + return false; + } + } + + +private: + version( Win32 ) + { + HANDLE m_hndl; + } + else version( OSX ) + { + semaphore_t m_hndl; + } + else version( Posix ) + { + sem_t m_hndl; + } +} + + +//////////////////////////////////////////////////////////////////////////////// +// Unit Tests +//////////////////////////////////////////////////////////////////////////////// + + +version( unittest ) +{ + private import core.thread; + + + void testWait() + { + auto semaphore = new Semaphore; + int numToProduce = 10; + bool allProduced = false; + auto synProduced = new Object; + int numConsumed = 0; + auto synConsumed = new Object; + int numConsumers = 10; + int numComplete = 0; + auto synComplete = new Object; + + void consumer() + { + while( true ) + { + semaphore.wait(); + + synchronized( synProduced ) + { + if( allProduced ) + break; + } + + synchronized( synConsumed ) + { + ++numConsumed; + } + } + + synchronized( synComplete ) + { + ++numComplete; + } + } + + void producer() + { + assert( !semaphore.tryWait() ); + + for( int i = 0; i < numToProduce; ++i ) + { + semaphore.notify(); + Thread.yield(); + } + Thread.sleep( 10_000_000 ); // 1s + synchronized( synProduced ) + { + allProduced = true; + } + + for( int i = 0; i < numConsumers; ++i ) + { + semaphore.notify(); + Thread.yield(); + } + + for( int i = numConsumers * 10000; i > 0; --i ) + { + synchronized( synComplete ) + { + if( numComplete == numConsumers ) + break; + } + Thread.yield(); + } + + synchronized( synComplete ) + { + assert( numComplete == numConsumers ); + } + + synchronized( synConsumed ) + { + assert( numConsumed == numToProduce ); + } + + assert( !semaphore.tryWait() ); + semaphore.notify(); + assert( semaphore.tryWait() ); + assert( !semaphore.tryWait() ); + } + + auto group = new ThreadGroup; + + for( int i = 0; i < numConsumers; ++i ) + group.create( &consumer ); + group.create( &producer ); + group.joinAll(); + } + + + void testWaitTimeout() + { + auto synReady = new Object; + auto semReady = new Semaphore; + bool waiting = false; + bool alertedOne = true; + bool alertedTwo = true; + + void waiter() + { + synchronized( synReady ) + { + waiting = true; + } + alertedOne = semReady.wait( 10_000_000 ); // 100ms + alertedTwo = semReady.wait( 10_000_000 ); // 100ms + } + + auto thread = new Thread( &waiter ); + thread.start(); + + while( true ) + { + synchronized( synReady ) + { + if( waiting ) + { + semReady.notify(); + break; + } + } + Thread.yield(); + } + thread.join(); + assert( waiting && alertedOne && !alertedTwo ); + } + + + unittest + { + testWait(); + testWaitTimeout(); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/common/core/thread.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/common/core/thread.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,3516 @@ +/** + * The thread module provides support for thread creation and management. + * + * Copyright: Copyright Sean Kelly 2005 - 2009. + * License: = 0 ); + } + body + { + version( Windows ) + { + enum : uint + { + TICKS_PER_MILLI = 10_000, + MAX_SLEEP_MILLIS = uint.max - 1 + } + + // NOTE: In instances where all other threads in the process have a + // lower priority than the current thread, the current thread + // will not yield with a sleep time of zero. However, unlike + // yield(), the user is not asking for a yield to occur but + // only for execution to suspend for the requested interval. + // Therefore, expected performance may not be met if a yield + // is forced upon the user. + period /= TICKS_PER_MILLI; + while( period > MAX_SLEEP_MILLIS ) + { + Sleep( MAX_SLEEP_MILLIS ); + period -= MAX_SLEEP_MILLIS; + } + Sleep( cast(uint) period ); + } + else version( Posix ) + { + timespec tin = void; + timespec tout = void; + + enum : uint + { + NANOS_PER_TICK = 100, + TICKS_PER_SECOND = 10_000_000, + } + enum : typeof(period) + { + MAX_SLEEP_TICKS = cast(typeof(period)) tin.tv_sec.max * TICKS_PER_SECOND + } + + do + { + if( period > MAX_SLEEP_TICKS ) + { + tin.tv_sec = tin.tv_sec.max; + tin.tv_nsec = 0; + } + else + { + tin.tv_sec = cast(typeof(tin.tv_sec)) (period / TICKS_PER_SECOND); + tin.tv_nsec = cast(typeof(tin.tv_nsec)) (period % TICKS_PER_SECOND) * NANOS_PER_TICK; + } + while( true ) + { + if( !nanosleep( &tin, &tout ) ) + return; + if( getErrno() != EINTR ) + throw new ThreadException( "Unable to sleep for the specified duration" ); + tin = tout; + } + period -= (cast(typeof(period)) tin.tv_sec) * TICKS_PER_SECOND; + period -= (cast(typeof(period)) tin.tv_nsec) / NANOS_PER_TICK; + } while( period > 0 ); + } + } + + + /** + * Forces a context switch to occur away from the calling thread. + */ + static void yield() + { + version( Windows ) + { + // NOTE: Sleep(1) is necessary because Sleep(0) does not give + // lower priority threads any timeslice, so looping on + // Sleep(0) could be resource-intensive in some cases. + Sleep( 1 ); + } + else version( Posix ) + { + sched_yield(); + } + } + + + /////////////////////////////////////////////////////////////////////////// + // Thread Accessors + /////////////////////////////////////////////////////////////////////////// + + + /** + * Provides a reference to the calling thread. + * + * Returns: + * The thread object representing the calling thread. The result of + * deleting this object is undefined. + */ + static Thread getThis() + { + // NOTE: This function may not be called until thread_init has + // completed. See thread_suspendAll for more information + // on why this might occur. + version( Windows ) + { + return cast(Thread) TlsGetValue( sm_this ); + } + else version( Posix ) + { + return cast(Thread) pthread_getspecific( sm_this ); + } + } + + + /** + * Provides a list of all threads currently being tracked by the system. + * + * Returns: + * An array containing references to all threads currently being + * tracked by the system. The result of deleting any contained + * objects is undefined. + */ + static Thread[] getAll() + { + synchronized( slock ) + { + size_t pos = 0; + Thread[] buf = new Thread[sm_tlen]; + + foreach( Thread t; Thread ) + { + buf[pos++] = t; + } + return buf; + } + } + + + /** + * Operates on all threads currently being tracked by the system. The + * result of deleting any Thread object is undefined. + * + * Params: + * dg = The supplied code as a delegate. + * + * Returns: + * Zero if all elemented are visited, nonzero if not. + */ + static int opApply( int delegate( inout Thread ) dg ) + { + synchronized( slock ) + { + int ret = 0; + + for( Thread t = sm_tbeg; t; t = t.next ) + { + ret = dg( t ); + if( ret ) + break; + } + return ret; + } + } + + + /////////////////////////////////////////////////////////////////////////// + // Local Storage Actions + /////////////////////////////////////////////////////////////////////////// + + + /** + * Indicates the number of local storage pointers available at program + * startup. It is recommended that this number be at least 64. + */ + static const uint LOCAL_MAX = 64; + + + /** + * Reserves a local storage pointer for use and initializes this location + * to null for all running threads. + * + * Returns: + * A key representing the array offset of this memory location. + */ + static uint createLocal() + { + synchronized( slock ) + { + foreach( uint key, inout bool set; sm_local ) + { + if( !set ) + { + //foreach( Thread t; sm_tbeg ) Bug in GDC 0.24 SVN (r139) + for( Thread t = sm_tbeg; t; t = t.next ) + { + t.m_local[key] = null; + } + set = true; + return key; + } + } + throw new ThreadException( "No more local storage slots available" ); + } + } + + + /** + * Marks the supplied key as available and sets the associated location + * to null for all running threads. It is assumed that any key passed + * to this function is valid. The result of calling this function for + * a key which is still in use is undefined. + * + * Params: + * key = The key to delete. + */ + static void deleteLocal( uint key ) + { + synchronized( slock ) + { + sm_local[key] = false; + // foreach( Thread t; sm_tbeg ) Bug in GDC 0.24 SVN (r139) + for( Thread t = sm_tbeg; t; t = t.next ) + { + t.m_local[key] = null; + } + } + } + + + /** + * Loads the value stored at key within a thread-local static array. It is + * assumed that any key passed to this function is valid. + * + * Params: + * key = The location which holds the desired data. + * + * Returns: + * The data associated with the supplied key. + */ + static void* getLocal( uint key ) + { + return getThis().m_local[key]; + } + + + /** + * Stores the supplied value at key within a thread-local static array. It + * is assumed that any key passed to this function is valid. + * + * Params: + * key = The location to store the supplied data. + * val = The data to store. + * + * Returns: + * A copy of the data which has just been stored. + */ + static void* setLocal( uint key, void* val ) + { + return getThis().m_local[key] = val; + } + + + /////////////////////////////////////////////////////////////////////////// + // Static Initalizer + /////////////////////////////////////////////////////////////////////////// + + + /** + * This initializer is used to set thread constants. All functional + * initialization occurs within thread_init(). + */ + static this() + { + version( Windows ) + { + PRIORITY_MIN = -15; + PRIORITY_MAX = 15; + } + else version( Posix ) + { + int policy; + sched_param param; + pthread_t self = pthread_self(); + + int status = pthread_getschedparam( self, &policy, ¶m ); + assert( status == 0 ); + + PRIORITY_MIN = sched_get_priority_min( policy ); + assert( PRIORITY_MIN != -1 ); + + PRIORITY_MAX = sched_get_priority_max( policy ); + assert( PRIORITY_MAX != -1 ); + } + } + + +private: + // + // Initializes a thread object which has no associated executable function. + // This is used for the main thread initialized in thread_init(). + // + this() + { + m_call = Call.NO; + m_curr = &m_main; + + void* pstart = cast(void*) &_tlsstart; + void* pend = cast(void*) &_tlsend; + m_tls = pstart[0 .. pend - pstart]; + } + + + // + // Thread entry point. Invokes the function or delegate passed on + // construction (if any). + // + final void run() + { + switch( m_call ) + { + case Call.FN: + m_fn(); + break; + case Call.DG: + m_dg(); + break; + default: + break; + } + } + + +private: + // + // The type of routine passed on thread construction. + // + enum Call + { + NO, + FN, + DG + } + + + // + // Standard types + // + version( Windows ) + { + alias uint TLSKey; + alias uint ThreadAddr; + } + else version( Posix ) + { + alias pthread_key_t TLSKey; + alias pthread_t ThreadAddr; + } + + + // + // Local storage + // + __gshared bool[LOCAL_MAX] sm_local; + __gshared TLSKey sm_this; + + void*[LOCAL_MAX] m_local; + + + // + // Standard thread data + // + version( Windows ) + { + HANDLE m_hndl; + } + else version( OSX ) + { + mach_port_t m_tmach; + } + ThreadAddr m_addr; + Call m_call; + char[] m_name; + union + { + void function() m_fn; + void delegate() m_dg; + } + size_t m_sz; + version( Posix ) + { + bool m_isRunning; + } + bool m_isDaemon; + Object m_unhandled; + + +private: + /////////////////////////////////////////////////////////////////////////// + // Storage of Active Thread + /////////////////////////////////////////////////////////////////////////// + + + // + // Sets a thread-local reference to the current thread object. + // + static void setThis( Thread t ) + { + version( Windows ) + { + TlsSetValue( sm_this, cast(void*) t ); + } + else version( Posix ) + { + pthread_setspecific( sm_this, cast(void*) t ); + } + } + + +private: + /////////////////////////////////////////////////////////////////////////// + // Thread Context and GC Scanning Support + /////////////////////////////////////////////////////////////////////////// + + + final void pushContext( Context* c ) + in + { + assert( !c.within ); + } + body + { + c.within = m_curr; + m_curr = c; + } + + + final void popContext() + in + { + assert( m_curr && m_curr.within ); + } + body + { + Context* c = m_curr; + m_curr = c.within; + c.within = null; + } + + + final Context* topContext() + in + { + assert( m_curr ); + } + body + { + return m_curr; + } + + + static struct Context + { + void* bstack, + tstack; + Context* within; + Context* next, + prev; + } + + + Context m_main; + Context* m_curr; + bool m_lock; + void[] m_tls; // spans implicit thread local storage + + version( Windows ) + { + version( X86 ) + { + uint[8] m_reg; // edi,esi,ebp,esp,ebx,edx,ecx,eax + } + else version( X86_64 ) + { + ulong[16] m_reg; // rdi,rsi,rbp,rsp,rbx,rdx,rcx,rax + // r8,r9,r10,r11,r12,r13,r14,r15 + } + else + { + static assert( "Architecture not supported." ); + } + } + else version( OSX ) + { + version( X86 ) + { + uint[8] m_reg; // edi,esi,ebp,esp,ebx,edx,ecx,eax + } + else version( X86_64 ) + { + ulong[16] m_reg; // rdi,rsi,rbp,rsp,rbx,rdx,rcx,rax + // r8,r9,r10,r11,r12,r13,r14,r15 + } + else + { + static assert( "Architecture not supported." ); + } + } + + +private: + /////////////////////////////////////////////////////////////////////////// + // GC Scanning Support + /////////////////////////////////////////////////////////////////////////// + + + // NOTE: The GC scanning process works like so: + // + // 1. Suspend all threads. + // 2. Scan the stacks of all suspended threads for roots. + // 3. Resume all threads. + // + // Step 1 and 3 require a list of all threads in the system, while + // step 2 requires a list of all thread stacks (each represented by + // a Context struct). Traditionally, there was one stack per thread + // and the Context structs were not necessary. However, Fibers have + // changed things so that each thread has its own 'main' stack plus + // an arbitrary number of nested stacks (normally referenced via + // m_curr). Also, there may be 'free-floating' stacks in the system, + // which are Fibers that are not currently executing on any specific + // thread but are still being processed and still contain valid + // roots. + // + // To support all of this, the Context struct has been created to + // represent a stack range, and a global list of Context structs has + // been added to enable scanning of these stack ranges. The lifetime + // (and presence in the Context list) of a thread's 'main' stack will + // be equivalent to the thread's lifetime. So the Ccontext will be + // added to the list on thread entry, and removed from the list on + // thread exit (which is essentially the same as the presence of a + // Thread object in its own global list). The lifetime of a Fiber's + // context, however, will be tied to the lifetime of the Fiber object + // itself, and Fibers are expected to add/remove their Context struct + // on construction/deletion. + + + // + // All use of the global lists should synchronize on this lock. + // + static Object slock() + { + return Thread.classinfo; + } + + + __gshared + { + Context* sm_cbeg; + size_t sm_clen; + + Thread sm_tbeg; + size_t sm_tlen; + } + + // + // Used for ordering threads in the global thread list. + // + Thread prev; + Thread next; + + + /////////////////////////////////////////////////////////////////////////// + // Global Context List Operations + /////////////////////////////////////////////////////////////////////////// + + + // + // Add a context to the global context list. + // + static void add( Context* c ) + in + { + assert( c ); + assert( !c.next && !c.prev ); + } + body + { + synchronized( slock ) + { + if( sm_cbeg ) + { + c.next = sm_cbeg; + sm_cbeg.prev = c; + } + sm_cbeg = c; + ++sm_clen; + } + } + + + // + // Remove a context from the global context list. + // + static void remove( Context* c ) + in + { + assert( c ); + assert( c.next || c.prev ); + } + body + { + synchronized( slock ) + { + if( c.prev ) + c.prev.next = c.next; + if( c.next ) + c.next.prev = c.prev; + if( sm_cbeg == c ) + sm_cbeg = c.next; + --sm_clen; + } + // NOTE: Don't null out c.next or c.prev because opApply currently + // follows c.next after removing a node. This could be easily + // addressed by simply returning the next node from this + // function, however, a context should never be re-added to the + // list anyway and having next and prev be non-null is a good way + // to ensure that. + } + + + /////////////////////////////////////////////////////////////////////////// + // Global Thread List Operations + /////////////////////////////////////////////////////////////////////////// + + + // + // Add a thread to the global thread list. + // + static void add( Thread t ) + in + { + assert( t ); + assert( !t.next && !t.prev ); + assert( t.isRunning ); + } + body + { + synchronized( slock ) + { + if( sm_tbeg ) + { + t.next = sm_tbeg; + sm_tbeg.prev = t; + } + sm_tbeg = t; + ++sm_tlen; + } + } + + + // + // Remove a thread from the global thread list. + // + static void remove( Thread t ) + in + { + assert( t ); + assert( t.next || t.prev ); + version( Windows ) + { + // NOTE: This doesn't work for Posix as m_isRunning must be set to + // false after the thread is removed during normal execution. + assert( !t.isRunning ); + } + } + body + { + synchronized( slock ) + { + // NOTE: When a thread is removed from the global thread list its + // main context is invalid and should be removed as well. + // It is possible that t.m_curr could reference more + // than just the main context if the thread exited abnormally + // (if it was terminated), but we must assume that the user + // retains a reference to them and that they may be re-used + // elsewhere. Therefore, it is the responsibility of any + // object that creates contexts to clean them up properly + // when it is done with them. + remove( &t.m_main ); + + if( t.prev ) + t.prev.next = t.next; + if( t.next ) + t.next.prev = t.prev; + if( sm_tbeg == t ) + sm_tbeg = t.next; + --sm_tlen; + } + // NOTE: Don't null out t.next or t.prev because opApply currently + // follows t.next after removing a node. This could be easily + // addressed by simply returning the next node from this + // function, however, a thread should never be re-added to the + // list anyway and having next and prev be non-null is a good way + // to ensure that. + } +} + + +/////////////////////////////////////////////////////////////////////////////// +// GC Support Routines +/////////////////////////////////////////////////////////////////////////////// + + +/** + * Initializes the thread module. This function must be called by the + * garbage collector on startup and before any other thread routines + * are called. + */ +extern (C) void thread_init() +{ + // NOTE: If thread_init itself performs any allocations then the thread + // routines reserved for garbage collector use may be called while + // thread_init is being processed. However, since no memory should + // exist to be scanned at this point, it is sufficient for these + // functions to detect the condition and return immediately. + + version( Windows ) + { + Thread.sm_this = TlsAlloc(); + assert( Thread.sm_this != TLS_OUT_OF_INDEXES ); + } + else version( Posix ) + { + int status; + sigaction_t sigusr1 = void; + sigaction_t sigusr2 = void; + + // This is a quick way to zero-initialize the structs without using + // memset or creating a link dependency on their static initializer. + (cast(byte*) &sigusr1)[0 .. sigaction_t.sizeof] = 0; + (cast(byte*) &sigusr2)[0 .. sigaction_t.sizeof] = 0; + + // NOTE: SA_RESTART indicates that system calls should restart if they + // are interrupted by a signal, but this is not available on all + // Posix systems, even those that support multithreading. + static if( is( typeof( SA_RESTART ) ) ) + sigusr1.sa_flags = SA_RESTART; + else + sigusr1.sa_flags = 0; + sigusr1.sa_handler = &thread_suspendHandler; + // NOTE: We want to ignore all signals while in this handler, so fill + // sa_mask to indicate this. + status = sigfillset( &sigusr1.sa_mask ); + assert( status == 0 ); + + // NOTE: Since SIGUSR2 should only be issued for threads within the + // suspend handler, we don't want this signal to trigger a + // restart. + sigusr2.sa_flags = 0; + sigusr2.sa_handler = &thread_resumeHandler; + // NOTE: We want to ignore all signals while in this handler, so fill + // sa_mask to indicate this. + status = sigfillset( &sigusr2.sa_mask ); + assert( status == 0 ); + + status = sigaction( SIGUSR1, &sigusr1, null ); + assert( status == 0 ); + + status = sigaction( SIGUSR2, &sigusr2, null ); + assert( status == 0 ); + + status = sem_init( &suspendCount, 0, 0 ); + assert( status == 0 ); + + status = pthread_key_create( &Thread.sm_this, null ); + assert( status == 0 ); + } + + thread_attachThis(); +} + + +/** + * Registers the calling thread for use with the D Runtime. If this routine + * is called for a thread which is already registered, the result is undefined. + */ +extern (C) void thread_attachThis() +{ + version( Windows ) + { + Thread thisThread = new Thread(); + Thread.Context* thisContext = &thisThread.m_main; + assert( thisContext == thisThread.m_curr ); + + thisThread.m_addr = GetCurrentThreadId(); + thisThread.m_hndl = GetCurrentThreadHandle(); + thisContext.bstack = getStackBottom(); + thisContext.tstack = thisContext.bstack; + + thisThread.m_isDaemon = true; + + Thread.setThis( thisThread ); + } + else version( Posix ) + { + Thread thisThread = new Thread(); + Thread.Context* thisContext = thisThread.m_curr; + assert( thisContext == &thisThread.m_main ); + + thisThread.m_addr = pthread_self(); + thisContext.bstack = getStackBottom(); + thisContext.tstack = thisContext.bstack; + + thisThread.m_isRunning = true; + thisThread.m_isDaemon = true; + + Thread.setThis( thisThread ); + } + version( OSX ) + { + thisThread.m_tmach = pthread_mach_thread_np( thisThread.m_addr ); + assert( thisThread.m_tmach != thisThread.m_tmach.init ); + } + + Thread.add( thisThread ); + Thread.add( thisContext ); +} + + +/** + * Deregisters the calling thread from use with the runtime. If this routine + * is called for a thread which is already registered, the result is undefined. + */ +extern (C) void thread_detachThis() +{ + Thread.remove( Thread.getThis() ); +} + + +/** + * Joins all non-daemon threads that are currently running. This is done by + * performing successive scans through the thread list until a scan consists + * of only daemon threads. + */ +extern (C) void thread_joinAll() +{ + + while( true ) + { + Thread nonDaemon = null; + + foreach( t; Thread ) + { + if( !t.isDaemon ) + { + nonDaemon = t; + break; + } + } + if( nonDaemon is null ) + return; + nonDaemon.join(); + } +} + + +/** + * Performs intermediate shutdown of the thread module. + */ +static ~this() +{ + // NOTE: The functionality related to garbage collection must be minimally + // operable after this dtor completes. Therefore, only minimal + // cleanup may occur. + + for( Thread t = Thread.sm_tbeg; t; t = t.next ) + { + if( !t.isRunning ) + Thread.remove( t ); + } +} + + +// Used for needLock below +private __gshared bool multiThreadedFlag = false; + + +/** + * This function is used to determine whether the the process is + * multi-threaded. Optimizations may only be performed on this + * value if the programmer can guarantee that no path from the + * enclosed code will start a thread. + * + * Returns: + * True if Thread.start() has been called in this process. + */ +extern (C) bool thread_needLock() +{ + return multiThreadedFlag; +} + + +// Used for suspendAll/resumeAll below +private __gshared uint suspendDepth = 0; + + +/** + * Suspend all threads but the calling thread for "stop the world" garbage + * collection runs. This function may be called multiple times, and must + * be followed by a matching number of calls to thread_resumeAll before + * processing is resumed. + * + * Throws: + * ThreadException if the suspend operation fails for a running thread. + */ +extern (C) void thread_suspendAll() +{ + /** + * Suspend the specified thread and load stack and register information for + * use by thread_scanAll. If the supplied thread is the calling thread, + * stack and register information will be loaded but the thread will not + * be suspended. If the suspend operation fails and the thread is not + * running then it will be removed from the global thread list, otherwise + * an exception will be thrown. + * + * Params: + * t = The thread to suspend. + * + * Throws: + * ThreadException if the suspend operation fails for a running thread. + */ + void suspend( Thread t ) + { + version( Windows ) + { + if( t.m_addr != GetCurrentThreadId() && SuspendThread( t.m_hndl ) == 0xFFFFFFFF ) + { + if( !t.isRunning ) + { + Thread.remove( t ); + return; + } + throw new ThreadException( "Unable to suspend thread" ); + } + + CONTEXT context = void; + context.ContextFlags = CONTEXT_INTEGER | CONTEXT_CONTROL; + + if( !GetThreadContext( t.m_hndl, &context ) ) + throw new ThreadException( "Unable to load thread context" ); + + version( X86 ) + { + if( !t.m_lock ) + t.m_curr.tstack = cast(void*) context.Esp; + // eax,ebx,ecx,edx,edi,esi,ebp,esp + t.m_reg[0] = context.Eax; + t.m_reg[1] = context.Ebx; + t.m_reg[2] = context.Ecx; + t.m_reg[3] = context.Edx; + t.m_reg[4] = context.Edi; + t.m_reg[5] = context.Esi; + t.m_reg[6] = context.Ebp; + t.m_reg[7] = context.Esp; + } + else + { + static assert( "Architecture not supported." ); + } + } + else version( OSX ) + { + if( t.m_addr != pthread_self() && thread_suspend( t.m_tmach ) != KERN_SUCCESS ) + { + if( !t.isRunning ) + { + Thread.remove( t ); + return; + } + throw new ThreadException( "Unable to suspend thread" ); + } + + version( X86 ) + { + x86_thread_state32_t state = void; + mach_msg_type_number_t count = x86_THREAD_STATE32_COUNT; + + if( thread_get_state( t.m_tmach, x86_THREAD_STATE32, &state, &count ) != KERN_SUCCESS ) + throw new ThreadException( "Unable to load thread state" ); + if( !t.m_lock ) + t.m_curr.tstack = cast(void*) state.esp; + // eax,ebx,ecx,edx,edi,esi,ebp,esp + t.m_reg[0] = state.eax; + t.m_reg[1] = state.ebx; + t.m_reg[2] = state.ecx; + t.m_reg[3] = state.edx; + t.m_reg[4] = state.edi; + t.m_reg[5] = state.esi; + t.m_reg[6] = state.ebp; + t.m_reg[7] = state.esp; + } + else version( X86_64 ) + { + x86_thread_state64_t state = void; + mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT; + + if( thread_get_state( t.m_tmach, x86_THREAD_STATE64, &state, &count ) != KERN_SUCCESS ) + throw new ThreadException( "Unable to load thread state" ); + if( !t.m_lock ) + t.m_curr.tstack = cast(void*) state.rsp; + // rax,rbx,rcx,rdx,rdi,rsi,rbp,rsp + t.m_reg[0] = state.rax; + t.m_reg[1] = state.rbx; + t.m_reg[2] = state.rcx; + t.m_reg[3] = state.rdx; + t.m_reg[4] = state.rdi; + t.m_reg[5] = state.rsi; + t.m_reg[6] = state.rbp; + t.m_reg[7] = state.rsp; + // r8,r9,r10,r11,r12,r13,r14,r15 + t.m_reg[8] = state.r8; + t.m_reg[9] = state.r9; + t.m_reg[10] = state.r10; + t.m_reg[11] = state.r11; + t.m_reg[12] = state.r12; + t.m_reg[13] = state.r13; + t.m_reg[14] = state.r14; + t.m_reg[15] = state.r15; + } + else + { + static assert( "Architecture not supported." ); + } + } + else version( Posix ) + { + if( t.m_addr != pthread_self() ) + { + if( pthread_kill( t.m_addr, SIGUSR1 ) != 0 ) + { + if( !t.isRunning ) + { + Thread.remove( t ); + return; + } + throw new ThreadException( "Unable to suspend thread" ); + } + // NOTE: It's really not ideal to wait for each thread to + // signal individually -- rather, it would be better to + // suspend them all and wait once at the end. However, + // semaphores don't really work this way, and the obvious + // alternative (looping on an atomic suspend count) + // requires either the atomic module (which only works on + // x86) or other specialized functionality. It would + // also be possible to simply loop on sem_wait at the + // end, but I'm not convinced that this would be much + // faster than the current approach. + sem_wait( &suspendCount ); + } + else if( !t.m_lock ) + { + t.m_curr.tstack = getStackTop(); + } + } + } + + + // NOTE: We've got an odd chicken & egg problem here, because while the GC + // is required to call thread_init before calling any other thread + // routines, thread_init may allocate memory which could in turn + // trigger a collection. Thus, thread_suspendAll, thread_scanAll, + // and thread_resumeAll must be callable before thread_init + // completes, with the assumption that no other GC memory has yet + // been allocated by the system, and thus there is no risk of losing + // data if the global thread list is empty. The check of + // Thread.sm_tbeg below is done to ensure thread_init has completed, + // and therefore that calling Thread.getThis will not result in an + // error. For the short time when Thread.sm_tbeg is null, there is + // no reason not to simply call the multithreaded code below, with + // the expectation that the foreach loop will never be entered. + if( !multiThreadedFlag && Thread.sm_tbeg ) + { + if( ++suspendDepth == 1 ) + suspend( Thread.getThis() ); + return; + } + synchronized( Thread.slock ) + { + if( ++suspendDepth > 1 ) + return; + + // NOTE: I'd really prefer not to check isRunning within this loop but + // not doing so could be problematic if threads are termianted + // abnormally and a new thread is created with the same thread + // address before the next GC run. This situation might cause + // the same thread to be suspended twice, which would likely + // cause the second suspend to fail, the garbage collection to + // abort, and Bad Things to occur. + for( Thread t = Thread.sm_tbeg; t; t = t.next ) + { + if( t.isRunning ) + suspend( t ); + else + Thread.remove( t ); + } + + version( Posix ) + { + // wait on semaphore -- see note in suspend for + // why this is currently not implemented + } + } +} + + +/** + * Resume all threads but the calling thread for "stop the world" garbage + * collection runs. This function must be called once for each preceding + * call to thread_suspendAll before the threads are actually resumed. + * + * In: + * This routine must be preceded by a call to thread_suspendAll. + * + * Throws: + * ThreadException if the resume operation fails for a running thread. + */ +extern (C) void thread_resumeAll() +in +{ + assert( suspendDepth > 0 ); +} +body +{ + /** + * Resume the specified thread and unload stack and register information. + * If the supplied thread is the calling thread, stack and register + * information will be unloaded but the thread will not be resumed. If + * the resume operation fails and the thread is not running then it will + * be removed from the global thread list, otherwise an exception will be + * thrown. + * + * Params: + * t = The thread to resume. + * + * Throws: + * ThreadException if the resume fails for a running thread. + */ + void resume( Thread t ) + { + version( Windows ) + { + if( t.m_addr != GetCurrentThreadId() && ResumeThread( t.m_hndl ) == 0xFFFFFFFF ) + { + if( !t.isRunning ) + { + Thread.remove( t ); + return; + } + throw new ThreadException( "Unable to resume thread" ); + } + + if( !t.m_lock ) + t.m_curr.tstack = t.m_curr.bstack; + t.m_reg[0 .. $] = 0; + } + else version( OSX ) + { + if( t.m_addr != pthread_self() && thread_resume( t.m_tmach ) != KERN_SUCCESS ) + { + if( !t.isRunning ) + { + Thread.remove( t ); + return; + } + throw new ThreadException( "Unable to resume thread" ); + } + + if( !t.m_lock ) + t.m_curr.tstack = t.m_curr.bstack; + t.m_reg[0 .. $] = 0; + } + else version( Posix ) + { + if( t.m_addr != pthread_self() ) + { + if( pthread_kill( t.m_addr, SIGUSR2 ) != 0 ) + { + if( !t.isRunning ) + { + Thread.remove( t ); + return; + } + throw new ThreadException( "Unable to resume thread" ); + } + } + else if( !t.m_lock ) + { + t.m_curr.tstack = t.m_curr.bstack; + } + } + } + + + // NOTE: See thread_suspendAll for the logic behind this. + if( !multiThreadedFlag && Thread.sm_tbeg ) + { + if( --suspendDepth == 0 ) + resume( Thread.getThis() ); + return; + } + synchronized( Thread.slock ) + { + if( --suspendDepth > 0 ) + return; + + for( Thread t = Thread.sm_tbeg; t; t = t.next ) + { + resume( t ); + } + } +} + + +private alias void delegate( void*, void* ) scanAllThreadsFn; + + +/** + * The main entry point for garbage collection. The supplied delegate + * will be passed ranges representing both stack and register values. + * + * Params: + * scan = The scanner function. It should scan from p1 through p2 - 1. + * curStackTop = An optional pointer to the top of the calling thread's stack. + * + * In: + * This routine must be preceded by a call to thread_suspendAll. + */ +extern (C) void thread_scanAll( scanAllThreadsFn scan, void* curStackTop = null ) +in +{ + assert( suspendDepth > 0 ); +} +body +{ + Thread thisThread = null; + void* oldStackTop = null; + + if( curStackTop && Thread.sm_tbeg ) + { + thisThread = Thread.getThis(); + if( !thisThread.m_lock ) + { + oldStackTop = thisThread.m_curr.tstack; + thisThread.m_curr.tstack = curStackTop; + } + } + + scope( exit ) + { + if( curStackTop && Thread.sm_tbeg ) + { + if( !thisThread.m_lock ) + { + thisThread.m_curr.tstack = oldStackTop; + } + } + } + + // NOTE: Synchronizing on Thread.slock is not needed because this + // function may only be called after all other threads have + // been suspended from within the same lock. + for( Thread.Context* c = Thread.sm_cbeg; c; c = c.next ) + { + version( StackGrowsDown ) + { + // NOTE: We can't index past the bottom of the stack + // so don't do the "+1" for StackGrowsDown. + if( c.tstack && c.tstack < c.bstack ) + scan( c.tstack, c.bstack ); + } + else + { + if( c.bstack && c.bstack < c.tstack ) + scan( c.bstack, c.tstack + 1 ); + } + } + + for( Thread t = Thread.sm_tbeg; t; t = t.next ) + { + scan( &t.m_tls[0], &t.m_tls[0] + t.m_tls.length ); + + version( Windows ) + { + scan( &t.m_reg[0], &t.m_reg[0] + t.m_reg.length ); + } + } +} + + +/////////////////////////////////////////////////////////////////////////////// +// Thread Local +/////////////////////////////////////////////////////////////////////////////// + + +/** + * This class encapsulates the operations required to initialize, access, and + * destroy thread local data. + */ +class ThreadLocal( T ) +{ + /////////////////////////////////////////////////////////////////////////// + // Initialization + /////////////////////////////////////////////////////////////////////////// + + + /** + * Initializes thread local storage for the indicated value which will be + * initialized to def for all threads. + * + * Params: + * def = The default value to return if no value has been explicitly set. + */ + this( T def = T.init ) + { + m_def = def; + m_key = Thread.createLocal(); + } + + + ~this() + { + Thread.deleteLocal( m_key ); + } + + + /////////////////////////////////////////////////////////////////////////// + // Accessors + /////////////////////////////////////////////////////////////////////////// + + + /** + * Gets the value last set by the calling thread, or def if no such value + * has been set. + * + * Returns: + * The stored value or def if no value is stored. + */ + T val() + { + Wrap* wrap = cast(Wrap*) Thread.getLocal( m_key ); + + return wrap ? wrap.val : m_def; + } + + + /** + * Copies newval to a location specific to the calling thread, and returns + * newval. + * + * Params: + * newval = The value to set. + * + * Returns: + * The value passed to this function. + */ + T val( T newval ) + { + Wrap* wrap = cast(Wrap*) Thread.getLocal( m_key ); + + if( wrap is null ) + { + wrap = new Wrap; + Thread.setLocal( m_key, wrap ); + } + wrap.val = newval; + return newval; + } + + +private: + // + // A wrapper for the stored data. This is needed for determining whether + // set has ever been called for this thread (and therefore whether the + // default value should be returned) and also to flatten the differences + // between data that is smaller and larger than (void*).sizeof. The + // obvious tradeoff here is an extra per-thread allocation for each + // ThreadLocal value as compared to calling the Thread routines directly. + // + struct Wrap + { + T val; + } + + + T m_def; + uint m_key; +} + + +/////////////////////////////////////////////////////////////////////////////// +// Thread Group +/////////////////////////////////////////////////////////////////////////////// + + +/** + * This class is intended to simplify certain common programming techniques. + */ +class ThreadGroup +{ + /** + * Creates and starts a new Thread object that executes fn and adds it to + * the list of tracked threads. + * + * Params: + * fn = The thread function. + * + * Returns: + * A reference to the newly created thread. + */ + final Thread create( void function() fn ) + { + Thread t = new Thread( fn ); + + t.start(); + synchronized( this ) + { + m_all[t] = t; + } + return t; + } + + + /** + * Creates and starts a new Thread object that executes dg and adds it to + * the list of tracked threads. + * + * Params: + * dg = The thread function. + * + * Returns: + * A reference to the newly created thread. + */ + final Thread create( void delegate() dg ) + { + Thread t = new Thread( dg ); + + t.start(); + synchronized( this ) + { + m_all[t] = t; + } + return t; + } + + + /** + * Add t to the list of tracked threads if it is not already being tracked. + * + * Params: + * t = The thread to add. + * + * In: + * t must not be null. + */ + final void add( Thread t ) + in + { + assert( t ); + } + body + { + synchronized( this ) + { + m_all[t] = t; + } + } + + + /** + * Removes t from the list of tracked threads. No operation will be + * performed if t is not currently being tracked by this object. + * + * Params: + * t = The thread to remove. + * + * In: + * t must not be null. + */ + final void remove( Thread t ) + in + { + assert( t ); + } + body + { + synchronized( this ) + { + m_all.remove( t ); + } + } + + + /** + * Operates on all threads currently tracked by this object. + */ + final int opApply( int delegate( inout Thread ) dg ) + { + synchronized( this ) + { + int ret = 0; + + // NOTE: This loop relies on the knowledge that m_all uses the + // Thread object for both the key and the mapped value. + foreach( Thread t; m_all.keys ) + { + ret = dg( t ); + if( ret ) + break; + } + return ret; + } + } + + + /** + * Iteratively joins all tracked threads. This function will block add, + * remove, and opApply until it completes. + * + * Params: + * rethrow = Rethrow any unhandled exception which may have caused the + * current thread to terminate. + * + * Throws: + * Any exception not handled by the joined threads. + */ + final void joinAll( bool rethrow = true ) + { + synchronized( this ) + { + // NOTE: This loop relies on the knowledge that m_all uses the + // Thread object for both the key and the mapped value. + foreach( Thread t; m_all.keys ) + { + t.join( rethrow ); + } + } + } + + +private: + Thread[Thread] m_all; +} + + +/////////////////////////////////////////////////////////////////////////////// +// Fiber Platform Detection and Memory Allocation +/////////////////////////////////////////////////////////////////////////////// + + +private +{ + version( D_InlineAsm_X86 ) + { + version( X86_64 ) + { + + } + else + { + version( Windows ) + version = AsmX86_Win32; + else version( Posix ) + version = AsmX86_Posix; + } + } + else version( PPC ) + { + version( Posix ) + version = AsmPPC_Posix; + } + + + version( Posix ) + { + import core.sys.posix.unistd; // for sysconf + import core.sys.posix.sys.mman; // for mmap + import core.sys.posix.stdlib; // for malloc, valloc, free + + version( AsmX86_Win32 ) {} else + version( AsmX86_Posix ) {} else + version( AsmPPC_Posix ) {} else + { + // NOTE: The ucontext implementation requires architecture specific + // data definitions to operate so testing for it must be done + // by checking for the existence of ucontext_t rather than by + // a version identifier. Please note that this is considered + // an obsolescent feature according to the POSIX spec, so a + // custom solution is still preferred. + import core.sys.posix.ucontext; + } + } + + const size_t PAGESIZE; +} + + +static this() +{ + static if( is( typeof( GetSystemInfo ) ) ) + { + SYSTEM_INFO info; + GetSystemInfo( &info ); + + PAGESIZE = info.dwPageSize; + assert( PAGESIZE < int.max ); + } + else static if( is( typeof( sysconf ) ) && + is( typeof( _SC_PAGESIZE ) ) ) + { + PAGESIZE = cast(size_t) sysconf( _SC_PAGESIZE ); + assert( PAGESIZE < int.max ); + } + else + { + version( PPC ) + PAGESIZE = 8192; + else + PAGESIZE = 4096; + } +} + + +/////////////////////////////////////////////////////////////////////////////// +// Fiber Entry Point and Context Switch +/////////////////////////////////////////////////////////////////////////////// + + +private +{ + extern (C) void fiber_entryPoint() + { + Fiber obj = Fiber.getThis(); + assert( obj ); + + assert( Thread.getThis().m_curr is obj.m_ctxt ); + volatile Thread.getThis().m_lock = false; + obj.m_ctxt.tstack = obj.m_ctxt.bstack; + obj.m_state = Fiber.State.EXEC; + + try + { + obj.run(); + } + catch( Object o ) + { + obj.m_unhandled = o; + } + + static if( is( ucontext_t ) ) + obj.m_ucur = &obj.m_utxt; + + obj.m_state = Fiber.State.TERM; + obj.switchOut(); + } + + + // NOTE: If AsmPPC_Posix is defined then the context switch routine will + // be defined externally until GDC supports inline PPC ASM. + version( AsmPPC_Posix ) + extern (C) void fiber_switchContext( void** oldp, void* newp ); + else + extern (C) void fiber_switchContext( void** oldp, void* newp ) + { + // NOTE: The data pushed and popped in this routine must match the + // default stack created by Fiber.initStack or the initial + // switch into a new context will fail. + + version( AsmX86_Win32 ) + { + asm + { + naked; + + // save current stack state + push EBP; + mov EBP, ESP; + push EAX; + push dword ptr FS:[0]; + push dword ptr FS:[4]; + push dword ptr FS:[8]; + push EBX; + push ESI; + push EDI; + + // store oldp again with more accurate address + mov EAX, dword ptr 8[EBP]; + mov [EAX], ESP; + // load newp to begin context switch + mov ESP, dword ptr 12[EBP]; + + // load saved state from new stack + pop EDI; + pop ESI; + pop EBX; + pop dword ptr FS:[8]; + pop dword ptr FS:[4]; + pop dword ptr FS:[0]; + pop EAX; + pop EBP; + + // 'return' to complete switch + ret; + } + } + else version( AsmX86_Posix ) + { + asm + { + naked; + + // save current stack state + push EBP; + mov EBP, ESP; + push EAX; + push EBX; + push ESI; + push EDI; + + // store oldp again with more accurate address + mov EAX, dword ptr 8[EBP]; + mov [EAX], ESP; + // load newp to begin context switch + mov ESP, dword ptr 12[EBP]; + + // load saved state from new stack + pop EDI; + pop ESI; + pop EBX; + pop EAX; + pop EBP; + + // 'return' to complete switch + ret; + } + } + else static if( is( ucontext_t ) ) + { + Fiber cfib = Fiber.getThis(); + void* ucur = cfib.m_ucur; + + *oldp = &ucur; + swapcontext( **(cast(ucontext_t***) oldp), + *(cast(ucontext_t**) newp) ); + } + } +} + + +/////////////////////////////////////////////////////////////////////////////// +// Fiber +/////////////////////////////////////////////////////////////////////////////// + + +/** + * This class provides a cooperative concurrency mechanism integrated with the + * threading and garbage collection functionality. Calling a fiber may be + * considered a blocking operation that returns when the fiber yields (via + * Fiber.yield()). Execution occurs within the context of the calling thread + * so synchronization is not necessary to guarantee memory visibility so long + * as the same thread calls the fiber each time. Please note that there is no + * requirement that a fiber be bound to one specific thread. Rather, fibers + * may be freely passed between threads so long as they are not currently + * executing. Like threads, a new fiber thread may be created using either + * derivation or composition, as in the following example. + * + * Example: + * ---------------------------------------------------------------------- + * + * class DerivedFiber : Fiber + * { + * this() + * { + * super( &run ); + * } + * + * private : + * void run() + * { + * printf( "Derived fiber running.\n" ); + * } + * } + * + * void fiberFunc() + * { + * printf( "Composed fiber running.\n" ); + * Fiber.yield(); + * printf( "Composed fiber running.\n" ); + * } + * + * // create instances of each type + * Fiber derived = new DerivedFiber(); + * Fiber composed = new Fiber( &fiberFunc ); + * + * // call both fibers once + * derived.call(); + * composed.call(); + * printf( "Execution returned to calling context.\n" ); + * composed.call(); + * + * // since each fiber has run to completion, each should have state TERM + * assert( derived.state == Fiber.State.TERM ); + * assert( composed.state == Fiber.State.TERM ); + * + * ---------------------------------------------------------------------- + * + * Authors: Based on a design by Mikola Lysenko. + */ +class Fiber +{ + /////////////////////////////////////////////////////////////////////////// + // Initialization + /////////////////////////////////////////////////////////////////////////// + + + /** + * Initializes a fiber object which is associated with a static + * D function. + * + * Params: + * fn = The thread function. + * sz = The stack size for this fiber. + * + * In: + * fn must not be null. + */ + this( void function() fn, size_t sz = PAGESIZE ) + in + { + assert( fn ); + } + body + { + m_fn = fn; + m_call = Call.FN; + m_state = State.HOLD; + allocStack( sz ); + initStack(); + } + + + /** + * Initializes a fiber object which is associated with a dynamic + * D function. + * + * Params: + * dg = The thread function. + * sz = The stack size for this fiber. + * + * In: + * dg must not be null. + */ + this( void delegate() dg, size_t sz = PAGESIZE ) + in + { + assert( dg ); + } + body + { + m_dg = dg; + m_call = Call.DG; + m_state = State.HOLD; + allocStack( sz ); + initStack(); + } + + + /** + * Cleans up any remaining resources used by this object. + */ + ~this() + { + // NOTE: A live reference to this object will exist on its associated + // stack from the first time its call() method has been called + // until its execution completes with State.TERM. Thus, the only + // times this dtor should be called are either if the fiber has + // terminated (and therefore has no active stack) or if the user + // explicitly deletes this object. The latter case is an error + // but is not easily tested for, since State.HOLD may imply that + // the fiber was just created but has never been run. There is + // not a compelling case to create a State.INIT just to offer a + // means of ensuring the user isn't violating this object's + // contract, so for now this requirement will be enforced by + // documentation only. + freeStack(); + } + + + /////////////////////////////////////////////////////////////////////////// + // General Actions + /////////////////////////////////////////////////////////////////////////// + + + /** + * Transfers execution to this fiber object. The calling context will be + * suspended until the fiber calls Fiber.yield() or until it terminates + * via an unhandled exception. + * + * Params: + * rethrow = Rethrow any unhandled exception which may have caused this + * fiber to terminate. + * + * In: + * This fiber must be in state HOLD. + * + * Throws: + * Any exception not handled by the joined thread. + * + * Returns: + * Any exception not handled by this fiber if rethrow = false, null + * otherwise. + */ + final Object call( bool rethrow = true ) + in + { + assert( m_state == State.HOLD ); + } + body + { + Fiber cur = getThis(); + + static if( is( ucontext_t ) ) + m_ucur = cur ? &cur.m_utxt : &Fiber.sm_utxt; + + setThis( this ); + this.switchIn(); + setThis( cur ); + + static if( is( ucontext_t ) ) + m_ucur = null; + + // NOTE: If the fiber has terminated then the stack pointers must be + // reset. This ensures that the stack for this fiber is not + // scanned if the fiber has terminated. This is necessary to + // prevent any references lingering on the stack from delaying + // the collection of otherwise dead objects. The most notable + // being the current object, which is referenced at the top of + // fiber_entryPoint. + if( m_state == State.TERM ) + { + m_ctxt.tstack = m_ctxt.bstack; + } + if( m_unhandled ) + { + Object obj = m_unhandled; + m_unhandled = null; + if( rethrow ) + throw obj; + return obj; + } + return null; + } + + + /** + * Resets this fiber so that it may be re-used. This routine may only be + * called for fibers that have terminated, as doing otherwise could result + * in scope-dependent functionality that is not executed. Stack-based + * classes, for example, may not be cleaned up properly if a fiber is reset + * before it has terminated. + * + * In: + * This fiber must be in state TERM. + */ + final void reset() + in + { + assert( m_state == State.TERM ); + assert( m_ctxt.tstack == m_ctxt.bstack ); + } + body + { + m_state = State.HOLD; + initStack(); + m_unhandled = null; + } + + + /////////////////////////////////////////////////////////////////////////// + // General Properties + /////////////////////////////////////////////////////////////////////////// + + + /** + * A fiber may occupy one of three states: HOLD, EXEC, and TERM. The HOLD + * state applies to any fiber that is suspended and ready to be called. + * The EXEC state will be set for any fiber that is currently executing. + * And the TERM state is set when a fiber terminates. Once a fiber + * terminates, it must be reset before it may be called again. + */ + enum State + { + HOLD, /// + EXEC, /// + TERM /// + } + + + /** + * Gets the current state of this fiber. + * + * Returns: + * The state of this fiber as an enumerated value. + */ + final State state() + { + return m_state; + } + + + /////////////////////////////////////////////////////////////////////////// + // Actions on Calling Fiber + /////////////////////////////////////////////////////////////////////////// + + + /** + * Forces a context switch to occur away from the calling fiber. + */ + static void yield() + { + Fiber cur = getThis(); + assert( cur, "Fiber.yield() called with no active fiber" ); + assert( cur.m_state == State.EXEC ); + + static if( is( ucontext_t ) ) + cur.m_ucur = &cur.m_utxt; + + cur.m_state = State.HOLD; + cur.switchOut(); + cur.m_state = State.EXEC; + } + + + /** + * Forces a context switch to occur away from the calling fiber and then + * throws obj in the calling fiber. + * + * Params: + * obj = The object to throw. + * + * In: + * obj must not be null. + */ + static void yieldAndThrow( Object obj ) + in + { + assert( obj ); + } + body + { + Fiber cur = getThis(); + assert( cur, "Fiber.yield() called with no active fiber" ); + assert( cur.m_state == State.EXEC ); + + static if( is( ucontext_t ) ) + cur.m_ucur = &cur.m_utxt; + + cur.m_unhandled = obj; + cur.m_state = State.HOLD; + cur.switchOut(); + cur.m_state = State.EXEC; + } + + + /////////////////////////////////////////////////////////////////////////// + // Fiber Accessors + /////////////////////////////////////////////////////////////////////////// + + + /** + * Provides a reference to the calling fiber or null if no fiber is + * currently active. + * + * Returns: + * The fiber object representing the calling fiber or null if no fiber + * is currently active. The result of deleting this object is undefined. + */ + static Fiber getThis() + { + version( Windows ) + { + return cast(Fiber) TlsGetValue( sm_this ); + } + else version( Posix ) + { + return cast(Fiber) pthread_getspecific( sm_this ); + } + } + + + /////////////////////////////////////////////////////////////////////////// + // Static Initialization + /////////////////////////////////////////////////////////////////////////// + + + static this() + { + version( Windows ) + { + sm_this = TlsAlloc(); + assert( sm_this != TLS_OUT_OF_INDEXES ); + } + else version( Posix ) + { + int status; + + status = pthread_key_create( &sm_this, null ); + assert( status == 0 ); + + static if( is( ucontext_t ) ) + { + status = getcontext( &sm_utxt ); + assert( status == 0 ); + } + } + } + + +private: + // + // Initializes a fiber object which has no associated executable function. + // + this() + { + m_call = Call.NO; + } + + + // + // Fiber entry point. Invokes the function or delegate passed on + // construction (if any). + // + final void run() + { + switch( m_call ) + { + case Call.FN: + m_fn(); + break; + case Call.DG: + m_dg(); + break; + default: + break; + } + } + + +private: + // + // The type of routine passed on fiber construction. + // + enum Call + { + NO, + FN, + DG + } + + + // + // Standard fiber data + // + Call m_call; + union + { + void function() m_fn; + void delegate() m_dg; + } + bool m_isRunning; + Object m_unhandled; + State m_state; + + +private: + /////////////////////////////////////////////////////////////////////////// + // Stack Management + /////////////////////////////////////////////////////////////////////////// + + + // + // Allocate a new stack for this fiber. + // + final void allocStack( size_t sz ) + in + { + assert( !m_pmem && !m_ctxt ); + } + body + { + // adjust alloc size to a multiple of PAGESIZE + sz += PAGESIZE - 1; + sz -= sz % PAGESIZE; + + // NOTE: This instance of Thread.Context is dynamic so Fiber objects + // can be collected by the GC so long as no user level references + // to the object exist. If m_ctxt were not dynamic then its + // presence in the global context list would be enough to keep + // this object alive indefinitely. An alternative to allocating + // room for this struct explicitly would be to mash it into the + // base of the stack being allocated below. However, doing so + // requires too much special logic to be worthwhile. + m_ctxt = new Thread.Context; + + static if( is( typeof( VirtualAlloc ) ) ) + { + // reserve memory for stack + m_pmem = VirtualAlloc( null, + sz + PAGESIZE, + MEM_RESERVE, + PAGE_NOACCESS ); + if( !m_pmem ) + { + throw new FiberException( "Unable to reserve memory for stack" ); + } + + version( StackGrowsDown ) + { + void* stack = m_pmem + PAGESIZE; + void* guard = m_pmem; + void* pbase = stack + sz; + } + else + { + void* stack = m_pmem; + void* guard = m_pmem + sz; + void* pbase = stack; + } + + // allocate reserved stack segment + stack = VirtualAlloc( stack, + sz, + MEM_COMMIT, + PAGE_READWRITE ); + if( !stack ) + { + throw new FiberException( "Unable to allocate memory for stack" ); + } + + // allocate reserved guard page + guard = VirtualAlloc( guard, + PAGESIZE, + MEM_COMMIT, + PAGE_READWRITE | PAGE_GUARD ); + if( !guard ) + { + throw new FiberException( "Unable to create guard page for stack" ); + } + + m_ctxt.bstack = pbase; + m_ctxt.tstack = pbase; + m_size = sz; + } + else + { static if( is( typeof( mmap ) ) ) + { + m_pmem = mmap( null, + sz, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, + -1, + 0 ); + if( m_pmem == MAP_FAILED ) + m_pmem = null; + } + else static if( is( typeof( valloc ) ) ) + { + m_pmem = valloc( sz ); + } + else static if( is( typeof( malloc ) ) ) + { + m_pmem = malloc( sz ); + } + else + { + m_pmem = null; + } + + if( !m_pmem ) + { + throw new FiberException( "Unable to allocate memory for stack" ); + } + + version( StackGrowsDown ) + { + m_ctxt.bstack = m_pmem + sz; + m_ctxt.tstack = m_pmem + sz; + } + else + { + m_ctxt.bstack = m_pmem; + m_ctxt.tstack = m_pmem; + } + m_size = sz; + } + + Thread.add( m_ctxt ); + } + + + // + // Free this fiber's stack. + // + final void freeStack() + in + { + assert( m_pmem && m_ctxt ); + } + body + { + // NOTE: Since this routine is only ever expected to be called from + // the dtor, pointers to freed data are not set to null. + + // NOTE: m_ctxt is guaranteed to be alive because it is held in the + // global context list. + Thread.remove( m_ctxt ); + + static if( is( typeof( VirtualAlloc ) ) ) + { + VirtualFree( m_pmem, 0, MEM_RELEASE ); + } + else static if( is( typeof( mmap ) ) ) + { + munmap( m_pmem, m_size ); + } + else static if( is( typeof( valloc ) ) ) + { + free( m_pmem ); + } + else static if( is( typeof( malloc ) ) ) + { + free( m_pmem ); + } + delete m_ctxt; + } + + + // + // Initialize the allocated stack. + // + final void initStack() + in + { + assert( m_ctxt.tstack && m_ctxt.tstack == m_ctxt.bstack ); + assert( cast(size_t) m_ctxt.bstack % (void*).sizeof == 0 ); + } + body + { + void* pstack = m_ctxt.tstack; + scope( exit ) m_ctxt.tstack = pstack; + + void push( size_t val ) + { + version( StackGrowsDown ) + { + pstack -= size_t.sizeof; + *(cast(size_t*) pstack) = val; + } + else + { + pstack += size_t.sizeof; + *(cast(size_t*) pstack) = val; + } + } + + // NOTE: On OS X the stack must be 16-byte aligned according to the + // IA-32 call spec. + version( OSX ) + { + version( StackGrowsDown ) + { + pstack = cast(void*)(cast(uint)(pstack) - (cast(uint)(pstack) & 0x0F)); + } + else + { + pstack = cast(void*)(cast(uint)(pstack) + (cast(uint)(pstack) & 0x0F)); + } + } + + version( AsmX86_Win32 ) + { + push( cast(size_t) &fiber_entryPoint ); // EIP + push( 0xFFFFFFFF ); // EBP + push( 0x00000000 ); // EAX + push( 0xFFFFFFFF ); // FS:[0] + version( StackGrowsDown ) + { + push( cast(size_t) m_ctxt.bstack ); // FS:[4] + push( cast(size_t) m_ctxt.bstack - m_size ); // FS:[8] + } + else + { + push( cast(size_t) m_ctxt.bstack ); // FS:[4] + push( cast(size_t) m_ctxt.bstack + m_size ); // FS:[8] + } + push( 0x00000000 ); // EBX + push( 0x00000000 ); // ESI + push( 0x00000000 ); // EDI + } + else version( AsmX86_Posix ) + { + push( 0x00000000 ); // Pad stack for OSX + push( cast(size_t) &fiber_entryPoint ); // EIP + push( 0x00000000 ); // EBP + push( 0x00000000 ); // EAX + push( 0x00000000 ); // EBX + push( 0x00000000 ); // ESI + push( 0x00000000 ); // EDI + } + else version( AsmPPC_Posix ) + { + version( StackGrowsDown ) + { + pstack -= int.sizeof * 5; + } + else + { + pstack += int.sizeof * 5; + } + + push( cast(size_t) &fiber_entryPoint ); // link register + push( 0x00000000 ); // control register + push( 0x00000000 ); // old stack pointer + + // GPR values + version( StackGrowsDown ) + { + pstack -= int.sizeof * 20; + } + else + { + pstack += int.sizeof * 20; + } + + assert( cast(uint) pstack & 0x0f == 0 ); + } + else static if( is( ucontext_t ) ) + { + getcontext( &m_utxt ); + m_utxt.uc_stack.ss_sp = m_ctxt.bstack; + m_utxt.uc_stack.ss_size = m_size; + makecontext( &m_utxt, &fiber_entryPoint, 0 ); + // NOTE: If ucontext is being used then the top of the stack will + // be a pointer to the ucontext_t struct for that fiber. + push( cast(size_t) &m_utxt ); + } + } + + + Thread.Context* m_ctxt; + size_t m_size; + void* m_pmem; + + static if( is( ucontext_t ) ) + { + // NOTE: The static ucontext instance is used to represent the context + // of the main application thread. + static ucontext_t sm_utxt = void; + ucontext_t m_utxt = void; + ucontext_t* m_ucur = null; + } + + +private: + /////////////////////////////////////////////////////////////////////////// + // Storage of Active Fiber + /////////////////////////////////////////////////////////////////////////// + + + // + // Sets a thread-local reference to the current fiber object. + // + static void setThis( Fiber f ) + { + version( Windows ) + { + TlsSetValue( sm_this, cast(void*) f ); + } + else version( Posix ) + { + pthread_setspecific( sm_this, cast(void*) f ); + } + } + + + __gshared Thread.TLSKey sm_this; + + +private: + /////////////////////////////////////////////////////////////////////////// + // Context Switching + /////////////////////////////////////////////////////////////////////////// + + + // + // Switches into the stack held by this fiber. + // + final void switchIn() + { + Thread tobj = Thread.getThis(); + void** oldp = &tobj.m_curr.tstack; + void* newp = m_ctxt.tstack; + + // NOTE: The order of operations here is very important. The current + // stack top must be stored before m_lock is set, and pushContext + // must not be called until after m_lock is set. This process + // is intended to prevent a race condition with the suspend + // mechanism used for garbage collection. If it is not followed, + // a badly timed collection could cause the GC to scan from the + // bottom of one stack to the top of another, or to miss scanning + // a stack that still contains valid data. The old stack pointer + // oldp will be set again before the context switch to guarantee + // that it points to exactly the correct stack location so the + // successive pop operations will succeed. + *oldp = getStackTop(); + volatile tobj.m_lock = true; + tobj.pushContext( m_ctxt ); + + fiber_switchContext( oldp, newp ); + + // NOTE: As above, these operations must be performed in a strict order + // to prevent Bad Things from happening. + tobj.popContext(); + volatile tobj.m_lock = false; + tobj.m_curr.tstack = tobj.m_curr.bstack; + } + + + // + // Switches out of the current stack and into the enclosing stack. + // + final void switchOut() + { + Thread tobj = Thread.getThis(); + void** oldp = &m_ctxt.tstack; + void* newp = tobj.m_curr.within.tstack; + + // NOTE: The order of operations here is very important. The current + // stack top must be stored before m_lock is set, and pushContext + // must not be called until after m_lock is set. This process + // is intended to prevent a race condition with the suspend + // mechanism used for garbage collection. If it is not followed, + // a badly timed collection could cause the GC to scan from the + // bottom of one stack to the top of another, or to miss scanning + // a stack that still contains valid data. The old stack pointer + // oldp will be set again before the context switch to guarantee + // that it points to exactly the correct stack location so the + // successive pop operations will succeed. + *oldp = getStackTop(); + volatile tobj.m_lock = true; + + fiber_switchContext( oldp, newp ); + + // NOTE: As above, these operations must be performed in a strict order + // to prevent Bad Things from happening. + volatile tobj.m_lock = false; + tobj.m_curr.tstack = tobj.m_curr.bstack; + } +} + +version (OSX) +{ + /* The Mach-O object file format does not allow for thread local storage + * declarations. So, instead we roll our own by putting tls into + * the sections __tlsdata and __tlscoal_nt. + */ + + extern (D) + void* ___tls_get_addr(void* p) + { + return p; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/common/core/threadasm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/common/core/threadasm.S Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,143 @@ +/** + * Support code for mutithreading. + * + * Copyright: Copyright Mikola Lysenko 2005 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright + * + * Copyright Digital Mars 2004 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.aApply; + +private import rt.util.utf; + +/********************************************** + */ + +// dg is D, but _aApplycd() is C +extern (D) typedef int delegate(void *) dg_t; + +extern (C) int _aApplycd1(char[] aa, dg_t dg) +{ int result; + size_t i; + size_t len = aa.length; + + debug(apply) printf("_aApplycd1(), len = %d\n", len); + for (i = 0; i < len; ) + { dchar d; + + d = aa[i]; + if (d & 0x80) + d = decode(aa, i); + else + i++; + result = dg(cast(void *)&d); + if (result) + break; + } + return result; +} + +extern (C) int _aApplywd1(wchar[] aa, dg_t dg) +{ int result; + size_t i; + size_t len = aa.length; + + debug(apply) printf("_aApplywd1(), len = %d\n", len); + for (i = 0; i < len; ) + { dchar d; + + d = aa[i]; + if (d & ~0x7F) + d = decode(aa, i); + else + i++; + result = dg(cast(void *)&d); + if (result) + break; + } + return result; +} + +extern (C) int _aApplycw1(char[] aa, dg_t dg) +{ int result; + size_t i; + size_t len = aa.length; + + debug(apply) printf("_aApplycw1(), len = %d\n", len); + for (i = 0; i < len; ) + { dchar d; + wchar w; + + w = aa[i]; + if (w & 0x80) + { d = decode(aa, i); + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar)((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(cast(void *)&w); + if (result) + break; + w = cast(wchar)(((d - 0x10000) & 0x3FF) + 0xDC00); + } + } + else + i++; + result = dg(cast(void *)&w); + if (result) + break; + } + return result; +} + +extern (C) int _aApplywc1(wchar[] aa, dg_t dg) +{ int result; + size_t i; + size_t len = aa.length; + + debug(apply) printf("_aApplywc1(), len = %d\n", len); + for (i = 0; i < len; ) + { dchar d; + wchar w; + char c; + + w = aa[i]; + if (w & ~0x7F) + { + char[4] buf; + + d = decode(aa, i); + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { c = cast(char)w; + i++; + } + result = dg(cast(void *)&c); + if (result) + break; + } + return result; +} + +extern (C) int _aApplydc1(dchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplydc1(), len = %d\n", aa.length); + foreach (dchar d; aa) + { + char c; + + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { + c = cast(char)d; + } + result = dg(cast(void *)&c); + if (result) + break; + } + return result; +} + +extern (C) int _aApplydw1(dchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplydw1(), len = %d\n", aa.length); + foreach (dchar d; aa) + { + wchar w; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar)((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(cast(void *)&w); + if (result) + break; + w = cast(wchar)(((d - 0x10000) & 0x3FF) + 0xDC00); + } + result = dg(cast(void *)&w); + if (result) + break; + } + return result; +} + + +/****************************************************************************/ + +// dg is D, but _aApplycd2() is C +extern (D) typedef int delegate(void *, void *) dg2_t; + +extern (C) int _aApplycd2(char[] aa, dg2_t dg) +{ int result; + size_t i; + size_t n; + size_t len = aa.length; + + debug(apply) printf("_aApplycd2(), len = %d\n", len); + for (i = 0; i < len; i += n) + { dchar d; + + d = aa[i]; + if (d & 0x80) + { + n = i; + d = decode(aa, n); + n -= i; + } + else + n = 1; + result = dg(&i, cast(void *)&d); + if (result) + break; + } + return result; +} + +extern (C) int _aApplywd2(wchar[] aa, dg2_t dg) +{ int result; + size_t i; + size_t n; + size_t len = aa.length; + + debug(apply) printf("_aApplywd2(), len = %d\n", len); + for (i = 0; i < len; i += n) + { dchar d; + + d = aa[i]; + if (d & ~0x7F) + { + n = i; + d = decode(aa, n); + n -= i; + } + else + n = 1; + result = dg(&i, cast(void *)&d); + if (result) + break; + } + return result; +} + +extern (C) int _aApplycw2(char[] aa, dg2_t dg) +{ int result; + size_t i; + size_t n; + size_t len = aa.length; + + debug(apply) printf("_aApplycw2(), len = %d\n", len); + for (i = 0; i < len; i += n) + { dchar d; + wchar w; + + w = aa[i]; + if (w & 0x80) + { n = i; + d = decode(aa, n); + n -= i; + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(&i, cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + } + else + n = 1; + result = dg(&i, cast(void *)&w); + if (result) + break; + } + return result; +} + +extern (C) int _aApplywc2(wchar[] aa, dg2_t dg) +{ int result; + size_t i; + size_t n; + size_t len = aa.length; + + debug(apply) printf("_aApplywc2(), len = %d\n", len); + for (i = 0; i < len; i += n) + { dchar d; + wchar w; + char c; + + w = aa[i]; + if (w & ~0x7F) + { + char[4] buf; + + n = i; + d = decode(aa, n); + n -= i; + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(&i, cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { c = cast(char)w; + n = 1; + } + result = dg(&i, cast(void *)&c); + if (result) + break; + } + return result; +} + +extern (C) int _aApplydc2(dchar[] aa, dg2_t dg) +{ int result; + size_t i; + size_t len = aa.length; + + debug(apply) printf("_aApplydc2(), len = %d\n", len); + for (i = 0; i < len; i++) + { dchar d; + char c; + + d = aa[i]; + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(&i, cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { c = cast(char)d; + } + result = dg(&i, cast(void *)&c); + if (result) + break; + } + return result; +} + +extern (C) int _aApplydw2(dchar[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplydw2(), len = %d\n", aa.length); + foreach (size_t i, dchar d; aa) + { + wchar w; + auto j = i; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(&j, cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + result = dg(&j, cast(void *)&w); + if (result) + break; + } + return result; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/aApplyR.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/aApplyR.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,957 @@ +/** + * This code handles decoding UTF strings for foreach_reverse loops. There are + * 6 combinations of conversions between char, wchar, and dchar, and 2 of each + * of those. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: >= 1; + c = aa[i]; + } + d |= (c & m) << j; + } + result = dg(cast(void *)&d); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRcd1.unittest\n"); + + auto s = "hello"c[]; + int i; + + foreach_reverse(dchar d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(dchar d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == '\U00100456'); break; + case 2: assert(d == '\u1234'); break; + case 3: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 4); +} + +/*****************************/ + +extern (C) int _aApplyRwd1(in wchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplyRwd1(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + + i--; + d = aa[i]; + if (d >= 0xDC00 && d <= 0xDFFF) + { if (i == 0) + onUnicodeError("Invalid UTF-16 sequence", 0); + i--; + d = ((aa[i] - 0xD7C0) << 10) + (d - 0xDC00); + } + result = dg(cast(void *)&d); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRwd1.unittest\n"); + + auto s = "hello"w[]; + int i; + + foreach_reverse(dchar d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(dchar d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == '\U00100456'); break; + case 2: assert(d == '\u1234'); break; + case 3: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 4); +} + +/*****************************/ + +extern (C) int _aApplyRcw1(in char[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplyRcw1(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + wchar w; + + i--; + w = aa[i]; + if (w & 0x80) + { char c = cast(char)w; + uint j; + uint m = 0x3F; + d = 0; + while ((c & 0xC0) != 0xC0) + { if (i == 0) + onUnicodeError("Invalid UTF-8 sequence", 0); + i--; + d |= (c & 0x3F) << j; + j += 6; + m >>= 1; + c = aa[i]; + } + d |= (c & m) << j; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + } + result = dg(cast(void *)&w); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRcw1.unittest\n"); + + auto s = "hello"c[]; + int i; + + foreach_reverse(wchar d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(wchar d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == 0xDBC1); break; + case 2: assert(d == 0xDC56); break; + case 3: assert(d == 0x1234); break; + case 4: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 5); +} + +/*****************************/ + +extern (C) int _aApplyRwc1(in wchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplyRwc1(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + char c; + + i--; + d = aa[i]; + if (d >= 0xDC00 && d <= 0xDFFF) + { if (i == 0) + onUnicodeError("Invalid UTF-16 sequence", 0); + i--; + d = ((aa[i] - 0xD7C0) << 10) + (d - 0xDC00); + } + + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(cast(void *)&c2); + if (result) + return result; + } + continue; + } + c = cast(char)d; + result = dg(cast(void *)&c); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRwc1.unittest\n"); + + auto s = "hello"w[]; + int i; + + foreach_reverse(char d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(char d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == 0xF4); break; + case 2: assert(d == 0x80); break; + case 3: assert(d == 0x91); break; + case 4: assert(d == 0x96); break; + case 5: assert(d == 0xE1); break; + case 6: assert(d == 0x88); break; + case 7: assert(d == 0xB4); break; + case 8: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 9); +} + +/*****************************/ + +extern (C) int _aApplyRdc1(in dchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplyRdc1(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0;) + { dchar d = aa[--i]; + char c; + + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { + c = cast(char)d; + } + result = dg(cast(void *)&c); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRdc1.unittest\n"); + + auto s = "hello"d[]; + int i; + + foreach_reverse(char d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(char d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == 0xF4); break; + case 2: assert(d == 0x80); break; + case 3: assert(d == 0x91); break; + case 4: assert(d == 0x96); break; + case 5: assert(d == 0xE1); break; + case 6: assert(d == 0x88); break; + case 7: assert(d == 0xB4); break; + case 8: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 9); +} + +/*****************************/ + +extern (C) int _aApplyRdw1(in dchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplyRdw1(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d = aa[--i]; + wchar w; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + result = dg(cast(void *)&w); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRdw1.unittest\n"); + + auto s = "hello"d[]; + int i; + + foreach_reverse(wchar d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(wchar d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == 0xDBC1); break; + case 2: assert(d == 0xDC56); break; + case 3: assert(d == 0x1234); break; + case 4: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 5); +} + + +/****************************************************************************/ +/* 2 argument versions */ + +// dg is D, but _aApplyRcd2() is C +extern (D) typedef int delegate(void *, void *) dg2_t; + +extern (C) int _aApplyRcd2(in char[] aa, dg2_t dg) +{ int result; + size_t i; + size_t len = aa.length; + + debug(apply) printf("_aApplyRcd2(), len = %d\n", len); + for (i = len; i != 0; ) + { dchar d; + + i--; + d = aa[i]; + if (d & 0x80) + { char c = cast(char)d; + uint j; + uint m = 0x3F; + d = 0; + while ((c & 0xC0) != 0xC0) + { if (i == 0) + onUnicodeError("Invalid UTF-8 sequence", 0); + i--; + d |= (c & 0x3F) << j; + j += 6; + m >>= 1; + c = aa[i]; + } + d |= (c & m) << j; + } + result = dg(&i, cast(void *)&d); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRcd2.unittest\n"); + + auto s = "hello"c[]; + int i; + + foreach_reverse(k, dchar d; s) + { + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, dchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(d == 'b'); assert(k == 8); break; + case 1: assert(d == '\U00100456'); assert(k == 4); break; + case 2: assert(d == '\u1234'); assert(k == 1); break; + case 3: assert(d == 'a'); assert(k == 0); break; + default: assert(0); + } + i++; + } + assert(i == 4); +} + +/*****************************/ + +extern (C) int _aApplyRwd2(in wchar[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplyRwd2(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + + i--; + d = aa[i]; + if (d >= 0xDC00 && d <= 0xDFFF) + { if (i == 0) + onUnicodeError("Invalid UTF-16 sequence", 0); + i--; + d = ((aa[i] - 0xD7C0) << 10) + (d - 0xDC00); + } + result = dg(&i, cast(void *)&d); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRwd2.unittest\n"); + + auto s = "hello"w[]; + int i; + + foreach_reverse(k, dchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, dchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(k == 4); assert(d == 'b'); break; + case 1: assert(k == 2); assert(d == '\U00100456'); break; + case 2: assert(k == 1); assert(d == '\u1234'); break; + case 3: assert(k == 0); assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 4); +} + +/*****************************/ + +extern (C) int _aApplyRcw2(in char[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplyRcw2(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + wchar w; + + i--; + w = aa[i]; + if (w & 0x80) + { char c = cast(char)w; + uint j; + uint m = 0x3F; + d = 0; + while ((c & 0xC0) != 0xC0) + { if (i == 0) + onUnicodeError("Invalid UTF-8 sequence", 0); + i--; + d |= (c & 0x3F) << j; + j += 6; + m >>= 1; + c = aa[i]; + } + d |= (c & m) << j; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(&i, cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + } + result = dg(&i, cast(void *)&w); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRcw2.unittest\n"); + + auto s = "hello"c[]; + int i; + + foreach_reverse(k, wchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, wchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(k == 8); assert(d == 'b'); break; + case 1: assert(k == 4); assert(d == 0xDBC1); break; + case 2: assert(k == 4); assert(d == 0xDC56); break; + case 3: assert(k == 1); assert(d == 0x1234); break; + case 4: assert(k == 0); assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 5); +} + +/*****************************/ + +extern (C) int _aApplyRwc2(in wchar[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplyRwc2(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + char c; + + i--; + d = aa[i]; + if (d >= 0xDC00 && d <= 0xDFFF) + { if (i == 0) + onUnicodeError("Invalid UTF-16 sequence", 0); + i--; + d = ((aa[i] - 0xD7C0) << 10) + (d - 0xDC00); + } + + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(&i, cast(void *)&c2); + if (result) + return result; + } + continue; + } + c = cast(char)d; + result = dg(&i, cast(void *)&c); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRwc2.unittest\n"); + + auto s = "hello"w[]; + int i; + + foreach_reverse(k, char d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, char d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(k == 4); assert(d == 'b'); break; + case 1: assert(k == 2); assert(d == 0xF4); break; + case 2: assert(k == 2); assert(d == 0x80); break; + case 3: assert(k == 2); assert(d == 0x91); break; + case 4: assert(k == 2); assert(d == 0x96); break; + case 5: assert(k == 1); assert(d == 0xE1); break; + case 6: assert(k == 1); assert(d == 0x88); break; + case 7: assert(k == 1); assert(d == 0xB4); break; + case 8: assert(k == 0); assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 9); +} + +/*****************************/ + +extern (C) int _aApplyRdc2(in dchar[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplyRdc2(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d = aa[--i]; + char c; + + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(&i, cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { c = cast(char)d; + } + result = dg(&i, cast(void *)&c); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRdc2.unittest\n"); + + auto s = "hello"d[]; + int i; + + foreach_reverse(k, char d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, char d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(k == 3); assert(d == 'b'); break; + case 1: assert(k == 2); assert(d == 0xF4); break; + case 2: assert(k == 2); assert(d == 0x80); break; + case 3: assert(k == 2); assert(d == 0x91); break; + case 4: assert(k == 2); assert(d == 0x96); break; + case 5: assert(k == 1); assert(d == 0xE1); break; + case 6: assert(k == 1); assert(d == 0x88); break; + case 7: assert(k == 1); assert(d == 0xB4); break; + case 8: assert(k == 0); assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 9); +} + +/*****************************/ + +extern (C) int _aApplyRdw2(in dchar[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplyRdw2(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d = aa[--i]; + wchar w; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(&i, cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + result = dg(&i, cast(void *)&w); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRdw2.unittest\n"); + + auto s = "hello"d[]; + int i; + + foreach_reverse(k, wchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, wchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(k == 3); assert(d == 'b'); break; + case 1: assert(k == 2); assert(d == 0xDBC1); break; + case 2: assert(k == 2); assert(d == 0xDC56); break; + case 3: assert(k == 1); assert(d == 0x1234); break; + case 4: assert(k == 0); assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 5); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/aaA.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/aaA.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,872 @@ +/** + * Implementation of associative arrays. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: aa.a.b.length * 4) + { + //printf("rehash\n"); + _aaRehash(aa,keyti); + } + +Lret: + return cast(void *)(e + 1) + keysize; +} + + +/************************************************* + * Get pointer to value in associative array indexed by key. + * Returns null if it is not already there. + */ + +void* _aaGetRvalue(AA aa, TypeInfo keyti, size_t valuesize, ...) +{ + //printf("_aaGetRvalue(valuesize = %u)\n", valuesize); + if (!aa.a) + return null; + + auto pkey = cast(void *)(&valuesize + 1); + auto keysize = aligntsize(keyti.tsize()); + auto len = aa.a.b.length; + + if (len) + { + auto key_hash = keyti.getHash(pkey); + //printf("hash = %d\n", key_hash); + size_t i = key_hash % len; + auto e = aa.a.b[i]; + while (e !is null) + { + if (key_hash == e.hash) + { + auto c = keyti.compare(pkey, e + 1); + if (c == 0) + return cast(void *)(e + 1) + keysize; + e = (c < 0) ? e.left : e.right; + } + else + e = (key_hash < e.hash) ? e.left : e.right; + } + } + return null; // not found, caller will throw exception +} + + +/************************************************* + * Determine if key is in aa. + * Returns: + * null not in aa + * !=null in aa, return pointer to value + */ + +void* _aaIn(AA aa, TypeInfo keyti, ...) +in +{ +} +out (result) +{ + //assert(result == 0 || result == 1); +} +body +{ + if (aa.a) + { + auto pkey = cast(void *)(&keyti + 1); + + //printf("_aaIn(), .length = %d, .ptr = %x\n", aa.a.length, cast(uint)aa.a.ptr); + auto len = aa.a.b.length; + + if (len) + { + auto key_hash = keyti.getHash(pkey); + //printf("hash = %d\n", key_hash); + size_t i = key_hash % len; + auto e = aa.a.b[i]; + while (e !is null) + { + if (key_hash == e.hash) + { + auto c = keyti.compare(pkey, e + 1); + if (c == 0) + return cast(void *)(e + 1) + aligntsize(keyti.tsize()); + e = (c < 0) ? e.left : e.right; + } + else + e = (key_hash < e.hash) ? e.left : e.right; + } + } + } + + // Not found + return null; +} + +/************************************************* + * Delete key entry in aa[]. + * If key is not in aa[], do nothing. + */ + +void _aaDel(AA aa, TypeInfo keyti, ...) +{ + auto pkey = cast(void *)(&keyti + 1); + aaA *e; + + if (aa.a && aa.a.b.length) + { + auto key_hash = keyti.getHash(pkey); + //printf("hash = %d\n", key_hash); + size_t i = key_hash % aa.a.b.length; + auto pe = &aa.a.b[i]; + while ((e = *pe) !is null) // null means not found + { + if (key_hash == e.hash) + { + auto c = keyti.compare(pkey, e + 1); + if (c == 0) + { + if (!e.left && !e.right) + { + *pe = null; + } + else if (e.left && !e.right) + { + *pe = e.left; + e.left = null; + } + else if (!e.left && e.right) + { + *pe = e.right; + e.right = null; + } + else + { + *pe = e.left; + e.left = null; + do + pe = &(*pe).right; + while (*pe); + *pe = e.right; + e.right = null; + } + + aa.a.nodes--; + gc_free(e); + break; + } + pe = (c < 0) ? &e.left : &e.right; + } + else + pe = (key_hash < e.hash) ? &e.left : &e.right; + } + } +} + + +/******************************************** + * Produce array of values from aa. + */ + +ArrayRet_t _aaValues(AA aa, size_t keysize, size_t valuesize) +in +{ + assert(keysize == aligntsize(keysize)); +} +body +{ + size_t resi; + Array a; + + void _aaValues_x(aaA* e) + { + do + { + memcpy(a.ptr + resi * valuesize, + cast(byte*)e + aaA.sizeof + keysize, + valuesize); + resi++; + if (e.left) + { if (!e.right) + { e = e.left; + continue; + } + _aaValues_x(e.left); + } + e = e.right; + } while (e !is null); + } + + if (aa.a) + { + a.length = _aaLen(aa); + a.ptr = cast(byte*) gc_malloc(a.length * valuesize, + valuesize < (void*).sizeof ? BlkAttr.NO_SCAN : 0); + resi = 0; + foreach (e; aa.a.b) + { + if (e) + _aaValues_x(e); + } + assert(resi == a.length); + } + return *cast(ArrayRet_t*)(&a); +} + + +/******************************************** + * Rehash an array. + */ + +void* _aaRehash(AA* paa, TypeInfo keyti) +in +{ + //_aaInvAh(paa); +} +out (result) +{ + //_aaInvAh(result); +} +body +{ + BB newb; + + void _aaRehash_x(aaA* olde) + { + while (1) + { + auto left = olde.left; + auto right = olde.right; + olde.left = null; + olde.right = null; + + aaA *e; + + //printf("rehash %p\n", olde); + auto key_hash = olde.hash; + size_t i = key_hash % newb.b.length; + auto pe = &newb.b[i]; + while ((e = *pe) !is null) + { + //printf("\te = %p, e.left = %p, e.right = %p\n", e, e.left, e.right); + assert(e.left != e); + assert(e.right != e); + if (key_hash == e.hash) + { + auto c = keyti.compare(olde + 1, e + 1); + assert(c != 0); + pe = (c < 0) ? &e.left : &e.right; + } + else + pe = (key_hash < e.hash) ? &e.left : &e.right; + } + *pe = olde; + + if (right) + { + if (!left) + { olde = right; + continue; + } + _aaRehash_x(right); + } + if (!left) + break; + olde = left; + } + } + + //printf("Rehash\n"); + if (paa.a) + { + auto aa = paa.a; + auto len = _aaLen(*paa); + if (len) + { size_t i; + + for (i = 0; i < prime_list.length - 1; i++) + { + if (len <= prime_list[i]) + break; + } + len = prime_list[i]; + newb.b = new aaA*[len]; + + foreach (e; aa.b) + { + if (e) + _aaRehash_x(e); + } + delete aa.b; + + newb.nodes = aa.nodes; + newb.keyti = aa.keyti; + } + + *paa.a = newb; + _aaBalance(paa); + } + return (*paa).a; +} + +/******************************************** + * Balance an array. + */ + +void _aaBalance(AA* paa) +{ + //printf("_aaBalance()\n"); + if (paa.a) + { + aaA*[16] tmp; + aaA*[] array = tmp; + auto aa = paa.a; + foreach (j, e; aa.b) + { + /* Temporarily store contents of bucket in array[] + */ + size_t k = 0; + void addToArray(aaA* e) + { + while (e) + { addToArray(e.left); + if (k == array.length) + array.length = array.length * 2; + array[k++] = e; + e = e.right; + } + } + addToArray(e); + /* The contents of the bucket are now sorted into array[]. + * Rebuild the tree. + */ + void buildTree(aaA** p, size_t x1, size_t x2) + { + if (x1 >= x2) + *p = null; + else + { auto mid = (x1 + x2) >> 1; + *p = array[mid]; + buildTree(&(*p).left, x1, mid); + buildTree(&(*p).right, mid + 1, x2); + } + } + auto p = &aa.b[j]; + buildTree(p, 0, k); + } + } +} +/******************************************** + * Produce array of N byte keys from aa. + */ + +ArrayRet_t _aaKeys(AA aa, size_t keysize) +{ + byte[] res; + size_t resi; + + void _aaKeys_x(aaA* e) + { + do + { + memcpy(&res[resi * keysize], cast(byte*)(e + 1), keysize); + resi++; + if (e.left) + { if (!e.right) + { e = e.left; + continue; + } + _aaKeys_x(e.left); + } + e = e.right; + } while (e !is null); + } + + auto len = _aaLen(aa); + if (!len) + return 0; + res = (cast(byte*) gc_malloc(len * keysize, + !(aa.a.keyti.flags() & 1) ? BlkAttr.NO_SCAN : 0))[0 .. len * keysize]; + resi = 0; + foreach (e; aa.a.b) + { + if (e) + _aaKeys_x(e); + } + assert(resi == len); + + Array a; + a.length = len; + a.ptr = res.ptr; + return *cast(ArrayRet_t*)(&a); +} + + +/********************************************** + * 'apply' for associative arrays - to support foreach + */ + +// dg is D, but _aaApply() is C +extern (D) typedef int delegate(void *) dg_t; + +int _aaApply(AA aa, size_t keysize, dg_t dg) +in +{ + assert(aligntsize(keysize) == keysize); +} +body +{ int result; + + //printf("_aaApply(aa = x%llx, keysize = %d, dg = x%llx)\n", aa.a, keysize, dg); + + int treewalker(aaA* e) + { int result; + + do + { + //printf("treewalker(e = %p, dg = x%llx)\n", e, dg); + result = dg(cast(void *)(e + 1) + keysize); + if (result) + break; + if (e.right) + { if (!e.left) + { + e = e.right; + continue; + } + result = treewalker(e.right); + if (result) + break; + } + e = e.left; + } while (e); + + return result; + } + + if (aa.a) + { + foreach (e; aa.a.b) + { + if (e) + { + result = treewalker(e); + if (result) + break; + } + } + } + return result; +} + +// dg is D, but _aaApply2() is C +extern (D) typedef int delegate(void *, void *) dg2_t; + +int _aaApply2(AA aa, size_t keysize, dg2_t dg) +in +{ + assert(aligntsize(keysize) == keysize); +} +body +{ int result; + + //printf("_aaApply(aa = x%llx, keysize = %d, dg = x%llx)\n", aa.a, keysize, dg); + + int treewalker(aaA* e) + { int result; + + do + { + //printf("treewalker(e = %p, dg = x%llx)\n", e, dg); + result = dg(cast(void *)(e + 1), cast(void *)(e + 1) + keysize); + if (result) + break; + if (e.right) + { if (!e.left) + { + e = e.right; + continue; + } + result = treewalker(e.right); + if (result) + break; + } + e = e.left; + } while (e); + + return result; + } + + if (aa.a) + { + foreach (e; aa.a.b) + { + if (e) + { + result = treewalker(e); + if (result) + break; + } + } + } + return result; +} + + +/*********************************** + * Construct an associative array of type ti from + * length pairs of key/value pairs. + */ + +extern (C) +BB* _d_assocarrayliteralT(TypeInfo_AssociativeArray ti, size_t length, ...) +{ + auto valuesize = ti.next.tsize(); // value size + auto keyti = ti.key; + auto keysize = keyti.tsize(); // key size + BB* result; + + //printf("_d_assocarrayliteralT(keysize = %d, valuesize = %d, length = %d)\n", keysize, valuesize, length); + //printf("tivalue = %.*s\n", ti.next.classinfo.name); + if (length == 0 || valuesize == 0 || keysize == 0) + { + ; + } + else + { + va_list q; + va_start!(size_t)(q, length); + + result = new BB(); + result.keyti = keyti; + size_t i; + + for (i = 0; i < prime_list.length - 1; i++) + { + if (length <= prime_list[i]) + break; + } + auto len = prime_list[i]; + result.b = new aaA*[len]; + + size_t keystacksize = (keysize + int.sizeof - 1) & ~(int.sizeof - 1); + size_t valuestacksize = (valuesize + int.sizeof - 1) & ~(int.sizeof - 1); + + size_t keytsize = aligntsize(keysize); + + for (size_t j = 0; j < length; j++) + { void* pkey = q; + q += keystacksize; + void* pvalue = q; + q += valuestacksize; + aaA* e; + + auto key_hash = keyti.getHash(pkey); + //printf("hash = %d\n", key_hash); + i = key_hash % len; + auto pe = &result.b[i]; + while (1) + { + e = *pe; + if (!e) + { + // Not found, create new elem + //printf("create new one\n"); + e = cast(aaA *) cast(void*) new void[aaA.sizeof + keytsize + valuesize]; + memcpy(e + 1, pkey, keysize); + e.hash = key_hash; + *pe = e; + result.nodes++; + break; + } + if (key_hash == e.hash) + { + auto c = keyti.compare(pkey, e + 1); + if (c == 0) + break; + pe = (c < 0) ? &e.left : &e.right; + } + else + pe = (key_hash < e.hash) ? &e.left : &e.right; + } + memcpy(cast(void *)(e + 1) + keytsize, pvalue, valuesize); + } + + va_end(q); + } + return result; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/adi.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/adi.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,603 @@ +/** + * Implementation of dynamic array property support routines. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: = lo); + } + if (lo == hi) + break; + + debug(adi) printf("\tstridelo = %d, stridehi = %d\n", stridelo, stridehi); + if (stridelo == stridehi) + { + + memcpy(tmp.ptr, lo, stridelo); + memcpy(lo, hi, stridelo); + memcpy(hi, tmp.ptr, stridelo); + lo += stridelo; + hi--; + continue; + } + + /* Shift the whole array. This is woefully inefficient + */ + memcpy(tmp.ptr, hi, stridehi); + memcpy(tmplo.ptr, lo, stridelo); + memmove(lo + stridehi, lo + stridelo , (hi - lo) - stridelo); + memcpy(lo, tmp.ptr, stridehi); + memcpy(hi + stridehi - stridelo, tmplo.ptr, stridelo); + + lo += stridehi; + hi = hi - 1 + (stridehi - stridelo); + } + } + return *cast(long*)(&a); +} + +unittest +{ + auto a = "abcd"c[]; + + auto r = a.dup.reverse; + //writefln(r); + assert(r == "dcba"); + + a = "a\u1235\u1234c"; + //writefln(a); + r = a.dup.reverse; + //writefln(r); + assert(r == "c\u1234\u1235a"); + + a = "ab\u1234c"; + //writefln(a); + r = a.dup.reverse; + //writefln(r); + assert(r == "c\u1234ba"); + + a = "\u3026\u2021\u3061\n"; + r = a.dup.reverse; + assert(r == "\n\u3061\u2021\u3026"); +} + + +/********************************************** + * Reverse array of wchars. + * Handled separately because embedded multiword encodings should not be + * reversed. + */ + +extern (C) long _adReverseWchar(wchar[] a) +{ + if (a.length > 1) + { + wchar[2] tmp; + wchar* lo = a.ptr; + wchar* hi = &a[length - 1]; + + while (lo < hi) + { auto clo = *lo; + auto chi = *hi; + + if ((clo < 0xD800 || clo > 0xDFFF) && + (chi < 0xD800 || chi > 0xDFFF)) + { + *lo = chi; + *hi = clo; + lo++; + hi--; + continue; + } + + int stridelo = 1 + (clo >= 0xD800 && clo <= 0xDBFF); + + int stridehi = 1; + if (chi >= 0xDC00 && chi <= 0xDFFF) + { + chi = *--hi; + stridehi++; + assert(hi >= lo); + } + if (lo == hi) + break; + + if (stridelo == stridehi) + { int stmp; + + assert(stridelo == 2); + assert(stmp.sizeof == 2 * (*lo).sizeof); + stmp = *cast(int*)lo; + *cast(int*)lo = *cast(int*)hi; + *cast(int*)hi = stmp; + lo += stridelo; + hi--; + continue; + } + + /* Shift the whole array. This is woefully inefficient + */ + memcpy(tmp.ptr, hi, stridehi * wchar.sizeof); + memcpy(hi + stridehi - stridelo, lo, stridelo * wchar.sizeof); + memmove(lo + stridehi, lo + stridelo , (hi - (lo + stridelo)) * wchar.sizeof); + memcpy(lo, tmp.ptr, stridehi * wchar.sizeof); + + lo += stridehi; + hi = hi - 1 + (stridehi - stridelo); + } + } + return *cast(long*)(&a); +} + +unittest +{ + wstring a = "abcd"; + + auto r = a.dup.reverse; + assert(r == "dcba"); + + a = "a\U00012356\U00012346c"; + r = a.dup.reverse; + assert(r == "c\U00012346\U00012356a"); + + a = "ab\U00012345c"; + r = a.dup.reverse; + assert(r == "c\U00012345ba"); +} + + +/********************************************** + * Support for array.reverse property. + */ + +extern (C) long _adReverse(Array a, size_t szelem) +out (result) +{ + assert(result is *cast(long*)(&a)); +} +body +{ + if (a.length >= 2) + { + byte* tmp; + byte[16] buffer; + + void* lo = a.ptr; + void* hi = a.ptr + (a.length - 1) * szelem; + + tmp = buffer.ptr; + if (szelem > 16) + { + //version (Windows) + tmp = cast(byte*) alloca(szelem); + //else + //tmp = gc_malloc(szelem); + } + + for (; lo < hi; lo += szelem, hi -= szelem) + { + memcpy(tmp, lo, szelem); + memcpy(lo, hi, szelem); + memcpy(hi, tmp, szelem); + } + + version (Windows) + { + } + else + { + //if (szelem > 16) + // BUG: bad code is generate for delete pointer, tries + // to call delclass. + //gc_free(tmp); + } + } + return *cast(long*)(&a); +} + +unittest +{ + debug(adi) printf("array.reverse.unittest\n"); + + int[] a = new int[5]; + int[] b; + size_t i; + + for (i = 0; i < 5; i++) + a[i] = i; + b = a.reverse; + assert(b is a); + for (i = 0; i < 5; i++) + assert(a[i] == 4 - i); + + struct X20 + { // More than 16 bytes in size + int a; + int b, c, d, e; + } + + X20[] c = new X20[5]; + X20[] d; + + for (i = 0; i < 5; i++) + { c[i].a = i; + c[i].e = 10; + } + d = c.reverse; + assert(d is c); + for (i = 0; i < 5; i++) + { + assert(c[i].a == 4 - i); + assert(c[i].e == 10); + } +} + +/********************************************** + * Sort array of chars. + */ + +extern (C) long _adSortChar(char[] a) +{ + if (a.length > 1) + { + dstring da = toUTF32(a); + da.sort; + size_t i = 0; + foreach (dchar d; da) + { char[4] buf; + auto t = toUTF8(buf, d); + a[i .. i + t.length] = t[]; + i += t.length; + } + delete da; + } + return *cast(long*)(&a); +} + +/********************************************** + * Sort array of wchars. + */ + +extern (C) long _adSortWchar(wchar[] a) +{ + if (a.length > 1) + { + dstring da = toUTF32(a); + da.sort; + size_t i = 0; + foreach (dchar d; da) + { wchar[2] buf; + auto t = toUTF16(buf, d); + a[i .. i + t.length] = t[]; + i += t.length; + } + delete da; + } + return *cast(long*)(&a); +} + +/*************************************** + * Support for array equality test. + * Returns: + * 1 equal + * 0 not equal + */ + +extern (C) int _adEq(Array a1, Array a2, TypeInfo ti) +{ + debug(adi) printf("_adEq(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); + if (a1.length != a2.length) + return 0; // not equal + auto sz = ti.tsize(); + auto p1 = a1.ptr; + auto p2 = a2.ptr; + + if (sz == 1) + // We should really have a ti.isPOD() check for this + return (memcmp(p1, p2, a1.length) == 0); + + for (size_t i = 0; i < a1.length; i++) + { + if (!ti.equals(p1 + i * sz, p2 + i * sz)) + return 0; // not equal + } + return 1; // equal +} + +extern (C) int _adEq2(Array a1, Array a2, TypeInfo ti) +{ + debug(adi) printf("_adEq2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); + if (a1.length != a2.length) + return 0; // not equal + if (!ti.equals(&a1, &a2)) + return 0; + return 1; +} +unittest +{ + debug(adi) printf("array.Eq unittest\n"); + + auto a = "hello"c; + + assert(a != "hel"); + assert(a != "helloo"); + assert(a != "betty"); + assert(a == "hello"); + assert(a != "hxxxx"); +} + +/*************************************** + * Support for array compare test. + */ + +extern (C) int _adCmp(Array a1, Array a2, TypeInfo ti) +{ + debug(adi) printf("adCmp()\n"); + auto len = a1.length; + if (a2.length < len) + len = a2.length; + auto sz = ti.tsize(); + void *p1 = a1.ptr; + void *p2 = a2.ptr; + + if (sz == 1) + { // We should really have a ti.isPOD() check for this + auto c = memcmp(p1, p2, len); + if (c) + return c; + } + else + { + for (size_t i = 0; i < len; i++) + { + auto c = ti.compare(p1 + i * sz, p2 + i * sz); + if (c) + return c; + } + } + if (a1.length == a2.length) + return 0; + return (a1.length > a2.length) ? 1 : -1; +} + +extern (C) int _adCmp2(Array a1, Array a2, TypeInfo ti) +{ + debug(adi) printf("_adCmp2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); + return ti.compare(&a1, &a2); +} +unittest +{ + debug(adi) printf("array.Cmp unittest\n"); + + auto a = "hello"c; + + assert(a > "hel"); + assert(a >= "hel"); + assert(a < "helloo"); + assert(a <= "helloo"); + assert(a > "betty"); + assert(a >= "betty"); + assert(a == "hello"); + assert(a <= "hello"); + assert(a >= "hello"); +} + +/*************************************** + * Support for array compare test. + */ + +extern (C) int _adCmpChar(Array a1, Array a2) +{ + version (X86) + { + asm + { naked ; + + push EDI ; + push ESI ; + + mov ESI,a1+4[4+ESP] ; + mov EDI,a2+4[4+ESP] ; + + mov ECX,a1[4+ESP] ; + mov EDX,a2[4+ESP] ; + + cmp ECX,EDX ; + jb GotLength ; + + mov ECX,EDX ; + +GotLength: + cmp ECX,4 ; + jb DoBytes ; + + // Do alignment if neither is dword aligned + test ESI,3 ; + jz Aligned ; + + test EDI,3 ; + jz Aligned ; +DoAlign: + mov AL,[ESI] ; //align ESI to dword bounds + mov DL,[EDI] ; + + cmp AL,DL ; + jnz Unequal ; + + inc ESI ; + inc EDI ; + + test ESI,3 ; + + lea ECX,[ECX-1] ; + jnz DoAlign ; +Aligned: + mov EAX,ECX ; + + // do multiple of 4 bytes at a time + + shr ECX,2 ; + jz TryOdd ; + + repe ; + cmpsd ; + + jnz UnequalQuad ; + +TryOdd: + mov ECX,EAX ; +DoBytes: + // if still equal and not end of string, do up to 3 bytes slightly + // slower. + + and ECX,3 ; + jz Equal ; + + repe ; + cmpsb ; + + jnz Unequal ; +Equal: + mov EAX,a1[4+ESP] ; + mov EDX,a2[4+ESP] ; + + sub EAX,EDX ; + pop ESI ; + + pop EDI ; + ret ; + +UnequalQuad: + mov EDX,[EDI-4] ; + mov EAX,[ESI-4] ; + + cmp AL,DL ; + jnz Unequal ; + + cmp AH,DH ; + jnz Unequal ; + + shr EAX,16 ; + + shr EDX,16 ; + + cmp AL,DL ; + jnz Unequal ; + + cmp AH,DH ; +Unequal: + sbb EAX,EAX ; + pop ESI ; + + or EAX,1 ; + pop EDI ; + + ret ; + } + } + else + { + int len; + int c; + + debug(adi) printf("adCmpChar()\n"); + len = a1.length; + if (a2.length < len) + len = a2.length; + c = memcmp(cast(char *)a1.ptr, cast(char *)a2.ptr, len); + if (!c) + c = cast(int)a1.length - cast(int)a2.length; + return c; + } +} + +unittest +{ + debug(adi) printf("array.CmpChar unittest\n"); + + auto a = "hello"c; + + assert(a > "hel"); + assert(a >= "hel"); + assert(a < "helloo"); + assert(a <= "helloo"); + assert(a > "betty"); + assert(a >= "betty"); + assert(a == "hello"); + assert(a <= "hello"); + assert(a >= "hello"); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/alloca.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/alloca.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,135 @@ +/** + * Implementation of alloca() standard C routine. + * + * Copyright: Copyright Digital Mars 1990 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright, based on code originally written by Burton Radons + * + * Copyright Digital Mars 2008 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.arraybyte; + +import rt.util.cpuid; + +version (unittest) +{ + private import core.stdc.stdio : printf; + /* This is so unit tests will test every CPU variant + */ + int cpuid; + const int CPUID_MAX = 4; + bool mmx() { return cpuid == 1 && rt.util.cpuid.mmx(); } + bool sse() { return cpuid == 2 && rt.util.cpuid.sse(); } + bool sse2() { return cpuid == 3 && rt.util.cpuid.sse2(); } + bool amd3dnow() { return cpuid == 4 && rt.util.cpuid.amd3dnow(); } +} +else +{ + alias rt.util.cpuid.mmx mmx; + alias rt.util.cpuid.sse sse; + alias rt.util.cpuid.sse2 sse2; + alias rt.util.cpuid.amd3dnow amd3dnow; +} + +//version = log; + +bool disjoint(T)(T[] a, T[] b) +{ + return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); +} + +alias byte T; + +extern (C): + +/* ======================================================================== */ + + +/*********************** + * Computes: + * a[] = b[] + value + */ + +T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b) +{ + return _arraySliceExpAddSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b) +{ + return _arraySliceExpAddSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpAddSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1088% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 1000% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startaddmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + paddb MM0, MM4; + paddb MM1, MM4; + paddb MM2, MM4; + paddb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + /* trying to be fair and treat normal 32-bit cpu the same way as we do + * the SIMD units, with unrolled asm. There's not enough registers, + * really. + */ + else + if (a.length >= 4) + { + + auto n = aptr + (a.length & ~3); + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov CL, value; + + align 4; + startadd386: + add ESI, 4; + mov DX, [EAX]; + mov BX, [EAX+2]; + add EAX, 4; + add BL, CL; + add BH, CL; + add DL, CL; + add DH, CL; + mov [ESI -4], DX; + mov [ESI+2 -4], BX; + cmp ESI, EDI; + jb startadd386; + + mov aptr, ESI; + mov bptr, EAX; + } + + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ + value); + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + c[] + */ + +T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceAddSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 5739% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + version (log) printf("\tsse2 unaligned\n"); + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startaddlsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddlsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + version (log) printf("\tsse2 aligned\n"); + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startaddlsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddlsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 4428% faster + if (mmx() && a.length >= 32) + { + version (log) printf("\tmmx\n"); + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startaddlmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + paddb MM0, MM4; + paddb MM1, MM5; + paddb MM2, MM6; + paddb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddlmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + version (log) if (aptr < aend) printf("\tbase\n"); + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ + *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_a(T[] a, T value) +{ + return _arrayExpSliceAddass_g(a, value); +} + +T[] _arrayExpSliceAddass_h(T[] a, T value) +{ + return _arrayExpSliceAddass_g(a, value); +} + +T[] _arrayExpSliceAddass_g(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1578% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddasssse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasssse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddasssse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasssse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 1721% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 8; + startaddassmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + paddb MM0, MM4; + paddb MM1, MM4; + paddb MM2, MM4; + paddb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddassmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_a(T[] a, T[] b) +{ + return _arraySliceSliceAddass_g(a, b); +} + +T[] _arraySliceSliceAddass_h(T[] a, T[] b) +{ + return _arraySliceSliceAddass_g(a, b); +} + +T[] _arraySliceSliceAddass_g(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 4727% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startaddasslsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasslsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startaddasslsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasslsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 3059% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startaddasslmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + paddb MM0, MM4; + paddb MM1, MM5; + paddb MM2, MM6; + paddb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddasslmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1189% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 1079% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startsubmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + psubb MM0, MM4; + psubb MM1, MM4; + psubb MM2, MM4; + psubb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsubmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really. + else + if (a.length >= 4) + { + auto n = aptr + (a.length & ~3); + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov CL, value; + + align 4; + startsub386: + add ESI, 4; + mov DX, [EAX]; + mov BX, [EAX+2]; + add EAX, 4; + sub BL, CL; + sub BH, CL; + sub DL, CL; + sub DH, CL; + mov [ESI -4], DX; + mov [ESI+2 -4], BX; + cmp ESI, EDI; + jb startsub386; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ - value); + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] = b[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, c[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_g(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_g(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 8748% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubrsse2u: + add ESI, 64; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + psubb XMM5, XMM0; + psubb XMM6, XMM1; + movdqu [ESI -64], XMM5; + movdqu [ESI+16-64], XMM6; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + psubb XMM5, XMM2; + psubb XMM6, XMM3; + movdqu [ESI+32-64], XMM5; + movdqu [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsubrsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubrsse2a: + add ESI, 64; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + psubb XMM5, XMM0; + psubb XMM6, XMM1; + movdqa [ESI -64], XMM5; + movdqa [ESI+16-64], XMM6; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + psubb XMM5, XMM2; + psubb XMM6, XMM3; + movdqa [ESI+32-64], XMM5; + movdqa [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsubrsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 7397% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startsubrmmx: + add ESI, 32; + movq MM5, MM4; + movq MM6, MM4; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + psubb MM5, MM0; + psubb MM6, MM1; + movq [ESI -32], MM5; + movq [ESI+8 -32], MM6; + movq MM5, MM4; + movq MM6, MM4; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + psubb MM5, MM2; + psubb MM6, MM3; + movq [ESI+16-32], MM5; + movq [ESI+24-32], MM6; + cmp ESI, EDI; + jb startsubrmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + + } + + while (aptr < aend) + *aptr++ = cast(T)(value - *bptr++); + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] = 6 - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - b[i])) + { + printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - c[] + */ + +T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 5756% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startsublsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsublsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startsublsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsublsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 4428% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startsublmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + psubb MM0, MM4; + psubb MM1, MM5; + psubb MM2, MM6; + psubb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsublmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ - *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_a(T[] a, T value) +{ + return _arrayExpSliceMinass_g(a, value); +} + +T[] _arrayExpSliceMinass_h(T[] a, T value) +{ + return _arrayExpSliceMinass_g(a, value); +} + +T[] _arrayExpSliceMinass_g(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1577% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubasssse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasssse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubasssse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasssse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 1577% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 8; + startsubassmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + psubb MM0, MM4; + psubb MM1, MM4; + psubb MM2, MM4; + psubb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsubassmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_a(T[] a, T[] b) +{ + return _arraySliceSliceMinass_g(a, b); +} + +T[] _arraySliceSliceMinass_h(T[] a, T[] b) +{ + return _arraySliceSliceMinass_g(a, b); +} + +T[] _arraySliceSliceMinass_g(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 4800% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startsubasslsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasslsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startsubasslsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasslsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 3107% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startsubasslmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + psubb MM0, MM4; + psubb MM1, MM5; + psubb MM2, MM6; + psubb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsubasslmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/arraycast.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/arraycast.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,94 @@ +/** + * Implementation of array cast support routines. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: = 8) + { + auto n = aptr + (b.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ESI, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM5; + subpd XMM2, XMM6; + subpd XMM3, XMM7; + add ECX, 64; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + // Handle remainder + while (aptr < aend) + *aptr++ = *bptr++ - *cptr++; + + return a; +} + + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + value + */ + +T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpAddSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 305% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + addpd XMM0, XMM4; + addpd XMM1, XMM4; + addpd XMM2, XMM4; + addpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ + value; + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_d(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 version is 114% faster + if (sse2() && a.length >= 8) + { + auto n = cast(T*)((cast(uint)aend) & ~7); + if (aptr < n) + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + addpd XMM0, XMM4; + addpd XMM1, XMM4; + addpd XMM2, XMM4; + addpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_d(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 183% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ECX, 64; + addpd XMM0, XMM4; + addpd XMM1, XMM5; + addpd XMM2, XMM6; + addpd XMM3, XMM7; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 305% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM4; + subpd XMM2, XMM4; + subpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ - value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 66% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movapd XMM5, XMM4; + movapd XMM6, XMM4; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + subpd XMM5, XMM0; + subpd XMM6, XMM1; + movupd [ESI+ 0-64], XMM5; + movupd [ESI+16-64], XMM6; + movapd XMM5, XMM4; + movapd XMM6, XMM4; + subpd XMM5, XMM2; + subpd XMM6, XMM3; + movupd [ESI+32-64], XMM5; + movupd [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = value - *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = 6 - a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - a[i])) + { + printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_d(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 version is 115% faster + if (sse2() && a.length >= 8) + { + auto n = cast(T*)((cast(uint)aend) & ~7); + if (aptr < n) + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM4; + subpd XMM2, XMM4; + subpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_d(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 183% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ECX, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM5; + subpd XMM2, XMM6; + subpd XMM3, XMM7; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * value + */ + +T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMulSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 304% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * c[] + */ + +T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceMulSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 329% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add EAX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM5; + mulpd XMM2, XMM6; + mulpd XMM3, XMM7; + add ECX, 64; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * b[i])) + { + printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= value + */ + +T[] _arrayExpSliceMulass_d(T[] a, T value) +{ + //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 version is 109% faster + if (sse2() && a.length >= 8) + { + auto n = cast(T*)((cast(uint)aend) & ~7); + if (aptr < n) + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= b[] + */ + +T[] _arraySliceSliceMulass_d(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMulass_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 205% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ECX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM5; + mulpd XMM2, XMM6; + mulpd XMM3, XMM7; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ *= *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] / value + */ + +T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpDivSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + /* Multiplying by the reciprocal is faster, but does + * not produce as accurate an answer. + */ + T recip = cast(T)1 / value; + + version (D_InlineAsm_X86) + { + // SSE2 version is 299% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, recip; + //movsd XMM4, value + //rcpsd XMM4, XMM4 + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + //divpd XMM0, XMM4; + //divpd XMM1, XMM4; + //divpd XMM2, XMM4; + //divpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + { + *aptr++ = *bptr++ / value; + //*aptr++ = *bptr++ * recip; + } + + return a; +} + +unittest +{ + printf("_arraySliceExpDivSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] / 8; + + for (int i = 0; i < dim; i++) + { + //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]); + if (c[i] != cast(T)(a[i] / 8)) + { + printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] /= value + */ + +T[] _arrayExpSliceDivass_d(T[] a, T value) +{ + //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + /* Multiplying by the reciprocal is faster, but does + * not produce as accurate an answer. + */ + T recip = cast(T)1 / value; + + version (D_InlineAsm_X86) + { + // SSE2 version is 65% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, recip; + //movsd XMM4, value + //rcpsd XMM4, XMM4 + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + //divpd XMM0, XMM4; + //divpd XMM1, XMM4; + //divpd XMM2, XMM4; + //divpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= recip; + + return a; +} + + +unittest +{ + printf("_arrayExpSliceDivass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] /= 8; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] / 8)) + { + printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] * value + */ + +T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAddass_d(a, -value, b); +} + +/*********************** + * Computes: + * a[] += b[] * value + */ + +T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + // Handle remainder + while (aptr < aend) + *aptr++ += *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAddass_d unittest\n"); + + cpuid = 1; + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 1; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] += a[] * 6; + + for (int i = 0; i < dim; i++) + { + //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); + if (c[i] != cast(T)(b[i] + a[i] * 6)) + { + printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/arrayfloat.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/arrayfloat.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,2309 @@ +/** + * Contains SSE2 and MMX versions of certain operations for float. + * + * Copyright: Copyright Digital Mars 2008 - 2009. + * License: = 16) + { + version (log) printf("\tsse unaligned\n"); + auto n = aptr + (b.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add ESI, 64; + addps XMM0, XMM4; + addps XMM1, XMM5; + addps XMM2, XMM6; + addps XMM3, XMM7; + add ECX, 64; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + // 3DNow! version is only 13% faster + if (amd3dnow() && b.length >= 8) + { + version (log) printf("\tamd3dnow\n"); + auto n = aptr + (b.length & ~7); + + asm + { + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + + align 4; + start3dnow: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfadd MM0, [ECX]; + pfadd MM1, [ECX+8]; + pfadd MM2, [ECX+16]; + pfadd MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ECX, 32; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + // Handle remainder + version (log) if (aptr < aend) printf("\tbase\n"); + while (aptr < aend) + *aptr++ = *bptr++ + *cptr++; + + return a; +} + + +unittest +{ + printf("_arraySliceSliceAddSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - c[] + */ + +T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 834% faster + if (sse() && b.length >= 16) + { + auto n = aptr + (b.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add ESI, 64; + subps XMM0, XMM4; + subps XMM1, XMM5; + subps XMM2, XMM6; + subps XMM3, XMM7; + add ECX, 64; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + // 3DNow! version is only 13% faster + if (amd3dnow() && b.length >= 8) + { + auto n = aptr + (b.length & ~7); + + asm + { + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + + align 4; + start3dnow: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfsub MM0, [ECX]; + pfsub MM1, [ECX+8]; + pfsub MM2, [ECX+16]; + pfsub MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ECX, 32; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + // Handle remainder + while (aptr < aend) + *aptr++ = *bptr++ - *cptr++; + + return a; +} + + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + value + */ + +T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpAddSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 665% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + addps XMM0, XMM4; + addps XMM1, XMM4; + addps XMM2, XMM4; + addps XMM3, XMM4; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + // 3DNow! version is 69% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM4, qword ptr [v]; + + align 8; + start3dnow: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfadd MM0, MM4; + pfadd MM1, MM4; + pfadd MM2, MM4; + pfadd MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ + value; + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_f(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE version is 302% faster + if (sse() && a.length >= 16) + { + // align pointer + auto n = cast(T*)((cast(uint)aptr + 15) & ~15); + while (aptr < n) + *aptr++ += value; + n = cast(T*)((cast(uint)aend) & ~15); + if (aptr < n) + + // Aligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloopa: + movaps XMM0, [ESI]; + movaps XMM1, [ESI+16]; + movaps XMM2, [ESI+32]; + movaps XMM3, [ESI+48]; + add ESI, 64; + addps XMM0, XMM4; + addps XMM1, XMM4; + addps XMM2, XMM4; + addps XMM3, XMM4; + movaps [ESI+ 0-64], XMM0; + movaps [ESI+16-64], XMM1; + movaps [ESI+32-64], XMM2; + movaps [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + else + // 3DNow! version is 63% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + movq MM4, qword ptr [v]; + + align 8; + start3dnow: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfadd MM0, MM4; + pfadd MM1, MM4; + pfadd MM2, MM4; + pfadd MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov dword ptr [aptr], ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_f(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 468% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [ESI]; + movups XMM1, [ESI+16]; + movups XMM2, [ESI+32]; + movups XMM3, [ESI+48]; + add ESI, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add ECX, 64; + addps XMM0, XMM4; + addps XMM1, XMM5; + addps XMM2, XMM6; + addps XMM3, XMM7; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + // 3DNow! version is 57% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, dword ptr [aptr]; // destination operand + mov EDI, dword ptr [n]; // end comparison + mov ECX, dword ptr [bptr]; // right operand + + align 4; + start3dnow: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfadd MM0, [ECX]; + pfadd MM1, [ECX+8]; + pfadd MM2, [ECX+16]; + pfadd MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add ECX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov dword ptr [aptr], ESI; + mov dword ptr [bptr], ECX; + } + } + } + + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 622% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + subps XMM0, XMM4; + subps XMM1, XMM4; + subps XMM2, XMM4; + subps XMM3, XMM4; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + // 3DNow! version is 67% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + T[2] w; + + w[0] = w[1] = value; + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + mov EAX, dword ptr [bptr]; + movq MM4, qword ptr [w]; + + align 8; + start3dnow: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfsub MM0, MM4; + pfsub MM1, MM4; + pfsub MM2, MM4; + pfsub MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov dword ptr [aptr], ESI; + mov dword ptr [bptr], EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ - value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 690% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movaps XMM5, XMM4; + movaps XMM6, XMM4; + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + subps XMM5, XMM0; + subps XMM6, XMM1; + movups [ESI+ 0-64], XMM5; + movups [ESI+16-64], XMM6; + movaps XMM5, XMM4; + movaps XMM6, XMM4; + subps XMM5, XMM2; + subps XMM6, XMM3; + movups [ESI+32-64], XMM5; + movups [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + // 3DNow! version is 67% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM4, qword ptr [v]; + + align 8; + start3dnow: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfsubr MM0, MM4; + pfsubr MM1, MM4; + pfsubr MM2, MM4; + pfsubr MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = value - *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = 6 - a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - a[i])) + { + printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_f(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE version is 304% faster + if (sse() && a.length >= 16) + { + // align pointer + auto n = cast(T*)((cast(uint)aptr + 15) & ~15); + while (aptr < n) + *aptr++ -= value; + n = cast(T*)((cast(uint)aend) & ~15); + if (aptr < n) + + // Aligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloopa: + movaps XMM0, [ESI]; + movaps XMM1, [ESI+16]; + movaps XMM2, [ESI+32]; + movaps XMM3, [ESI+48]; + add ESI, 64; + subps XMM0, XMM4; + subps XMM1, XMM4; + subps XMM2, XMM4; + subps XMM3, XMM4; + movaps [ESI+ 0-64], XMM0; + movaps [ESI+16-64], XMM1; + movaps [ESI+32-64], XMM2; + movaps [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + else + // 3DNow! version is 63% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + movq MM4, qword ptr [v]; + + align 8; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfsub MM0, MM4; + pfsub MM1, MM4; + pfsub MM2, MM4; + pfsub MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + cmp ESI, EDI; + jb start; + + emms; + mov dword ptr [aptr], ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceminass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_f(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 468% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [ESI]; + movups XMM1, [ESI+16]; + movups XMM2, [ESI+32]; + movups XMM3, [ESI+48]; + add ESI, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add ECX, 64; + subps XMM0, XMM4; + subps XMM1, XMM5; + subps XMM2, XMM6; + subps XMM3, XMM7; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + // 3DNow! version is 57% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, dword ptr [aptr]; // destination operand + mov EDI, dword ptr [n]; // end comparison + mov ECX, dword ptr [bptr]; // right operand + + align 4; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfsub MM0, [ECX]; + pfsub MM1, [ECX+8]; + pfsub MM2, [ECX+16]; + pfsub MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add ECX, 32; + cmp ESI, EDI; + jb start; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * value + */ + +T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMulSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 607% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM4; + mulps XMM2, XMM4; + mulps XMM3, XMM4; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + // 3DNow! version is 69% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + mov EAX, dword ptr [bptr]; + movq MM4, qword ptr [v]; + + align 8; + start: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfmul MM0, MM4; + pfmul MM1, MM4; + pfmul MM2, MM4; + pfmul MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * c[] + */ + +T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceMulSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 833% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add ESI, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add EAX, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM5; + mulps XMM2, XMM6; + mulps XMM3, XMM7; + add ECX, 64; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + // 3DNow! version is only 13% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, dword ptr [aptr]; // destination operand + mov EDI, dword ptr [n]; // end comparison + mov EAX, dword ptr [bptr]; // left operand + mov ECX, dword ptr [cptr]; // right operand + + align 4; + start: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfmul MM0, [ECX]; + pfmul MM1, [ECX+8]; + pfmul MM2, [ECX+16]; + pfmul MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ECX, 32; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * b[i])) + { + printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= value + */ + +T[] _arrayExpSliceMulass_f(T[] a, T value) +{ + //printf("_arrayExpSliceMulass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE version is 303% faster + if (sse() && a.length >= 16) + { + // align pointer + auto n = cast(T*)((cast(uint)aptr + 15) & ~15); + while (aptr < n) + *aptr++ *= value; + n = cast(T*)((cast(uint)aend) & ~15); + if (aptr < n) + + // Aligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloopa: + movaps XMM0, [ESI]; + movaps XMM1, [ESI+16]; + movaps XMM2, [ESI+32]; + movaps XMM3, [ESI+48]; + add ESI, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM4; + mulps XMM2, XMM4; + mulps XMM3, XMM4; + movaps [ESI+ 0-64], XMM0; + movaps [ESI+16-64], XMM1; + movaps [ESI+32-64], XMM2; + movaps [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + else + // 3DNow! version is 63% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + movq MM4, qword ptr [v]; + + align 8; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfmul MM0, MM4; + pfmul MM1, MM4; + pfmul MM2, MM4; + pfmul MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + cmp ESI, EDI; + jb start; + + emms; + mov dword ptr [aptr], ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= b[] + */ + +T[] _arraySliceSliceMulass_f(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMulass_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 525% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [ESI]; + movups XMM1, [ESI+16]; + movups XMM2, [ESI+32]; + movups XMM3, [ESI+48]; + add ESI, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add ECX, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM5; + mulps XMM2, XMM6; + mulps XMM3, XMM7; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + // 3DNow! version is 57% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, dword ptr [aptr]; // destination operand + mov EDI, dword ptr [n]; // end comparison + mov ECX, dword ptr [bptr]; // right operand + + align 4; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfmul MM0, [ECX]; + pfmul MM1, [ECX+8]; + pfmul MM2, [ECX+16]; + pfmul MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add ECX, 32; + cmp ESI, EDI; + jb start; + + emms; + mov dword ptr [aptr], ESI; + mov dword ptr [bptr], ECX; + } + } + } + + while (aptr < aend) + *aptr++ *= *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] / value + */ + +T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpDivSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + /* Multiplying by the reciprocal is faster, but does + * not produce as accurate an answer. + */ + T recip = cast(T)1 / value; + + version (D_InlineAsm_X86) + { + // SSE version is 587% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movss XMM4, recip; + //movss XMM4, value + //rcpss XMM4, XMM4 + shufps XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM4; + mulps XMM2, XMM4; + mulps XMM3, XMM4; + //divps XMM0, XMM4; + //divps XMM1, XMM4; + //divps XMM2, XMM4; + //divps XMM3, XMM4; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + // 3DNow! version is 72% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + T[2] w = void; + + w[0] = recip; + w[1] = recip; + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + mov EAX, dword ptr [bptr]; + movq MM4, qword ptr [w]; + + align 8; + start: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfmul MM0, MM4; + pfmul MM1, MM4; + pfmul MM2, MM4; + pfmul MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start; + + emms; + mov dword ptr [aptr], ESI; + mov dword ptr [bptr], EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * recip; + + return a; +} + +unittest +{ + printf("_arraySliceExpDivSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] / 8; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] / 8)) + { + printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] /= value + */ + +T[] _arrayExpSliceDivass_f(T[] a, T value) +{ + //printf("_arrayExpSliceDivass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + /* Multiplying by the reciprocal is faster, but does + * not produce as accurate an answer. + */ + T recip = cast(T)1 / value; + + version (D_InlineAsm_X86) + { + // SSE version is 245% faster + if (sse() && a.length >= 16) + { + // align pointer + auto n = cast(T*)((cast(uint)aptr + 15) & ~15); + while (aptr < n) + *aptr++ *= recip; + n = cast(T*)((cast(uint)aend) & ~15); + if (aptr < n) + + // Aligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movss XMM4, recip; + //movss XMM4, value + //rcpss XMM4, XMM4 + shufps XMM4, XMM4, 0; + + align 8; + startsseloopa: + movaps XMM0, [ESI]; + movaps XMM1, [ESI+16]; + movaps XMM2, [ESI+32]; + movaps XMM3, [ESI+48]; + add ESI, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM4; + mulps XMM2, XMM4; + mulps XMM3, XMM4; + //divps XMM0, XMM4; + //divps XMM1, XMM4; + //divps XMM2, XMM4; + //divps XMM3, XMM4; + movaps [ESI+ 0-64], XMM0; + movaps [ESI+16-64], XMM1; + movaps [ESI+32-64], XMM2; + movaps [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + else + // 3DNow! version is 57% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + T[2] w = void; + + w[0] = w[1] = recip; + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + movq MM4, qword ptr [w]; + + align 8; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfmul MM0, MM4; + pfmul MM1, MM4; + pfmul MM2, MM4; + pfmul MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + cmp ESI, EDI; + jb start; + + emms; + mov dword ptr [aptr], ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= recip; + + return a; +} + +unittest +{ + printf("_arrayExpSliceDivass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] /= 8; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] / 8)) + { + printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] * value + */ + +T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAddass_f(a, -value, b); +} + +/*********************** + * Computes: + * a[] += b[] * value + */ + +T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + // Handle remainder + while (aptr < aend) + *aptr++ += *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAddass_f unittest\n"); + + cpuid = 1; + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 1; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] += a[] * 6; + + for (int i = 0; i < dim; i++) + { + //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); + if (c[i] != cast(T)(b[i] + a[i] * 6)) + { + printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/arrayint.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/arrayint.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,2430 @@ +/** + * Contains MMX versions of certain operations for dchar, int, and uint ('w', + * 'i' and 'k' suffixes). + * + * Copyright: Copyright Digital Mars 2008 - 2009. + * License: = 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 298% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM2, l; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + paddd MM0, MM2; + paddd MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + else + if (a.length >= 2) + { + auto n = aptr + (a.length & ~1); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov EDX, value; + + align 4; + start386: + add ESI, 8; + mov EBX, [EAX]; + mov ECX, [EAX+4]; + add EAX, 8; + add EBX, EDX; + add ECX, EDX; + mov [ESI -8], EBX; + mov [ESI+4-8], ECX; + cmp ESI, EDI; + jb start386; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ + value; + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + c[] + */ + +T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceAddSliceAssign_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1710% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM2, [ECX]; + movdqu XMM1, [EAX+16]; + movdqu XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM2, [ECX]; + movdqa XMM1, [EAX+16]; + movdqa XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 995% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM2, [ECX]; + movq MM1, [EAX+8]; + movq MM3, [ECX+8]; + add EAX, 16; + add ECX, 16; + paddd MM0, MM2; + paddd MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + +normal: + while (aptr < aend) + *aptr++ = *bptr++ + *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_w(T[] a, T value) +{ + return _arrayExpSliceAddass_i(a, value); +} + +T[] _arrayExpSliceAddass_k(T[] a, T value) +{ + return _arrayExpSliceAddass_i(a, value); +} + +T[] _arrayExpSliceAddass_i(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 83% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 81% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + movq MM2, l; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + paddd MM0, MM2; + paddd MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + else + if (a.length >= 2) + { + auto n = aptr + (a.length & ~1); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EDX, value; + + align 4; + start386: + mov EBX, [ESI]; + mov ECX, [ESI+4]; + add ESI, 8; + add EBX, EDX; + add ECX, EDX; + mov [ESI -8], EBX; + mov [ESI+4-8], ECX; + cmp ESI, EDI; + jb start386; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + a[] += 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(c[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_w(T[] a, T[] b) +{ + return _arraySliceSliceAddass_i(a, b); +} + +T[] _arraySliceSliceAddass_k(T[] a, T[] b) +{ + return _arraySliceSliceAddass_i(a, b); +} + +T[] _arraySliceSliceAddass_i(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 695% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM2, [ECX]; + movdqu XMM1, [ESI+16]; + movdqu XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM2, [ECX]; + movdqa XMM1, [ESI+16]; + movdqa XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 471% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM2, [ECX]; + movq MM1, [ESI+8]; + movq MM3, [ECX+8]; + add ESI, 16; + add ECX, 16; + paddd MM0, MM2; + paddd MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + +normal: + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] += a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] + a[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_i(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_i(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 400% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 315% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM2, l; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + psubd MM0, MM2; + psubd MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + else + if (a.length >= 2) + { + auto n = aptr + (a.length & ~1); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov EDX, value; + + align 4; + start386: + add ESI, 8; + mov EBX, [EAX]; + mov ECX, [EAX+4]; + add EAX, 8; + sub EBX, EDX; + sub ECX, EDX; + mov [ESI -8], EBX; + mov [ESI+4-8], ECX; + cmp ESI, EDI; + jb start386; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ - value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_i(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_i(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1812% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 4; + startaddsse2u: + add ESI, 32; + movdqu XMM2, [EAX]; + movdqu XMM3, [EAX+16]; + movdqa XMM0, XMM4; + movdqa XMM1, XMM4; + add EAX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 4; + startaddsse2a: + add ESI, 32; + movdqa XMM2, [EAX]; + movdqa XMM3, [EAX+16]; + movdqa XMM0, XMM4; + movdqa XMM1, XMM4; + add EAX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 1077% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM4, l; + + align 4; + startmmx: + add ESI, 16; + movq MM2, [EAX]; + movq MM3, [EAX+8]; + movq MM0, MM4; + movq MM1, MM4; + add EAX, 16; + psubd MM0, MM2; + psubd MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = value - *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = 6 - a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - a[i])) + { + printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - c[] + */ + +T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1721% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM2, [ECX]; + movdqu XMM1, [EAX+16]; + movdqu XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM2, [ECX]; + movdqa XMM1, [EAX+16]; + movdqa XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 1002% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM2, [ECX]; + movq MM1, [EAX+8]; + movq MM3, [ECX+8]; + add EAX, 16; + add ECX, 16; + psubd MM0, MM2; + psubd MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ - *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_w(T[] a, T value) +{ + return _arrayExpSliceMinass_i(a, value); +} + +T[] _arrayExpSliceMinass_k(T[] a, T value) +{ + return _arrayExpSliceMinass_i(a, value); +} + +T[] _arrayExpSliceMinass_i(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 81% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 81% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + movq MM2, l; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + psubd MM0, MM2; + psubd MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + else + if (a.length >= 2) + { + auto n = aptr + (a.length & ~1); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EDX, value; + + align 4; + start386: + mov EBX, [ESI]; + mov ECX, [ESI+4]; + add ESI, 8; + sub EBX, EDX; + sub ECX, EDX; + mov [ESI -8], EBX; + mov [ESI+4-8], ECX; + cmp ESI, EDI; + jb start386; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + a[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(c[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_w(T[] a, T[] b) +{ + return _arraySliceSliceMinass_i(a, b); +} + +T[] _arraySliceSliceMinass_k(T[] a, T[] b) +{ + return _arraySliceSliceMinass_i(a, b); +} + +T[] _arraySliceSliceMinass_i(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 731% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM2, [ECX]; + movdqu XMM1, [ESI+16]; + movdqu XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM2, [ECX]; + movdqa XMM1, [ESI+16]; + movdqa XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 441% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM2, [ECX]; + movq MM1, [ESI+8]; + movq MM3, [ECX+8]; + add ESI, 16; + add ECX, 16; + psubd MM0, MM2; + psubd MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] -= a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] - a[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * value + */ + +T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAssign_i(a, value, b); +} + +T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAssign_i(a, value, b); +} + +T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMulSliceAssign_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (none) // multiplying a pair is not supported by MMX + { + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1380% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + { + // MMX version is 1380% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM2, l; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + pmuludq MM0, MM2; // only multiplies low 32 bits + pmuludq MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * 6; + + for (int i = 0; i < dim; i++) + { + //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]); + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * c[] + */ + +T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMulSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMulSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceMulSliceAssign_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (none) + { + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1407% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM2, [ECX]; + movdqu XMM1, [EAX+16]; + movdqu XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM2, [ECX]; + movdqa XMM1, [EAX+16]; + movdqa XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 1029% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM2, [ECX]; + movq MM1, [EAX+8]; + movq MM3, [ECX+8]; + add EAX, 16; + add ECX, 16; + pmuludq MM0, MM2; + pmuludq MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * b[i])) + { + printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= value + */ + +T[] _arrayExpSliceMulass_w(T[] a, T value) +{ + return _arrayExpSliceMulass_i(a, value); +} + +T[] _arrayExpSliceMulass_k(T[] a, T value) +{ + return _arrayExpSliceMulass_i(a, value); +} + +T[] _arrayExpSliceMulass_i(T[] a, T value) +{ + //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (none) + { + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 400% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 402% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + movq MM2, l; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + pmuludq MM0, MM2; + pmuludq MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + } + } + + while (aptr < aend) + *aptr++ *= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = a[]; + a[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(b[i] * 6)) + { + printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= b[] + */ + +T[] _arraySliceSliceMulass_w(T[] a, T[] b) +{ + return _arraySliceSliceMulass_i(a, b); +} + +T[] _arraySliceSliceMulass_k(T[] a, T[] b) +{ + return _arraySliceSliceMulass_i(a, b); +} + +T[] _arraySliceSliceMulass_i(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMulass_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (none) + { + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 873% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM2, [ECX]; + movdqu XMM1, [ESI+16]; + movdqu XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM2, [ECX]; + movdqa XMM1, [ESI+16]; + movdqa XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } +/+ BUG: comment out this section until we figure out what is going + wrong with the invalid pshufd instructions. + + else + // MMX version is 573% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM2, [ECX]; + movq MM1, [ESI+8]; + movq MM3, [ECX+8]; + pxor MM4, MM4; + pxor MM5, MM5; + punpckldq MM4, MM0; + punpckldq MM5, MM2; + add ESI, 16; + add ECX, 16; + pmuludq MM4, MM5; + pshufd MM4, MM4, 8; // ? + movq [ESI -16], MM4; + pxor MM4, MM4; + pxor MM5, MM5; + punpckldq MM4, MM1; + punpckldq MM5, MM3; + pmuludq MM4, MM5; + pshufd MM4, MM4, 8; // ? + movq [ESI+8-16], MM4; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } ++/ + } + } + + while (aptr < aend) + *aptr++ *= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = a[]; + a[] *= c[]; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(b[i] * c[i])) + { + printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); + assert(0); + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/arrayreal.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/arrayreal.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,241 @@ +/** + * Contains SSE2 and MMX versions of certain operations for real. + * + * Copyright: Copyright Digital Mars 2008 - 2009. + * License: = 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 3343% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + paddw MM0, MM2; + paddw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ + value); + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + c[] + */ + +T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceAddSliceAssign_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 3777% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + movdqu XMM2, [ECX]; + movdqu XMM3, [ECX+16]; + add ECX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + movdqa XMM2, [ECX]; + movdqa XMM3, [ECX+16]; + add ECX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 2068% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + movq MM2, [ECX]; + movq MM3, [ECX+8]; + add ECX, 16; + paddw MM0, MM2; + paddw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ + *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_u(T[] a, T value) +{ + return _arrayExpSliceAddass_s(a, value); +} + +T[] _arrayExpSliceAddass_t(T[] a, T value) +{ + return _arrayExpSliceAddass_s(a, value); +} + +T[] _arrayExpSliceAddass_s(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 832% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= (l << 16); + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 826% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + paddw MM0, MM2; + paddw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + a[] += 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(c[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_u(T[] a, T[] b) +{ + return _arraySliceSliceAddass_s(a, b); +} + +T[] _arraySliceSliceAddass_t(T[] a, T[] b) +{ + return _arraySliceSliceAddass_s(a, b); +} + +T[] _arraySliceSliceAddass_s(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 2085% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + movdqu XMM2, [ECX]; + movdqu XMM3, [ECX+16]; + add ECX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + movdqa XMM2, [ECX]; + movdqa XMM3, [ECX+16]; + add ECX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 1022% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + movq MM2, [ECX]; + movq MM3, [ECX+8]; + add ECX, 16; + paddw MM0, MM2; + paddw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb start; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] += a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] + a[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_s(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_s(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 3695% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 3049% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + psubw MM0, MM2; + psubw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ - value); + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_s(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_s(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 4995% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + + align 4; + startaddsse2u: + movd XMM2, l; + pshufd XMM2, XMM2, 0; + movd XMM3, l; + pshufd XMM3, XMM3, 0; + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + psubw XMM2, XMM0; + psubw XMM3, XMM1; + movdqu [ESI -32], XMM2; + movdqu [ESI+16-32], XMM3; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + + align 4; + startaddsse2a: + movd XMM2, l; + pshufd XMM2, XMM2, 0; + movd XMM3, l; + pshufd XMM3, XMM3, 0; + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + psubw XMM2, XMM0; + psubw XMM3, XMM1; + movdqa [ESI -32], XMM2; + movdqa [ESI+16-32], XMM3; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 4562% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startmmx: + add ESI, 16; + movq MM2, [EAX]; + movq MM3, [EAX+8]; + movq MM0, MM4; + movq MM1, MM4; + add EAX, 16; + psubw MM0, MM2; + psubw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(value - *bptr++); + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = 6 - a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - a[i])) + { + printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - c[] + */ + +T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 4129% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + movdqu XMM2, [ECX]; + movdqu XMM3, [ECX+16]; + add ECX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + movdqa XMM2, [ECX]; + movdqa XMM3, [ECX+16]; + add ECX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 2018% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + movq MM2, [ECX]; + movq MM3, [ECX+8]; + add ECX, 16; + psubw MM0, MM2; + psubw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ - *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_u(T[] a, T value) +{ + return _arrayExpSliceMinass_s(a, value); +} + +T[] _arrayExpSliceMinass_t(T[] a, T value) +{ + return _arrayExpSliceMinass_s(a, value); +} + +T[] _arrayExpSliceMinass_s(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 835% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= (l << 16); + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 835% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + psubw MM0, MM2; + psubw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + a[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(c[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_u(T[] a, T[] b) +{ + return _arraySliceSliceMinass_s(a, b); +} + +T[] _arraySliceSliceMinass_t(T[] a, T[] b) +{ + return _arraySliceSliceMinass_s(a, b); +} + +T[] _arraySliceSliceMinass_s(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 2121% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + movdqu XMM2, [ECX]; + movdqu XMM3, [ECX+16]; + add ECX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + movdqa XMM2, [ECX]; + movdqa XMM3, [ECX+16]; + add ECX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 1116% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + movq MM2, [ECX]; + movq MM3, [ECX+8]; + add ECX, 16; + psubw MM0, MM2; + psubw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb start; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] -= a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] - a[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * value + */ + +T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAssign_s(a, value, b); +} + +T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAssign_s(a, value, b); +} + +T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMulSliceAssign_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 3733% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= l << 16; + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 3733% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + pmullw MM0, MM2; + pmullw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ * value); + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * c[] + */ + +T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMulSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMulSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceMulSliceAssign_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 2515% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM2, [ECX]; + movdqu XMM1, [EAX+16]; + movdqu XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM2, [ECX]; + movdqa XMM1, [EAX+16]; + movdqa XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 2515% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM2, [ECX]; + movq MM1, [EAX+8]; + movq MM3, [ECX+8]; + add EAX, 16; + add ECX, 16; + pmullw MM0, MM2; + pmullw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ * *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * b[i])) + { + printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= value + */ + +T[] _arrayExpSliceMulass_u(T[] a, T value) +{ + return _arrayExpSliceMulass_s(a, value); +} + +T[] _arrayExpSliceMulass_t(T[] a, T value) +{ + return _arrayExpSliceMulass_s(a, value); +} + +T[] _arrayExpSliceMulass_s(T[] a, T value) +{ + //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 2044% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= l << 16; + + if (((cast(uint) aptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 2056% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + pmullw MM0, MM2; + pmullw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = a[]; + a[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(b[i] * 6)) + { + printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= b[] + */ + +T[] _arraySliceSliceMulass_u(T[] a, T[] b) +{ + return _arraySliceSliceMulass_s(a, b); +} + +T[] _arraySliceSliceMulass_t(T[] a, T[] b) +{ + return _arraySliceSliceMulass_s(a, b); +} + +T[] _arraySliceSliceMulass_s(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMulass_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 2519% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM2, [ECX]; + movdqu XMM1, [ESI+16]; + movdqu XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM2, [ECX]; + movdqa XMM1, [ESI+16]; + movdqa XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 1712% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM2, [ECX]; + movq MM1, [ESI+8]; + movq MM3, [ECX+8]; + add ESI, 16; + add ECX, 16; + pmullw MM0, MM2; + pmullw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ *= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = a[]; + a[] *= c[]; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(b[i] * c[i])) + { + printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); + assert(0); + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/cast_.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/cast_.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,169 @@ +/** + * Implementation of array assignment support routines. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright, Sean Kelly + * + * Copyright Digital Mars 2000 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.cover; + +private +{ + version( Windows ) + import core.sys.windows.windows; + else version( Posix ) + { + import core.sys.posix.fcntl; + import core.sys.posix.unistd; + } + import core.bitop; + import core.stdc.stdio; + import rt.util.utf; + + struct BitArray + { + size_t len; + uint* ptr; + + bool opIndex( size_t i ) + in + { + assert( i < len ); + } + body + { + return cast(bool) bt( ptr, i ); + } + } + + struct Cover + { + string filename; + BitArray valid; + uint[] data; + } + + __gshared + { + Cover[] gdata; + string srcpath; + string dstpath; + bool merge; + } +} + + +/** + * Set path to where source files are located. + * + * Params: + * pathname = The new path name. + */ +extern (C) void dmd_coverSourcePath( string pathname ) +{ + srcpath = pathname; +} + + +/** + * Set path to where listing files are to be written. + * + * Params: + * pathname = The new path name. + */ +extern (C) void dmd_coverDestPath( string pathname ) +{ + dstpath = pathname; +} + + +/** + * Set merge mode. + * + * Params: + * flag = true means new data is summed with existing data in the listing + * file; false means a new listing file is always created. + */ +extern (C) void dmd_coverSetMerge( bool flag ) +{ + merge = flag; +} + + +/** + * The coverage callback. + * + * Params: + * filename = The name of the coverage file. + * valid = ??? + * data = ??? + */ +extern (C) void _d_cover_register( string filename, BitArray valid, uint[] data ) +{ + Cover c; + + c.filename = filename; + c.valid = valid; + c.data = data; + gdata ~= c; +} + + +static ~this() +{ + const NUMLINES = 16384 - 1; + const NUMCHARS = 16384 * 16 - 1; + + char[] srcbuf = new char[NUMCHARS]; + char[][] srclines = new char[][NUMLINES]; + char[] lstbuf = new char[NUMCHARS]; + char[][] lstlines = new char[][NUMLINES]; + + foreach( Cover c; gdata ) + { + if( !readFile( appendFN( srcpath, c.filename ), srcbuf ) ) + continue; + splitLines( srcbuf, srclines ); + + if( merge ) + { + if( !readFile( addExt( baseName( c.filename ), "lst" ), lstbuf ) ) + break; + splitLines( lstbuf, lstlines ); + + for( size_t i = 0; i < lstlines.length; ++i ) + { + if( i >= c.data.length ) + break; + + int count = 0; + + foreach( char c2; lstlines[i] ) + { + switch( c2 ) + { + case ' ': + continue; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + count = count * 10 + c2 - '0'; + continue; + default: + break; + } + } + c.data[i] += count; + } + } + + FILE* flst = fopen( (addExt( baseName( c.filename ), "lst\0" )).ptr, "wb" ); + + if( !flst ) + continue; //throw new Exception( "Error opening file for write: " ~ lstfn ); + + uint nno; + uint nyes; + + for( int i = 0; i < c.data.length; i++ ) + { + if( i < srclines.length ) + { + uint n = c.data[i]; + char[] line = srclines[i]; + + line = expandTabs( line ); + + if( n == 0 ) + { + if( c.valid[i] ) + { + nno++; + fprintf( flst, "0000000|%.*s\n", line ); + } + else + { + fprintf( flst, " |%.*s\n", line ); + } + } + else + { + nyes++; + fprintf( flst, "%7u|%.*s\n", n, line ); + } + } + } + if( nyes + nno ) // no divide by 0 bugs + { + fprintf( flst, "%.*s is %d%% covered\n", c.filename, ( nyes * 100 ) / ( nyes + nno ) ); + } + fclose( flst ); + } +} + + +string appendFN( string path, string name ) +{ + version( Windows ) + const char sep = '\\'; + else + const char sep = '/'; + + auto dest = path; + + if( dest && dest[$ - 1] != sep ) + dest ~= sep; + dest ~= name; + return dest; +} + + +string baseName( string name, string ext = null ) +{ + auto i = name.length; + for( ; i > 0; --i ) + { + version( Windows ) + { + if( name[i - 1] == ':' || name[i - 1] == '\\' ) + break; + } + else version( Posix ) + { + if( name[i - 1] == '/' ) + break; + } + } + return chomp( name[i .. $], ext ? ext : "" ); +} + + +string getExt( string name ) +{ + auto i = name.length; + + while( i > 0 ) + { + if( name[i - 1] == '.' ) + return name[i .. $]; + --i; + version( Windows ) + { + if( name[i] == ':' || name[i] == '\\' ) + break; + } + else version( Posix ) + { + if( name[i] == '/' ) + break; + } + } + return null; +} + + +string addExt( string name, string ext ) +{ + auto existing = getExt( name ); + + if( existing.length == 0 ) + { + if( name.length && name[$ - 1] == '.' ) + name ~= ext; + else + name = name ~ "." ~ ext; + } + else + { + name = name[0 .. $ - existing.length] ~ ext; + } + return name; +} + + +string chomp( string str, string delim = null ) +{ + if( delim is null ) + { + auto len = str.length; + + if( len ) + { + auto c = str[len - 1]; + + if( c == '\r' ) + --len; + else if( c == '\n' && str[--len - 1] == '\r' ) + --len; + } + return str[0 .. len]; + } + else if( str.length >= delim.length ) + { + if( str[$ - delim.length .. $] == delim ) + return str[0 .. $ - delim.length]; + } + return str; +} + + +bool readFile( string name, inout char[] buf ) +{ + version( Windows ) + { + auto wnamez = toUTF16z( name ); + HANDLE file = CreateFileW( wnamez, + GENERIC_READ, + FILE_SHARE_READ, + null, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN, + cast(HANDLE) null ); + + delete wnamez; + if( file == INVALID_HANDLE_VALUE ) + return false; + scope( exit ) CloseHandle( file ); + + DWORD num = 0; + DWORD pos = 0; + + buf.length = 4096; + while( true ) + { + if( !ReadFile( file, &buf[pos], cast(DWORD)( buf.length - pos ), &num, null ) ) + return false; + if( !num ) + break; + pos += num; + buf.length = pos * 2; + } + buf.length = pos; + return true; + } + else version( Posix ) + { + char[] namez = new char[name.length + 1]; + namez[0 .. name.length] = name; + namez[$ - 1] = 0; + int file = open( namez.ptr, O_RDONLY ); + + delete namez; + if( file == -1 ) + return false; + scope( exit ) close( file ); + + int num = 0; + uint pos = 0; + + buf.length = 4096; + while( true ) + { + num = read( file, &buf[pos], cast(uint)( buf.length - pos ) ); + if( num == -1 ) + return false; + if( !num ) + break; + pos += num; + buf.length = pos * 2; + } + buf.length = pos; + return true; + } +} + + +void splitLines( char[] buf, inout char[][] lines ) +{ + size_t beg = 0, + pos = 0; + + lines.length = 0; + for( ; pos < buf.length; ++pos ) + { + char c = buf[pos]; + + switch( buf[pos] ) + { + case '\r': + case '\n': + lines ~= buf[beg .. pos]; + beg = pos + 1; + if( buf[pos] == '\r' && pos < buf.length - 1 && buf[pos + 1] == '\n' ) + ++pos, ++beg; + default: + continue; + } + } + if( beg != pos ) + { + lines ~= buf[beg .. pos]; + } +} + + +char[] expandTabs( char[] str, int tabsize = 8 ) +{ + const dchar LS = '\u2028'; // UTF line separator + const dchar PS = '\u2029'; // UTF paragraph separator + + bool changes = false; + char[] result = str; + int column; + int nspaces; + + foreach( size_t i, dchar c; str ) + { + switch( c ) + { + case '\t': + nspaces = tabsize - (column % tabsize); + if( !changes ) + { + changes = true; + result = null; + result.length = str.length + nspaces - 1; + result.length = i + nspaces; + result[0 .. i] = str[0 .. i]; + result[i .. i + nspaces] = ' '; + } + else + { int j = result.length; + result.length = j + nspaces; + result[j .. j + nspaces] = ' '; + } + column += nspaces; + break; + + case '\r': + case '\n': + case PS: + case LS: + column = 0; + goto L1; + + default: + column++; + L1: + if (changes) + { + if (c <= 0x7F) + result ~= c; + else + encode(result, c); + } + break; + } + } + return result; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/critical.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/critical.c Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,173 @@ +/** + * Implementation of support routines for synchronized blocks. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: next) + { + pthread_mutex_lock(&critical_section.cs); + if (!dcs->next) // if, in the meantime, another thread didn't set it + { + dcs->next = dcs_list; + dcs_list = dcs; + pthread_mutex_init(&dcs->cs, &_criticals_attr); + } + pthread_mutex_unlock(&critical_section.cs); + } + pthread_mutex_lock(&dcs->cs); +} + +void _d_criticalexit(D_CRITICAL_SECTION *dcs) +{ + //printf("_d_criticalexit(dcs = x%x)\n", dcs); + pthread_mutex_unlock(&dcs->cs); +} + +void _STI_critical_init() +{ + if (!dcs_list) + { //printf("_STI_critical_init()\n"); + pthread_mutexattr_init(&_criticals_attr); + pthread_mutexattr_settype(&_criticals_attr, PTHREAD_MUTEX_RECURSIVE); + + // The global critical section doesn't need to be recursive + pthread_mutex_init(&critical_section.cs, 0); + dcs_list = &critical_section; + } +} + +void _STD_critical_term() +{ + if (dcs_list) + { //printf("_STI_critical_term()\n"); + while (dcs_list) + { + //printf("\tlooping... %x\n", dcs_list); + pthread_mutex_destroy(&dcs_list->cs); + dcs_list = dcs_list->next; + } + } +} + +#endif + diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/deh.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/deh.c Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,724 @@ +/** + * Implementation of exception handling support routines for Windows. + * + * Copyright: Copyright Digital Mars 1999 - 2009. + * License: ExceptionFlags & EXCEPTION_UNWIND) + { + // Call all the finally blocks in this frame + _d_local_unwind(handler_table, frame, -1); + } + else + { + // Jump to catch block if matching one is found + + int ndx,prev_ndx,i; + struct DHandlerInfo *phi; + struct DCatchInfo *pci; + struct DCatchBlock *pcb; + unsigned ncatches; // number of catches in the current handler + Object *pti; + ClassInfo *ci; + + ci = NULL; // only compute it if we need it + + // walk through handler table, checking each handler + // with an index smaller than the current table_index + for (ndx = frame->table_index; ndx != -1; ndx = prev_ndx) + { + phi = &handler_table->handler_info[ndx]; + prev_ndx = phi->prev_index; + if (phi->cioffset) + { + // this is a catch handler (no finally) + pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset); + ncatches = pci->ncatches; + for (i = 0; i < ncatches; i++) + { + pcb = &pci->catch_block[i]; + + if (!ci) + { + // This code must match the translation code + if (exception_record->ExceptionCode == STATUS_DIGITAL_MARS_D_EXCEPTION) + { + //printf("ei[0] = %p\n", exception_record->ExceptionInformation[0]); + ci = **(ClassInfo ***)(exception_record->ExceptionInformation[0]); + } + else + ci = &_Class_9Throwable; + } + + if (_d_isbaseof(ci, pcb->type)) + { + // Matched the catch type, so we've found the handler. + int regebp; + + pti = _d_translate_se_to_d_exception(exception_record); + + // Initialize catch variable + regebp = (int)&frame->ebp; // EBP for this frame + *(void **)(regebp + (pcb->bpoffset)) = pti; + + // Have system call all finally blocks in intervening frames + _global_unwind(frame, exception_record); + + // Call all the finally blocks skipped in this frame + _d_local_unwind(handler_table, frame, ndx); + + frame->table_index = prev_ndx; // we are out of this handler + + // Jump to catch block. Does not return. + { + unsigned catch_esp; + fp_t catch_addr; + + catch_addr = (fp_t)(pcb->code); + catch_esp = regebp - handler_table->espoffset - sizeof(fp_t); + _asm + { + mov EAX,catch_esp + mov ECX,catch_addr + mov [EAX],ECX + mov EBP,regebp + mov ESP,EAX // reset stack + ret // jump to catch block + } + } + } + } + } + } + } + return ExceptionContinueSearch; +} + +/*********************************** + * Exception filter for use in __try..__except block + * surrounding call to Dmain() + */ + +int _d_exception_filter(struct _EXCEPTION_POINTERS *eptrs, + int retval, + Object **exception_object) +{ + *exception_object = _d_translate_se_to_d_exception(eptrs->ExceptionRecord); + return retval; +} + +/*********************************** + * Throw a D object. + */ + +void __stdcall _d_throw(Object *h) +{ + //printf("_d_throw(h = %p, &h = %p)\n", h, &h); + //printf("\tvptr = %p\n", *(void **)h); + RaiseException(STATUS_DIGITAL_MARS_D_EXCEPTION, + EXCEPTION_NONCONTINUABLE, + 1, (DWORD *)&h); +} + +/*********************************** + * Create an exception object + */ + +Object *_d_create_exception_object(ClassInfo *ci, char *msg) +{ + Throwable *exc; + + exc = (Throwable *)_d_newclass(ci); + // BUG: what if _d_newclass() throws an out of memory exception? + + if (msg) + { + exc->msglen = strlen(msg); + exc->msg = msg; + } + return (Object *)exc; +} + +/*********************************** + * Converts a Windows Structured Exception code to a D Exception Object. + */ + +Object *_d_translate_se_to_d_exception(EXCEPTION_RECORD *exception_record) +{ + Object *pti; + + switch (exception_record->ExceptionCode) { + case STATUS_DIGITAL_MARS_D_EXCEPTION: + // Generated D exception + pti = (Object *)(exception_record->ExceptionInformation[0]); + break; + + case STATUS_INTEGER_DIVIDE_BY_ZERO: + pti = _d_create_exception_object(&_Class_5Error, "Integer Divide by Zero"); + break; + + case STATUS_FLOAT_DIVIDE_BY_ZERO: + pti = _d_create_exception_object(&_Class_5Error, "Float Divide by Zero"); + break; + + case STATUS_ACCESS_VIOLATION: + pti = _d_create_exception_object(&_Class_5Error, "Access Violation"); + break; + + case STATUS_STACK_OVERFLOW: + pti = _d_create_exception_object(&_Class_5Error, "Stack Overflow"); + break; + + case STATUS_DATATYPE_MISALIGNMENT: + pti = _d_create_exception_object(&_Class_5Error, "Datatype Misalignment"); + break; + + case STATUS_ARRAY_BOUNDS_EXCEEDED: + pti = _d_create_exception_object(&_Class_5Error, "Array Bounds Exceeded"); + break; + + case STATUS_FLOAT_INVALID_OPERATION: + pti = _d_create_exception_object(&_Class_5Error, "Invalid Floating Point Operation"); + break; + + case STATUS_FLOAT_DENORMAL_OPERAND: + pti = _d_create_exception_object(&_Class_5Error, "Floating Point Denormal Operand"); + break; + + case STATUS_FLOAT_INEXACT_RESULT: + pti = _d_create_exception_object(&_Class_5Error, "Floating Point Inexact Result"); + break; + + case STATUS_FLOAT_OVERFLOW: + pti = _d_create_exception_object(&_Class_5Error, "Floating Point Overflow"); + break; + + case STATUS_FLOAT_UNDERFLOW: + pti = _d_create_exception_object(&_Class_5Error, "Floating Point Underflow"); + break; + + case STATUS_FLOAT_STACK_CHECK: + pti = _d_create_exception_object(&_Class_5Error, "Floating Point Stack Check"); + break; + + case STATUS_PRIVILEGED_INSTRUCTION: + pti = _d_create_exception_object(&_Class_5Error, "Privileged Instruction"); + break; + + case STATUS_ILLEGAL_INSTRUCTION: + pti = _d_create_exception_object(&_Class_5Error, "Illegal Instruction"); + break; + + case STATUS_BREAKPOINT: + pti = _d_create_exception_object(&_Class_5Error, "Breakpoint"); + break; + + case STATUS_IN_PAGE_ERROR: + pti = _d_create_exception_object(&_Class_5Error, "Win32 In Page Exception"); + break; +/* + case STATUS_INTEGER_OVERFLOW: // not supported on any x86 processor + case STATUS_INVALID_DISPOSITION: + case STATUS_NONCONTINUABLE_EXCEPTION: + case STATUS_SINGLE_STEP: + case DBG_CONTROL_C: // only when a debugger is attached + // In DMC, but not in Microsoft docs + case STATUS_GUARD_PAGE_VIOLATION: + case STATUS_INVALID_HANDLE: +*/ + // convert all other exception codes into a Win32Exception + default: + pti = _d_create_exception_object(&_Class_5Error, "Win32 Exception"); + break; + } + + return pti; +} + +/************************************** + * Call finally blocks in the current stack frame until stop_index. + * This is roughly equivalent to _local_unwind() for C in \src\win32\ehsup.c + */ + +void __cdecl _d_local_unwind(struct DHandlerTable *handler_table, + struct DEstablisherFrame *frame, int stop_index) +{ + struct DHandlerInfo *phi; + struct DCatchInfo *pci; + int i; + + // Set up a special exception handler to catch double-fault exceptions. + __asm + { + push dword ptr -1 + push dword ptr 0 + push offset _local_except_handler // defined in src\win32\ehsup.c + push dword ptr fs:_except_list + mov FS:_except_list,ESP + } + + for (i = frame->table_index; i != -1 && i != stop_index; i = phi->prev_index) + { + phi = &handler_table->handler_info[i]; + if (phi->finally_code) + { + // Note that it is unnecessary to adjust the ESP, as the finally block + // accesses all items on the stack as relative to EBP. + + DWORD *catch_ebp = &frame->ebp; + void *blockaddr = phi->finally_code; + + _asm + { + push EBX + mov EBX,blockaddr + push EBP + mov EBP,catch_ebp + call EBX + pop EBP + pop EBX + } + } + } + + _asm + { + pop FS:_except_list + add ESP,12 + } +} + +/*********************************** + * external version of the unwinder + */ + +__declspec(naked) void __cdecl _d_local_unwind2() +{ + __asm + { + jmp _d_local_unwind + } +} + +/*********************************** + * The frame handler, this is called for each frame that has been registered + * in the OS except_list. + * Input: + * EAX the handler table for the frame + */ + +EXCEPTION_DISPOSITION _d_monitor_handler( + EXCEPTION_RECORD *exception_record, + struct DEstablisherFrame *frame, + CONTEXT context, + void *dispatcher_context) +{ + if (exception_record->ExceptionFlags & EXCEPTION_UNWIND) + { + _d_monitorexit((Object *)frame->table_index); + } + else + { + } + return ExceptionContinueSearch; +} + +/*********************************** + */ + +void _d_monitor_prolog(void *x, void *y, Object *h) +{ + __asm + { + push EAX + } + //printf("_d_monitor_prolog(x=%p, y=%p, h=%p)\n", x, y, h); + _d_monitorenter(h); + __asm + { + pop EAX + } +} + +/*********************************** + */ + +void _d_monitor_epilog(void *x, void *y, Object *h) +{ + //printf("_d_monitor_epilog(x=%p, y=%p, h=%p)\n", x, y, h); + __asm + { + push EAX + push EDX + } + _d_monitorexit(h); + __asm + { + pop EDX + pop EAX + } +} + +#endif + +/* ======================== linux =============================== */ + +#if linux + +#include "mars.h" + +extern ClassInfo D6object9Throwable7__ClassZ; +#define _Class_9Throwable D6object9Throwable7__ClassZ; + +extern ClassInfo D6object5Error7__ClassZ; +#define _Class_5Error D6object5Error7__ClassZ + +typedef int (*fp_t)(); // function pointer in ambient memory model + +struct DHandlerInfo +{ + unsigned offset; // offset from function address to start of guarded section + int prev_index; // previous table index + unsigned cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch) + void *finally_code; // pointer to finally code to execute + // (!=0 if try-finally) +}; + +// Address of DHandlerTable, searched for by eh_finddata() + +struct DHandlerTable +{ + void *fptr; // pointer to start of function + unsigned espoffset; // offset of ESP from EBP + unsigned retoffset; // offset from start of function to return code + unsigned nhandlers; // dimension of handler_info[] + struct DHandlerInfo handler_info[1]; +}; + +struct DCatchBlock +{ + ClassInfo *type; // catch type + unsigned bpoffset; // EBP offset of catch var + void *code; // catch handler code +}; + +// Create one of these for each try-catch +struct DCatchInfo +{ + unsigned ncatches; // number of catch blocks + struct DCatchBlock catch_block[1]; // data for each catch block +}; + +// One of these is generated for each function with try-catch or try-finally + +struct FuncTable +{ + void *fptr; // pointer to start of function + struct DHandlerTable *handlertable; // eh data for this function + unsigned size; // size of function in bytes +}; + +extern struct FuncTable *table_start; +extern struct FuncTable *table_end; + +void terminate() +{ +// _asm +// { +// hlt +// } +} + +/******************************************* + * Given address that is inside a function, + * figure out which function it is in. + * Return DHandlerTable if there is one, NULL if not. + */ + +struct DHandlerTable *__eh_finddata(void *address) +{ + struct FuncTable *ft; + + for (ft = (struct FuncTable *)table_start; + ft < (struct FuncTable *)table_end; + ft++) + { + if (ft->fptr <= address && + address < (void *)((char *)ft->fptr + ft->size)) + { + return ft->handlertable; + } + } + return NULL; +} + + +/****************************** + * Given EBP, find return address to caller, and caller's EBP. + * Input: + * regbp Value of EBP for current function + * *pretaddr Return address + * Output: + * *pretaddr return address to caller + * Returns: + * caller's EBP + */ + +unsigned __eh_find_caller(unsigned regbp, unsigned *pretaddr) +{ + unsigned bp = *(unsigned *)regbp; + + if (bp) // if not end of call chain + { + // Perform sanity checks on new EBP. + // If it is screwed up, terminate() hopefully before we do more damage. + if (bp <= regbp) + // stack should grow to smaller values + terminate(); + + *pretaddr = *(unsigned *)(regbp + sizeof(int)); + } + return bp; +} + +/*********************************** + * Throw a D object. + */ + +void __stdcall _d_throw(Object *h) +{ + unsigned regebp; + + //printf("_d_throw(h = %p, &h = %p)\n", h, &h); + //printf("\tvptr = %p\n", *(void **)h); + + regebp = _EBP; + + while (1) // for each function on the stack + { + struct DHandlerTable *handler_table; + struct FuncTable *pfunc; + struct DHandlerInfo *phi; + unsigned retaddr; + unsigned funcoffset; + unsigned spoff; + unsigned retoffset; + int index; + int dim; + int ndx; + int prev_ndx; + + regebp = __eh_find_caller(regebp,&retaddr); + if (!regebp) + // if end of call chain + break; + + handler_table = __eh_finddata((void *)retaddr); // find static data associated with function + if (!handler_table) // if no static data + { + continue; + } + funcoffset = (unsigned)handler_table->fptr; + spoff = handler_table->espoffset; + retoffset = handler_table->retoffset; + +#ifdef DEBUG + printf("retaddr = x%x\n",(unsigned)retaddr); + printf("regebp=x%04x, funcoffset=x%04x, spoff=x%x, retoffset=x%x\n", + regebp,funcoffset,spoff,retoffset); +#endif + + // Find start index for retaddr in static data + dim = handler_table->nhandlers; + index = -1; + for (int i = 0; i < dim; i++) + { + phi = &handler_table->handler_info[i]; + + if ((unsigned)retaddr >= funcoffset + phi->offset) + index = i; + } + + // walk through handler table, checking each handler + // with an index smaller than the current table_index + for (ndx = index; ndx != -1; ndx = prev_ndx) + { + phi = &handler_table->handler_info[ndx]; + prev_ndx = phi->prev_index; + if (phi->cioffset) + { + // this is a catch handler (no finally) + struct DCatchInfo *pci; + int ncatches; + int i; + + pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset); + ncatches = pci->ncatches; + for (i = 0; i < ncatches; i++) + { + struct DCatchBlock *pcb; + ClassInfo *ci = **(ClassInfo ***)h; + + pcb = &pci->catch_block[i]; + + if (_d_isbaseof(ci, pcb->type)) + { // Matched the catch type, so we've found the handler. + + // Initialize catch variable + *(void **)(regebp + (pcb->bpoffset)) = h; + + // Jump to catch block. Does not return. + { + unsigned catch_esp; + fp_t catch_addr; + + catch_addr = (fp_t)(pcb->code); + catch_esp = regebp - handler_table->espoffset - sizeof(fp_t); + _asm + { + mov EAX,catch_esp + mov ECX,catch_addr + mov [EAX],ECX + mov EBP,regebp + mov ESP,EAX // reset stack + ret // jump to catch block + } + } + } + } + } + else if (phi->finally_code) + { // Call finally block + // Note that it is unnecessary to adjust the ESP, as the finally block + // accesses all items on the stack as relative to EBP. + + void *blockaddr = phi->finally_code; + + _asm + { + push EBX + mov EBX,blockaddr + push EBP + mov EBP,regebp + call EBX + pop EBP + pop EBX + } + } + } + } +} + + +#endif diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/deh2.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/deh2.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,316 @@ +/** + * Implementation of exception handling support routines for Posix. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: funcoffset + phi.offset && + cast(uint)retaddr <= funcoffset + phi.endoffset) + index = i; + } + debug printf("index = %d\n", index); + + // walk through handler table, checking each handler + // with an index smaller than the current table_index + for (ndx = index; ndx != -1; ndx = prev_ndx) + { + phi = &handler_table.handler_info[ndx]; + prev_ndx = phi.prev_index; + if (phi.cioffset) + { + // this is a catch handler (no finally) + DCatchInfo *pci; + int ncatches; + int i; + + pci = cast(DCatchInfo *)(cast(char *)handler_table + phi.cioffset); + ncatches = pci.ncatches; + for (i = 0; i < ncatches; i++) + { + DCatchBlock *pcb; + ClassInfo ci = **cast(ClassInfo **)h; + + pcb = &pci.catch_block[i]; + + if (_d_isbaseof(ci, pcb.type)) + { // Matched the catch type, so we've found the handler. + + // Initialize catch variable + *cast(void **)(regebp + (pcb.bpoffset)) = h; + + // Jump to catch block. Does not return. + { + uint catch_esp; + fp_t catch_addr; + + catch_addr = cast(fp_t)(pcb.code); + catch_esp = regebp - handler_table.espoffset - fp_t.sizeof; + asm + { + mov EAX,catch_esp ; + mov ECX,catch_addr ; + mov [EAX],ECX ; + mov EBP,regebp ; + mov ESP,EAX ; // reset stack + ret ; // jump to catch block + } + } + } + } + } + else if (phi.finally_code) + { // Call finally block + // Note that it is unnecessary to adjust the ESP, as the finally block + // accesses all items on the stack as relative to EBP. + + void *blockaddr = phi.finally_code; + + version (OSX) + { + asm + { + sub ESP,4 ; // align stack to 16 + push EBX ; + mov EBX,blockaddr ; + push EBP ; + mov EBP,regebp ; + call EBX ; + pop EBP ; + pop EBX ; + add ESP,4 ; + } + } + else + { + asm + { + push EBX ; + mov EBX,blockaddr ; + push EBP ; + mov EBP,regebp ; + call EBX ; + pop EBP ; + pop EBX ; + } + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/dmain2.BAK --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/dmain2.BAK Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,374 @@ +/* + * Placed into the Public Domain. + * written by Walter Bright + * www.digitalmars.com + */ + +/* + * Modified by Sean Kelly for use with the D Runtime Project + */ + +module rt.dmain2; + +private +{ + import memory; + import util.console; + import core.stdc.stddef; + import core.stdc.stdlib; + import core.stdc.string; +} + +version (Windows) +{ + extern (Windows) alias int function() FARPROC; + extern (Windows) FARPROC GetProcAddress(void*, in char*); + extern (Windows) void* LoadLibraryA(in char*); + extern (Windows) int FreeLibrary(void*); + extern (Windows) void* LocalFree(void*); + extern (Windows) wchar_t* GetCommandLineW(); + extern (Windows) wchar_t** CommandLineToArgvW(wchar_t*, int*); + extern (Windows) export int WideCharToMultiByte(uint, uint, wchar_t*, int, char*, int, char*, int); + pragma(lib, "shell32.lib"); // needed for CommandLineToArgvW +} + +extern (C) void _STI_monitor_staticctor(); +extern (C) void _STD_monitor_staticdtor(); +extern (C) void _STI_critical_init(); +extern (C) void _STD_critical_term(); +extern (C) void gc_init(); +extern (C) void gc_term(); +extern (C) void _minit(); +extern (C) void _moduleCtor(); +extern (C) void _moduleDtor(); +extern (C) void thread_joinAll(); + +/*********************************** + * These are a temporary means of providing a GC hook for DLL use. They may be + * replaced with some other similar functionality later. + */ +extern (C) +{ + void* gc_getProxy(); + void gc_setProxy(void* p); + void gc_clrProxy(); + + alias void* function() gcGetFn; + alias void function(void*) gcSetFn; + alias void function() gcClrFn; +} + +extern (C) void* rt_loadLibrary(in char[] name) +{ + version (Windows) + { + char[260] temp = void; + temp[0 .. name.length] = name[]; + temp[name.length] = cast(char) 0; + void* ptr = LoadLibraryA(temp.ptr); + if (ptr is null) + return ptr; + gcSetFn gcSet = cast(gcSetFn) GetProcAddress(ptr, "gc_setProxy"); + if (gcSet !is null) + gcSet(gc_getProxy()); + return ptr; + + } + else version (linux) + { + throw new Exception("rt_loadLibrary not yet implemented on linux."); + } +} + +extern (C) bool rt_unloadLibrary(void* ptr) +{ + version (Windows) + { + gcClrFn gcClr = cast(gcClrFn) GetProcAddress(ptr, "gc_clrProxy"); + if (gcClr !is null) + gcClr(); + return FreeLibrary(ptr) != 0; + } + else version (linux) + { + throw new Exception("rt_unloadLibrary not yet implemented on linux."); + } +} + +/*********************************** + * These functions must be defined for any D program linked + * against this library. + */ +extern (C) void onAssertError(string file, size_t line); +extern (C) void onAssertErrorMsg(string file, size_t line, string msg); +extern (C) void onRangeError(string file, size_t line); +extern (C) void onHiddenFuncError(Object o); +extern (C) void onSwitchError(string file, size_t line); +extern (C) bool runModuleUnitTests(); + +// this function is called from the utf module +//extern (C) void onUnicodeError(string msg, size_t idx); + +/*********************************** + * These are internal callbacks for various language errors. + */ +extern (C) void _d_assert(string file, uint line) +{ + onAssertError(file, line); +} + +extern (C) static void _d_assert_msg(string msg, string file, uint line) +{ + onAssertErrorMsg(file, line, msg); +} + +extern (C) void _d_array_bounds(string file, uint line) +{ + onRangeError(file, line); +} + +extern (C) void _d_switch_error(string file, uint line) +{ + onSwitchError(file, line); +} + +extern (C) void _d_hidden_func() +{ + Object o; + asm + { + mov o, EAX; + } + onHiddenFuncError(o); +} + +bool _d_isHalting = false; + +extern (C) bool rt_isHalting() +{ + return _d_isHalting; +} + +extern (C) bool rt_trapExceptions = true; + +void _d_criticalInit() +{ + version (linux) + { + _STI_monitor_staticctor(); + _STI_critical_init(); + } +} + +alias void delegate(Throwable) ExceptionHandler; + +extern (C) bool rt_init(ExceptionHandler dg = null) +{ + _d_criticalInit(); + + try + { + gc_init(); + initStaticDataGC(); + version (Windows) + _minit(); + _moduleCtor(); + return true; + } + catch (Throwable e) + { + if (dg) + dg(e); + } + catch + { + + } + _d_criticalTerm(); + return false; +} + +void _d_criticalTerm() +{ + version (linux) + { + _STD_critical_term(); + _STD_monitor_staticdtor(); + } +} + +extern (C) bool rt_term(ExceptionHandler dg = null) +{ + try + { + thread_joinAll(); + _d_isHalting = true; + _moduleDtor(); + gc_term(); + return true; + } + catch (Throwable e) + { + if (dg) + dg(e); + } + catch + { + + } + finally + { + _d_criticalTerm(); + } + return false; +} + +/*********************************** + * The D main() function supplied by the user's program + */ +int main(char[][] args); + +/*********************************** + * Substitutes for the C main() function. + * It's purpose is to wrap the call to the D main() + * function and catch any unhandled exceptions. + */ + +extern (C) int main(int argc, char **argv) +{ + char[][] args; + int result; + + version (linux) + { + _STI_monitor_staticctor(); + _STI_critical_init(); + } + + version (Windows) + { + wchar_t* wcbuf = GetCommandLineW(); + size_t wclen = wcslen(wcbuf); + int wargc = 0; + wchar_t** wargs = CommandLineToArgvW(wcbuf, &wargc); + assert(wargc == argc); + + char* cargp = null; + size_t cargl = WideCharToMultiByte(65001, 0, wcbuf, wclen, null, 0, null, 0); + + cargp = cast(char*) alloca(cargl); + args = ((cast(char[]*) alloca(wargc * (char[]).sizeof)))[0 .. wargc]; + + for (size_t i = 0, p = 0; i < wargc; i++) + { + int wlen = wcslen(wargs[i]); + int clen = WideCharToMultiByte(65001, 0, &wargs[i][0], wlen, null, 0, null, 0); + args[i] = cargp[p .. p+clen]; + p += clen; assert(p <= cargl); + WideCharToMultiByte(65001, 0, &wargs[i][0], wlen, &args[i][0], clen, null, 0); + } + LocalFree(wargs); + wargs = null; + wargc = 0; + } + else version (linux) + { + char[]* am = cast(char[]*) malloc(argc * (char[]).sizeof); + scope(exit) free(am); + + for (size_t i = 0; i < argc; i++) + { + auto len = strlen(argv[i]); + am[i] = argv[i][0 .. len]; + } + args = am[0 .. argc]; + } + + bool trapExceptions = rt_trapExceptions; + + void tryExec(void delegate() dg) + { + + if (trapExceptions) + { + try + { + dg(); + } + catch (Throwable e) + { + while (e) + { + if (e.file) + { + // fprintf(stderr, "%.*s(%u): %.*s\n", e.file, e.line, e.msg); + console (e.classinfo.name)("@")(e.file)("(")(e.line)("): ")(e.msg)("\n"); + } + else + { + // fprintf(stderr, "%.*s\n", e.toString()); + console (e.toString)("\n"); + } + if (e.info) + { + console ("----------------\n"); + foreach (t; e.info) + console (t)("\n"); + } + if (e.next) + console ("\n"); + e = e.next; + } + result = EXIT_FAILURE; + } + catch (Object o) + { + // fprintf(stderr, "%.*s\n", o.toString()); + console (o.toString)("\n"); + result = EXIT_FAILURE; + } + } + else + { + dg(); + } + } + + // NOTE: The lifetime of a process is much like the lifetime of an object: + // it is initialized, then used, then destroyed. If initialization + // fails, the successive two steps are never reached. However, if + // initialization succeeds, then cleanup will occur even if the use + // step fails in some way. Here, the use phase consists of running + // the user's main function. If main terminates with an exception, + // the exception is handled and then cleanup begins. An exception + // thrown during cleanup, however, will abort the cleanup process. + + void runMain() + { + result = main(args); + } + + void runAll() + { + gc_init(); + initStaticDataGC(); + version (Windows) + _minit(); + _moduleCtor(); + if (runModuleUnitTests()) + tryExec(&runMain); + thread_joinAll(); + _d_isHalting = true; + _moduleDtor(); + gc_term(); + } + + tryExec(&runAll); + + version (linux) + { + _STD_critical_term(); + _STD_monitor_staticdtor(); + } + return result; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/dmain2.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/dmain2.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,394 @@ +/** + * Contains main program entry point and support routines. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: p.length) + { + size_t size = p.length * sizeelem; + auto info = gc_query(p.data); + + if (info.size <= newsize || info.base != p.data) + { + if (info.size >= PAGESIZE && info.base == p.data) + { // Try to extend in-place + auto u = gc_extend(p.data, (newsize + 1) - info.size, (newsize + 1) - info.size); + if (u) + { + goto L1; + } + } + newdata = cast(byte *)gc_malloc(newsize + 1, info.attr); + newdata[0 .. size] = p.data[0 .. size]; + } + L1: + newdata[size .. newsize] = 0; + } + } + else + { + newdata = cast(byte *)gc_calloc(newsize + 1, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + } + } + else + { + newdata = p.data; + } + + p.data = newdata; + p.length = newlength; + return newdata[0 .. newlength]; + +Loverflow: + onOutOfMemoryError(); +} + + +/** + * Resize arrays for non-zero initializers. + * p pointer to array lvalue to be updated + * newlength new .length property of array + * sizeelem size of each element of array + * initsize size of initializer + * ... initializer + */ +extern (C) byte[] _d_arraysetlengthiT(TypeInfo ti, size_t newlength, Array *p) +in +{ + assert(!p.length || p.data); +} +body +{ + byte* newdata; + size_t sizeelem = ti.next.tsize(); + void[] initializer = ti.next.init(); + size_t initsize = initializer.length; + + assert(sizeelem); + assert(initsize); + assert(initsize <= sizeelem); + assert((sizeelem / initsize) * initsize == sizeelem); + + debug(PRINTF) + { + printf("_d_arraysetlengthiT(p = %p, sizeelem = %d, newlength = %d, initsize = %d)\n", p, sizeelem, newlength, initsize); + if (p) + printf("\tp.data = %p, p.length = %d\n", p.data, p.length); + } + + if (newlength) + { + version (D_InlineAsm_X86) + { + size_t newsize = void; + + asm + { + mov EAX,newlength ; + mul EAX,sizeelem ; + mov newsize,EAX ; + jc Loverflow ; + } + } + else + { + size_t newsize = sizeelem * newlength; + + if (newsize / newlength != sizeelem) + goto Loverflow; + } + debug(PRINTF) printf("newsize = %x, newlength = %x\n", newsize, newlength); + + size_t size = p.length * sizeelem; + + if (p.data) + { + newdata = p.data; + if (newlength > p.length) + { + auto info = gc_query(p.data); + + if (info.size <= newsize || info.base != p.data) + { + if (info.size >= PAGESIZE && info.base == p.data) + { // Try to extend in-place + auto u = gc_extend(p.data, (newsize + 1) - info.size, (newsize + 1) - info.size); + if (u) + { + goto L1; + } + } + newdata = cast(byte *)gc_malloc(newsize + 1, info.attr); + newdata[0 .. size] = p.data[0 .. size]; + L1: ; + } + } + } + else + { + newdata = cast(byte *)gc_malloc(newsize + 1, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + } + + auto q = initializer.ptr; // pointer to initializer + + if (newsize > size) + { + if (initsize == 1) + { + debug(PRINTF) printf("newdata = %p, size = %d, newsize = %d, *q = %d\n", newdata, size, newsize, *cast(byte*)q); + newdata[size .. newsize] = *(cast(byte*)q); + } + else + { + for (size_t u = size; u < newsize; u += initsize) + { + memcpy(newdata + u, q, initsize); + } + } + } + } + else + { + newdata = p.data; + } + + p.data = newdata; + p.length = newlength; + return newdata[0 .. newlength]; + +Loverflow: + onOutOfMemoryError(); +} + + +/** + * Append y[] to array x[]. + * size is size of each array element. + */ +extern (C) long _d_arrayappendT(TypeInfo ti, Array *px, byte[] y) +{ + auto sizeelem = ti.next.tsize(); // array element size + auto info = gc_query(px.data); + auto length = px.length; + auto newlength = length + y.length; + auto newsize = newlength * sizeelem; + + if (info.size < newsize || info.base != px.data) + { byte* newdata; + + if (info.size >= PAGESIZE && info.base == px.data) + { // Try to extend in-place + auto u = gc_extend(px.data, (newsize + 1) - info.size, (newsize + 1) - info.size); + if (u) + { + goto L1; + } + } + newdata = cast(byte *)gc_malloc(newCapacity(newlength, sizeelem) + 1, info.attr); + memcpy(newdata, px.data, length * sizeelem); + px.data = newdata; + } + L1: + px.length = newlength; + memcpy(px.data + length * sizeelem, y.ptr, y.length * sizeelem); + return *cast(long*)px; +} + + +/** + * + */ +size_t newCapacity(size_t newlength, size_t size) +{ + version(none) + { + size_t newcap = newlength * size; + } + else + { + /* + * Better version by Dave Fladebo: + * This uses an inverse logorithmic algorithm to pre-allocate a bit more + * space for larger arrays. + * - Arrays smaller than PAGESIZE bytes are left as-is, so for the most + * common cases, memory allocation is 1 to 1. The small overhead added + * doesn't affect small array perf. (it's virtually the same as + * current). + * - Larger arrays have some space pre-allocated. + * - As the arrays grow, the relative pre-allocated space shrinks. + * - The logorithmic algorithm allocates relatively more space for + * mid-size arrays, making it very fast for medium arrays (for + * mid-to-large arrays, this turns out to be quite a bit faster than the + * equivalent realloc() code in C, on Linux at least. Small arrays are + * just as fast as GCC). + * - Perhaps most importantly, overall memory usage and stress on the GC + * is decreased significantly for demanding environments. + */ + size_t newcap = newlength * size; + size_t newext = 0; + + if (newcap > PAGESIZE) + { + //double mult2 = 1.0 + (size / log10(pow(newcap * 2.0,2.0))); + + // redo above line using only integer math + + static int log2plus1(size_t c) + { int i; + + if (c == 0) + i = -1; + else + for (i = 1; c >>= 1; i++) + { + } + return i; + } + + /* The following setting for mult sets how much bigger + * the new size will be over what is actually needed. + * 100 means the same size, more means proportionally more. + * More means faster but more memory consumption. + */ + //long mult = 100 + (1000L * size) / (6 * log2plus1(newcap)); + long mult = 100 + (1000L * size) / log2plus1(newcap); + + // testing shows 1.02 for large arrays is about the point of diminishing return + if (mult < 102) + mult = 102; + newext = cast(size_t)((newcap * mult) / 100); + newext -= newext % size; + debug(PRINTF) printf("mult: %2.2f, alloc: %2.2f\n",mult/100.0,newext / cast(double)size); + } + newcap = newext > newcap ? newext : newcap; + debug(PRINTF) printf("newcap = %d, newlength = %d, size = %d\n", newcap, newlength, size); + } + return newcap; +} + + +/** + * + */ +extern (C) byte[] _d_arrayappendcT(TypeInfo ti, inout byte[] x, ...) +{ + auto sizeelem = ti.next.tsize(); // array element size + auto info = gc_query(x.ptr); + auto length = x.length; + auto newlength = length + 1; + auto newsize = newlength * sizeelem; + + assert(info.size == 0 || length * sizeelem <= info.size); + + debug(PRINTF) printf("_d_arrayappendcT(sizeelem = %d, ptr = %p, length = %d, cap = %d)\n", sizeelem, x.ptr, x.length, info.size); + + if (info.size <= newsize || info.base != x.ptr) + { byte* newdata; + + if (info.size >= PAGESIZE && info.base == x.ptr) + { // Try to extend in-place + auto u = gc_extend(x.ptr, (newsize + 1) - info.size, (newsize + 1) - info.size); + if (u) + { + goto L1; + } + } + debug(PRINTF) printf("_d_arrayappendcT(length = %d, newlength = %d, cap = %d)\n", length, newlength, info.size); + auto newcap = newCapacity(newlength, sizeelem); + assert(newcap >= newlength * sizeelem); + newdata = cast(byte *)gc_malloc(newcap + 1, info.attr); + memcpy(newdata, x.ptr, length * sizeelem); + (cast(void**)(&x))[1] = newdata; + } + L1: + byte *argp = cast(byte *)(&ti + 2); + + *cast(size_t *)&x = newlength; + x.ptr[length * sizeelem .. newsize] = argp[0 .. sizeelem]; + assert((cast(size_t)x.ptr & 15) == 0); + assert(gc_sizeOf(x.ptr) > x.length * sizeelem); + return x; +} + + +/** + * + */ +extern (C) byte[] _d_arraycatT(TypeInfo ti, byte[] x, byte[] y) +out (result) +{ + auto sizeelem = ti.next.tsize(); // array element size + debug(PRINTF) printf("_d_arraycatT(%d,%p ~ %d,%p sizeelem = %d => %d,%p)\n", x.length, x.ptr, y.length, y.ptr, sizeelem, result.length, result.ptr); + assert(result.length == x.length + y.length); + for (size_t i = 0; i < x.length * sizeelem; i++) + assert((cast(byte*)result)[i] == (cast(byte*)x)[i]); + for (size_t i = 0; i < y.length * sizeelem; i++) + assert((cast(byte*)result)[x.length * sizeelem + i] == (cast(byte*)y)[i]); + + size_t cap = gc_sizeOf(result.ptr); + assert(!cap || cap > result.length * sizeelem); +} +body +{ + version (none) + { + /* Cannot use this optimization because: + * char[] a, b; + * char c = 'a'; + * b = a ~ c; + * c = 'b'; + * will change the contents of b. + */ + if (!y.length) + return x; + if (!x.length) + return y; + } + + debug(PRINTF) printf("_d_arraycatT(%d,%p ~ %d,%p)\n", x.length, x.ptr, y.length, y.ptr); + auto sizeelem = ti.next.tsize(); // array element size + debug(PRINTF) printf("_d_arraycatT(%d,%p ~ %d,%p sizeelem = %d)\n", x.length, x.ptr, y.length, y.ptr, sizeelem); + size_t xlen = x.length * sizeelem; + size_t ylen = y.length * sizeelem; + size_t len = xlen + ylen; + + if (!len) + return null; + + byte* p = cast(byte*)gc_malloc(len + 1, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + memcpy(p, x.ptr, xlen); + memcpy(p + xlen, y.ptr, ylen); + p[len] = 0; + return p[0 .. x.length + y.length]; +} + + +/** + * + */ +extern (C) byte[] _d_arraycatnT(TypeInfo ti, uint n, ...) +{ void* a; + size_t length; + byte[]* p; + uint i; + byte[] b; + auto size = ti.next.tsize(); // array element size + + p = cast(byte[]*)(&n + 1); + + for (i = 0; i < n; i++) + { + b = *p++; + length += b.length; + } + if (!length) + return null; + + a = gc_malloc(length * size, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + p = cast(byte[]*)(&n + 1); + + uint j = 0; + for (i = 0; i < n; i++) + { + b = *p++; + if (b.length) + { + memcpy(a + j, b.ptr, b.length * size); + j += b.length * size; + } + } + + byte[] result; + *cast(int *)&result = length; // jam length + (cast(void **)&result)[1] = a; // jam ptr + return result; +} + + +/** + * + */ +extern (C) void* _d_arrayliteralT(TypeInfo ti, size_t length, ...) +{ + auto sizeelem = ti.next.tsize(); // array element size + void* result; + + debug(PRINTF) printf("_d_arrayliteralT(sizeelem = %d, length = %d)\n", sizeelem, length); + if (length == 0 || sizeelem == 0) + result = null; + else + { + result = gc_malloc(length * sizeelem, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + + va_list q; + va_start!(size_t)(q, length); + + size_t stacksize = (sizeelem + int.sizeof - 1) & ~(int.sizeof - 1); + + if (stacksize == sizeelem) + { + memcpy(result, q, length * sizeelem); + } + else + { + for (size_t i = 0; i < length; i++) + { + memcpy(result + i * sizeelem, q, sizeelem); + q += stacksize; + } + } + + va_end(q); + } + return result; +} + + +/** + * Support for array.dup property. + */ +struct Array2 +{ + size_t length; + void* ptr; +} + + +/** + * + */ +extern (C) long _adDupT(TypeInfo ti, Array2 a) +out (result) +{ + auto sizeelem = ti.next.tsize(); // array element size + assert(memcmp((*cast(Array2*)&result).ptr, a.ptr, a.length * sizeelem) == 0); +} +body +{ + Array2 r; + + if (a.length) + { + auto sizeelem = ti.next.tsize(); // array element size + auto size = a.length * sizeelem; + r.ptr = gc_malloc(size, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + r.length = a.length; + memcpy(r.ptr, a.ptr, size); + } + return *cast(long*)(&r); +} + + +unittest +{ + int[] a; + int[] b; + int i; + + a = new int[3]; + a[0] = 1; a[1] = 2; a[2] = 3; + b = a.dup; + assert(b.length == 3); + for (i = 0; i < 3; i++) + assert(b[i] == i + 1); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/llmath.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/llmath.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,531 @@ +/** + * Support for 64-bit longs. + * + * Copyright: Copyright Digital Mars 1993 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright, Sean Kelly + * + * Copyright Digital Mars 2000 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#include + +#if __cplusplus +extern "C" { +#endif + +struct ClassInfo; +struct Vtbl; + +typedef struct Vtbl +{ + size_t len; + void **vptr; +} Vtbl; + +typedef struct Interface +{ + struct ClassInfo *classinfo; + struct Vtbl vtbl; + int offset; +} Interface; + +typedef struct Object +{ + void **vptr; + void *monitor; +} Object; + +typedef struct ClassInfo +{ + Object object; + + size_t initlen; + void *init; + + size_t namelen; + char *name; + + Vtbl vtbl; + + size_t interfacelen; + Interface *interfaces; + + struct ClassInfo *baseClass; + + void *destructor; + void *invariant; + + int flags; +} ClassInfo; + +typedef struct Throwable +{ + Object object; + + size_t msglen; + char* msg; + + size_t filelen; + char* file; + + size_t line; + + struct Interface *info; + struct Throwable *next; +} Throwable; + +typedef struct Array +{ + size_t length; + void *ptr; +} Array; + +typedef struct Delegate +{ + void *thisptr; + void (*funcptr)(); +} Delegate; + +void _d_monitorenter(Object *h); +void _d_monitorexit(Object *h); + +int _d_isbaseof(ClassInfo *b, ClassInfo *c); +Object *_d_dynamic_cast(Object *o, ClassInfo *ci); + +Object * _d_newclass(ClassInfo *ci); +void _d_delclass(Object **p); + +void _d_OutOfMemory(); + +#if __cplusplus +} +#endif diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/memory.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/memory.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,213 @@ +/** + * This module exposes functionality for inspecting and manipulating memory. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright + * + * Copyright Digital Mars 2004 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.memset; + +extern (C) +{ + // Functions from the C library. + void *memcpy(void *, void *, size_t); +} + +extern (C): + +short *_memset16(short *p, short value, size_t count) +{ + short *pstart = p; + short *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} + +int *_memset32(int *p, int value, size_t count) +{ +version (X86) +{ + asm + { + mov EDI,p ; + mov EAX,value ; + mov ECX,count ; + mov EDX,EDI ; + rep ; + stosd ; + mov EAX,EDX ; + } +} +else +{ + int *pstart = p; + int *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} +} + +long *_memset64(long *p, long value, size_t count) +{ + long *pstart = p; + long *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} + +cdouble *_memset128(cdouble *p, cdouble value, size_t count) +{ + cdouble *pstart = p; + cdouble *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} + +real *_memset80(real *p, real value, size_t count) +{ + real *pstart = p; + real *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} + +creal *_memset160(creal *p, creal value, size_t count) +{ + creal *pstart = p; + creal *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} + +void *_memsetn(void *p, void *value, int count, size_t sizelem) +{ void *pstart = p; + int i; + + for (i = 0; i < count; i++) + { + memcpy(p, value, sizelem); + p = cast(void *)(cast(char *)p + sizelem); + } + return pstart; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/minit.asm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/minit.asm Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,85 @@ +;_ minit.asm +; Module initialization support. +; +; Copyright: Copyright Digital Mars 2000 - 2009. +; License: Boost License 1.0. + * Authors: Walter Bright, Sean Kelly + * + * Copyright Digital Mars 2000 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#include +#include +#include + +#if _WIN32 +#elif linux || __APPLE__ +#define USE_PTHREADS 1 +#else +#endif + +#if _WIN32 +#include +#endif + +#if USE_PTHREADS +#include +#endif + +#include "mars.h" + +// This is what the monitor reference in Object points to +typedef struct Monitor +{ + void* impl; // for user-level monitors + Array devt; // for internal monitors + +#if _WIN32 + CRITICAL_SECTION mon; +#endif + +#if USE_PTHREADS + pthread_mutex_t mon; +#endif +} Monitor; + +#define MONPTR(h) (&((Monitor *)(h)->monitor)->mon) + +static volatile int inited; + +/* =============================== Win32 ============================ */ + +#if _WIN32 + +static CRITICAL_SECTION _monitor_critsec; + +void _STI_monitor_staticctor() +{ + if (!inited) + { InitializeCriticalSection(&_monitor_critsec); + inited = 1; + } +} + +void _STD_monitor_staticdtor() +{ + if (inited) + { inited = 0; + DeleteCriticalSection(&_monitor_critsec); + } +} + +void _d_monitor_create(Object *h) +{ + /* + * NOTE: Assume this is only called when h->monitor is null prior to the + * call. However, please note that another thread may call this function + * at the same time, so we can not assert this here. Instead, try and + * create a lock, and if one already exists then forget about it. + */ + + //printf("+_d_monitor_create(%p)\n", h); + assert(h); + Monitor *cs = NULL; + EnterCriticalSection(&_monitor_critsec); + if (!h->monitor) + { + cs = (Monitor *)calloc(sizeof(Monitor), 1); + assert(cs); + InitializeCriticalSection(&cs->mon); + h->monitor = (void *)cs; + cs = NULL; + } + LeaveCriticalSection(&_monitor_critsec); + if (cs) + free(cs); + //printf("-_d_monitor_create(%p)\n", h); +} + +void _d_monitor_destroy(Object *h) +{ + //printf("+_d_monitor_destroy(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + DeleteCriticalSection(MONPTR(h)); + free((void *)h->monitor); + h->monitor = NULL; + //printf("-_d_monitor_destroy(%p)\n", h); +} + +int _d_monitor_lock(Object *h) +{ + //printf("+_d_monitor_acquire(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + EnterCriticalSection(MONPTR(h)); + //printf("-_d_monitor_acquire(%p)\n", h); +} + +void _d_monitor_unlock(Object *h) +{ + //printf("+_d_monitor_release(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + LeaveCriticalSection(MONPTR(h)); + //printf("-_d_monitor_release(%p)\n", h); +} + +#endif + +/* =============================== linux ============================ */ + +#if USE_PTHREADS + +#ifndef PTHREAD_MUTEX_RECURSIVE +# define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP +#endif + +// Includes attribute fixes from David Friedman's GDC port + +static pthread_mutex_t _monitor_critsec; +static pthread_mutexattr_t _monitors_attr; + +void _STI_monitor_staticctor() +{ + if (!inited) + { + pthread_mutexattr_init(&_monitors_attr); + pthread_mutexattr_settype(&_monitors_attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&_monitor_critsec, 0); + inited = 1; + } +} + +void _STD_monitor_staticdtor() +{ + if (inited) + { inited = 0; + pthread_mutex_destroy(&_monitor_critsec); + pthread_mutexattr_destroy(&_monitors_attr); + } +} + +void _d_monitor_create(Object *h) +{ + /* + * NOTE: Assume this is only called when h->monitor is null prior to the + * call. However, please note that another thread may call this function + * at the same time, so we can not assert this here. Instead, try and + * create a lock, and if one already exists then forget about it. + */ + + //printf("+_d_monitor_create(%p)\n", h); + assert(h); + Monitor *cs = NULL; + pthread_mutex_lock(&_monitor_critsec); + if (!h->monitor) + { + cs = (Monitor *)calloc(sizeof(Monitor), 1); + assert(cs); + pthread_mutex_init(&cs->mon, & _monitors_attr); + h->monitor = (void *)cs; + cs = NULL; + } + pthread_mutex_unlock(&_monitor_critsec); + if (cs) + free(cs); + //printf("-_d_monitor_create(%p)\n", h); +} + +void _d_monitor_destroy(Object *h) +{ + //printf("+_d_monitor_destroy(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + pthread_mutex_destroy(MONPTR(h)); + free((void *)h->monitor); + h->monitor = NULL; + //printf("-_d_monitor_destroy(%p)\n", h); +} + +int _d_monitor_lock(Object *h) +{ + //printf("+_d_monitor_acquire(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + pthread_mutex_lock(MONPTR(h)); + //printf("-_d_monitor_acquire(%p)\n", h); +} + +void _d_monitor_unlock(Object *h) +{ + //printf("+_d_monitor_release(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + pthread_mutex_unlock(MONPTR(h)); + //printf("-_d_monitor_release(%p)\n", h); +} + +#endif diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/obj.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/obj.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,34 @@ +/** + * Containts object comparator functions called by generated code. + * + * Copyright: Copyright Digital Mars 2002 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright, Sean Kelly + * + * Copyright Digital Mars 2000 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ + +module object; + +private +{ + import core.stdc.string; + import core.stdc.stdlib; + // HACK: This versioning is to provide for the different treatment of + // imports in a normal vs. a -lib build. It should really be fixed + // correctly before the next release. + version (D_Ddoc) + import util.string; + else + import rt.util.string; + debug(PRINTF) import core.stdc.stdio; + + extern (C) void onOutOfMemoryError(); + extern (C) Object _d_newclass(ClassInfo ci); +} + +// NOTE: For some reason, this declaration method doesn't work +// in this particular file (and this file only). It must +// be a DMD thing. +//alias typeof(int.sizeof) size_t; +//alias typeof(cast(void*)0 - cast(void*)0) ptrdiff_t; + +version(X86_64) +{ + alias ulong size_t; + alias long ptrdiff_t; +} +else +{ + alias uint size_t; + alias int ptrdiff_t; +} + +alias size_t hash_t; +alias bool equals_t; + +alias immutable(char)[] string; +alias immutable(wchar)[] wstring; +alias immutable(dchar)[] dstring; + +/** + * All D class objects inherit from Object. + */ +class Object +{ + /** + * Convert Object to a human readable string. + */ + string toString() + { + return this.classinfo.name; + } + + /** + * Compute hash function for Object. + */ + hash_t toHash() + { + // BUG: this prevents a compacting GC from working, needs to be fixed + return cast(hash_t)cast(void*)this; + } + + /** + * Compare with another Object obj. + * Returns: + * $(TABLE + * $(TR $(TD this < obj) $(TD < 0)) + * $(TR $(TD this == obj) $(TD 0)) + * $(TR $(TD this > obj) $(TD > 0)) + * ) + */ + int opCmp(Object o) + { + // BUG: this prevents a compacting GC from working, needs to be fixed + //return cast(int)cast(void*)this - cast(int)cast(void*)o; + + throw new Exception("need opCmp for class " ~ this.classinfo.name); + //return this !is o; + } + + /** + * Returns !=0 if this object does have the same contents as obj. + */ + equals_t opEquals(Object o) + { + return this is o; + } + + interface Monitor + { + void lock(); + void unlock(); + } + + /** + * Create instance of class specified by classname. + * The class must either have no constructors or have + * a default constructor. + * Returns: + * null if failed + */ + static Object factory(string classname) + { + auto ci = ClassInfo.find(classname); + if (ci) + { + return ci.create(); + } + return null; + } +} + +/** + * Information about an interface. + * When an object is accessed via an interface, an Interface* appears as the + * first entry in its vtbl. + */ +struct Interface +{ + ClassInfo classinfo; /// .classinfo for this interface (not for containing class) + void*[] vtbl; + ptrdiff_t offset; /// offset to Interface 'this' from Object 'this' +} + +/** + * Runtime type information about a class. Can be retrieved for any class type + * or instance by using the .classinfo property. + * A pointer to this appears as the first entry in the class's vtbl[]. + */ +class ClassInfo : Object +{ + byte[] init; /** class static initializer + * (init.length gives size in bytes of class) + */ + string name; /// class name + void*[] vtbl; /// virtual function pointer table + Interface[] interfaces; /// interfaces this class implements + ClassInfo base; /// base class + void* destructor; + void function(Object) classInvariant; + uint flags; + // 1: // is IUnknown or is derived from IUnknown + // 2: // has no possible pointers into GC memory + // 4: // has offTi[] member + // 8: // has constructors + // 16: // has xgetMembers member + // 32: // has typeinfo member + void* deallocator; + OffsetTypeInfo[] offTi; + void function(Object) defaultConstructor; // default Constructor + const(MemberInfo[]) function(in char[]) xgetMembers; + TypeInfo typeinfo; + + /** + * Search all modules for ClassInfo corresponding to classname. + * Returns: null if not found + */ + static ClassInfo find(in char[] classname) + { + foreach (m; ModuleInfo) + { + //writefln("module %s, %d", m.name, m.localClasses.length); + foreach (c; m.localClasses) + { + //writefln("\tclass %s", c.name); + if (c.name == classname) + return c; + } + } + return null; + } + + /** + * Create instance of Object represented by 'this'. + */ + Object create() + { + if (flags & 8 && !defaultConstructor) + return null; + Object o = _d_newclass(this); + if (flags & 8 && defaultConstructor) + { + defaultConstructor(o); + } + return o; + } + + /** + * Search for all members with the name 'name'. + * If name[] is null, return all members. + */ + const(MemberInfo[]) getMembers(in char[] name) + { + if (flags & 16 && xgetMembers) + return xgetMembers(name); + return null; + } +} + +/** + * Array of pairs giving the offset and type information for each + * member in an aggregate. + */ +struct OffsetTypeInfo +{ + size_t offset; /// Offset of member from start of object + TypeInfo ti; /// TypeInfo for this member +} + +/** + * Runtime type information about a type. + * Can be retrieved for any type using a + * TypeidExpression. + */ +class TypeInfo +{ + override hash_t toHash() + { + hash_t hash; + + foreach (char c; this.toString()) + hash = hash * 9 + c; + return hash; + } + + override int opCmp(Object o) + { + if (this is o) + return 0; + TypeInfo ti = cast(TypeInfo)o; + if (ti is null) + return 1; + return dstrcmp(this.toString(), ti.toString()); + } + + override equals_t opEquals(Object o) + { + /* TypeInfo instances are singletons, but duplicates can exist + * across DLL's. Therefore, comparing for a name match is + * sufficient. + */ + if (this is o) + return true; + TypeInfo ti = cast(TypeInfo)o; + return ti && this.toString() == ti.toString(); + } + + /// Returns a hash of the instance of a type. + hash_t getHash(in void* p) { return cast(hash_t)p; } + + /// Compares two instances for equality. + equals_t equals(in void* p1, in void* p2) { return p1 == p2; } + + /// Compares two instances for <, ==, or >. + int compare(in void* p1, in void* p2) { return 0; } + + /// Returns size of the type. + size_t tsize() { return 0; } + + /// Swaps two instances of the type. + void swap(void* p1, void* p2) + { + size_t n = tsize(); + for (size_t i = 0; i < n; i++) + { + byte t = (cast(byte *)p1)[i]; + (cast(byte*)p1)[i] = (cast(byte*)p2)[i]; + (cast(byte*)p2)[i] = t; + } + } + + /// Get TypeInfo for 'next' type, as defined by what kind of type this is, + /// null if none. + TypeInfo next() { return null; } + + /// Return default initializer, null if default initialize to 0 + void[] init() { return null; } + + /// Get flags for type: 1 means GC should scan for pointers + uint flags() { return 0; } + + /// Get type information on the contents of the type; null if not available + OffsetTypeInfo[] offTi() { return null; } + /// Run the destructor on the object and all its sub-objects + void destroy(void* p) {} + /// Run the postblit on the object and all its sub-objects + void postblit(void* p) {} +} + +class TypeInfo_Typedef : TypeInfo +{ + override string toString() { return name; } + + override equals_t opEquals(Object o) + { + TypeInfo_Typedef c; + return this is o || + ((c = cast(TypeInfo_Typedef)o) !is null && + this.name == c.name && + this.base == c.base); + } + + override hash_t getHash(in void* p) { return base.getHash(p); } + override equals_t equals(in void* p1, in void* p2) { return base.equals(p1, p2); } + override int compare(in void* p1, in void* p2) { return base.compare(p1, p2); } + override size_t tsize() { return base.tsize(); } + override void swap(void* p1, void* p2) { return base.swap(p1, p2); } + + override TypeInfo next() { return base.next(); } + override uint flags() { return base.flags(); } + override void[] init() { return m_init.length ? m_init : base.init(); } + + TypeInfo base; + string name; + void[] m_init; +} + +class TypeInfo_Enum : TypeInfo_Typedef +{ + +} + +class TypeInfo_Pointer : TypeInfo +{ + override string toString() { return m_next.toString() ~ "*"; } + + override equals_t opEquals(Object o) + { + TypeInfo_Pointer c; + return this is o || + ((c = cast(TypeInfo_Pointer)o) !is null && + this.m_next == c.m_next); + } + + override hash_t getHash(in void* p) + { + return cast(hash_t)*cast(void**)p; + } + + override equals_t equals(in void* p1, in void* p2) + { + return *cast(void**)p1 == *cast(void**)p2; + } + + override int compare(in void* p1, in void* p2) + { + if (*cast(void**)p1 < *cast(void**)p2) + return -1; + else if (*cast(void**)p1 > *cast(void**)p2) + return 1; + else + return 0; + } + + override size_t tsize() + { + return (void*).sizeof; + } + + override void swap(void* p1, void* p2) + { + void* tmp = *cast(void**)p1; + *cast(void**)p1 = *cast(void**)p2; + *cast(void**)p2 = tmp; + } + + override TypeInfo next() { return m_next; } + override uint flags() { return 1; } + + TypeInfo m_next; +} + +class TypeInfo_Array : TypeInfo +{ + override string toString() { return value.toString() ~ "[]"; } + + override equals_t opEquals(Object o) + { + TypeInfo_Array c; + return this is o || + ((c = cast(TypeInfo_Array)o) !is null && + this.value == c.value); + } + + override hash_t getHash(in void* p) + { + size_t sz = value.tsize(); + hash_t hash = 0; + void[] a = *cast(void[]*)p; + for (size_t i = 0; i < a.length; i++) + hash += value.getHash(a.ptr + i * sz) * 11; + return hash; + } + + override equals_t equals(in void* p1, in void* p2) + { + void[] a1 = *cast(void[]*)p1; + void[] a2 = *cast(void[]*)p2; + if (a1.length != a2.length) + return false; + size_t sz = value.tsize(); + for (size_t i = 0; i < a1.length; i++) + { + if (!value.equals(a1.ptr + i * sz, a2.ptr + i * sz)) + return false; + } + return true; + } + + override int compare(in void* p1, in void* p2) + { + void[] a1 = *cast(void[]*)p1; + void[] a2 = *cast(void[]*)p2; + size_t sz = value.tsize(); + size_t len = a1.length; + + if (a2.length < len) + len = a2.length; + for (size_t u = 0; u < len; u++) + { + int result = value.compare(a1.ptr + u * sz, a2.ptr + u * sz); + if (result) + return result; + } + return cast(int)a1.length - cast(int)a2.length; + } + + override size_t tsize() + { + return (void[]).sizeof; + } + + override void swap(void* p1, void* p2) + { + void[] tmp = *cast(void[]*)p1; + *cast(void[]*)p1 = *cast(void[]*)p2; + *cast(void[]*)p2 = tmp; + } + + TypeInfo value; + + override TypeInfo next() + { + return value; + } + + override uint flags() { return 1; } +} + +class TypeInfo_StaticArray : TypeInfo +{ + override string toString() + { + char[10] tmp = void; + return cast(string)(value.toString() ~ "[" ~ tmp.intToString(len) ~ "]"); + } + + override equals_t opEquals(Object o) + { + TypeInfo_StaticArray c; + return this is o || + ((c = cast(TypeInfo_StaticArray)o) !is null && + this.len == c.len && + this.value == c.value); + } + + override hash_t getHash(in void* p) + { + size_t sz = value.tsize(); + hash_t hash = 0; + for (size_t i = 0; i < len; i++) + hash += value.getHash(p + i * sz); + return hash; + } + + override equals_t equals(in void* p1, in void* p2) + { + size_t sz = value.tsize(); + + for (size_t u = 0; u < len; u++) + { + if (!value.equals(p1 + u * sz, p2 + u * sz)) + return false; + } + return true; + } + + override int compare(in void* p1, in void* p2) + { + size_t sz = value.tsize(); + + for (size_t u = 0; u < len; u++) + { + int result = value.compare(p1 + u * sz, p2 + u * sz); + if (result) + return result; + } + return 0; + } + + override size_t tsize() + { + return len * value.tsize(); + } + + override void swap(void* p1, void* p2) + { + void* tmp; + size_t sz = value.tsize(); + ubyte[16] buffer; + void* pbuffer; + + if (sz < buffer.sizeof) + tmp = buffer.ptr; + else + tmp = pbuffer = (new void[sz]).ptr; + + for (size_t u = 0; u < len; u += sz) + { size_t o = u * sz; + memcpy(tmp, p1 + o, sz); + memcpy(p1 + o, p2 + o, sz); + memcpy(p2 + o, tmp, sz); + } + if (pbuffer) + delete pbuffer; + } + + override void[] init() { return value.init(); } + override TypeInfo next() { return value; } + override uint flags() { return value.flags(); } + + override void destroy(void* p) + { + auto sz = value.tsize(); + p += sz * len; + foreach (i; 0 .. len) + { + p -= sz; + value.destroy(p); + } + } + + override void postblit(void* p) + { + auto sz = value.tsize(); + foreach (i; 0 .. len) + { + value.postblit(p); + p += sz; + } + } + + TypeInfo value; + size_t len; +} + +class TypeInfo_AssociativeArray : TypeInfo +{ + override string toString() + { + return cast(string)(next.toString() ~ "[" ~ key.toString() ~ "]"); + } + + override equals_t opEquals(Object o) + { + TypeInfo_AssociativeArray c; + return this is o || + ((c = cast(TypeInfo_AssociativeArray)o) !is null && + this.key == c.key && + this.value == c.value); + } + + // BUG: need to add the rest of the functions + + override size_t tsize() + { + return (char[int]).sizeof; + } + + override TypeInfo next() { return value; } + override uint flags() { return 1; } + + TypeInfo value; + TypeInfo key; +} + +class TypeInfo_Function : TypeInfo +{ + override string toString() + { + return cast(string)(next.toString() ~ "()"); + } + + override equals_t opEquals(Object o) + { + TypeInfo_Function c; + return this is o || + ((c = cast(TypeInfo_Function)o) !is null && + this.next == c.next); + } + + // BUG: need to add the rest of the functions + + override size_t tsize() + { + return 0; // no size for functions + } + + TypeInfo next; +} + +class TypeInfo_Delegate : TypeInfo +{ + override string toString() + { + return cast(string)(next.toString() ~ " delegate()"); + } + + override equals_t opEquals(Object o) + { + TypeInfo_Delegate c; + return this is o || + ((c = cast(TypeInfo_Delegate)o) !is null && + this.next == c.next); + } + + // BUG: need to add the rest of the functions + + override size_t tsize() + { + alias int delegate() dg; + return dg.sizeof; + } + + override uint flags() { return 1; } + + TypeInfo next; +} + +class TypeInfo_Class : TypeInfo +{ + override string toString() { return info.name; } + + override equals_t opEquals(Object o) + { + TypeInfo_Class c; + return this is o || + ((c = cast(TypeInfo_Class)o) !is null && + this.info.name == c.classinfo.name); + } + + override hash_t getHash(in void* p) + { + Object o = *cast(Object*)p; + return o ? o.toHash() : 0; + } + + override equals_t equals(in void* p1, in void* p2) + { + Object o1 = *cast(Object*)p1; + Object o2 = *cast(Object*)p2; + + return (o1 is o2) || (o1 && o1.opEquals(o2)); + } + + override int compare(in void* p1, in void* p2) + { + Object o1 = *cast(Object*)p1; + Object o2 = *cast(Object*)p2; + int c = 0; + + // Regard null references as always being "less than" + if (o1 !is o2) + { + if (o1) + { + if (!o2) + c = 1; + else + c = o1.opCmp(o2); + } + else + c = -1; + } + return c; + } + + override size_t tsize() + { + return Object.sizeof; + } + + override uint flags() { return 1; } + + override OffsetTypeInfo[] offTi() + { + return (info.flags & 4) ? info.offTi : null; + } + + ClassInfo info; +} + +class TypeInfo_Interface : TypeInfo +{ + override string toString() { return info.name; } + + override equals_t opEquals(Object o) + { + TypeInfo_Interface c; + return this is o || + ((c = cast(TypeInfo_Interface)o) !is null && + this.info.name == c.classinfo.name); + } + + override hash_t getHash(in void* p) + { + Interface* pi = **cast(Interface ***)*cast(void**)p; + Object o = cast(Object)(*cast(void**)p - pi.offset); + assert(o); + return o.toHash(); + } + + override equals_t equals(in void* p1, in void* p2) + { + Interface* pi = **cast(Interface ***)*cast(void**)p1; + Object o1 = cast(Object)(*cast(void**)p1 - pi.offset); + pi = **cast(Interface ***)*cast(void**)p2; + Object o2 = cast(Object)(*cast(void**)p2 - pi.offset); + + return o1 == o2 || (o1 && o1.opCmp(o2) == 0); + } + + override int compare(in void* p1, in void* p2) + { + Interface* pi = **cast(Interface ***)*cast(void**)p1; + Object o1 = cast(Object)(*cast(void**)p1 - pi.offset); + pi = **cast(Interface ***)*cast(void**)p2; + Object o2 = cast(Object)(*cast(void**)p2 - pi.offset); + int c = 0; + + // Regard null references as always being "less than" + if (o1 != o2) + { + if (o1) + { + if (!o2) + c = 1; + else + c = o1.opCmp(o2); + } + else + c = -1; + } + return c; + } + + override size_t tsize() + { + return Object.sizeof; + } + + override uint flags() { return 1; } + + ClassInfo info; +} + +class TypeInfo_Struct : TypeInfo +{ + override string toString() { return name; } + + override equals_t opEquals(Object o) + { + TypeInfo_Struct s; + return this is o || + ((s = cast(TypeInfo_Struct)o) !is null && + this.name == s.name && + this.init.length == s.init.length); + } + + override hash_t getHash(in void* p) + { + assert(p); + if (xtoHash) + { + debug(PRINTF) printf("getHash() using xtoHash\n"); + return (*xtoHash)(p); + } + else + { + hash_t h; + debug(PRINTF) printf("getHash() using default hash\n"); + // A sorry hash algorithm. + // Should use the one for strings. + // BUG: relies on the GC not moving objects + auto q = cast(const(ubyte)*)p; + for (size_t i = 0; i < init.length; i++) + { + h = h * 9 + *q; + q++; + } + return h; + } + } + + override equals_t equals(in void* p1, in void* p2) + { + if (p1 == p2) + return true; + else if (!p1 || !p2) + return false; + else if (xopEquals) + return (*xopEquals)(p1, p2); + else + // BUG: relies on the GC not moving objects + return memcmp(p1, p2, init.length) == 0; + } + + override int compare(in void* p1, in void* p2) + { + // Regard null references as always being "less than" + if (p1 != p2) + { + if (p1) + { + if (!p2) + return true; + else if (xopCmp) + return (*xopCmp)(p2, p1); + else + // BUG: relies on the GC not moving objects + return memcmp(p1, p2, init.length); + } + else + return -1; + } + return 0; + } + + override size_t tsize() + { + return init.length; + } + + override void[] init() { return m_init; } + + override uint flags() { return m_flags; } + + override void destroy(void* p) + { + if (xdtor) + (*xdtor)(p); + } + + override void postblit(void* p) + { + if (xpostblit) + (*xpostblit)(p); + } + + string name; + void[] m_init; // initializer; init.ptr == null if 0 initialize + + hash_t function(in void*) xtoHash; + equals_t function(in void*, in void*) xopEquals; + int function(in void*, in void*) xopCmp; + char[] function(in void*) xtoString; + + uint m_flags; + + const(MemberInfo[]) function(in char[]) xgetMembers; + void function(void*) xdtor; + void function(void*) xpostblit; +} + +class TypeInfo_Tuple : TypeInfo +{ + TypeInfo[] elements; + + override string toString() + { + string s = "("; + foreach (i, element; elements) + { + if (i) + s ~= ','; + s ~= element.toString(); + } + s ~= ")"; + return s; + } + + override equals_t opEquals(Object o) + { + if (this is o) + return true; + + auto t = cast(TypeInfo_Tuple)o; + if (t && elements.length == t.elements.length) + { + for (size_t i = 0; i < elements.length; i++) + { + if (elements[i] != t.elements[i]) + return false; + } + return true; + } + return false; + } + + override hash_t getHash(in void* p) + { + assert(0); + } + + override equals_t equals(in void* p1, in void* p2) + { + assert(0); + } + + override int compare(in void* p1, in void* p2) + { + assert(0); + } + + override size_t tsize() + { + assert(0); + } + + override void swap(void* p1, void* p2) + { + assert(0); + } + + override void destroy(void* p) + { + assert(0); + } + + override void postblit(void* p) + { + assert(0); + } +} + +class TypeInfo_Const : TypeInfo +{ + override string toString() + { + return cast(string) ("const(" ~ base.toString() ~ ")"); + } + + override equals_t opEquals(Object o) { return base.opEquals(o); } + override hash_t getHash(in void *p) { return base.getHash(p); } + override equals_t equals(in void *p1, in void *p2) { return base.equals(p1, p2); } + override int compare(in void *p1, in void *p2) { return base.compare(p1, p2); } + override size_t tsize() { return base.tsize(); } + override void swap(void *p1, void *p2) { return base.swap(p1, p2); } + + override TypeInfo next() { return base.next(); } + override uint flags() { return base.flags(); } + override void[] init() { return base.init(); } + + TypeInfo base; +} + +class TypeInfo_Invariant : TypeInfo_Const +{ + override string toString() + { + return cast(string) ("immutable(" ~ base.toString() ~ ")"); + } +} + +class TypeInfo_Shared : TypeInfo_Const +{ + override string toString() + { + return cast(string) ("shared(" ~ base.toString() ~ ")"); + } +} + +abstract class MemberInfo +{ + string name(); +} + +class MemberInfo_field : MemberInfo +{ + this(string name, TypeInfo ti, size_t offset) + { + m_name = name; + m_typeinfo = ti; + m_offset = offset; + } + + override string name() { return m_name; } + TypeInfo typeInfo() { return m_typeinfo; } + size_t offset() { return m_offset; } + + string m_name; + TypeInfo m_typeinfo; + size_t m_offset; +} + +class MemberInfo_function : MemberInfo +{ + this(string name, TypeInfo ti, void* fp, uint flags) + { + m_name = name; + m_typeinfo = ti; + m_fp = fp; + m_flags = flags; + } + + override string name() { return m_name; } + TypeInfo typeInfo() { return m_typeinfo; } + void* fp() { return m_fp; } + uint flags() { return m_flags; } + + string m_name; + TypeInfo m_typeinfo; + void* m_fp; + uint m_flags; +} + + +/////////////////////////////////////////////////////////////////////////////// +// Throwable +/////////////////////////////////////////////////////////////////////////////// + + +class Throwable : Object +{ + interface TraceInfo + { + int opApply(int delegate(inout char[])); + } + + string msg; + string file; + size_t line; + TraceInfo info; + Throwable next; + + this(string msg, Throwable next = null) + { + this.msg = msg; + this.next = next; + this.info = traceContext(); + } + + this(string msg, string file, size_t line, Throwable next = null) + { + this(msg, next); + this.file = file; + this.line = line; + this.info = traceContext(); + } + + override string toString() + { + char[10] tmp = void; + char[] buf; + + for (Throwable e = this; e !is null; e = e.next) + { + if (e.file) + { + buf ~= e.classinfo.name ~ "@" ~ e.file ~ "(" ~ tmp.intToString(e.line) ~ "): " ~ e.msg; + } + else + { + buf ~= e.classinfo.name ~ ": " ~ e.msg; + } + if (e.info) + { + buf ~= "\n----------------"; + foreach (t; e.info) + buf ~= "\n" ~ t; + } + if (e.next) + buf ~= "\n"; + } + return cast(string) buf; + } +} + + +alias Throwable.TraceInfo function(void* ptr = null) TraceHandler; +private __gshared TraceHandler traceHandler = null; + + +/** + * Overrides the default trace hander with a user-supplied version. + * + * Params: + * h = The new trace handler. Set to null to use the default handler. + */ +extern (C) void rt_setTraceHandler(TraceHandler h) +{ + traceHandler = h; +} + + +/** + * This function will be called when an exception is constructed. The + * user-supplied trace handler will be called if one has been supplied, + * otherwise no trace will be generated. + * + * Params: + * ptr = A pointer to the location from which to generate the trace, or null + * if the trace should be generated from within the trace handler + * itself. + * + * Returns: + * An object describing the current calling context or null if no handler is + * supplied. + */ +Throwable.TraceInfo traceContext(void* ptr = null) +{ + if (traceHandler is null) + return null; + return traceHandler(ptr); +} + + +class Exception : Throwable +{ + this(string msg, Throwable next = null) + { + super(msg, next); + } + + this(string msg, string file, size_t line, Throwable next = null) + { + super(msg, file, line, next); + } +} + + +class Error : Throwable +{ + this(string msg, Throwable next = null) + { + super(msg, next); + } + + this(string msg, string file, size_t line, Throwable next = null) + { + super(msg, file, line, next); + } +} + + +/////////////////////////////////////////////////////////////////////////////// +// ModuleInfo +/////////////////////////////////////////////////////////////////////////////// + + +enum +{ + MIctorstart = 1, // we've started constructing it + MIctordone = 2, // finished construction + MIstandalone = 4, // module ctor does not depend on other module + // ctors being done first + MIhasictor = 8, // has ictor member +} + + +class ModuleInfo +{ + string name; + ModuleInfo[] importedModules; + ClassInfo[] localClasses; + uint flags; + + void function() ctor; // module static constructor (order dependent) + void function() dtor; // module static destructor + void function() unitTest; // module unit tests + + void* xgetMembers; // module getMembers() function + + void function() ictor; // module static constructor (order independent) + + void*[4] reserved; // for future expansion + + static int opApply(int delegate(inout ModuleInfo) dg) + { + int ret = 0; + + foreach (m; _moduleinfo_array) + { + ret = dg(m); + if (ret) + break; + } + return ret; + } +} + + +// Windows: this gets initialized by minit.asm +// Posix: this gets initialized in _moduleCtor() +extern (C) __gshared ModuleInfo[] _moduleinfo_array; + + +version (linux) +{ + // This linked list is created by a compiler generated function inserted + // into the .ctor list by the compiler. + struct ModuleReference + { + ModuleReference* next; + ModuleInfo mod; + } + + extern (C) __gshared ModuleReference* _Dmodule_ref; // start of linked list +} + +version (FreeBSD) +{ + // This linked list is created by a compiler generated function inserted + // into the .ctor list by the compiler. + struct ModuleReference + { + ModuleReference* next; + ModuleInfo mod; + } + + extern (C) __gshared ModuleReference* _Dmodule_ref; // start of linked list +} + +version (Solaris) +{ + // This linked list is created by a compiler generated function inserted + // into the .ctor list by the compiler. + struct ModuleReference + { + ModuleReference* next; + ModuleInfo mod; + } + + extern (C) __gshared ModuleReference* _Dmodule_ref; // start of linked list +} + +version (OSX) +{ + extern (C) + { + extern __gshared void* _minfo_beg; + extern __gshared void* _minfo_end; + } +} + +__gshared ModuleInfo[] _moduleinfo_dtors; +__gshared uint _moduleinfo_dtors_i; + +// Register termination function pointers +extern (C) int _fatexit(void*); + +/** + * Initialize the modules. + */ + +extern (C) void _moduleCtor() +{ + debug(PRINTF) printf("_moduleCtor()\n"); + version (linux) + { + int len = 0; + ModuleReference *mr; + + for (mr = _Dmodule_ref; mr; mr = mr.next) + len++; + _moduleinfo_array = new ModuleInfo[len]; + len = 0; + for (mr = _Dmodule_ref; mr; mr = mr.next) + { _moduleinfo_array[len] = mr.mod; + len++; + } + } + + version (FreeBSD) + { + int len = 0; + ModuleReference *mr; + + for (mr = _Dmodule_ref; mr; mr = mr.next) + len++; + _moduleinfo_array = new ModuleInfo[len]; + len = 0; + for (mr = _Dmodule_ref; mr; mr = mr.next) + { _moduleinfo_array[len] = mr.mod; + len++; + } + } + + version (Solaris) + { + int len = 0; + ModuleReference *mr; + + for (mr = _Dmodule_ref; mr; mr = mr.next) + len++; + _moduleinfo_array = new ModuleInfo[len]; + len = 0; + for (mr = _Dmodule_ref; mr; mr = mr.next) + { _moduleinfo_array[len] = mr.mod; + len++; + } + } + + version (OSX) + { + /* The ModuleInfo references are stored in the special segment + * __minfodata, which is bracketed by the segments __minfo_beg + * and __minfo_end. The variables _minfo_beg and _minfo_end + * are of zero size and are in the two bracketing segments, + * respectively. + */ + size_t length = cast(ModuleInfo*)&_minfo_end - cast(ModuleInfo*)&_minfo_beg; + _moduleinfo_array = (cast(ModuleInfo*)&_minfo_beg)[0 .. length]; + debug printf("moduleinfo: ptr = %p, length = %d\n", _moduleinfo_array.ptr, _moduleinfo_array.length); + + debug foreach (m; _moduleinfo_array) + { + //printf("\t%p\n", m); + printf("\t%.*s\n", m.name); + } + } + + version (Windows) + { + // Ensure module destructors also get called on program termination + //_fatexit(&_STD_moduleDtor); + } + + _moduleinfo_dtors = new ModuleInfo[_moduleinfo_array.length]; + debug(PRINTF) printf("_moduleinfo_dtors = x%x\n", cast(void*)_moduleinfo_dtors); + _moduleIndependentCtors(); + _moduleCtor2(_moduleinfo_array, 0); +} + +extern (C) void _moduleIndependentCtors() +{ + debug(PRINTF) printf("_moduleIndependentCtors()\n"); + foreach (m; _moduleinfo_array) + { + if (m && m.flags & MIhasictor && m.ictor) + { + (*m.ictor)(); + } + } +} + +void _moduleCtor2(ModuleInfo[] mi, int skip) +{ + debug(PRINTF) printf("_moduleCtor2(): %d modules\n", mi.length); + for (uint i = 0; i < mi.length; i++) + { + ModuleInfo m = mi[i]; + + debug(PRINTF) printf("\tmodule[%d] = '%p'\n", i, m); + if (!m) + continue; + debug(PRINTF) printf("\tmodule[%d] = '%.*s'\n", i, m.name); + if (m.flags & MIctordone) + continue; + debug(PRINTF) printf("\tmodule[%d] = '%.*s', m = x%x\n", i, m.name, m); + + if (m.ctor || m.dtor) + { + if (m.flags & MIctorstart) + { if (skip || m.flags & MIstandalone) + continue; + throw new Exception("Cyclic dependency in module " ~ m.name); + } + + m.flags |= MIctorstart; + _moduleCtor2(m.importedModules, 0); + if (m.ctor) + (*m.ctor)(); + m.flags &= ~MIctorstart; + m.flags |= MIctordone; + + // Now that construction is done, register the destructor + //printf("\tadding module dtor x%x\n", m); + assert(_moduleinfo_dtors_i < _moduleinfo_dtors.length); + _moduleinfo_dtors[_moduleinfo_dtors_i++] = m; + } + else + { + m.flags |= MIctordone; + _moduleCtor2(m.importedModules, 1); + } + } +} + +/** + * Destruct the modules. + */ + +// Starting the name with "_STD" means under Posix a pointer to the +// function gets put in the .dtors segment. + +extern (C) void _moduleDtor() +{ + debug(PRINTF) printf("_moduleDtor(): %d modules\n", _moduleinfo_dtors_i); + + for (uint i = _moduleinfo_dtors_i; i-- != 0;) + { + ModuleInfo m = _moduleinfo_dtors[i]; + + debug(PRINTF) printf("\tmodule[%d] = '%.*s', x%x\n", i, m.name, m); + if (m.dtor) + { + (*m.dtor)(); + } + } + debug(PRINTF) printf("_moduleDtor() done\n"); +} + +/////////////////////////////////////////////////////////////////////////////// +// Monitor +/////////////////////////////////////////////////////////////////////////////// + +alias Object.Monitor IMonitor; +alias void delegate(Object) DEvent; + +// NOTE: The dtor callback feature is only supported for monitors that are not +// supplied by the user. The assumption is that any object with a user- +// supplied monitor may have special storage or lifetime requirements and +// that as a result, storing references to local objects within Monitor +// may not be safe or desirable. Thus, devt is only valid if impl is +// null. +struct Monitor +{ + IMonitor impl; + /* internal */ + DEvent[] devt; + /* stuff */ +} + +Monitor* getMonitor(Object h) +{ + return cast(Monitor*) (cast(void**) h)[1]; +} + +void setMonitor(Object h, Monitor* m) +{ + (cast(void**) h)[1] = m; +} + +extern (C) void _d_monitor_create(Object); +extern (C) void _d_monitor_destroy(Object); +extern (C) void _d_monitor_lock(Object); +extern (C) int _d_monitor_unlock(Object); + +extern (C) void _d_monitordelete(Object h, bool det) +{ + Monitor* m = getMonitor(h); + + if (m !is null) + { + IMonitor i = m.impl; + if (i is null) + { + _d_monitor_devt(m, h); + _d_monitor_destroy(h); + setMonitor(h, null); + return; + } + if (det && (cast(void*) i) !is (cast(void*) h)) + delete i; + setMonitor(h, null); + } +} + +extern (C) void _d_monitorenter(Object h) +{ + Monitor* m = getMonitor(h); + + if (m is null) + { + _d_monitor_create(h); + m = getMonitor(h); + } + + IMonitor i = m.impl; + + if (i is null) + { + _d_monitor_lock(h); + return; + } + i.lock(); +} + +extern (C) void _d_monitorexit(Object h) +{ + Monitor* m = getMonitor(h); + IMonitor i = m.impl; + + if (i is null) + { + _d_monitor_unlock(h); + return; + } + i.unlock(); +} + +extern (C) void _d_monitor_devt(Monitor* m, Object h) +{ + if (m.devt.length) + { + DEvent[] devt; + + synchronized (h) + { + devt = m.devt; + m.devt = null; + } + foreach (v; devt) + { + if (v) + v(h); + } + free(devt.ptr); + } +} + +extern (C) void rt_attachDisposeEvent(Object h, DEvent e) +{ + synchronized (h) + { + Monitor* m = getMonitor(h); + assert(m.impl is null); + + foreach (inout v; m.devt) + { + if (v is null || v == e) + { + v = e; + return; + } + } + + auto len = m.devt.length + 4; // grow by 4 elements + auto pos = m.devt.length; // insert position + auto p = realloc(m.devt.ptr, DEvent.sizeof * len); + if (!p) + onOutOfMemoryError(); + m.devt = (cast(DEvent*)p)[0 .. len]; + m.devt[pos+1 .. len] = null; + m.devt[pos] = e; + } +} + +extern (C) void rt_detachDisposeEvent(Object h, DEvent e) +{ + synchronized (h) + { + Monitor* m = getMonitor(h); + assert(m.impl is null); + + foreach (p, v; m.devt) + { + if (v == e) + { + memmove(&m.devt[p], + &m.devt[p+1], + (m.devt.length - p - 1) * DEvent.sizeof); + m.devt[$ - 1] = null; + return; + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/posix.mak --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/posix.mak Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,82 @@ +# Makefile to build the compiler runtime D library for Linux +# Designed to work with GNU make +# Targets: +# make +# Same as make all +# make debug +# Build the debug version of the library +# make release +# Build the release version of the library +# make doc +# Generate documentation +# make clean +# Delete all files created by build process + +# Essentials + +LIBDIR=../../../lib +DOCDIR=../../../doc +IMPDIR=../../../import +LIBBASENAME=libdruntime-rt-dmd.a +MODULES= +BUILDS=debug release unittest + +MODULES_BASE=aaA aApply aApplyR adi alloca arrayassign arraybyte \ + arraycast arraycat arraydouble arrayfloat arrayint arrayreal \ + arrayshort cast_ cmath2 cover deh2 dmain2 invariant invariant_ \ + lifetime llmath memory memset obj object_ qsort switch_ trace +# NOTE: trace.o and cover.o are not necessary for a successful build +# as both are used for debugging features (profiling and coverage) +# NOTE: a pre-compiled minit.obj has been provided in dmd for Win32 and +# minit.asm is not used by dmd for Linux +# NOTE: deh.o is only needed for Win32, Linux uses deh2.o +MODULES_UTIL=$(addprefix util/,console cpuid ctype string utf) +MODULES_TI=$(addprefix typeinfo/ti_,AC Acdouble Acfloat Acreal Adouble \ + Afloat Ag Aint Along Areal Ashort byte C cdouble cfloat char creal \ + dchar delegate double float idouble ifloat int ireal long ptr real \ + short ubyte uint ulong ushort void wchar) +C_SRCS=complex.c critical.c memory_osx.c monitor.c + +# Symbols + +DMD=dmd +DOCFLAGS=-version=DDoc +DFLAGS_release=-d -release -O -inline -w -nofloat +DFLAGS_debug=-d -g -w -nofloat +DFLAGS_unittest=$(DFLAGS_release) -unittest +CFLAGS_release=-m32 -O +CFLAGS_debug=-m32 -g +CFLAGS_unittest=$(CFLAGS_release) + +# Derived symbols + +C_OBJS=$(addsuffix .o,$(basename $(C_SRCS))) +AS_OBJS=$(addsuffix .o,$(basename $(AS_SRCS))) +ALL_MODULES=$(MODULES_BASE) $(MODULES_UTIL) $(MODULES_TI) +D_SRCS=$(addsuffix .d,$(ALL_MODULES)) +ALLLIBS=$(addsuffix /$(LIBBASENAME),$(addprefix $(LIBDIR)/,$(BUILDS))) + +# Patterns + +$(LIBDIR)/%/$(LIBBASENAME) : $(D_SRCS) $(C_SRCS) $(AS_SRCS) + $(CC) -c $(CFLAGS_$*) $(C_SRCS) + $(DMD) $(DFLAGS_$*) -lib -of$@ $(D_SRCS) $(C_OBJS) $(AS_OBJS) + rm $(C_OBJS) $(AS_OBJS) + +$(DOCDIR)/%.html : %.d + $(DMD) -c -d -o- -Df$@ $< + +$(IMPDIR)/%.di : %.d + $(DMD) -c -d -o- -Hf$@ $< + +# Rulez + +all : $(BUILDS) doc + +debug : $(LIBDIR)/debug/$(LIBBASENAME) $(IMPORTS) +release : $(LIBDIR)/release/$(LIBBASENAME) $(IMPORTS) +unittest : $(LIBDIR)/unittest/$(LIBBASENAME) $(IMPORTS) +#doc : $(DOCS) + +clean : + rm -f $(IMPORTS) $(DOCS) $(ALLLIBS) diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/qsort.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/qsort.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,158 @@ +/* + Portions of this file are: + Copyright Prototronics, 1987 + Totem Lake P.O. 8117 + Kirkland, Washington 98034 + (206) 820-1972 + Licensed to Digital Mars. + + June 11, 1987 from Ray Gardner's + Denver, Colorado) public domain version + + Use qsort2.d instead of this file if a redistributable version of + _adSort() is required. +*/ + +module rt.qsort; + +/* +** Sorts an array starting at base, of length nbr_elements, each +** element of size width_bytes, ordered via compare_function; which +** is called as (*comp_fp)(ptr_to_element1, ptr_to_element2) +** and returns < 0 if element1 < element2, 0 if element1 = element2, +** > 0 if element1 > element2. Most of the refinements are due to +** R. Sedgewick. See "Implementing Quicksort Programs", Comm. ACM, +** Oct. 1978, and Corrigendum, Comm. ACM, June 1979. +*/ + +//debug=qsort; // uncomment to turn on debugging printf's + + +struct Array +{ + size_t length; + void* ptr; +} + + +private const int _maxspan = 7; // subarrays of _maxspan or fewer elements + // will be sorted by a simple insertion sort + +/* Adjust _maxspan according to relative cost of a swap and a compare. Reduce +_maxspan (not less than 1) if a swap is very expensive such as when you have +an array of large structures to be sorted, rather than an array of pointers to +structures. The default value is optimized for a high cost for compares. */ + + +extern (C) long _adSort(Array a, TypeInfo ti) +{ + byte* base; + byte*[40] stack; // stack + byte** sp; // stack pointer + byte* i, j, limit; // scan and limit pointers + uint thresh; // size of _maxspan elements in bytes + uint width = ti.tsize(); + + base = cast(byte *)a.ptr; + thresh = _maxspan * width; // init threshold + sp = stack.ptr; // init stack pointer + limit = base + a.length * width; // pointer past end of array + while (1) // repeat until done then return + { + while (limit - base > thresh) // if more than _maxspan elements + { + //swap middle, base + ti.swap((cast(uint)(limit - base) >> 1) - + (((cast(uint)(limit - base) >> 1)) % width) + base, base); + + i = base + width; // i scans from left to right + j = limit - width; // j scans from right to left + + if (ti.compare(i, j) > 0) // Sedgewick's + ti.swap(i, j); // three-element sort + if (ti.compare(base, j) > 0) // sets things up + ti.swap(base, j); // so that + if (ti.compare(i, base) > 0) // *i <= *base <= *j + ti.swap(i, base); // *base is the pivot element + + while (1) + { + do // move i right until *i >= pivot + i += width; + while (ti.compare(i, base) < 0); + do // move j left until *j <= pivot + j -= width; + while (ti.compare(j, base) > 0); + if (i > j) // break loop if pointers crossed + break; + ti.swap(i, j); // else swap elements, keep scanning + } + ti.swap(base, j); // move pivot into correct place + if (j - base > limit - i) // if left subarray is larger... + { + sp[0] = base; // stack left subarray base + sp[1] = j; // and limit + base = i; // sort the right subarray + } + else // else right subarray is larger + { + sp[0] = i; // stack right subarray base + sp[1] = limit; // and limit + limit = j; // sort the left subarray + } + sp += 2; // increment stack pointer + assert(sp < cast(byte**)stack + stack.length); + } + + // Insertion sort on remaining subarray + i = base + width; + while (i < limit) + { + j = i; + while (j > base && ti.compare(j - width, j) > 0) + { + ti.swap(j - width, j); + j -= width; + } + i += width; + } + + if (sp > stack.ptr) // if any entries on stack... + { + sp -= 2; // pop the base and limit + base = sp[0]; + limit = sp[1]; + } + else // else stack empty, all done + return *cast(long*)(&a); + } + assert(0); +} + + +unittest +{ + debug(qsort) printf("array.sort.unittest()\n"); + + int a[] = new int[10]; + + a[0] = 23; + a[1] = 1; + a[2] = 64; + a[3] = 5; + a[4] = 6; + a[5] = 5; + a[6] = 17; + a[7] = 3; + a[8] = 0; + a[9] = -1; + + a.sort; + + for (int i = 0; i < a.length - 1; i++) + { + //printf("i = %d", i); + //printf(" %d %d\n", a[i], a[i + 1]); + assert(a[i] <= a[i + 1]); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/qsort2.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/qsort2.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,71 @@ +/** + * This is a public domain version of qsort.d. All it does is call C's + * qsort(), but runs a little slower since it needs to synchronize a global + * variable. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: = 0); + assert(ca.length >= 0); + + // Make sure table[] is sorted correctly + int j; + + for (j = 1; j < table.length; j++) + { + int len1 = table[j - 1].length; + int len2 = table[j].length; + + assert(len1 <= len2); + if (len1 == len2) + { + int ci; + + ci = memcmp(table[j - 1].ptr, table[j].ptr, len1); + assert(ci < 0); // ci==0 means a duplicate + } + } +} +out (result) +{ + int i; + int cj; + + //printf("out _d_switch_string()\n"); + if (result == -1) + { + // Not found + for (i = 0; i < table.length; i++) + { + if (table[i].length == ca.length) + { cj = memcmp(table[i].ptr, ca.ptr, ca.length); + assert(cj != 0); + } + } + } + else + { + assert(0 <= result && result < table.length); + for (i = 0; 1; i++) + { + assert(i < table.length); + if (table[i].length == ca.length) + { + cj = memcmp(table[i].ptr, ca.ptr, ca.length); + if (cj == 0) + { + assert(i == result); + break; + } + } + } + } +} +body +{ + //printf("body _d_switch_string(%.*s)\n", ca); + int low; + int high; + int mid; + int c; + char[] pca; + + low = 0; + high = table.length; + + version (none) + { + // Print table + printf("ca[] = '%s'\n", cast(char *)ca); + for (mid = 0; mid < high; mid++) + { + pca = table[mid]; + printf("table[%d] = %d, '%.*s'\n", mid, pca.length, pca); + } + } + if (high && + ca.length >= table[0].length && + ca.length <= table[high - 1].length) + { + // Looking for 0 length string, which would only be at the beginning + if (ca.length == 0) + return 0; + + char c1 = ca[0]; + + // Do binary search + while (low < high) + { + mid = (low + high) >> 1; + pca = table[mid]; + c = ca.length - pca.length; + if (c == 0) + { + c = cast(ubyte)c1 - cast(ubyte)pca[0]; + if (c == 0) + { + c = memcmp(ca.ptr, pca.ptr, ca.length); + if (c == 0) + { //printf("found %d\n", mid); + return mid; + } + } + } + if (c < 0) + { + high = mid; + } + else + { + low = mid + 1; + } + } + } + + //printf("not found\n"); + return -1; // not found +} + +unittest +{ + switch (cast(char []) "c") + { + case "coo": + default: + break; + } +} + +/********************************** + * Same thing, but for wide chars. + */ + +int _d_switch_ustring(wchar[][] table, wchar[] ca) +in +{ + //printf("in _d_switch_ustring()\n"); + assert(table.length >= 0); + assert(ca.length >= 0); + + // Make sure table[] is sorted correctly + int j; + + for (j = 1; j < table.length; j++) + { + int len1 = table[j - 1].length; + int len2 = table[j].length; + + assert(len1 <= len2); + if (len1 == len2) + { + int c; + + c = memcmp(table[j - 1].ptr, table[j].ptr, len1 * wchar.sizeof); + assert(c < 0); // c==0 means a duplicate + } + } +} +out (result) +{ + int i; + int c; + + //printf("out _d_switch_string()\n"); + if (result == -1) + { + // Not found + for (i = 0; i < table.length; i++) + { + if (table[i].length == ca.length) + { c = memcmp(table[i].ptr, ca.ptr, ca.length * wchar.sizeof); + assert(c != 0); + } + } + } + else + { + assert(0 <= result && result < table.length); + for (i = 0; 1; i++) + { + assert(i < table.length); + if (table[i].length == ca.length) + { + c = memcmp(table[i].ptr, ca.ptr, ca.length * wchar.sizeof); + if (c == 0) + { + assert(i == result); + break; + } + } + } + } +} +body +{ + //printf("body _d_switch_ustring()\n"); + int low; + int high; + int mid; + int c; + wchar[] pca; + + low = 0; + high = table.length; + +/* + // Print table + wprintf("ca[] = '%.*s'\n", ca); + for (mid = 0; mid < high; mid++) + { + pca = table[mid]; + wprintf("table[%d] = %d, '%.*s'\n", mid, pca.length, pca); + } +*/ + + // Do binary search + while (low < high) + { + mid = (low + high) >> 1; + pca = table[mid]; + c = ca.length - pca.length; + if (c == 0) + { + c = memcmp(ca.ptr, pca.ptr, ca.length * wchar.sizeof); + if (c == 0) + { //printf("found %d\n", mid); + return mid; + } + } + if (c < 0) + { + high = mid; + } + else + { + low = mid + 1; + } + } + //printf("not found\n"); + return -1; // not found +} + + +unittest +{ + switch (cast(wchar []) "c") + { + case "coo": + default: + break; + } +} + + +/********************************** + * Same thing, but for wide chars. + */ + +int _d_switch_dstring(dchar[][] table, dchar[] ca) +in +{ + //printf("in _d_switch_dstring()\n"); + assert(table.length >= 0); + assert(ca.length >= 0); + + // Make sure table[] is sorted correctly + int j; + + for (j = 1; j < table.length; j++) + { + int len1 = table[j - 1].length; + int len2 = table[j].length; + + assert(len1 <= len2); + if (len1 == len2) + { + int c; + + c = memcmp(table[j - 1].ptr, table[j].ptr, len1 * dchar.sizeof); + assert(c < 0); // c==0 means a duplicate + } + } +} +out (result) +{ + int i; + int c; + + //printf("out _d_switch_string()\n"); + if (result == -1) + { + // Not found + for (i = 0; i < table.length; i++) + { + if (table[i].length == ca.length) + { c = memcmp(table[i].ptr, ca.ptr, ca.length * dchar.sizeof); + assert(c != 0); + } + } + } + else + { + assert(0 <= result && result < table.length); + for (i = 0; 1; i++) + { + assert(i < table.length); + if (table[i].length == ca.length) + { + c = memcmp(table[i].ptr, ca.ptr, ca.length * dchar.sizeof); + if (c == 0) + { + assert(i == result); + break; + } + } + } + } +} +body +{ + //printf("body _d_switch_ustring()\n"); + int low; + int high; + int mid; + int c; + dchar[] pca; + + low = 0; + high = table.length; + +/* + // Print table + wprintf("ca[] = '%.*s'\n", ca); + for (mid = 0; mid < high; mid++) + { + pca = table[mid]; + wprintf("table[%d] = %d, '%.*s'\n", mid, pca.length, pca); + } +*/ + + // Do binary search + while (low < high) + { + mid = (low + high) >> 1; + pca = table[mid]; + c = ca.length - pca.length; + if (c == 0) + { + c = memcmp(ca.ptr, pca.ptr, ca.length * dchar.sizeof); + if (c == 0) + { //printf("found %d\n", mid); + return mid; + } + } + if (c < 0) + { + high = mid; + } + else + { + low = mid + 1; + } + } + //printf("not found\n"); + return -1; // not found +} + + +unittest +{ + switch (cast(dchar []) "c") + { + case "coo": + default: + break; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/tls.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/tls.S Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,44 @@ +/** + * Contains support code for thread-local storage. + * + * Copyright: Copyright Digital Mars 2008 - 2009. + * License: = count) + { uint u2; + uint c2; + + u2 = (u + 1 < num) ? u + 1 : u; + c2 = base[u2].count; + if (c2 < count) + c2 = count; + trace_place(base[u].sym,c2); + } + else + break; + } + + // Clean up + trace_free(base); + } +} + +///////////////////////////////////// +// Initialize and terminate. + +static this() +{ + trace_init(); +} + +static ~this() +{ + trace_term(); +} + +/////////////////////////////////// +// Report results. +// Also compute nsymbols. + +static void trace_report(Symbol* s) +{ SymPair* sp; + uint count; + + //printf("trace_report()\n"); + while (s) + { nsymbols++; + if (s.Sl) + trace_report(s.Sl); + fprintf(fplog,"------------------\n"); + count = 0; + for (sp = s.Sfanin; sp; sp = sp.next) + { + fprintf(fplog,"\t%5d\t%.*s\n", sp.count, sp.sym.Sident); + count += sp.count; + } + fprintf(fplog,"%.*s\t%u\t%lld\t%lld\n",s.Sident,count,s.totaltime,s.functime); + for (sp = s.Sfanout; sp; sp = sp.next) + { + fprintf(fplog,"\t%5d\t%.*s\n",sp.count,sp.sym.Sident); + } + s = s.Sr; + } +} + +//////////////////////////////////// +// Allocate and fill array of symbols. + +static void trace_array(Symbol *s) +{ + __gshared uint u; + + if (!psymbols) + { u = 0; + psymbols = cast(Symbol **)trace_malloc((Symbol *).sizeof * nsymbols); + } + while (s) + { + psymbols[u++] = s; + trace_array(s.Sl); + s = s.Sr; + } +} + + +////////////////////////////////////// +// Qsort() comparison routine for array of pointers to Symbol's. + +static int symbol_cmp(in void* e1, in void* e2) +{ Symbol** ps1; + Symbol** ps2; + timer_t diff; + + ps1 = cast(Symbol **)e1; + ps2 = cast(Symbol **)e2; + + diff = (*ps2).functime - (*ps1).functime; + return (diff == 0) ? 0 : ((diff > 0) ? 1 : -1); +} + + +/////////////////////////////////// +// Report function timings + +static void trace_times(Symbol* root) +{ uint u; + timer_t freq; + + // Sort array + qsort(psymbols, nsymbols, (Symbol *).sizeof, &symbol_cmp); + + // Print array + QueryPerformanceFrequency(&freq); + fprintf(fplog,"\n======== Timer Is %lld Ticks/Sec, Times are in Microsecs ========\n\n",freq); + fprintf(fplog," Num Tree Func Per\n"); + fprintf(fplog," Calls Time Time Call\n\n"); + for (u = 0; u < nsymbols; u++) + { Symbol* s = psymbols[u]; + timer_t tl,tr; + timer_t fl,fr; + timer_t pl,pr; + timer_t percall; + SymPair* sp; + uint calls; + char[] id; + + version (Windows) + { + char* p = (s.Sident ~ '\0').ptr; + p = unmangle_ident(p); + if (p) + id = p[0 .. strlen(p)]; + } + if (!id) + id = s.Sident; + calls = 0; + for (sp = s.Sfanin; sp; sp = sp.next) + calls += sp.count; + if (calls == 0) + calls = 1; + +version (all) +{ + tl = (s.totaltime * 1000000) / freq; + fl = (s.functime * 1000000) / freq; + percall = s.functime / calls; + pl = (s.functime * 1000000) / calls / freq; + + fprintf(fplog,"%7d%12lld%12lld%12lld %.*s\n", + calls,tl,fl,pl,id); +} +else +{ + tl = s.totaltime / freq; + tr = ((s.totaltime - tl * freq) * 10000000) / freq; + + fl = s.functime / freq; + fr = ((s.functime - fl * freq) * 10000000) / freq; + + percall = s.functime / calls; + pl = percall / freq; + pr = ((percall - pl * freq) * 10000000) / freq; + + fprintf(fplog,"%7d\t%3lld.%07lld\t%3lld.%07lld\t%3lld.%07lld\t%.*s\n", + calls,tl,tr,fl,fr,pl,pr,id); +} + if (id !is s.Sident) + free(id.ptr); + } +} + + +/////////////////////////////////// +// Initialize. + +static void trace_init() +{ + if (!trace_inited) + { + trace_inited = 1; + + { // See if we can determine the overhead. + uint u; + timer_t starttime; + timer_t endtime; + Stack *st; + + st = trace_tos; + trace_tos = null; + QueryPerformanceCounter(&starttime); + for (u = 0; u < 100; u++) + { + asm + { + call _trace_pro_n ; + db 0 ; + call _trace_epi_n ; + } + } + QueryPerformanceCounter(&endtime); + trace_ohd = (endtime - starttime) / u; + //printf("trace_ohd = %lld\n",trace_ohd); + if (trace_ohd > 0) + trace_ohd--; // round down + trace_tos = st; + } + } +} + +///////////////////////////////// +// Terminate. + +void trace_term() +{ + //printf("trace_term()\n"); + if (trace_inited == 1) + { Stack *n; + + trace_inited = 2; + + // Free remainder of the stack + while (trace_tos) + { + n = trace_tos.prev; + stack_free(trace_tos); + trace_tos = n; + } + + while (stack_freelist) + { + n = stack_freelist.prev; + stack_free(stack_freelist); + stack_freelist = n; + } + + // Merge in data from any existing file + trace_merge(); + + // Report results + fplog = fopen(trace_logfilename.ptr, "w"); + if (fplog) + { nsymbols = 0; + trace_report(root); + trace_array(root); + trace_times(root); + fclose(fplog); + } + + // Output function link order + fpdef = fopen(trace_deffilename.ptr,"w"); + if (fpdef) + { fprintf(fpdef,"\nFUNCTIONS\n"); + trace_order(root); + fclose(fpdef); + } + + trace_free(psymbols); + psymbols = null; + } +} + +///////////////////////////////// +// Our storage allocator. + +static void *trace_malloc(size_t nbytes) +{ void *p; + + p = malloc(nbytes); + if (!p) + exit(EXIT_FAILURE); + return p; +} + +static void trace_free(void *p) +{ + free(p); +} + +////////////////////////////////////////////// +// + +static Symbol* trace_addsym(char[] id) +{ + Symbol** parent; + Symbol* rover; + Symbol* s; + int cmp; + char c; + + //printf("trace_addsym('%s',%d)\n",p,len); + parent = &root; + rover = *parent; + while (rover !is null) // while we haven't run out of tree + { + cmp = dstrcmp(id, rover.Sident); + if (cmp == 0) + { + return rover; + } + parent = (cmp < 0) ? /* if we go down left side */ + &(rover.Sl) : /* then get left child */ + &(rover.Sr); /* else get right child */ + rover = *parent; /* get child */ + } + /* not in table, so insert into table */ + s = cast(Symbol *)trace_malloc(Symbol.sizeof); + memset(s,0,Symbol.sizeof); + s.Sident = id; + *parent = s; // link new symbol into tree + return s; +} + +/*********************************** + * Add symbol s with count to SymPair list. + */ + +static void trace_sympair_add(SymPair** psp, Symbol* s, uint count) +{ SymPair* sp; + + for (; 1; psp = &sp.next) + { + sp = *psp; + if (!sp) + { + sp = cast(SymPair *)trace_malloc(SymPair.sizeof); + sp.sym = s; + sp.count = 0; + sp.next = null; + *psp = sp; + break; + } + else if (sp.sym == s) + { + break; + } + } + sp.count += count; +} + +////////////////////////////////////////////// +// + +static void trace_pro(char[] id) +{ + Stack* n; + Symbol* s; + timer_t starttime; + timer_t t; + + QueryPerformanceCounter(&starttime); + if (id.length == 0) + return; + if (!trace_inited) + trace_init(); // initialize package + n = stack_malloc(); + n.prev = trace_tos; + trace_tos = n; + s = trace_addsym(id); + trace_tos.sym = s; + if (trace_tos.prev) + { + Symbol* prev; + int i; + + // Accumulate Sfanout and Sfanin + prev = trace_tos.prev.sym; + trace_sympair_add(&prev.Sfanout,s,1); + trace_sympair_add(&s.Sfanin,prev,1); + } + QueryPerformanceCounter(&t); + trace_tos.starttime = starttime; + trace_tos.ohd = trace_ohd + t - starttime; + trace_tos.subtime = 0; + //printf("trace_tos.ohd=%lld, trace_ohd=%lld + t=%lld - starttime=%lld\n", + // trace_tos.ohd,trace_ohd,t,starttime); +} + +///////////////////////////////////////// +// + +static void trace_epi() +{ Stack* n; + timer_t endtime; + timer_t t; + timer_t ohd; + + //printf("trace_epi()\n"); + if (trace_tos) + { + timer_t starttime; + timer_t totaltime; + + QueryPerformanceCounter(&endtime); + starttime = trace_tos.starttime; + totaltime = endtime - starttime - trace_tos.ohd; + if (totaltime < 0) + { //printf("endtime=%lld - starttime=%lld - trace_tos.ohd=%lld < 0\n", + // endtime,starttime,trace_tos.ohd); + totaltime = 0; // round off error, just make it 0 + } + + // totaltime is time spent in this function + all time spent in + // subfunctions - bookkeeping overhead. + trace_tos.sym.totaltime += totaltime; + + //if (totaltime < trace_tos.subtime) + //printf("totaltime=%lld < trace_tos.subtime=%lld\n",totaltime,trace_tos.subtime); + trace_tos.sym.functime += totaltime - trace_tos.subtime; + ohd = trace_tos.ohd; + n = trace_tos.prev; + stack_free(trace_tos); + trace_tos = n; + if (n) + { QueryPerformanceCounter(&t); + n.ohd += ohd + t - endtime; + n.subtime += totaltime; + //printf("n.ohd = %lld\n",n.ohd); + } + } +} + + +////////////////////////// FILE INTERFACE ///////////////////////// + +///////////////////////////////////// +// Read line from file fp. +// Returns: +// trace_malloc'd line buffer +// null if end of file + +static char* trace_readline(FILE* fp) +{ int c; + int dim; + int i; + char *buf; + + //printf("trace_readline(%p)\n", fp); + i = 0; + dim = 0; + buf = null; + while (1) + { + if (i == dim) + { char *p; + + dim += 80; + p = cast(char *)trace_malloc(dim); + memcpy(p,buf,i); + trace_free(buf); + buf = p; + } + c = fgetc(fp); + switch (c) + { + case EOF: + if (i == 0) + { trace_free(buf); + return null; + } + case '\n': + goto L1; + default: + break; + } + buf[i] = cast(char)c; + i++; + } +L1: + buf[i] = 0; + //printf("line '%s'\n",buf); + return buf; +} + +////////////////////////////////////// +// Skip space + +static char *skipspace(char *p) +{ + while (isspace(*p)) + p++; + return p; +} + +//////////////////////////////////////////////////////// +// Merge in profiling data from existing file. + +static void trace_merge() +{ FILE *fp; + char *buf; + char *p; + uint count; + Symbol *s; + SymPair *sfanin; + SymPair **psp; + + if (trace_logfilename && (fp = fopen(trace_logfilename.ptr,"r")) !is null) + { + buf = null; + sfanin = null; + psp = &sfanin; + while (1) + { + trace_free(buf); + buf = trace_readline(fp); + if (!buf) + break; + switch (*buf) + { + case '=': // ignore rest of file + trace_free(buf); + goto L1; + case ' ': + case '\t': // fan in or fan out line + count = strtoul(buf,&p,10); + if (p == buf) // if invalid conversion + continue; + p = skipspace(p); + if (!*p) + continue; + s = trace_addsym(p[0 .. strlen(p)]); + trace_sympair_add(psp,s,count); + break; + default: + if (!isalpha(*buf)) + { + if (!sfanin) + psp = &sfanin; + continue; // regard unrecognized line as separator + } + case '?': + case '_': + case '$': + case '@': + p = buf; + while (isgraph(*p)) + p++; + *p = 0; + //printf("trace_addsym('%s')\n",buf); + s = trace_addsym(buf[0 .. strlen(buf)]); + if (s.Sfanin) + { SymPair *sp; + + for (; sfanin; sfanin = sp) + { + trace_sympair_add(&s.Sfanin,sfanin.sym,sfanin.count); + sp = sfanin.next; + trace_free(sfanin); + } + } + else + { s.Sfanin = sfanin; + } + sfanin = null; + psp = &s.Sfanout; + + { timer_t t; + + p++; + count = strtoul(p,&p,10); + t = cast(long)strtoull(p,&p,10); + s.totaltime += t; + t = cast(long)strtoull(p,&p,10); + s.functime += t; + } + break; + } + } + L1: + fclose(fp); + } +} + +////////////////////////// COMPILER INTERFACE ///////////////////// + +///////////////////////////////////////////// +// Function called by trace code in function prolog. + +void _trace_pro_n() +{ + /* Length of string is either: + * db length + * ascii string + * or: + * db 0x0FF + * db 0 + * dw length + * ascii string + */ + + version (OSX) + { // 16 byte align stack + asm + { naked ; + pushad ; + mov ECX,8*4[ESP] ; + xor EAX,EAX ; + mov AL,[ECX] ; + cmp AL,0xFF ; + jne L1 ; + cmp byte ptr 1[ECX],0 ; + jne L1 ; + mov AX,2[ECX] ; + add 8*4[ESP],3 ; + add ECX,3 ; + L1: inc EAX ; + inc ECX ; + add 8*4[ESP],EAX ; + dec EAX ; + sub ESP,4 ; + push ECX ; + push EAX ; + call trace_pro ; + add ESP,12 ; + popad ; + ret ; + } + } + else + { + asm + { naked ; + pushad ; + mov ECX,8*4[ESP] ; + xor EAX,EAX ; + mov AL,[ECX] ; + cmp AL,0xFF ; + jne L1 ; + cmp byte ptr 1[ECX],0 ; + jne L1 ; + mov AX,2[ECX] ; + add 8*4[ESP],3 ; + add ECX,3 ; + L1: inc EAX ; + inc ECX ; + add 8*4[ESP],EAX ; + dec EAX ; + push ECX ; + push EAX ; + call trace_pro ; + add ESP,8 ; + popad ; + ret ; + } + } +} + +///////////////////////////////////////////// +// Function called by trace code in function epilog. + + +void _trace_epi_n() +{ + version (OSX) + { // 16 byte align stack + asm + { naked ; + pushad ; + sub ESP,12 ; + } + trace_epi(); + asm + { + add ESP,12 ; + popad ; + ret ; + } + } + else + { + asm + { naked ; + pushad ; + } + trace_epi(); + asm + { + popad ; + ret ; + } + } +} + + +version (Windows) +{ + extern (Windows) + { + export int QueryPerformanceCounter(timer_t *); + export int QueryPerformanceFrequency(timer_t *); + } +} +else version (X86) +{ + extern (D) + { + void QueryPerformanceCounter(timer_t* ctr) + { + asm + { naked ; + mov ECX,EAX ; + rdtsc ; + mov [ECX],EAX ; + mov 4[ECX],EDX ; + ret ; + } + } + + void QueryPerformanceFrequency(timer_t* freq) + { + *freq = 3579545; + } + } +} +else +{ + static assert(0); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_AC.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_AC.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,107 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: 0) + c = 1; + return c; + } + + override size_t tsize() + { + return (Object[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(Object); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_Acdouble.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_Acdouble.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,94 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (cdouble[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(cdouble); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_Acfloat.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_Acfloat.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,92 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (cfloat[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(cfloat); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_Acreal.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_Acreal.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,95 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (creal[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(creal); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_Adouble.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_Adouble.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,104 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (double[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(double); + } +} + +// idouble[] + +class TypeInfo_Ap : TypeInfo_Ad +{ + override string toString() { return "idouble[]"; } + + override TypeInfo next() + { + return typeid(idouble); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_Afloat.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_Afloat.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,103 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (float[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(float); + } +} + +// ifloat[] + +class TypeInfo_Ao : TypeInfo_Af +{ + override string toString() { return "ifloat[]"; } + + override TypeInfo next() + { + return typeid(ifloat); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_Ag.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_Ag.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,228 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (byte[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(byte); + } +} + + +// ubyte[] + +class TypeInfo_Ah : TypeInfo_Ag +{ + override string toString() { return "ubyte[]"; } + + override int compare(in void* p1, in void* p2) + { + char[] s1 = *cast(char[]*)p1; + char[] s2 = *cast(char[]*)p2; + + return dstrcmp(s1, s2); + } + + override TypeInfo next() + { + return typeid(ubyte); + } +} + +// void[] + +class TypeInfo_Av : TypeInfo_Ah +{ + override string toString() { return "void[]"; } + + override TypeInfo next() + { + return typeid(void); + } +} + +// bool[] + +class TypeInfo_Ab : TypeInfo_Ah +{ + override string toString() { return "bool[]"; } + + override TypeInfo next() + { + return typeid(bool); + } +} + +// char[] + +class TypeInfo_Aa : TypeInfo_Ag +{ + override string toString() { return "char[]"; } + + override hash_t getHash(in void* p) + { char[] s = *cast(char[]*)p; + hash_t hash = 0; + +version (all) +{ + foreach (char c; s) + hash = hash * 11 + c; +} +else +{ + size_t len = s.length; + char *str = s; + + while (1) + { + switch (len) + { + case 0: + return hash; + + case 1: + hash *= 9; + hash += *cast(ubyte *)str; + return hash; + + case 2: + hash *= 9; + hash += *cast(ushort *)str; + return hash; + + case 3: + hash *= 9; + hash += (*cast(ushort *)str << 8) + + (cast(ubyte *)str)[2]; + return hash; + + default: + hash *= 9; + hash += *cast(uint *)str; + str += 4; + len -= 4; + break; + } + } +} + return hash; + } + + override TypeInfo next() + { + return typeid(char); + } +} + +// string + +class TypeInfo_Aya : TypeInfo_Aa +{ + override string toString() { return "immutable(char)[]"; } + + override TypeInfo next() + { + return typeid(immutable(char)); + } +} + diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_Aint.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_Aint.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,140 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (int[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(int); + } +} + +unittest +{ + int[][] a = [[5,3,8,7], [2,5,3,8,7]]; + a.sort; + assert(a == [[2,5,3,8,7], [5,3,8,7]]); + + a = [[5,3,8,7], [5,3,8]]; + a.sort; + assert(a == [[5,3,8], [5,3,8,7]]); +} + +// uint[] + +class TypeInfo_Ak : TypeInfo_Ai +{ + override string toString() { return "uint[]"; } + + override int compare(in void* p1, in void* p2) + { + uint[] s1 = *cast(uint[]*)p1; + uint[] s2 = *cast(uint[]*)p2; + size_t len = s1.length; + + if (s2.length < len) + len = s2.length; + for (size_t u = 0; u < len; u++) + { + int result = s1[u] - s2[u]; + if (result) + return result; + } + if (s1.length < s2.length) + return -1; + else if (s1.length > s2.length) + return 1; + return 0; + } + + override TypeInfo next() + { + return typeid(uint); + } +} + +// dchar[] + +class TypeInfo_Aw : TypeInfo_Ak +{ + override string toString() { return "dchar[]"; } + + override TypeInfo next() + { + return typeid(dchar); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_Along.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_Along.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,120 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2[u]) + return 1; + } + if (s1.length < s2.length) + return -1; + else if (s1.length > s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (long[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(long); + } +} + + +// ulong[] + +class TypeInfo_Am : TypeInfo_Al +{ + override string toString() { return "ulong[]"; } + + override int compare(in void* p1, in void* p2) + { + ulong[] s1 = *cast(ulong[]*)p1; + ulong[] s2 = *cast(ulong[]*)p2; + size_t len = s1.length; + + if (s2.length < len) + len = s2.length; + for (size_t u = 0; u < len; u++) + { + if (s1[u] < s2[u]) + return -1; + else if (s1[u] > s2[u]) + return 1; + } + if (s1.length < s2.length) + return -1; + else if (s1.length > s2.length) + return 1; + return 0; + } + + override TypeInfo next() + { + return typeid(ulong); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_Areal.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_Areal.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,105 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (real[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(real); + } +} + +// ireal[] + +class TypeInfo_Aj : TypeInfo_Ae +{ + override string toString() { return "ireal[]"; } + + override TypeInfo next() + { + return typeid(ireal); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_Ashort.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_Ashort.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,143 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (short[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(short); + } +} + + +// ushort[] + +class TypeInfo_At : TypeInfo_As +{ + override string toString() { return "ushort[]"; } + + override int compare(in void* p1, in void* p2) + { + ushort[] s1 = *cast(ushort[]*)p1; + ushort[] s2 = *cast(ushort[]*)p2; + size_t len = s1.length; + + if (s2.length < len) + len = s2.length; + for (size_t u = 0; u < len; u++) + { + int result = s1[u] - s2[u]; + if (result) + return result; + } + if (s1.length < s2.length) + return -1; + else if (s1.length > s2.length) + return 1; + return 0; + } + + override TypeInfo next() + { + return typeid(ushort); + } +} + +// wchar[] + +class TypeInfo_Au : TypeInfo_At +{ + override string toString() { return "wchar[]"; } + + override TypeInfo next() + { + return typeid(wchar); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_C.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_C.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,63 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: f2.re) + result = 1; + else if (f1.im < f2.im) + result = -1; + else if (f1.im > f2.im) + result = 1; + else + result = 0; + return result; + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(cdouble *)p1, *cast(cdouble *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(cdouble *)p1, *cast(cdouble *)p2); + } + + override size_t tsize() + { + return cdouble.sizeof; + } + + override void swap(void *p1, void *p2) + { + cdouble t; + + t = *cast(cdouble *)p1; + *cast(cdouble *)p1 = *cast(cdouble *)p2; + *cast(cdouble *)p2 = t; + } + + override void[] init() + { static immutable cdouble r; + + return (cast(cdouble *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_cfloat.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_cfloat.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,76 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: f2.re) + result = 1; + else if (f1.im < f2.im) + result = -1; + else if (f1.im > f2.im) + result = 1; + else + result = 0; + return result; + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(cfloat *)p1, *cast(cfloat *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(cfloat *)p1, *cast(cfloat *)p2); + } + + override size_t tsize() + { + return cfloat.sizeof; + } + + override void swap(void *p1, void *p2) + { + cfloat t; + + t = *cast(cfloat *)p1; + *cast(cfloat *)p1 = *cast(cfloat *)p2; + *cast(cfloat *)p2 = t; + } + + override void[] init() + { static immutable cfloat r; + + return (cast(cfloat *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_char.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_char.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,55 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: f2.re) + result = 1; + else if (f1.im < f2.im) + result = -1; + else if (f1.im > f2.im) + result = 1; + else + result = 0; + return result; + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(creal *)p1, *cast(creal *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(creal *)p1, *cast(creal *)p2); + } + + override size_t tsize() + { + return creal.sizeof; + } + + override void swap(void *p1, void *p2) + { + creal t; + + t = *cast(creal *)p1; + *cast(creal *)p1 = *cast(creal *)p2; + *cast(creal *)p2 = t; + } + + override void[] init() + { static immutable creal r; + + return (cast(creal *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_dchar.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_dchar.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,55 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright + * + * Copyright Digital Mars 2004 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.typeinfo.ti_double; + +// double + +class TypeInfo_d : TypeInfo +{ + override string toString() { return "double"; } + + override hash_t getHash(in void* p) + { + return (cast(uint *)p)[0] + (cast(uint *)p)[1]; + } + + static equals_t _equals(double f1, double f2) + { + return f1 == f2 || + (f1 !<>= f1 && f2 !<>= f2); + } + + static int _compare(double d1, double d2) + { + if (d1 !<>= d2) // if either are NaN + { + if (d1 !<>= d1) + { if (d2 !<>= d2) + return 0; + return -1; + } + return 1; + } + return (d1 == d2) ? 0 : ((d1 < d2) ? -1 : 1); + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(double *)p1, *cast(double *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(double *)p1, *cast(double *)p2); + } + + override size_t tsize() + { + return double.sizeof; + } + + override void swap(void *p1, void *p2) + { + double t; + + t = *cast(double *)p1; + *cast(double *)p1 = *cast(double *)p2; + *cast(double *)p2 = t; + } + + override void[] init() + { static immutable double r; + + return (cast(double *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_float.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_float.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,75 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: = f1 && f2 !<>= f2); + } + + static int _compare(float d1, float d2) + { + if (d1 !<>= d2) // if either are NaN + { + if (d1 !<>= d1) + { if (d2 !<>= d2) + return 0; + return -1; + } + return 1; + } + return (d1 == d2) ? 0 : ((d1 < d2) ? -1 : 1); + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(float *)p1, *cast(float *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(float *)p1, *cast(float *)p2); + } + + override size_t tsize() + { + return float.sizeof; + } + + override void swap(void *p1, void *p2) + { + float t; + + t = *cast(float *)p1; + *cast(float *)p1 = *cast(float *)p2; + *cast(float *)p2 = t; + } + + override void[] init() + { static immutable float r; + + return (cast(float *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_idouble.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_idouble.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,22 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: *cast(int*) p2) + return 1; + return 0; + } + + override size_t tsize() + { + return int.sizeof; + } + + override void swap(void *p1, void *p2) + { + int t; + + t = *cast(int *)p1; + *cast(int *)p1 = *cast(int *)p2; + *cast(int *)p2 = t; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_ireal.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_ireal.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,22 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: *cast(long *)p2) + return 1; + return 0; + } + + override size_t tsize() + { + return long.sizeof; + } + + override void swap(void *p1, void *p2) + { + long t; + + t = *cast(long *)p1; + *cast(long *)p1 = *cast(long *)p2; + *cast(long *)p2 = t; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_ptr.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_ptr.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,57 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright + * + * Copyright Digital Mars 2004 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.typeinfo.ti_real; + +// real + +class TypeInfo_e : TypeInfo +{ + override string toString() { return "real"; } + + override hash_t getHash(in void* p) + { + return (cast(uint *)p)[0] + (cast(uint *)p)[1] + (cast(ushort *)p)[4]; + } + + static equals_t _equals(real f1, real f2) + { + return f1 == f2 || + (f1 !<>= f1 && f2 !<>= f2); + } + + static int _compare(real d1, real d2) + { + if (d1 !<>= d2) // if either are NaN + { + if (d1 !<>= d1) + { if (d2 !<>= d2) + return 0; + return -1; + } + return 1; + } + return (d1 == d2) ? 0 : ((d1 < d2) ? -1 : 1); + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(real *)p1, *cast(real *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(real *)p1, *cast(real *)p2); + } + + override size_t tsize() + { + return real.sizeof; + } + + override void swap(void *p1, void *p2) + { + real t; + + t = *cast(real *)p1; + *cast(real *)p1 = *cast(real *)p2; + *cast(real *)p2 = t; + } + + override void[] init() + { static immutable real r; + + return (cast(real *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_short.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_short.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,49 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: *cast(uint*) p2) + return 1; + return 0; + } + + override size_t tsize() + { + return uint.sizeof; + } + + override void swap(void *p1, void *p2) + { + int t; + + t = *cast(uint *)p1; + *cast(uint *)p1 = *cast(uint *)p2; + *cast(uint *)p2 = t; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_ulong.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_ulong.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,53 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: *cast(ulong *)p2) + return 1; + return 0; + } + + override size_t tsize() + { + return ulong.sizeof; + } + + override void swap(void *p1, void *p2) + { + ulong t; + + t = *cast(ulong *)p1; + *cast(ulong *)p1 = *cast(ulong *)p2; + *cast(ulong *)p2 = t; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/typeinfo/ti_ushort.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/typeinfo/ti_ushort.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,49 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: =0x48 && x<=0x80) + || x==0x86 || x==0x87 + || (x>=0x66 && x<=0x68) || (x>=0x39 && x<=0x3E)){ + datacache[level].lineSize = 64; + } else datacache[level].lineSize = 32; + } + } + } + + uint[4] a; + bool firstTime = true; + // On a multi-core system, this could theoretically fail, but it's only used + // for old single-core CPUs. + uint numinfos = 1; + do { + asm { + mov EAX, 2; + cpuid; + mov a, EAX; + mov a+4, EBX; + mov a+8, ECX; + mov a+12, EDX; + } + if (firstTime) { + if (a[0]==0x0000_7001 && a[3]==0x80 && a[1]==0 && a[2]==0) { + // Cyrix MediaGX MMXEnhanced returns: EAX= 00007001, EDX=00000080. + // These are NOT standard Intel values + // (TLB = 32 entry, 4 way associative, 4K pages) + // (L1 cache = 16K, 4way, linesize16) + datacache[0].size=8; + datacache[0].associativity=4; + datacache[0].lineSize=16; + return; + } + // lsb of a is how many times to loop. + numinfos = a[0] & 0xFF; + // and otherwise it should be ignored + a[0] &= 0xFFFF_FF00; + firstTime = false; + } + for (int c=0; c<4;++c) { + // high bit set == no info. + if (a[c] & 0x8000_0000) continue; + decipherCpuid2(cast(ubyte)(a[c] & 0xFF)); + decipherCpuid2(cast(ubyte)((a[c]>>8) & 0xFF)); + decipherCpuid2(cast(ubyte)((a[c]>>16) & 0xFF)); + decipherCpuid2(cast(ubyte)((a[c]>>24) & 0xFF)); + } + } while (--numinfos); +} + +// CPUID4: "Deterministic cache parameters" leaf +void getcacheinfoCPUID4() +{ + int cachenum = 0; + for(;;) { + uint a, b, number_of_sets; + asm { + mov EAX, 4; + mov ECX, cachenum; + cpuid; + mov a, EAX; + mov b, EBX; + mov number_of_sets, ECX; + } + ++cachenum; + if ((a&0x1F)==0) break; // no more caches + uint numthreads = ((a>>14) & 0xFFF) + 1; + uint numcores = ((a>>26) & 0x3F) + 1; + if (numcores > maxCores) maxCores = numcores; + if ((a&0x1F)!=1 && ((a&0x1F)!=3)) continue; // we only want data & unified caches + + ++number_of_sets; + ubyte level = cast(ubyte)(((a>>5)&7)-1); + if (level > datacache.length) continue; // ignore deep caches + datacache[level].associativity = a & 0x200 ? ubyte.max :cast(ubyte)((b>>22)+1); + datacache[level].lineSize = (b & 0xFFF)+ 1; // system coherency line size + uint line_partitions = ((b >> 12)& 0x3FF) + 1; + // Size = number of sets * associativity * cachelinesize * linepartitions + // and must convert to Kb, also dividing by the number of hyperthreads using this cache. + ulong sz = (datacache[level].associativity< ubyte.max)? number_of_sets * + datacache[level].associativity : number_of_sets; + datacache[level].size = cast(uint)( + (sz * datacache[level].lineSize * line_partitions ) / (numthreads *1024)); + if (level == 0 && (a&0xF)==3) { + // Halve the size for unified L1 caches + datacache[level].size/=2; + } + } +} + +// CPUID8000_0005 & 6 +void getAMDcacheinfo() +{ + uint c5, c6, d6; + asm { + mov EAX, 0x8000_0005; // L1 cache + cpuid; + // EAX has L1_TLB_4M. + // EBX has L1_TLB_4K + // EDX has L1 instruction cache + mov c5, ECX; + } + + datacache[0].size = ( (c5>>24) & 0xFF); + datacache[0].associativity = cast(ubyte)( (c5 >> 16) & 0xFF); + datacache[0].lineSize = c5 & 0xFF; + + if (max_extended_cpuid >= 0x8000_0006) { + // AMD K6-III or K6-2+ or later. + ubyte numcores = 1; + if (max_extended_cpuid >=0x8000_0008) { + asm { + mov EAX, 0x8000_0008; + cpuid; + mov numcores, CL; + } + ++numcores; + if (numcores>maxCores) maxCores = numcores; + } + asm { + mov EAX, 0x8000_0006; // L2/L3 cache + cpuid; + mov c6, ECX; // L2 cache info + mov d6, EDX; // L3 cache info + } + + immutable ubyte [] assocmap = [ 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 0xFF ]; + datacache[1].size = (c6>>16) & 0xFFFF; + datacache[1].associativity = assocmap[(c6>>12)&0xF]; + datacache[1].lineSize = c6 & 0xFF; + + // The L3 cache value is TOTAL, not per core. + datacache[2].size = ((d6>>18)*512)/numcores; // could be up to 2 * this, -1. + datacache[2].associativity = assocmap[(d6>>12)&0xF]; + datacache[2].lineSize = d6 & 0xFF; + } +} + + +void cpuidX86() +{ + char * venptr = vendorID.ptr; + asm { + mov EAX, 0; + cpuid; + mov max_cpuid, EAX; + mov EAX, venptr; + mov [EAX], EBX; + mov [EAX + 4], EDX; + mov [EAX + 8], ECX; + mov EAX, 0x8000_0000; + cpuid; + mov max_extended_cpuid, EAX; + } + + probablyIntel = vendorID == "GenuineIntel"; + probablyAMD = vendorID == "AuthenticAMD"; + uint a, b, c, d; + uint apic = 0; // brand index, apic id + asm { + mov EAX, 1; // model, stepping + cpuid; + mov a, EAX; + mov apic, EBX; + mov miscfeatures, ECX; + mov features, EDX; + } + amdfeatures = 0; + amdmiscfeatures = 0; + if (max_extended_cpuid >= 0x8000_0001) { + asm { + mov EAX, 0x8000_0001; + cpuid; + mov amdmiscfeatures, ECX; + mov amdfeatures, EDX; + } + } + // Try to detect fraudulent vendorIDs + if (amd3dnow) probablyIntel = false; + + stepping = a & 0xF; + uint fbase = (a >> 8) & 0xF; + uint mbase = (a >> 4) & 0xF; + family = ((fbase == 0xF) || (fbase == 0)) ? fbase + (a >> 20) & 0xFF : fbase; + model = ((fbase == 0xF) || (fbase == 6 && probablyIntel) ) ? + mbase + ((a >> 12) & 0xF0) : mbase; + + if (!probablyIntel && max_extended_cpuid >= 0x8000_0008) { + // determine max number of cores for AMD + asm { + mov EAX, 0x8000_0008; + cpuid; + mov c, ECX; + } + uint apicsize = (c>>12) & 0xF; + if (apicsize == 0) { + // use legacy method + if (hyperThreadingBit) maxCores = c & 0xFF; + else maxCores = 1; + } else { + // maxcores = 2^ apicsize + maxCores = 1; + while (apicsize) { maxCores<<=1; --apicsize; } + } + } + + if (max_extended_cpuid >= 0x8000_0004) { + char *procptr = processorNameBuffer.ptr; + asm { + push ESI; + mov ESI, procptr; + mov EAX, 0x8000_0002; + cpuid; + mov [ESI], EAX; + mov [ESI+4], EBX; + mov [ESI+8], ECX; + mov [ESI+12], EDX; + mov EAX, 0x8000_0003; + cpuid; + mov [ESI+16], EAX; + mov [ESI+20], EBX; + mov [ESI+24], ECX; + mov [ESI+28], EDX; + mov EAX, 0x8000_0004; + cpuid; + mov [ESI+32], EAX; + mov [ESI+36], EBX; + mov [ESI+40], ECX; + mov [ESI+44], EDX; + pop ESI; + } + // Intel P4 and PM pad at front with spaces. + // Other CPUs pad at end with nulls. + int start = 0, end = 0; + while (processorNameBuffer[start] == ' ') { ++start; } + while (processorNameBuffer[$-end-1] == 0) { ++end; } + processorName = processorNameBuffer[start..$-end]; + } else { + processorName[] = "Unknown CPU"; + } + // Determine cache sizes + + // Intel docs specify that they return 0 for 0x8000_0005. + // AMD docs do not specify the behaviour for 0004 and 0002. + // Centaur/VIA and most other manufacturers use the AMD method, + // except Cyrix MediaGX MMX Enhanced uses their OWN form of CPUID2! + // NS Geode GX1 provides CyrixCPUID2 _and_ does the same wrong behaviour + // for CPUID80000005. But Geode GX uses the AMD method + + // Deal with Geode GX1 - make it same as MediaGX MMX. + if (max_extended_cpuid==0x8000_0005 && max_cpuid==2) { + max_extended_cpuid = 0x8000_0004; + } + // Therefore, we try the AMD method unless it's an Intel chip. + // If we still have no info, try the Intel methods. + datacache[0].size = 0; + if (max_cpuid<2 || !probablyIntel) { + if (max_extended_cpuid >= 0x8000_0005) { + getAMDcacheinfo(); + } else if (probablyAMD) { + // According to AMDProcRecognitionAppNote, this means CPU + // K5 model 0, or Am5x86 (model 4), or Am4x86DX4 (model 4) + // Am5x86 has 16Kb 4-way unified data & code cache. + datacache[0].size = 8; + datacache[0].associativity = 4; + datacache[0].lineSize = 32; + } else { + // Some obscure CPU. + // Values for Cyrix 6x86MX (family 6, model 0) + datacache[0].size = 64; + datacache[0].associativity = 4; + datacache[0].lineSize = 32; + } + } + if ((datacache[0].size == 0) && max_cpuid>=4) { + getcacheinfoCPUID4(); + } + if ((datacache[0].size == 0) && max_cpuid>=2) { + getcacheinfoCPUID2(); + } + if (datacache[0].size == 0) { + // Pentium, PMMX, late model 486, or an obscure CPU + if (mmx) { // Pentium MMX. Also has 8kB code cache. + datacache[0].size = 16; + datacache[0].associativity = 4; + datacache[0].lineSize = 32; + } else { // Pentium 1 (which also has 8kB code cache) + // or 486. + // Cyrix 6x86: 16, 4way, 32 linesize + datacache[0].size = 8; + datacache[0].associativity = 2; + datacache[0].lineSize = 32; + } + } + if (hyperThreadingBit) maxThreads = (apic>>>16) & 0xFF; + else maxThreads = maxCores; +} + +// Return true if the cpuid instruction is supported. +// BUG(WONTFIX): Returns false for Cyrix 6x86 and 6x86L. They will be treated as 486 machines. +bool hasCPUID() +{ + uint flags; + asm { + pushfd; + pop EAX; + mov flags, EAX; + xor EAX, 0x0020_0000; + push EAX; + popfd; + pushfd; + pop EAX; + xor flags, EAX; + } + return (flags & 0x0020_0000) !=0; +} + +} else { // inline asm X86 + + bool hasCPUID() { return false; } + + void cpuidX86() + { + datacache[0].size = 8; + datacache[0].associativity = 2; + datacache[0].lineSize = 32; + } +} + +// TODO: Implement this function with OS support +void cpuidPPC() +{ + enum :int { PPC601, PPC603, PPC603E, PPC604, + PPC604E, PPC620, PPCG3, PPCG4, PPCG5 }; + + // TODO: + // asm { mfpvr; } returns the CPU version but unfortunately it can + // only be used in kernel mode. So OS support is required. + int cputype = PPC603; + + // 601 has a 8KB combined data & code L1 cache. + uint sizes[] = [4, 8, 16, 16, 32, 32, 32, 32, 64]; + ubyte ways[] = [8, 2, 4, 4, 4, 8, 8, 8, 8]; + uint L2size[]= [0, 0, 0, 0, 0, 0, 0, 256, 512]; + uint L3size[]= [0, 0, 0, 0, 0, 0, 0, 2048, 0]; + + datacache[0].size = sizes[cputype]; + datacache[0].associativity = ways[cputype]; + datacache[0].lineSize = (cputype==PPCG5)? 128 : + (cputype == PPC620 || cputype == PPCG3)? 64 : 32; + datacache[1].size = L2size[cputype]; + datacache[2].size = L3size[cputype]; + datacache[1].lineSize = datacache[0].lineSize; + datacache[2].lineSize = datacache[0].lineSize; +} + +// TODO: Implement this function with OS support +void cpuidSparc() +{ + // UltaSparcIIi : L1 = 16, 2way. L2 = 512, 4 way. + // UltraSparcIII : L1 = 64, 4way. L2= 4096 or 8192. + // UltraSparcIIIi: L1 = 64, 4way. L2= 1024, 4 way + // UltraSparcIV : L1 = 64, 4way. L2 = 16*1024. + // UltraSparcIV+ : L1 = 64, 4way. L2 = 2048, L3=32*1024. + // Sparc64V : L1 = 128, 2way. L2 = 4096 4way. +} + + +static this() +{ + if (hasCPUID()) { + cpuidX86(); + } else { + // it's a 386 or 486, or a Cyrix 6x86. + //Probably still has an external cache. + } + if (datacache[0].size==0) { + // Guess same as Pentium 1. + datacache[0].size = 8; + datacache[0].associativity = 2; + datacache[0].lineSize = 32; + } + numCacheLevels = 1; + // And now fill up all the unused levels with full memory space. + for (int i=1; i< datacache.length; ++i) { + if (datacache[i].size==0) { + // Set all remaining levels of cache equal to full address space. + datacache[i].size = uint.max/1024; + datacache[i].associativity = 1; + datacache[i].lineSize = datacache[i-1].lineSize; + } else numCacheLevels = i+1; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/util/ctype.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/util/ctype.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,92 @@ +/** + * Simple ASCII char classification functions. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: Boost License 1.0. + * Authors: Sean Kelly + * + * Copyright Sean Kelly 2005 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.util.string; + +private import core.stdc.string; + +char[] intToString( char[] buf, uint val ) +{ + assert( buf.length > 9 ); + auto p = buf.ptr + buf.length; + + do + { + *--p = cast(char)(val % 10 + '0'); + } while( val /= 10 ); + + return buf[p - buf.ptr .. $]; +} + + +int dstrcmp( in char[] s1, in char[] s2 ) +{ + auto len = s1.length; + if( s2.length < len ) + len = s2.length; + if( memcmp( s1.ptr, s2.ptr, len ) == 0 ) + return 0; + return s1.length > s2.length ? 1 : -1; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/util/utf.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/util/utf.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,901 @@ +/******************************************** + * Encode and decode UTF-8, UTF-16 and UTF-32 strings. + * + * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D + * wchar type. + * For Posix systems, the C wchar_t type is UTF-32 and corresponds to + * the D utf.dchar type. + * + * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). + * + * See_Also: + * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)
+ * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)
+ * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) + * Macros: + * WIKI = Phobos/StdUtf + * + * Copyright: Copyright Digital Mars 2003 - 2009. + * License: = 0xD800 && u <= 0xDBFF); +} + +/** + * stride() returns the length of a UTF-32 sequence starting at index i + * in string s. + * Returns: The return value will always be 1. + */ +uint stride(in dchar[] s, size_t i) +{ + return 1; +} + +/******************************************* + * Given an index i into an array of characters s[], + * and assuming that index i is at the start of a UTF character, + * determine the number of UCS characters up to that index i. + */ + +size_t toUCSindex(in char[] s, size_t i) +{ + size_t n; + size_t j; + + for (j = 0; j < i; ) + { + j += stride(s, j); + n++; + } + if (j > i) + { + onUnicodeError("invalid UTF-8 sequence", j); + } + return n; +} + +/** ditto */ +size_t toUCSindex(in wchar[] s, size_t i) +{ + size_t n; + size_t j; + + for (j = 0; j < i; ) + { + j += stride(s, j); + n++; + } + if (j > i) + { + onUnicodeError("invalid UTF-16 sequence", j); + } + return n; +} + +/** ditto */ +size_t toUCSindex(in dchar[] s, size_t i) +{ + return i; +} + +/****************************************** + * Given a UCS index n into an array of characters s[], return the UTF index. + */ + +size_t toUTFindex(in char[] s, size_t n) +{ + size_t i; + + while (n--) + { + uint j = UTF8stride[s[i]]; + if (j == 0xFF) + onUnicodeError("invalid UTF-8 sequence", i); + i += j; + } + return i; +} + +/** ditto */ +size_t toUTFindex(in wchar[] s, size_t n) +{ + size_t i; + + while (n--) + { wchar u = s[i]; + + i += 1 + (u >= 0xD800 && u <= 0xDBFF); + } + return i; +} + +/** ditto */ +size_t toUTFindex(in dchar[] s, size_t n) +{ + return n; +} + +/* =================== Decode ======================= */ + +/*************** + * Decodes and returns character starting at s[idx]. idx is advanced past the + * decoded character. If the character is not well formed, a UtfException is + * thrown and idx remains unchanged. + */ +dchar decode(in char[] s, inout size_t idx) + in + { + assert(idx >= 0 && idx < s.length); + } + out (result) + { + assert(isValidDchar(result)); + } + body + { + size_t len = s.length; + dchar V; + size_t i = idx; + char u = s[i]; + + if (u & 0x80) + { uint n; + char u2; + + /* The following encodings are valid, except for the 5 and 6 byte + * combinations: + * 0xxxxxxx + * 110xxxxx 10xxxxxx + * 1110xxxx 10xxxxxx 10xxxxxx + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + for (n = 1; ; n++) + { + if (n > 4) + goto Lerr; // only do the first 4 of 6 encodings + if (((u << n) & 0x80) == 0) + { + if (n == 1) + goto Lerr; + break; + } + } + + // Pick off (7 - n) significant bits of B from first byte of octet + V = cast(dchar)(u & ((1 << (7 - n)) - 1)); + + if (i + (n - 1) >= len) + goto Lerr; // off end of string + + /* The following combinations are overlong, and illegal: + * 1100000x (10xxxxxx) + * 11100000 100xxxxx (10xxxxxx) + * 11110000 1000xxxx (10xxxxxx 10xxxxxx) + * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) + * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) + */ + u2 = s[i + 1]; + if ((u & 0xFE) == 0xC0 || + (u == 0xE0 && (u2 & 0xE0) == 0x80) || + (u == 0xF0 && (u2 & 0xF0) == 0x80) || + (u == 0xF8 && (u2 & 0xF8) == 0x80) || + (u == 0xFC && (u2 & 0xFC) == 0x80)) + goto Lerr; // overlong combination + + for (uint j = 1; j != n; j++) + { + u = s[i + j]; + if ((u & 0xC0) != 0x80) + goto Lerr; // trailing bytes are 10xxxxxx + V = (V << 6) | (u & 0x3F); + } + if (!isValidDchar(V)) + goto Lerr; + i += n; + } + else + { + V = cast(dchar) u; + i++; + } + + idx = i; + return V; + + Lerr: + onUnicodeError("invalid UTF-8 sequence", i); + return V; // dummy return + } + +unittest +{ size_t i; + dchar c; + + debug(utf) printf("utf.decode.unittest\n"); + + static s1 = "abcd"c; + i = 0; + c = decode(s1, i); + assert(c == cast(dchar)'a'); + assert(i == 1); + c = decode(s1, i); + assert(c == cast(dchar)'b'); + assert(i == 2); + + static s2 = "\xC2\xA9"c; + i = 0; + c = decode(s2, i); + assert(c == cast(dchar)'\u00A9'); + assert(i == 2); + + static s3 = "\xE2\x89\xA0"c; + i = 0; + c = decode(s3, i); + assert(c == cast(dchar)'\u2260'); + assert(i == 3); + + static s4 = + [ "\xE2\x89"c[], // too short + "\xC0\x8A", + "\xE0\x80\x8A", + "\xF0\x80\x80\x8A", + "\xF8\x80\x80\x80\x8A", + "\xFC\x80\x80\x80\x80\x8A", + ]; + + for (int j = 0; j < s4.length; j++) + { + try + { + i = 0; + c = decode(s4[j], i); + assert(0); + } + catch (Object o) + { + i = 23; + } + assert(i == 23); + } +} + +/** ditto */ + +dchar decode(in wchar[] s, inout size_t idx) + in + { + assert(idx >= 0 && idx < s.length); + } + out (result) + { + assert(isValidDchar(result)); + } + body + { + string msg; + dchar V; + size_t i = idx; + uint u = s[i]; + + if (u & ~0x7F) + { if (u >= 0xD800 && u <= 0xDBFF) + { uint u2; + + if (i + 1 == s.length) + { msg = "surrogate UTF-16 high value past end of string"; + goto Lerr; + } + u2 = s[i + 1]; + if (u2 < 0xDC00 || u2 > 0xDFFF) + { msg = "surrogate UTF-16 low value out of range"; + goto Lerr; + } + u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); + i += 2; + } + else if (u >= 0xDC00 && u <= 0xDFFF) + { msg = "unpaired surrogate UTF-16 value"; + goto Lerr; + } + else if (u == 0xFFFE || u == 0xFFFF) + { msg = "illegal UTF-16 value"; + goto Lerr; + } + else + i++; + } + else + { + i++; + } + + idx = i; + return cast(dchar)u; + + Lerr: + onUnicodeError(msg, i); + return cast(dchar)u; // dummy return + } + +/** ditto */ + +dchar decode(in dchar[] s, inout size_t idx) + in + { + assert(idx >= 0 && idx < s.length); + } + body + { + size_t i = idx; + dchar c = s[i]; + + if (!isValidDchar(c)) + goto Lerr; + idx = i + 1; + return c; + + Lerr: + onUnicodeError("invalid UTF-32 value", i); + return c; // dummy return + } + + +/* =================== Encode ======================= */ + +/******************************* + * Encodes character c and appends it to array s[]. + */ +void encode(inout char[] s, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + char[] r = s; + + if (c <= 0x7F) + { + r ~= cast(char) c; + } + else + { + char[4] buf; + uint L; + + if (c <= 0x7FF) + { + buf[0] = cast(char)(0xC0 | (c >> 6)); + buf[1] = cast(char)(0x80 | (c & 0x3F)); + L = 2; + } + else if (c <= 0xFFFF) + { + buf[0] = cast(char)(0xE0 | (c >> 12)); + buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[2] = cast(char)(0x80 | (c & 0x3F)); + L = 3; + } + else if (c <= 0x10FFFF) + { + buf[0] = cast(char)(0xF0 | (c >> 18)); + buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); + buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[3] = cast(char)(0x80 | (c & 0x3F)); + L = 4; + } + else + { + assert(0); + } + r ~= buf[0 .. L]; + } + s = r; + } + +unittest +{ + debug(utf) printf("utf.encode.unittest\n"); + + char[] s = "abcd".dup; + encode(s, cast(dchar)'a'); + assert(s.length == 5); + assert(s == "abcda"); + + encode(s, cast(dchar)'\u00A9'); + assert(s.length == 7); + assert(s == "abcda\xC2\xA9"); + //assert(s == "abcda\u00A9"); // BUG: fix compiler + + encode(s, cast(dchar)'\u2260'); + assert(s.length == 10); + assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); +} + +/** ditto */ + +void encode(inout wchar[] s, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + wchar[] r = s; + + if (c <= 0xFFFF) + { + r ~= cast(wchar) c; + } + else + { + wchar[2] buf; + + buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); + buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); + r ~= buf; + } + s = r; + } + +/** ditto */ +void encode(inout dchar[] s, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + s ~= c; + } + +/** +Returns the code length of $(D c) in the encoding using $(D C) as a +code point. The code is returned in character count, not in bytes. + */ + +ubyte codeLength(C)(dchar c) +{ + + static if (C.sizeof == 1) + { + return + c <= 0x7F ? 1 + : c <= 0x7FF ? 2 + : c <= 0xFFFF ? 3 + : c <= 0x10FFFF ? 4 + : (assert(false), 6); +} + + else static if (C.sizeof == 2) +{ + return c <= 0xFFFF ? 1 : 2; + } + else + { + static assert(C.sizeof == 4); + return 1; + } +} + +/* =================== Validation ======================= */ + +/*********************************** +Checks to see if string is well formed or not. $(D S) can be an array + of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException) + if it is not. Use to check all untrusted input for correctness. + */ +void validate(S)(in S s) +{ + auto len = s.length; + for (size_t i = 0; i < len; ) + { + decode(s, i); + } +} + +/* =================== Conversion to UTF8 ======================= */ + +char[] toUTF8(char[4] buf, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + if (c <= 0x7F) + { + buf[0] = cast(char) c; + return buf[0 .. 1]; + } + else if (c <= 0x7FF) + { + buf[0] = cast(char)(0xC0 | (c >> 6)); + buf[1] = cast(char)(0x80 | (c & 0x3F)); + return buf[0 .. 2]; + } + else if (c <= 0xFFFF) + { + buf[0] = cast(char)(0xE0 | (c >> 12)); + buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[2] = cast(char)(0x80 | (c & 0x3F)); + return buf[0 .. 3]; + } + else if (c <= 0x10FFFF) + { + buf[0] = cast(char)(0xF0 | (c >> 18)); + buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); + buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[3] = cast(char)(0x80 | (c & 0x3F)); + return buf[0 .. 4]; + } + assert(0); + } + +/******************* + * Encodes string s into UTF-8 and returns the encoded string. + */ +string toUTF8(string s) + in + { + validate(s); + } + body + { + return s; + } + +/** ditto */ +string toUTF8(in wchar[] s) +{ + char[] r; + size_t i; + size_t slen = s.length; + + r.length = slen; + + for (i = 0; i < slen; i++) + { wchar c = s[i]; + + if (c <= 0x7F) + r[i] = cast(char)c; // fast path for ascii + else + { + r.length = i; + foreach (dchar c; s[i .. slen]) + { + encode(r, c); + } + break; + } + } + return cast(string)r; +} + +/** ditto */ +string toUTF8(in dchar[] s) +{ + char[] r; + size_t i; + size_t slen = s.length; + + r.length = slen; + + for (i = 0; i < slen; i++) + { dchar c = s[i]; + + if (c <= 0x7F) + r[i] = cast(char)c; // fast path for ascii + else + { + r.length = i; + foreach (dchar d; s[i .. slen]) + { + encode(r, d); + } + break; + } + } + return cast(string)r; +} + +/* =================== Conversion to UTF16 ======================= */ + +wchar[] toUTF16(wchar[2] buf, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + if (c <= 0xFFFF) + { + buf[0] = cast(wchar) c; + return buf[0 .. 1]; + } + else + { + buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); + buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); + return buf[0 .. 2]; + } + } + +/**************** + * Encodes string s into UTF-16 and returns the encoded string. + * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take + * an LPWSTR or LPCWSTR argument. + */ +wstring toUTF16(in char[] s) +{ + wchar[] r; + size_t slen = s.length; + + r.length = slen; + r.length = 0; + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c <= 0x7F) + { + i++; + r ~= cast(wchar)c; + } + else + { + c = decode(s, i); + encode(r, c); + } + } + return cast(wstring)r; +} + +alias const(wchar)* wptr; +/** ditto */ +wptr toUTF16z(in char[] s) +{ + wchar[] r; + size_t slen = s.length; + + r.length = slen + 1; + r.length = 0; + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c <= 0x7F) + { + i++; + r ~= cast(wchar)c; + } + else + { + c = decode(s, i); + encode(r, c); + } + } + r ~= "\000"; + return r.ptr; +} + +/** ditto */ +wstring toUTF16(wstring s) + in + { + validate(s); + } + body + { + return s; + } + +/** ditto */ +wstring toUTF16(in dchar[] s) +{ + wchar[] r; + size_t slen = s.length; + + r.length = slen; + r.length = 0; + for (size_t i = 0; i < slen; i++) + { + encode(r, s[i]); + } + return cast(wstring)r; +} + +/* =================== Conversion to UTF32 ======================= */ + +/***** + * Encodes string s into UTF-32 and returns the encoded string. + */ +dstring toUTF32(in char[] s) +{ + dchar[] r; + size_t slen = s.length; + size_t j = 0; + + r.length = slen; // r[] will never be longer than s[] + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c >= 0x80) + c = decode(s, i); + else + i++; // c is ascii, no need for decode + r[j++] = c; + } + return cast(dstring)r[0 .. j]; +} + +/** ditto */ +dstring toUTF32(in wchar[] s) +{ + dchar[] r; + size_t slen = s.length; + size_t j = 0; + + r.length = slen; // r[] will never be longer than s[] + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c >= 0x80) + c = decode(s, i); + else + i++; // c is ascii, no need for decode + r[j++] = c; + } + return cast(dstring)r[0 .. j]; +} + +/** ditto */ +dstring toUTF32(dstring s) + in + { + validate(s); + } + body + { + return s; + } + +/* ================================ tests ================================== */ + +unittest +{ + debug(utf) printf("utf.toUTF.unittest\n"); + + auto c = "hello"c[]; + auto w = toUTF16(c); + assert(w == "hello"); + auto d = toUTF32(c); + assert(d == "hello"); + + c = toUTF8(w); + assert(c == "hello"); + d = toUTF32(w); + assert(d == "hello"); + + c = toUTF8(d); + assert(c == "hello"); + w = toUTF16(d); + assert(w == "hello"); + + + c = "hel\u1234o"; + w = toUTF16(c); + assert(w == "hel\u1234o"); + d = toUTF32(c); + assert(d == "hel\u1234o"); + + c = toUTF8(w); + assert(c == "hel\u1234o"); + d = toUTF32(w); + assert(d == "hel\u1234o"); + + c = toUTF8(d); + assert(c == "hel\u1234o"); + w = toUTF16(d); + assert(w == "hel\u1234o"); + + + c = "he\U0010AAAAllo"; + w = toUTF16(c); + //foreach (wchar c; w) printf("c = x%x\n", c); + //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c); + assert(w == "he\U0010AAAAllo"); + d = toUTF32(c); + assert(d == "he\U0010AAAAllo"); + + c = toUTF8(w); + assert(c == "he\U0010AAAAllo"); + d = toUTF32(w); + assert(d == "he\U0010AAAAllo"); + + c = toUTF8(d); + assert(c == "he\U0010AAAAllo"); + w = toUTF16(d); + assert(w == "he\U0010AAAAllo"); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/dmd/win32.mak --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/dmd/win32.mak Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,175 @@ +# Makefile to build the D runtime library core components for Posix +# Designed to work with GNU make +# Targets: +# make +# Same as make all +# make debug +# Build the debug version of the library +# make release +# Build the release version of the library +# make doc +# Generate documentation +# make clean +# Delete all files created by build process + +# Essentials + +LIBDIR=..\..\..\lib +DOCDIR=..\..\..\doc +IMPDIR=..\..\..\import +LIBBASENAME=druntime_rt_dmd.lib +#MODULES=bitop exception memory runtime thread vararg \ +# $(addprefix sync/,barrier condition config exception mutex rwmutex semaphore) +BUILDS=debug release unittest + +MODULES_BASE= \ + aaA.d \ + aApply.d \ + aApplyR.d \ + adi.d \ + arrayassign.d \ + arraybyte.d \ + arraycast.d \ + arraycat.d \ + arraydouble.d \ + arrayfloat.d \ + arrayint.d \ + arrayreal.d \ + arrayshort.d \ + cast_.d \ + cover.d \ + dmain2.d \ + invariant.d \ + invariant_.d \ + lifetime.d \ + memory.d \ + memset.d \ + obj.d \ + object_.d \ + qsort.d \ + switch_.d \ + trace.d +# NOTE: trace.d and cover.d are not necessary for a successful build +# as both are used for debugging features (profiling and coverage) +# NOTE: a pre-compiled minit.obj has been provided in dmd for Win32 and +# minit.asm is not used by dmd for Linux + +MODULES_UTIL= \ + util\console.d \ + util\cpuid.d \ + util\ctype.d \ + util\string.d \ + util\utf.d + +MODULES_TI= \ + typeinfo\ti_AC.d \ + typeinfo\ti_Acdouble.d \ + typeinfo\ti_Acfloat.d \ + typeinfo\ti_Acreal.d \ + typeinfo\ti_Adouble.d \ + typeinfo\ti_Afloat.d \ + typeinfo\ti_Ag.d \ + typeinfo\ti_Aint.d \ + typeinfo\ti_Along.d \ + typeinfo\ti_Areal.d \ + typeinfo\ti_Ashort.d \ + typeinfo\ti_byte.d \ + typeinfo\ti_C.d \ + typeinfo\ti_cdouble.d \ + typeinfo\ti_cfloat.d \ + typeinfo\ti_char.d \ + typeinfo\ti_creal.d \ + typeinfo\ti_dchar.d \ + typeinfo\ti_delegate.d \ + typeinfo\ti_double.d \ + typeinfo\ti_float.d \ + typeinfo\ti_idouble.d \ + typeinfo\ti_ifloat.d \ + typeinfo\ti_int.d \ + typeinfo\ti_ireal.d \ + typeinfo\ti_long.d \ + typeinfo\ti_ptr.d \ + typeinfo\ti_real.d \ + typeinfo\ti_short.d \ + typeinfo\ti_ubyte.d \ + typeinfo\ti_uint.d \ + typeinfo\ti_ulong.d \ + typeinfo\ti_ushort.d \ + typeinfo\ti_void.d \ + typeinfo\ti_wchar.d + +C_SRCS=complex.c critical.c deh.c monitor.c + +# Symbols + +CC=dmc +DMD=dmd +DOCFLAGS=-version=DDoc +DFLAGS_release=-d -release -O -inline -w -nofloat +DFLAGS_debug=-d -g -w -nofloat +DFLAGS_unittest=$(DFLAGS_release) -unittest +CFLAGS_release=-mn -6 -r +CFLAGS_debug=-g -mn -6 -r +CFLAGS_unittest=$(CFLAGS_release) + +# Derived symbols + +C_OBJS=complex.obj critical.obj deh.obj monitor.obj +AS_OBJS=minit.obj +ALL_MODULES=$(MODULES_BASE) $(MODULES_UTIL) $(MODULES_TI) +D_SRCS=$(ALL_MODULES) +ALLLIBS=\ + $(LIBDIR)\debug\$(LIBBASENAME) \ + $(LIBDIR)\release\$(LIBBASENAME) \ + $(LIBDIR)\unittest\$(LIBBASENAME) + +# Patterns + +#$(LIBDIR)\%\$(LIBBASENAME) : $(D_SRCS) $(C_SRCS) $(AS_OBJS) +# $(CC) -c $(CFLAGS_$*) $(C_SRCS) +# $(DMD) $(DFLAGS_$*) -lib -of$@ $(D_SRCS) $(C_OBJS) $(AS_OBJS) +# del $(C_OBJS) + +#$(DOCDIR)\%.html : %.d +# $(DMD) -c -d -o- -Df$@ $< + +#$(IMPDIR)\%.di : %.d +# $(DMD) -c -d -o- -Hf$@ $< + +# Patterns - debug + +$(LIBDIR)\debug\$(LIBBASENAME) : $(D_SRCS) $(C_SRCS) $(AS_OBJS) + $(CC) -c $(CFLAGS_debug) $(C_SRCS) + $(DMD) $(DFLAGS_debug) -lib -of$@ $(D_SRCS) $(C_OBJS) $(AS_OBJS) + del $(C_OBJS) + +# Patterns - release + +$(LIBDIR)\release\$(LIBBASENAME) : $(D_SRCS) $(C_SRCS) $(AS_OBJS) + $(CC) -c $(CFLAGS_release) $(C_SRCS) + $(DMD) $(DFLAGS_release) -lib -of$@ $(D_SRCS) $(C_OBJS) $(AS_OBJS) + del $(C_OBJS) + +# Patterns - unittest + +$(LIBDIR)\unittest\$(LIBBASENAME) : $(D_SRCS) $(C_SRCS) $(AS_OBJS) + $(CC) -c $(CFLAGS_unittest) $(C_SRCS) + $(DMD) $(DFLAGS_unittest) -lib -of$@ $(D_SRCS) $(C_OBJS) $(AS_OBJS) + del $(C_OBJS) + +# Patterns - asm + +minit.obj : minit.asm + $(CC) -c $** + +# Rulez + +all : $(BUILDS) doc + +debug : $(LIBDIR)\debug\$(LIBBASENAME) $(IMPORTS) +release : $(LIBDIR)\release\$(LIBBASENAME) $(IMPORTS) +unittest : $(LIBDIR)\unittest\$(LIBBASENAME) $(IMPORTS) +#doc : $(DOCS) + +clean : + del $(IMPORTS) $(DOCS) $(ALLLIBS) diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/aApply.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/aApply.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,386 @@ +/** + * This code handles decoding UTF strings for foreach loops. There are 6 + * combinations of conversions between char, wchar, and dchar, and 2 of each + * of those. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: > 10) & 0x3FF) + 0xD800); + result = dg(cast(void *)&w); + if (result) + break; + w = cast(wchar)(((d - 0x10000) & 0x3FF) + 0xDC00); + } + } + else + i++; + result = dg(cast(void *)&w); + if (result) + break; + } + return result; +} + +extern (C) int _aApplywc1(wchar[] aa, dg_t dg) +{ int result; + size_t i; + size_t len = aa.length; + + debug(apply) printf("_aApplywc1(), len = %d\n", len); + for (i = 0; i < len; ) + { dchar d; + wchar w; + char c; + + w = aa[i]; + if (w & ~0x7F) + { + char[4] buf; + + d = decode(aa, i); + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { c = cast(char)w; + i++; + } + result = dg(cast(void *)&c); + if (result) + break; + } + return result; +} + +extern (C) int _aApplydc1(dchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplydc1(), len = %d\n", aa.length); + foreach (dchar d; aa) + { + char c; + + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { + c = cast(char)d; + } + result = dg(cast(void *)&c); + if (result) + break; + } + return result; +} + +extern (C) int _aApplydw1(dchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplydw1(), len = %d\n", aa.length); + foreach (dchar d; aa) + { + wchar w; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar)((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(cast(void *)&w); + if (result) + break; + w = cast(wchar)(((d - 0x10000) & 0x3FF) + 0xDC00); + } + result = dg(cast(void *)&w); + if (result) + break; + } + return result; +} + + +/****************************************************************************/ + +// dg is D, but _aApplycd2() is C +extern (D) typedef int delegate(void *, void *) dg2_t; + +extern (C) int _aApplycd2(char[] aa, dg2_t dg) +{ int result; + size_t i; + size_t n; + size_t len = aa.length; + + debug(apply) printf("_aApplycd2(), len = %d\n", len); + for (i = 0; i < len; i += n) + { dchar d; + + d = aa[i]; + if (d & 0x80) + { + n = i; + d = decode(aa, n); + n -= i; + } + else + n = 1; + result = dg(&i, cast(void *)&d); + if (result) + break; + } + return result; +} + +extern (C) int _aApplywd2(wchar[] aa, dg2_t dg) +{ int result; + size_t i; + size_t n; + size_t len = aa.length; + + debug(apply) printf("_aApplywd2(), len = %d\n", len); + for (i = 0; i < len; i += n) + { dchar d; + + d = aa[i]; + if (d & ~0x7F) + { + n = i; + d = decode(aa, n); + n -= i; + } + else + n = 1; + result = dg(&i, cast(void *)&d); + if (result) + break; + } + return result; +} + +extern (C) int _aApplycw2(char[] aa, dg2_t dg) +{ int result; + size_t i; + size_t n; + size_t len = aa.length; + + debug(apply) printf("_aApplycw2(), len = %d\n", len); + for (i = 0; i < len; i += n) + { dchar d; + wchar w; + + w = aa[i]; + if (w & 0x80) + { n = i; + d = decode(aa, n); + n -= i; + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(&i, cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + } + else + n = 1; + result = dg(&i, cast(void *)&w); + if (result) + break; + } + return result; +} + +extern (C) int _aApplywc2(wchar[] aa, dg2_t dg) +{ int result; + size_t i; + size_t n; + size_t len = aa.length; + + debug(apply) printf("_aApplywc2(), len = %d\n", len); + for (i = 0; i < len; i += n) + { dchar d; + wchar w; + char c; + + w = aa[i]; + if (w & ~0x7F) + { + char[4] buf; + + n = i; + d = decode(aa, n); + n -= i; + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(&i, cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { c = cast(char)w; + n = 1; + } + result = dg(&i, cast(void *)&c); + if (result) + break; + } + return result; +} + +extern (C) int _aApplydc2(dchar[] aa, dg2_t dg) +{ int result; + size_t i; + size_t len = aa.length; + + debug(apply) printf("_aApplydc2(), len = %d\n", len); + for (i = 0; i < len; i++) + { dchar d; + char c; + + d = aa[i]; + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(&i, cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { c = cast(char)d; + } + result = dg(&i, cast(void *)&c); + if (result) + break; + } + return result; +} + +extern (C) int _aApplydw2(dchar[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplydw2(), len = %d\n", aa.length); + foreach (size_t i, dchar d; aa) + { + wchar w; + auto j = i; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(&j, cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + result = dg(&j, cast(void *)&w); + if (result) + break; + } + return result; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/aApplyR.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/aApplyR.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,957 @@ +/** + * This code handles decoding UTF strings for foreach_reverse loops. There are + * 6 combinations of conversions between char, wchar, and dchar, and 2 of each + * of those. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: >= 1; + c = aa[i]; + } + d |= (c & m) << j; + } + result = dg(cast(void *)&d); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRcd1.unittest\n"); + + auto s = "hello"c[]; + int i; + + foreach_reverse(dchar d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(dchar d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == '\U00100456'); break; + case 2: assert(d == '\u1234'); break; + case 3: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 4); +} + +/*****************************/ + +extern (C) int _aApplyRwd1(in wchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplyRwd1(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + + i--; + d = aa[i]; + if (d >= 0xDC00 && d <= 0xDFFF) + { if (i == 0) + onUnicodeError("Invalid UTF-16 sequence", 0); + i--; + d = ((aa[i] - 0xD7C0) << 10) + (d - 0xDC00); + } + result = dg(cast(void *)&d); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRwd1.unittest\n"); + + auto s = "hello"w[]; + int i; + + foreach_reverse(dchar d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(dchar d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == '\U00100456'); break; + case 2: assert(d == '\u1234'); break; + case 3: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 4); +} + +/*****************************/ + +extern (C) int _aApplyRcw1(in char[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplyRcw1(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + wchar w; + + i--; + w = aa[i]; + if (w & 0x80) + { char c = cast(char)w; + uint j; + uint m = 0x3F; + d = 0; + while ((c & 0xC0) != 0xC0) + { if (i == 0) + onUnicodeError("Invalid UTF-8 sequence", 0); + i--; + d |= (c & 0x3F) << j; + j += 6; + m >>= 1; + c = aa[i]; + } + d |= (c & m) << j; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + } + result = dg(cast(void *)&w); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRcw1.unittest\n"); + + auto s = "hello"c[]; + int i; + + foreach_reverse(wchar d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(wchar d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == 0xDBC1); break; + case 2: assert(d == 0xDC56); break; + case 3: assert(d == 0x1234); break; + case 4: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 5); +} + +/*****************************/ + +extern (C) int _aApplyRwc1(in wchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplyRwc1(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + char c; + + i--; + d = aa[i]; + if (d >= 0xDC00 && d <= 0xDFFF) + { if (i == 0) + onUnicodeError("Invalid UTF-16 sequence", 0); + i--; + d = ((aa[i] - 0xD7C0) << 10) + (d - 0xDC00); + } + + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(cast(void *)&c2); + if (result) + return result; + } + continue; + } + c = cast(char)d; + result = dg(cast(void *)&c); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRwc1.unittest\n"); + + auto s = "hello"w[]; + int i; + + foreach_reverse(char d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(char d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == 0xF4); break; + case 2: assert(d == 0x80); break; + case 3: assert(d == 0x91); break; + case 4: assert(d == 0x96); break; + case 5: assert(d == 0xE1); break; + case 6: assert(d == 0x88); break; + case 7: assert(d == 0xB4); break; + case 8: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 9); +} + +/*****************************/ + +extern (C) int _aApplyRdc1(in dchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplyRdc1(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0;) + { dchar d = aa[--i]; + char c; + + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { + c = cast(char)d; + } + result = dg(cast(void *)&c); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRdc1.unittest\n"); + + auto s = "hello"d[]; + int i; + + foreach_reverse(char d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(char d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == 0xF4); break; + case 2: assert(d == 0x80); break; + case 3: assert(d == 0x91); break; + case 4: assert(d == 0x96); break; + case 5: assert(d == 0xE1); break; + case 6: assert(d == 0x88); break; + case 7: assert(d == 0xB4); break; + case 8: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 9); +} + +/*****************************/ + +extern (C) int _aApplyRdw1(in dchar[] aa, dg_t dg) +{ int result; + + debug(apply) printf("_aApplyRdw1(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d = aa[--i]; + wchar w; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + result = dg(cast(void *)&w); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRdw1.unittest\n"); + + auto s = "hello"d[]; + int i; + + foreach_reverse(wchar d; s) + { + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(wchar d; s) + { + //printf("i = %d, d = %x\n", i, d); + switch (i) + { + case 0: assert(d == 'b'); break; + case 1: assert(d == 0xDBC1); break; + case 2: assert(d == 0xDC56); break; + case 3: assert(d == 0x1234); break; + case 4: assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 5); +} + + +/****************************************************************************/ +/* 2 argument versions */ + +// dg is D, but _aApplyRcd2() is C +extern (D) typedef int delegate(void *, void *) dg2_t; + +extern (C) int _aApplyRcd2(in char[] aa, dg2_t dg) +{ int result; + size_t i; + size_t len = aa.length; + + debug(apply) printf("_aApplyRcd2(), len = %d\n", len); + for (i = len; i != 0; ) + { dchar d; + + i--; + d = aa[i]; + if (d & 0x80) + { char c = cast(char)d; + uint j; + uint m = 0x3F; + d = 0; + while ((c & 0xC0) != 0xC0) + { if (i == 0) + onUnicodeError("Invalid UTF-8 sequence", 0); + i--; + d |= (c & 0x3F) << j; + j += 6; + m >>= 1; + c = aa[i]; + } + d |= (c & m) << j; + } + result = dg(&i, cast(void *)&d); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRcd2.unittest\n"); + + auto s = "hello"c[]; + int i; + + foreach_reverse(k, dchar d; s) + { + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, dchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(d == 'b'); assert(k == 8); break; + case 1: assert(d == '\U00100456'); assert(k == 4); break; + case 2: assert(d == '\u1234'); assert(k == 1); break; + case 3: assert(d == 'a'); assert(k == 0); break; + default: assert(0); + } + i++; + } + assert(i == 4); +} + +/*****************************/ + +extern (C) int _aApplyRwd2(in wchar[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplyRwd2(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + + i--; + d = aa[i]; + if (d >= 0xDC00 && d <= 0xDFFF) + { if (i == 0) + onUnicodeError("Invalid UTF-16 sequence", 0); + i--; + d = ((aa[i] - 0xD7C0) << 10) + (d - 0xDC00); + } + result = dg(&i, cast(void *)&d); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRwd2.unittest\n"); + + auto s = "hello"w[]; + int i; + + foreach_reverse(k, dchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, dchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(k == 4); assert(d == 'b'); break; + case 1: assert(k == 2); assert(d == '\U00100456'); break; + case 2: assert(k == 1); assert(d == '\u1234'); break; + case 3: assert(k == 0); assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 4); +} + +/*****************************/ + +extern (C) int _aApplyRcw2(in char[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplyRcw2(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + wchar w; + + i--; + w = aa[i]; + if (w & 0x80) + { char c = cast(char)w; + uint j; + uint m = 0x3F; + d = 0; + while ((c & 0xC0) != 0xC0) + { if (i == 0) + onUnicodeError("Invalid UTF-8 sequence", 0); + i--; + d |= (c & 0x3F) << j; + j += 6; + m >>= 1; + c = aa[i]; + } + d |= (c & m) << j; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(&i, cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + } + result = dg(&i, cast(void *)&w); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRcw2.unittest\n"); + + auto s = "hello"c[]; + int i; + + foreach_reverse(k, wchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, wchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(k == 8); assert(d == 'b'); break; + case 1: assert(k == 4); assert(d == 0xDBC1); break; + case 2: assert(k == 4); assert(d == 0xDC56); break; + case 3: assert(k == 1); assert(d == 0x1234); break; + case 4: assert(k == 0); assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 5); +} + +/*****************************/ + +extern (C) int _aApplyRwc2(in wchar[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplyRwc2(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d; + char c; + + i--; + d = aa[i]; + if (d >= 0xDC00 && d <= 0xDFFF) + { if (i == 0) + onUnicodeError("Invalid UTF-16 sequence", 0); + i--; + d = ((aa[i] - 0xD7C0) << 10) + (d - 0xDC00); + } + + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(&i, cast(void *)&c2); + if (result) + return result; + } + continue; + } + c = cast(char)d; + result = dg(&i, cast(void *)&c); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRwc2.unittest\n"); + + auto s = "hello"w[]; + int i; + + foreach_reverse(k, char d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, char d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(k == 4); assert(d == 'b'); break; + case 1: assert(k == 2); assert(d == 0xF4); break; + case 2: assert(k == 2); assert(d == 0x80); break; + case 3: assert(k == 2); assert(d == 0x91); break; + case 4: assert(k == 2); assert(d == 0x96); break; + case 5: assert(k == 1); assert(d == 0xE1); break; + case 6: assert(k == 1); assert(d == 0x88); break; + case 7: assert(k == 1); assert(d == 0xB4); break; + case 8: assert(k == 0); assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 9); +} + +/*****************************/ + +extern (C) int _aApplyRdc2(in dchar[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplyRdc2(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d = aa[--i]; + char c; + + if (d & ~0x7F) + { + char[4] buf; + + auto b = toUTF8(buf, d); + foreach (char c2; b) + { + result = dg(&i, cast(void *)&c2); + if (result) + return result; + } + continue; + } + else + { c = cast(char)d; + } + result = dg(&i, cast(void *)&c); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRdc2.unittest\n"); + + auto s = "hello"d[]; + int i; + + foreach_reverse(k, char d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, char d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(k == 3); assert(d == 'b'); break; + case 1: assert(k == 2); assert(d == 0xF4); break; + case 2: assert(k == 2); assert(d == 0x80); break; + case 3: assert(k == 2); assert(d == 0x91); break; + case 4: assert(k == 2); assert(d == 0x96); break; + case 5: assert(k == 1); assert(d == 0xE1); break; + case 6: assert(k == 1); assert(d == 0x88); break; + case 7: assert(k == 1); assert(d == 0xB4); break; + case 8: assert(k == 0); assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 9); +} + +/*****************************/ + +extern (C) int _aApplyRdw2(in dchar[] aa, dg2_t dg) +{ int result; + + debug(apply) printf("_aApplyRdw2(), len = %d\n", aa.length); + for (size_t i = aa.length; i != 0; ) + { dchar d = aa[--i]; + wchar w; + + if (d <= 0xFFFF) + w = cast(wchar) d; + else + { + w = cast(wchar) ((((d - 0x10000) >> 10) & 0x3FF) + 0xD800); + result = dg(&i, cast(void *)&w); + if (result) + break; + w = cast(wchar) (((d - 0x10000) & 0x3FF) + 0xDC00); + } + result = dg(&i, cast(void *)&w); + if (result) + break; + } + return result; +} + +unittest +{ + debug(apply) printf("_aApplyRdw2.unittest\n"); + + auto s = "hello"d[]; + int i; + + foreach_reverse(k, wchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + assert(k == 4 - i); + switch (i) + { + case 0: assert(d == 'o'); break; + case 1: assert(d == 'l'); break; + case 2: assert(d == 'l'); break; + case 3: assert(d == 'e'); break; + case 4: assert(d == 'h'); break; + default: assert(0); + } + i++; + } + assert(i == 5); + + s = "a\u1234\U00100456b"; + i = 0; + foreach_reverse(k, wchar d; s) + { + //printf("i = %d, k = %d, d = %x\n", i, k, d); + switch (i) + { + case 0: assert(k == 3); assert(d == 'b'); break; + case 1: assert(k == 2); assert(d == 0xDBC1); break; + case 2: assert(k == 2); assert(d == 0xDC56); break; + case 3: assert(k == 1); assert(d == 0x1234); break; + case 4: assert(k == 0); assert(d == 'a'); break; + default: assert(0); + } + i++; + } + assert(i == 5); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/aaA.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/aaA.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,872 @@ +/** + * Implementation of associative arrays. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: aa.a.b.length * 4) + { + //printf("rehash\n"); + _aaRehash(aa,keyti); + } + +Lret: + return cast(void *)(e + 1) + keysize; +} + + +/************************************************* + * Get pointer to value in associative array indexed by key. + * Returns null if it is not already there. + */ + +void* _aaGetRvalue(AA aa, TypeInfo keyti, size_t valuesize, ...) +{ + //printf("_aaGetRvalue(valuesize = %u)\n", valuesize); + if (!aa.a) + return null; + + auto pkey = cast(void *)(&valuesize + 1); + auto keysize = aligntsize(keyti.tsize()); + auto len = aa.a.b.length; + + if (len) + { + auto key_hash = keyti.getHash(pkey); + //printf("hash = %d\n", key_hash); + size_t i = key_hash % len; + auto e = aa.a.b[i]; + while (e !is null) + { + if (key_hash == e.hash) + { + auto c = keyti.compare(pkey, e + 1); + if (c == 0) + return cast(void *)(e + 1) + keysize; + e = (c < 0) ? e.left : e.right; + } + else + e = (key_hash < e.hash) ? e.left : e.right; + } + } + return null; // not found, caller will throw exception +} + + +/************************************************* + * Determine if key is in aa. + * Returns: + * null not in aa + * !=null in aa, return pointer to value + */ + +void* _aaIn(AA aa, TypeInfo keyti, ...) +in +{ +} +out (result) +{ + //assert(result == 0 || result == 1); +} +body +{ + if (aa.a) + { + auto pkey = cast(void *)(&keyti + 1); + + //printf("_aaIn(), .length = %d, .ptr = %x\n", aa.a.length, cast(uint)aa.a.ptr); + auto len = aa.a.b.length; + + if (len) + { + auto key_hash = keyti.getHash(pkey); + //printf("hash = %d\n", key_hash); + size_t i = key_hash % len; + auto e = aa.a.b[i]; + while (e !is null) + { + if (key_hash == e.hash) + { + auto c = keyti.compare(pkey, e + 1); + if (c == 0) + return cast(void *)(e + 1) + aligntsize(keyti.tsize()); + e = (c < 0) ? e.left : e.right; + } + else + e = (key_hash < e.hash) ? e.left : e.right; + } + } + } + + // Not found + return null; +} + +/************************************************* + * Delete key entry in aa[]. + * If key is not in aa[], do nothing. + */ + +void _aaDel(AA aa, TypeInfo keyti, ...) +{ + auto pkey = cast(void *)(&keyti + 1); + aaA *e; + + if (aa.a && aa.a.b.length) + { + auto key_hash = keyti.getHash(pkey); + //printf("hash = %d\n", key_hash); + size_t i = key_hash % aa.a.b.length; + auto pe = &aa.a.b[i]; + while ((e = *pe) !is null) // null means not found + { + if (key_hash == e.hash) + { + auto c = keyti.compare(pkey, e + 1); + if (c == 0) + { + if (!e.left && !e.right) + { + *pe = null; + } + else if (e.left && !e.right) + { + *pe = e.left; + e.left = null; + } + else if (!e.left && e.right) + { + *pe = e.right; + e.right = null; + } + else + { + *pe = e.left; + e.left = null; + do + pe = &(*pe).right; + while (*pe); + *pe = e.right; + e.right = null; + } + + aa.a.nodes--; + gc_free(e); + break; + } + pe = (c < 0) ? &e.left : &e.right; + } + else + pe = (key_hash < e.hash) ? &e.left : &e.right; + } + } +} + + +/******************************************** + * Produce array of values from aa. + */ + +ArrayRet_t _aaValues(AA aa, size_t keysize, size_t valuesize) +in +{ + assert(keysize == aligntsize(keysize)); +} +body +{ + size_t resi; + Array a; + + void _aaValues_x(aaA* e) + { + do + { + memcpy(a.ptr + resi * valuesize, + cast(byte*)e + aaA.sizeof + keysize, + valuesize); + resi++; + if (e.left) + { if (!e.right) + { e = e.left; + continue; + } + _aaValues_x(e.left); + } + e = e.right; + } while (e !is null); + } + + if (aa.a) + { + a.length = _aaLen(aa); + a.ptr = cast(byte*) gc_malloc(a.length * valuesize, + valuesize < (void*).sizeof ? BlkAttr.NO_SCAN : 0); + resi = 0; + foreach (e; aa.a.b) + { + if (e) + _aaValues_x(e); + } + assert(resi == a.length); + } + return *cast(ArrayRet_t*)(&a); +} + + +/******************************************** + * Rehash an array. + */ + +void* _aaRehash(AA* paa, TypeInfo keyti) +in +{ + //_aaInvAh(paa); +} +out (result) +{ + //_aaInvAh(result); +} +body +{ + BB newb; + + void _aaRehash_x(aaA* olde) + { + while (1) + { + auto left = olde.left; + auto right = olde.right; + olde.left = null; + olde.right = null; + + aaA *e; + + //printf("rehash %p\n", olde); + auto key_hash = olde.hash; + size_t i = key_hash % newb.b.length; + auto pe = &newb.b[i]; + while ((e = *pe) !is null) + { + //printf("\te = %p, e.left = %p, e.right = %p\n", e, e.left, e.right); + assert(e.left != e); + assert(e.right != e); + if (key_hash == e.hash) + { + auto c = keyti.compare(olde + 1, e + 1); + assert(c != 0); + pe = (c < 0) ? &e.left : &e.right; + } + else + pe = (key_hash < e.hash) ? &e.left : &e.right; + } + *pe = olde; + + if (right) + { + if (!left) + { olde = right; + continue; + } + _aaRehash_x(right); + } + if (!left) + break; + olde = left; + } + } + + //printf("Rehash\n"); + if (paa.a) + { + auto aa = paa.a; + auto len = _aaLen(*paa); + if (len) + { size_t i; + + for (i = 0; i < prime_list.length - 1; i++) + { + if (len <= prime_list[i]) + break; + } + len = prime_list[i]; + newb.b = new aaA*[len]; + + foreach (e; aa.b) + { + if (e) + _aaRehash_x(e); + } + delete aa.b; + + newb.nodes = aa.nodes; + newb.keyti = aa.keyti; + } + + *paa.a = newb; + _aaBalance(paa); + } + return (*paa).a; +} + +/******************************************** + * Balance an array. + */ + +void _aaBalance(AA* paa) +{ + //printf("_aaBalance()\n"); + if (paa.a) + { + aaA*[16] tmp; + aaA*[] array = tmp; + auto aa = paa.a; + foreach (j, e; aa.b) + { + /* Temporarily store contents of bucket in array[] + */ + size_t k = 0; + void addToArray(aaA* e) + { + while (e) + { addToArray(e.left); + if (k == array.length) + array.length = array.length * 2; + array[k++] = e; + e = e.right; + } + } + addToArray(e); + /* The contents of the bucket are now sorted into array[]. + * Rebuild the tree. + */ + void buildTree(aaA** p, size_t x1, size_t x2) + { + if (x1 >= x2) + *p = null; + else + { auto mid = (x1 + x2) >> 1; + *p = array[mid]; + buildTree(&(*p).left, x1, mid); + buildTree(&(*p).right, mid + 1, x2); + } + } + auto p = &aa.b[j]; + buildTree(p, 0, k); + } + } +} +/******************************************** + * Produce array of N byte keys from aa. + */ + +ArrayRet_t _aaKeys(AA aa, size_t keysize) +{ + byte[] res; + size_t resi; + + void _aaKeys_x(aaA* e) + { + do + { + memcpy(&res[resi * keysize], cast(byte*)(e + 1), keysize); + resi++; + if (e.left) + { if (!e.right) + { e = e.left; + continue; + } + _aaKeys_x(e.left); + } + e = e.right; + } while (e !is null); + } + + auto len = _aaLen(aa); + if (!len) + return 0; + res = (cast(byte*) gc_malloc(len * keysize, + !(aa.a.keyti.flags() & 1) ? BlkAttr.NO_SCAN : 0))[0 .. len * keysize]; + resi = 0; + foreach (e; aa.a.b) + { + if (e) + _aaKeys_x(e); + } + assert(resi == len); + + Array a; + a.length = len; + a.ptr = res.ptr; + return *cast(ArrayRet_t*)(&a); +} + + +/********************************************** + * 'apply' for associative arrays - to support foreach + */ + +// dg is D, but _aaApply() is C +extern (D) typedef int delegate(void *) dg_t; + +int _aaApply(AA aa, size_t keysize, dg_t dg) +in +{ + assert(aligntsize(keysize) == keysize); +} +body +{ int result; + + //printf("_aaApply(aa = x%llx, keysize = %d, dg = x%llx)\n", aa.a, keysize, dg); + + int treewalker(aaA* e) + { int result; + + do + { + //printf("treewalker(e = %p, dg = x%llx)\n", e, dg); + result = dg(cast(void *)(e + 1) + keysize); + if (result) + break; + if (e.right) + { if (!e.left) + { + e = e.right; + continue; + } + result = treewalker(e.right); + if (result) + break; + } + e = e.left; + } while (e); + + return result; + } + + if (aa.a) + { + foreach (e; aa.a.b) + { + if (e) + { + result = treewalker(e); + if (result) + break; + } + } + } + return result; +} + +// dg is D, but _aaApply2() is C +extern (D) typedef int delegate(void *, void *) dg2_t; + +int _aaApply2(AA aa, size_t keysize, dg2_t dg) +in +{ + assert(aligntsize(keysize) == keysize); +} +body +{ int result; + + //printf("_aaApply(aa = x%llx, keysize = %d, dg = x%llx)\n", aa.a, keysize, dg); + + int treewalker(aaA* e) + { int result; + + do + { + //printf("treewalker(e = %p, dg = x%llx)\n", e, dg); + result = dg(cast(void *)(e + 1), cast(void *)(e + 1) + keysize); + if (result) + break; + if (e.right) + { if (!e.left) + { + e = e.right; + continue; + } + result = treewalker(e.right); + if (result) + break; + } + e = e.left; + } while (e); + + return result; + } + + if (aa.a) + { + foreach (e; aa.a.b) + { + if (e) + { + result = treewalker(e); + if (result) + break; + } + } + } + return result; +} + + +/*********************************** + * Construct an associative array of type ti from + * length pairs of key/value pairs. + */ + +extern (C) +BB* _d_assocarrayliteralT(TypeInfo_AssociativeArray ti, size_t length, ...) +{ + auto valuesize = ti.next.tsize(); // value size + auto keyti = ti.key; + auto keysize = keyti.tsize(); // key size + BB* result; + + //printf("_d_assocarrayliteralT(keysize = %d, valuesize = %d, length = %d)\n", keysize, valuesize, length); + //printf("tivalue = %.*s\n", ti.next.classinfo.name); + if (length == 0 || valuesize == 0 || keysize == 0) + { + ; + } + else + { + va_list q; + va_start!(size_t)(q, length); + + result = new BB(); + result.keyti = keyti; + size_t i; + + for (i = 0; i < prime_list.length - 1; i++) + { + if (length <= prime_list[i]) + break; + } + auto len = prime_list[i]; + result.b = new aaA*[len]; + + size_t keystacksize = (keysize + int.sizeof - 1) & ~(int.sizeof - 1); + size_t valuestacksize = (valuesize + int.sizeof - 1) & ~(int.sizeof - 1); + + size_t keytsize = aligntsize(keysize); + + for (size_t j = 0; j < length; j++) + { void* pkey = q; + q += keystacksize; + void* pvalue = q; + q += valuestacksize; + aaA* e; + + auto key_hash = keyti.getHash(pkey); + //printf("hash = %d\n", key_hash); + i = key_hash % len; + auto pe = &result.b[i]; + while (1) + { + e = *pe; + if (!e) + { + // Not found, create new elem + //printf("create new one\n"); + e = cast(aaA *) cast(void*) new void[aaA.sizeof + keytsize + valuesize]; + memcpy(e + 1, pkey, keysize); + e.hash = key_hash; + *pe = e; + result.nodes++; + break; + } + if (key_hash == e.hash) + { + auto c = keyti.compare(pkey, e + 1); + if (c == 0) + break; + pe = (c < 0) ? &e.left : &e.right; + } + else + pe = (key_hash < e.hash) ? &e.left : &e.right; + } + memcpy(cast(void *)(e + 1) + keytsize, pvalue, valuesize); + } + + va_end(q); + } + return result; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/adi.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/adi.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,603 @@ +/** + * Implementation of dynamic array property support routines. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: = lo); + } + if (lo == hi) + break; + + debug(adi) printf("\tstridelo = %d, stridehi = %d\n", stridelo, stridehi); + if (stridelo == stridehi) + { + + memcpy(tmp.ptr, lo, stridelo); + memcpy(lo, hi, stridelo); + memcpy(hi, tmp.ptr, stridelo); + lo += stridelo; + hi--; + continue; + } + + /* Shift the whole array. This is woefully inefficient + */ + memcpy(tmp.ptr, hi, stridehi); + memcpy(tmplo.ptr, lo, stridelo); + memmove(lo + stridehi, lo + stridelo , (hi - lo) - stridelo); + memcpy(lo, tmp.ptr, stridehi); + memcpy(hi + stridehi - stridelo, tmplo.ptr, stridelo); + + lo += stridehi; + hi = hi - 1 + (stridehi - stridelo); + } + } + return *cast(long*)(&a); +} + +unittest +{ + auto a = "abcd"c[]; + + auto r = a.dup.reverse; + //writefln(r); + assert(r == "dcba"); + + a = "a\u1235\u1234c"; + //writefln(a); + r = a.dup.reverse; + //writefln(r); + assert(r == "c\u1234\u1235a"); + + a = "ab\u1234c"; + //writefln(a); + r = a.dup.reverse; + //writefln(r); + assert(r == "c\u1234ba"); + + a = "\u3026\u2021\u3061\n"; + r = a.dup.reverse; + assert(r == "\n\u3061\u2021\u3026"); +} + + +/********************************************** + * Reverse array of wchars. + * Handled separately because embedded multiword encodings should not be + * reversed. + */ + +extern (C) long _adReverseWchar(wchar[] a) +{ + if (a.length > 1) + { + wchar[2] tmp; + wchar* lo = a.ptr; + wchar* hi = &a[length - 1]; + + while (lo < hi) + { auto clo = *lo; + auto chi = *hi; + + if ((clo < 0xD800 || clo > 0xDFFF) && + (chi < 0xD800 || chi > 0xDFFF)) + { + *lo = chi; + *hi = clo; + lo++; + hi--; + continue; + } + + int stridelo = 1 + (clo >= 0xD800 && clo <= 0xDBFF); + + int stridehi = 1; + if (chi >= 0xDC00 && chi <= 0xDFFF) + { + chi = *--hi; + stridehi++; + assert(hi >= lo); + } + if (lo == hi) + break; + + if (stridelo == stridehi) + { int stmp; + + assert(stridelo == 2); + assert(stmp.sizeof == 2 * (*lo).sizeof); + stmp = *cast(int*)lo; + *cast(int*)lo = *cast(int*)hi; + *cast(int*)hi = stmp; + lo += stridelo; + hi--; + continue; + } + + /* Shift the whole array. This is woefully inefficient + */ + memcpy(tmp.ptr, hi, stridehi * wchar.sizeof); + memcpy(hi + stridehi - stridelo, lo, stridelo * wchar.sizeof); + memmove(lo + stridehi, lo + stridelo , (hi - (lo + stridelo)) * wchar.sizeof); + memcpy(lo, tmp.ptr, stridehi * wchar.sizeof); + + lo += stridehi; + hi = hi - 1 + (stridehi - stridelo); + } + } + return *cast(long*)(&a); +} + +unittest +{ + wstring a = "abcd"; + + auto r = a.dup.reverse; + assert(r == "dcba"); + + a = "a\U00012356\U00012346c"; + r = a.dup.reverse; + assert(r == "c\U00012346\U00012356a"); + + a = "ab\U00012345c"; + r = a.dup.reverse; + assert(r == "c\U00012345ba"); +} + + +/********************************************** + * Support for array.reverse property. + */ + +extern (C) long _adReverse(Array a, size_t szelem) +out (result) +{ + assert(result is *cast(long*)(&a)); +} +body +{ + if (a.length >= 2) + { + byte* tmp; + byte[16] buffer; + + void* lo = a.ptr; + void* hi = a.ptr + (a.length - 1) * szelem; + + tmp = buffer.ptr; + if (szelem > 16) + { + //version (Windows) + tmp = cast(byte*) alloca(szelem); + //else + //tmp = gc_malloc(szelem); + } + + for (; lo < hi; lo += szelem, hi -= szelem) + { + memcpy(tmp, lo, szelem); + memcpy(lo, hi, szelem); + memcpy(hi, tmp, szelem); + } + + version (Windows) + { + } + else + { + //if (szelem > 16) + // BUG: bad code is generate for delete pointer, tries + // to call delclass. + //gc_free(tmp); + } + } + return *cast(long*)(&a); +} + +unittest +{ + debug(adi) printf("array.reverse.unittest\n"); + + int[] a = new int[5]; + int[] b; + size_t i; + + for (i = 0; i < 5; i++) + a[i] = i; + b = a.reverse; + assert(b is a); + for (i = 0; i < 5; i++) + assert(a[i] == 4 - i); + + struct X20 + { // More than 16 bytes in size + int a; + int b, c, d, e; + } + + X20[] c = new X20[5]; + X20[] d; + + for (i = 0; i < 5; i++) + { c[i].a = i; + c[i].e = 10; + } + d = c.reverse; + assert(d is c); + for (i = 0; i < 5; i++) + { + assert(c[i].a == 4 - i); + assert(c[i].e == 10); + } +} + +/********************************************** + * Sort array of chars. + */ + +extern (C) long _adSortChar(char[] a) +{ + if (a.length > 1) + { + dstring da = toUTF32(a); + da.sort; + size_t i = 0; + foreach (dchar d; da) + { char[4] buf; + auto t = toUTF8(buf, d); + a[i .. i + t.length] = t[]; + i += t.length; + } + delete da; + } + return *cast(long*)(&a); +} + +/********************************************** + * Sort array of wchars. + */ + +extern (C) long _adSortWchar(wchar[] a) +{ + if (a.length > 1) + { + dstring da = toUTF32(a); + da.sort; + size_t i = 0; + foreach (dchar d; da) + { wchar[2] buf; + auto t = toUTF16(buf, d); + a[i .. i + t.length] = t[]; + i += t.length; + } + delete da; + } + return *cast(long*)(&a); +} + +/*************************************** + * Support for array equality test. + * Returns: + * 1 equal + * 0 not equal + */ + +extern (C) int _adEq(Array a1, Array a2, TypeInfo ti) +{ + debug(adi) printf("_adEq(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); + if (a1.length != a2.length) + return 0; // not equal + auto sz = ti.tsize(); + auto p1 = a1.ptr; + auto p2 = a2.ptr; + + if (sz == 1) + // We should really have a ti.isPOD() check for this + return (memcmp(p1, p2, a1.length) == 0); + + for (size_t i = 0; i < a1.length; i++) + { + if (!ti.equals(p1 + i * sz, p2 + i * sz)) + return 0; // not equal + } + return 1; // equal +} + +extern (C) int _adEq2(Array a1, Array a2, TypeInfo ti) +{ + debug(adi) printf("_adEq2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); + if (a1.length != a2.length) + return 0; // not equal + if (!ti.equals(&a1, &a2)) + return 0; + return 1; +} +unittest +{ + debug(adi) printf("array.Eq unittest\n"); + + auto a = "hello"c; + + assert(a != "hel"); + assert(a != "helloo"); + assert(a != "betty"); + assert(a == "hello"); + assert(a != "hxxxx"); +} + +/*************************************** + * Support for array compare test. + */ + +extern (C) int _adCmp(Array a1, Array a2, TypeInfo ti) +{ + debug(adi) printf("adCmp()\n"); + auto len = a1.length; + if (a2.length < len) + len = a2.length; + auto sz = ti.tsize(); + void *p1 = a1.ptr; + void *p2 = a2.ptr; + + if (sz == 1) + { // We should really have a ti.isPOD() check for this + auto c = memcmp(p1, p2, len); + if (c) + return c; + } + else + { + for (size_t i = 0; i < len; i++) + { + auto c = ti.compare(p1 + i * sz, p2 + i * sz); + if (c) + return c; + } + } + if (a1.length == a2.length) + return 0; + return (a1.length > a2.length) ? 1 : -1; +} + +extern (C) int _adCmp2(Array a1, Array a2, TypeInfo ti) +{ + debug(adi) printf("_adCmp2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); + return ti.compare(&a1, &a2); +} +unittest +{ + debug(adi) printf("array.Cmp unittest\n"); + + auto a = "hello"c; + + assert(a > "hel"); + assert(a >= "hel"); + assert(a < "helloo"); + assert(a <= "helloo"); + assert(a > "betty"); + assert(a >= "betty"); + assert(a == "hello"); + assert(a <= "hello"); + assert(a >= "hello"); +} + +/*************************************** + * Support for array compare test. + */ + +extern (C) int _adCmpChar(Array a1, Array a2) +{ + version (X86) + { + asm + { naked ; + + push EDI ; + push ESI ; + + mov ESI,a1+4[4+ESP] ; + mov EDI,a2+4[4+ESP] ; + + mov ECX,a1[4+ESP] ; + mov EDX,a2[4+ESP] ; + + cmp ECX,EDX ; + jb GotLength ; + + mov ECX,EDX ; + +GotLength: + cmp ECX,4 ; + jb DoBytes ; + + // Do alignment if neither is dword aligned + test ESI,3 ; + jz Aligned ; + + test EDI,3 ; + jz Aligned ; +DoAlign: + mov AL,[ESI] ; //align ESI to dword bounds + mov DL,[EDI] ; + + cmp AL,DL ; + jnz Unequal ; + + inc ESI ; + inc EDI ; + + test ESI,3 ; + + lea ECX,[ECX-1] ; + jnz DoAlign ; +Aligned: + mov EAX,ECX ; + + // do multiple of 4 bytes at a time + + shr ECX,2 ; + jz TryOdd ; + + repe ; + cmpsd ; + + jnz UnequalQuad ; + +TryOdd: + mov ECX,EAX ; +DoBytes: + // if still equal and not end of string, do up to 3 bytes slightly + // slower. + + and ECX,3 ; + jz Equal ; + + repe ; + cmpsb ; + + jnz Unequal ; +Equal: + mov EAX,a1[4+ESP] ; + mov EDX,a2[4+ESP] ; + + sub EAX,EDX ; + pop ESI ; + + pop EDI ; + ret ; + +UnequalQuad: + mov EDX,[EDI-4] ; + mov EAX,[ESI-4] ; + + cmp AL,DL ; + jnz Unequal ; + + cmp AH,DH ; + jnz Unequal ; + + shr EAX,16 ; + + shr EDX,16 ; + + cmp AL,DL ; + jnz Unequal ; + + cmp AH,DH ; +Unequal: + sbb EAX,EAX ; + pop ESI ; + + or EAX,1 ; + pop EDI ; + + ret ; + } + } + else + { + int len; + int c; + + debug(adi) printf("adCmpChar()\n"); + len = a1.length; + if (a2.length < len) + len = a2.length; + c = memcmp(cast(char *)a1.ptr, cast(char *)a2.ptr, len); + if (!c) + c = cast(int)a1.length - cast(int)a2.length; + return c; + } +} + +unittest +{ + debug(adi) printf("array.CmpChar unittest\n"); + + auto a = "hello"c; + + assert(a > "hel"); + assert(a >= "hel"); + assert(a < "helloo"); + assert(a <= "helloo"); + assert(a > "betty"); + assert(a >= "betty"); + assert(a == "hello"); + assert(a <= "hello"); + assert(a >= "hello"); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/alloca.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/alloca.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,135 @@ +/** + * Implementation of alloca() standard C routine. + * + * Copyright: Copyright Digital Mars 1990 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright, based on code originally written by Burton Radons + * + * Copyright Digital Mars 2008 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.arraybyte; + +import rt.util.cpuid; + +version (unittest) +{ + private import core.stdc.stdio : printf; + /* This is so unit tests will test every CPU variant + */ + int cpuid; + const int CPUID_MAX = 4; + bool mmx() { return cpuid == 1 && rt.util.cpuid.mmx(); } + bool sse() { return cpuid == 2 && rt.util.cpuid.sse(); } + bool sse2() { return cpuid == 3 && rt.util.cpuid.sse2(); } + bool amd3dnow() { return cpuid == 4 && rt.util.cpuid.amd3dnow(); } +} +else +{ + alias rt.util.cpuid.mmx mmx; + alias rt.util.cpuid.sse sse; + alias rt.util.cpuid.sse2 sse2; + alias rt.util.cpuid.amd3dnow amd3dnow; +} + +//version = log; + +bool disjoint(T)(T[] a, T[] b) +{ + return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); +} + +alias byte T; + +extern (C): + +/* ======================================================================== */ + + +/*********************** + * Computes: + * a[] = b[] + value + */ + +T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b) +{ + return _arraySliceExpAddSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b) +{ + return _arraySliceExpAddSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpAddSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1088% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 1000% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startaddmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + paddb MM0, MM4; + paddb MM1, MM4; + paddb MM2, MM4; + paddb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + /* trying to be fair and treat normal 32-bit cpu the same way as we do + * the SIMD units, with unrolled asm. There's not enough registers, + * really. + */ + else + if (a.length >= 4) + { + + auto n = aptr + (a.length & ~3); + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov CL, value; + + align 4; + startadd386: + add ESI, 4; + mov DX, [EAX]; + mov BX, [EAX+2]; + add EAX, 4; + add BL, CL; + add BH, CL; + add DL, CL; + add DH, CL; + mov [ESI -4], DX; + mov [ESI+2 -4], BX; + cmp ESI, EDI; + jb startadd386; + + mov aptr, ESI; + mov bptr, EAX; + } + + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ + value); + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + c[] + */ + +T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceAddSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 5739% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + version (log) printf("\tsse2 unaligned\n"); + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startaddlsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddlsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + version (log) printf("\tsse2 aligned\n"); + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startaddlsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddlsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 4428% faster + if (mmx() && a.length >= 32) + { + version (log) printf("\tmmx\n"); + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startaddlmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + paddb MM0, MM4; + paddb MM1, MM5; + paddb MM2, MM6; + paddb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddlmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + version (log) if (aptr < aend) printf("\tbase\n"); + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ + *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_a(T[] a, T value) +{ + return _arrayExpSliceAddass_g(a, value); +} + +T[] _arrayExpSliceAddass_h(T[] a, T value) +{ + return _arrayExpSliceAddass_g(a, value); +} + +T[] _arrayExpSliceAddass_g(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1578% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddasssse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasssse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startaddasssse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM4; + paddb XMM2, XMM4; + paddb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasssse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 1721% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 8; + startaddassmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + paddb MM0, MM4; + paddb MM1, MM4; + paddb MM2, MM4; + paddb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddassmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_a(T[] a, T[] b) +{ + return _arraySliceSliceAddass_g(a, b); +} + +T[] _arraySliceSliceAddass_h(T[] a, T[] b) +{ + return _arraySliceSliceAddass_g(a, b); +} + +T[] _arraySliceSliceAddass_g(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 4727% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startaddasslsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasslsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startaddasslsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + paddb XMM0, XMM4; + paddb XMM1, XMM5; + paddb XMM2, XMM6; + paddb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startaddasslsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 3059% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startaddasslmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + paddb MM0, MM4; + paddb MM1, MM5; + paddb MM2, MM6; + paddb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startaddasslmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_g(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1189% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 1079% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startsubmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + psubb MM0, MM4; + psubb MM1, MM4; + psubb MM2, MM4; + psubb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsubmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really. + else + if (a.length >= 4) + { + auto n = aptr + (a.length & ~3); + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov CL, value; + + align 4; + startsub386: + add ESI, 4; + mov DX, [EAX]; + mov BX, [EAX+2]; + add EAX, 4; + sub BL, CL; + sub BH, CL; + sub DL, CL; + sub DH, CL; + mov [ESI -4], DX; + mov [ESI+2 -4], BX; + cmp ESI, EDI; + jb startsub386; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ - value); + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] = b[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, c[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_g(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_g(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 8748% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubrsse2u: + add ESI, 64; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + psubb XMM5, XMM0; + psubb XMM6, XMM1; + movdqu [ESI -64], XMM5; + movdqu [ESI+16-64], XMM6; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + psubb XMM5, XMM2; + psubb XMM6, XMM3; + movdqu [ESI+32-64], XMM5; + movdqu [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsubrsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubrsse2a: + add ESI, 64; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + psubb XMM5, XMM0; + psubb XMM6, XMM1; + movdqa [ESI -64], XMM5; + movdqa [ESI+16-64], XMM6; + movdqa XMM5, XMM4; + movdqa XMM6, XMM4; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + psubb XMM5, XMM2; + psubb XMM6, XMM3; + movdqa [ESI+32-64], XMM5; + movdqa [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsubrsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 7397% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startsubrmmx: + add ESI, 32; + movq MM5, MM4; + movq MM6, MM4; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + psubb MM5, MM0; + psubb MM6, MM1; + movq [ESI -32], MM5; + movq [ESI+8 -32], MM6; + movq MM5, MM4; + movq MM6, MM4; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + psubb MM5, MM2; + psubb MM6, MM3; + movq [ESI+16-32], MM5; + movq [ESI+24-32], MM6; + cmp ESI, EDI; + jb startsubrmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + + } + + while (aptr < aend) + *aptr++ = cast(T)(value - *bptr++); + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] = 6 - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - b[i])) + { + printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - c[] + */ + +T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_g(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 5756% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startsublsse2u: + add ESI, 64; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + movdqu XMM2, [EAX+32]; + movdqu XMM3, [EAX+48]; + add EAX, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsublsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startsublsse2a: + add ESI, 64; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + movdqa XMM2, [EAX+32]; + movdqa XMM3, [EAX+48]; + add EAX, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsublsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 4428% faster + if (mmx() && a.length >= 32) + { + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 8; + startsublmmx: + add ESI, 32; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + add EAX, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + psubb MM0, MM4; + psubb MM1, MM5; + psubb MM2, MM6; + psubb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsublmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ - *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_a(T[] a, T value) +{ + return _arrayExpSliceMinass_g(a, value); +} + +T[] _arrayExpSliceMinass_h(T[] a, T value) +{ + return _arrayExpSliceMinass_g(a, value); +} + +T[] _arrayExpSliceMinass_g(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1577% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + uint l = cast(ubyte) value; + l |= (l << 8); + l |= (l << 16); + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubasssse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasssse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 8; + startsubasssse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM4; + psubb XMM2, XMM4; + psubb XMM3, XMM4; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasssse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 1577% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + uint l = cast(ubyte) value; + l |= (l << 8); + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 8; + startsubassmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + psubb MM0, MM4; + psubb MM1, MM4; + psubb MM2, MM4; + psubb MM3, MM4; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsubassmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_a(T[] a, T[] b) +{ + return _arraySliceSliceMinass_g(a, b); +} + +T[] _arraySliceSliceMinass_h(T[] a, T[] b) +{ + return _arraySliceSliceMinass_g(a, b); +} + +T[] _arraySliceSliceMinass_g(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_g()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 4800% faster + if (sse2() && a.length >= 64) + { + auto n = aptr + (a.length & ~63); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startsubasslsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + movdqu XMM2, [ESI+32]; + movdqu XMM3, [ESI+48]; + add ESI, 64; + movdqu XMM4, [ECX]; + movdqu XMM5, [ECX+16]; + movdqu XMM6, [ECX+32]; + movdqu XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqu [ESI -64], XMM0; + movdqu [ESI+16-64], XMM1; + movdqu [ESI+32-64], XMM2; + movdqu [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasslsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startsubasslsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + movdqa XMM2, [ESI+32]; + movdqa XMM3, [ESI+48]; + add ESI, 64; + movdqa XMM4, [ECX]; + movdqa XMM5, [ECX+16]; + movdqa XMM6, [ECX+32]; + movdqa XMM7, [ECX+48]; + add ECX, 64; + psubb XMM0, XMM4; + psubb XMM1, XMM5; + psubb XMM2, XMM6; + psubb XMM3, XMM7; + movdqa [ESI -64], XMM0; + movdqa [ESI+16-64], XMM1; + movdqa [ESI+32-64], XMM2; + movdqa [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsubasslsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 3107% faster + if (mmx() && a.length >= 32) + { + + auto n = aptr + (a.length & ~31); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 8; + startsubasslmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + add ESI, 32; + movq MM4, [ECX]; + movq MM5, [ECX+8]; + movq MM6, [ECX+16]; + movq MM7, [ECX+24]; + add ECX, 32; + psubb MM0, MM4; + psubb MM1, MM5; + psubb MM2, MM6; + psubb MM3, MM7; + movq [ESI -32], MM0; + movq [ESI+8 -32], MM1; + movq [ESI+16-32], MM2; + movq [ESI+24-32], MM3; + cmp ESI, EDI; + jb startsubasslmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinass_g unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/arraycast.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/arraycast.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,94 @@ +/** + * Implementation of array cast support routines. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: = 8) + { + auto n = aptr + (b.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ESI, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM5; + subpd XMM2, XMM6; + subpd XMM3, XMM7; + add ECX, 64; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + // Handle remainder + while (aptr < aend) + *aptr++ = *bptr++ - *cptr++; + + return a; +} + + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + value + */ + +T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpAddSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 305% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + addpd XMM0, XMM4; + addpd XMM1, XMM4; + addpd XMM2, XMM4; + addpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ + value; + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_d(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 version is 114% faster + if (sse2() && a.length >= 8) + { + auto n = cast(T*)((cast(uint)aend) & ~7); + if (aptr < n) + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + addpd XMM0, XMM4; + addpd XMM1, XMM4; + addpd XMM2, XMM4; + addpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_d(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 183% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ECX, 64; + addpd XMM0, XMM4; + addpd XMM1, XMM5; + addpd XMM2, XMM6; + addpd XMM3, XMM7; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 305% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM4; + subpd XMM2, XMM4; + subpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ - value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 66% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movapd XMM5, XMM4; + movapd XMM6, XMM4; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + subpd XMM5, XMM0; + subpd XMM6, XMM1; + movupd [ESI+ 0-64], XMM5; + movupd [ESI+16-64], XMM6; + movapd XMM5, XMM4; + movapd XMM6, XMM4; + subpd XMM5, XMM2; + subpd XMM6, XMM3; + movupd [ESI+32-64], XMM5; + movupd [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = value - *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = 6 - a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - a[i])) + { + printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_d(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 version is 115% faster + if (sse2() && a.length >= 8) + { + auto n = cast(T*)((cast(uint)aend) & ~7); + if (aptr < n) + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM4; + subpd XMM2, XMM4; + subpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_d(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 183% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ECX, 64; + subpd XMM0, XMM4; + subpd XMM1, XMM5; + subpd XMM2, XMM6; + subpd XMM3, XMM7; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * value + */ + +T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMulSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 304% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * c[] + */ + +T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceMulSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 329% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add EAX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM5; + mulpd XMM2, XMM6; + mulpd XMM3, XMM7; + add ECX, 64; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * b[i])) + { + printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= value + */ + +T[] _arrayExpSliceMulass_d(T[] a, T value) +{ + //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 version is 109% faster + if (sse2() && a.length >= 8) + { + auto n = cast(T*)((cast(uint)aend) & ~7); + if (aptr < n) + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, value; + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= b[] + */ + +T[] _arraySliceSliceMulass_d(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMulass_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 version is 205% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + movupd XMM4, [ECX]; + movupd XMM5, [ECX+16]; + movupd XMM6, [ECX+32]; + movupd XMM7, [ECX+48]; + add ECX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM5; + mulpd XMM2, XMM6; + mulpd XMM3, XMM7; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ *= *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] / value + */ + +T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpDivSliceAssign_d()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + /* Multiplying by the reciprocal is faster, but does + * not produce as accurate an answer. + */ + T recip = cast(T)1 / value; + + version (D_InlineAsm_X86) + { + // SSE2 version is 299% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movsd XMM4, recip; + //movsd XMM4, value + //rcpsd XMM4, XMM4 + shufpd XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movupd XMM0, [EAX]; + movupd XMM1, [EAX+16]; + movupd XMM2, [EAX+32]; + movupd XMM3, [EAX+48]; + add EAX, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + //divpd XMM0, XMM4; + //divpd XMM1, XMM4; + //divpd XMM2, XMM4; + //divpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + { + *aptr++ = *bptr++ / value; + //*aptr++ = *bptr++ * recip; + } + + return a; +} + +unittest +{ + printf("_arraySliceExpDivSliceAssign_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] / 8; + + for (int i = 0; i < dim; i++) + { + //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]); + if (c[i] != cast(T)(a[i] / 8)) + { + printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] /= value + */ + +T[] _arrayExpSliceDivass_d(T[] a, T value) +{ + //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + /* Multiplying by the reciprocal is faster, but does + * not produce as accurate an answer. + */ + T recip = cast(T)1 / value; + + version (D_InlineAsm_X86) + { + // SSE2 version is 65% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + // Unaligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movsd XMM4, recip; + //movsd XMM4, value + //rcpsd XMM4, XMM4 + shufpd XMM4, XMM4, 0; + + align 8; + startsseloopa: + movupd XMM0, [ESI]; + movupd XMM1, [ESI+16]; + movupd XMM2, [ESI+32]; + movupd XMM3, [ESI+48]; + add ESI, 64; + mulpd XMM0, XMM4; + mulpd XMM1, XMM4; + mulpd XMM2, XMM4; + mulpd XMM3, XMM4; + //divpd XMM0, XMM4; + //divpd XMM1, XMM4; + //divpd XMM2, XMM4; + //divpd XMM3, XMM4; + movupd [ESI+ 0-64], XMM0; + movupd [ESI+16-64], XMM1; + movupd [ESI+32-64], XMM2; + movupd [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= recip; + + return a; +} + + +unittest +{ + printf("_arrayExpSliceDivass_d unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] /= 8; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] / 8)) + { + printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] * value + */ + +T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAddass_d(a, -value, b); +} + +/*********************** + * Computes: + * a[] += b[] * value + */ + +T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + // Handle remainder + while (aptr < aend) + *aptr++ += *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAddass_d unittest\n"); + + cpuid = 1; + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 1; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] += a[] * 6; + + for (int i = 0; i < dim; i++) + { + //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); + if (c[i] != cast(T)(b[i] + a[i] * 6)) + { + printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/arrayfloat.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/arrayfloat.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,2309 @@ +/** + * Contains SSE2 and MMX versions of certain operations for float. + * + * Copyright: Copyright Digital Mars 2008 - 2009. + * License: = 16) + { + version (log) printf("\tsse unaligned\n"); + auto n = aptr + (b.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add ESI, 64; + addps XMM0, XMM4; + addps XMM1, XMM5; + addps XMM2, XMM6; + addps XMM3, XMM7; + add ECX, 64; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + // 3DNow! version is only 13% faster + if (amd3dnow() && b.length >= 8) + { + version (log) printf("\tamd3dnow\n"); + auto n = aptr + (b.length & ~7); + + asm + { + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + + align 4; + start3dnow: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfadd MM0, [ECX]; + pfadd MM1, [ECX+8]; + pfadd MM2, [ECX+16]; + pfadd MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ECX, 32; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + // Handle remainder + version (log) if (aptr < aend) printf("\tbase\n"); + while (aptr < aend) + *aptr++ = *bptr++ + *cptr++; + + return a; +} + + +unittest +{ + printf("_arraySliceSliceAddSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - c[] + */ + +T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 834% faster + if (sse() && b.length >= 16) + { + auto n = aptr + (b.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add ESI, 64; + subps XMM0, XMM4; + subps XMM1, XMM5; + subps XMM2, XMM6; + subps XMM3, XMM7; + add ECX, 64; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + // 3DNow! version is only 13% faster + if (amd3dnow() && b.length >= 8) + { + auto n = aptr + (b.length & ~7); + + asm + { + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + + align 4; + start3dnow: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfsub MM0, [ECX]; + pfsub MM1, [ECX+8]; + pfsub MM2, [ECX+16]; + pfsub MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ECX, 32; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + // Handle remainder + while (aptr < aend) + *aptr++ = *bptr++ - *cptr++; + + return a; +} + + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + value + */ + +T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpAddSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 665% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + addps XMM0, XMM4; + addps XMM1, XMM4; + addps XMM2, XMM4; + addps XMM3, XMM4; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + // 3DNow! version is 69% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM4, qword ptr [v]; + + align 8; + start3dnow: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfadd MM0, MM4; + pfadd MM1, MM4; + pfadd MM2, MM4; + pfadd MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ + value; + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_f(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE version is 302% faster + if (sse() && a.length >= 16) + { + // align pointer + auto n = cast(T*)((cast(uint)aptr + 15) & ~15); + while (aptr < n) + *aptr++ += value; + n = cast(T*)((cast(uint)aend) & ~15); + if (aptr < n) + + // Aligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloopa: + movaps XMM0, [ESI]; + movaps XMM1, [ESI+16]; + movaps XMM2, [ESI+32]; + movaps XMM3, [ESI+48]; + add ESI, 64; + addps XMM0, XMM4; + addps XMM1, XMM4; + addps XMM2, XMM4; + addps XMM3, XMM4; + movaps [ESI+ 0-64], XMM0; + movaps [ESI+16-64], XMM1; + movaps [ESI+32-64], XMM2; + movaps [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + else + // 3DNow! version is 63% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + movq MM4, qword ptr [v]; + + align 8; + start3dnow: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfadd MM0, MM4; + pfadd MM1, MM4; + pfadd MM2, MM4; + pfadd MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov dword ptr [aptr], ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_f(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 468% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [ESI]; + movups XMM1, [ESI+16]; + movups XMM2, [ESI+32]; + movups XMM3, [ESI+48]; + add ESI, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add ECX, 64; + addps XMM0, XMM4; + addps XMM1, XMM5; + addps XMM2, XMM6; + addps XMM3, XMM7; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + // 3DNow! version is 57% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, dword ptr [aptr]; // destination operand + mov EDI, dword ptr [n]; // end comparison + mov ECX, dword ptr [bptr]; // right operand + + align 4; + start3dnow: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfadd MM0, [ECX]; + pfadd MM1, [ECX+8]; + pfadd MM2, [ECX+16]; + pfadd MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add ECX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov dword ptr [aptr], ESI; + mov dword ptr [bptr], ECX; + } + } + } + + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] += b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 622% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + subps XMM0, XMM4; + subps XMM1, XMM4; + subps XMM2, XMM4; + subps XMM3, XMM4; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + // 3DNow! version is 67% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + T[2] w; + + w[0] = w[1] = value; + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + mov EAX, dword ptr [bptr]; + movq MM4, qword ptr [w]; + + align 8; + start3dnow: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfsub MM0, MM4; + pfsub MM1, MM4; + pfsub MM2, MM4; + pfsub MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov dword ptr [aptr], ESI; + mov dword ptr [bptr], EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ - value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 690% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movaps XMM5, XMM4; + movaps XMM6, XMM4; + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + subps XMM5, XMM0; + subps XMM6, XMM1; + movups [ESI+ 0-64], XMM5; + movups [ESI+16-64], XMM6; + movaps XMM5, XMM4; + movaps XMM6, XMM4; + subps XMM5, XMM2; + subps XMM6, XMM3; + movups [ESI+32-64], XMM5; + movups [ESI+48-64], XMM6; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + // 3DNow! version is 67% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM4, qword ptr [v]; + + align 8; + start3dnow: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfsubr MM0, MM4; + pfsubr MM1, MM4; + pfsubr MM2, MM4; + pfsubr MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start3dnow; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = value - *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = 6 - a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - a[i])) + { + printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_f(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE version is 304% faster + if (sse() && a.length >= 16) + { + // align pointer + auto n = cast(T*)((cast(uint)aptr + 15) & ~15); + while (aptr < n) + *aptr++ -= value; + n = cast(T*)((cast(uint)aend) & ~15); + if (aptr < n) + + // Aligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloopa: + movaps XMM0, [ESI]; + movaps XMM1, [ESI+16]; + movaps XMM2, [ESI+32]; + movaps XMM3, [ESI+48]; + add ESI, 64; + subps XMM0, XMM4; + subps XMM1, XMM4; + subps XMM2, XMM4; + subps XMM3, XMM4; + movaps [ESI+ 0-64], XMM0; + movaps [ESI+16-64], XMM1; + movaps [ESI+32-64], XMM2; + movaps [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + else + // 3DNow! version is 63% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + movq MM4, qword ptr [v]; + + align 8; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfsub MM0, MM4; + pfsub MM1, MM4; + pfsub MM2, MM4; + pfsub MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + cmp ESI, EDI; + jb start; + + emms; + mov dword ptr [aptr], ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceminass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_f(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 468% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [ESI]; + movups XMM1, [ESI+16]; + movups XMM2, [ESI+32]; + movups XMM3, [ESI+48]; + add ESI, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add ECX, 64; + subps XMM0, XMM4; + subps XMM1, XMM5; + subps XMM2, XMM6; + subps XMM3, XMM7; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + // 3DNow! version is 57% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, dword ptr [aptr]; // destination operand + mov EDI, dword ptr [n]; // end comparison + mov ECX, dword ptr [bptr]; // right operand + + align 4; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfsub MM0, [ECX]; + pfsub MM1, [ECX+8]; + pfsub MM2, [ECX+16]; + pfsub MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add ECX, 32; + cmp ESI, EDI; + jb start; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * value + */ + +T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMulSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 607% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM4; + mulps XMM2, XMM4; + mulps XMM3, XMM4; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + // 3DNow! version is 69% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + mov EAX, dword ptr [bptr]; + movq MM4, qword ptr [v]; + + align 8; + start: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfmul MM0, MM4; + pfmul MM1, MM4; + pfmul MM2, MM4; + pfmul MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * c[] + */ + +T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceMulSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 833% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; // left operand + mov ECX, cptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add ESI, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add EAX, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM5; + mulps XMM2, XMM6; + mulps XMM3, XMM7; + add ECX, 64; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + // 3DNow! version is only 13% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, dword ptr [aptr]; // destination operand + mov EDI, dword ptr [n]; // end comparison + mov EAX, dword ptr [bptr]; // left operand + mov ECX, dword ptr [cptr]; // right operand + + align 4; + start: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfmul MM0, [ECX]; + pfmul MM1, [ECX+8]; + pfmul MM2, [ECX+16]; + pfmul MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ECX, 32; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * b[i])) + { + printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= value + */ + +T[] _arrayExpSliceMulass_f(T[] a, T value) +{ + //printf("_arrayExpSliceMulass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE version is 303% faster + if (sse() && a.length >= 16) + { + // align pointer + auto n = cast(T*)((cast(uint)aptr + 15) & ~15); + while (aptr < n) + *aptr++ *= value; + n = cast(T*)((cast(uint)aend) & ~15); + if (aptr < n) + + // Aligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movss XMM4, value; + shufps XMM4, XMM4, 0; + + align 8; + startsseloopa: + movaps XMM0, [ESI]; + movaps XMM1, [ESI+16]; + movaps XMM2, [ESI+32]; + movaps XMM3, [ESI+48]; + add ESI, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM4; + mulps XMM2, XMM4; + mulps XMM3, XMM4; + movaps [ESI+ 0-64], XMM0; + movaps [ESI+16-64], XMM1; + movaps [ESI+32-64], XMM2; + movaps [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + else + // 3DNow! version is 63% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + ulong w = *cast(uint *) &value; + ulong v = w | (w << 32L); + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + movq MM4, qword ptr [v]; + + align 8; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfmul MM0, MM4; + pfmul MM1, MM4; + pfmul MM2, MM4; + pfmul MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + cmp ESI, EDI; + jb start; + + emms; + mov dword ptr [aptr], ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= b[] + */ + +T[] _arraySliceSliceMulass_f(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMulass_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE version is 525% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov ECX, bptr; // right operand + mov ESI, aptr; // destination operand + mov EDI, n; // end comparison + + align 8; + startsseloopb: + movups XMM0, [ESI]; + movups XMM1, [ESI+16]; + movups XMM2, [ESI+32]; + movups XMM3, [ESI+48]; + add ESI, 64; + movups XMM4, [ECX]; + movups XMM5, [ECX+16]; + movups XMM6, [ECX+32]; + movups XMM7, [ECX+48]; + add ECX, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM5; + mulps XMM2, XMM6; + mulps XMM3, XMM7; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopb; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + // 3DNow! version is 57% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, dword ptr [aptr]; // destination operand + mov EDI, dword ptr [n]; // end comparison + mov ECX, dword ptr [bptr]; // right operand + + align 4; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfmul MM0, [ECX]; + pfmul MM1, [ECX+8]; + pfmul MM2, [ECX+16]; + pfmul MM3, [ECX+24]; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add ECX, 32; + cmp ESI, EDI; + jb start; + + emms; + mov dword ptr [aptr], ESI; + mov dword ptr [bptr], ECX; + } + } + } + + while (aptr < aend) + *aptr++ *= *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] / value + */ + +T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpDivSliceAssign_f()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + /* Multiplying by the reciprocal is faster, but does + * not produce as accurate an answer. + */ + T recip = cast(T)1 / value; + + version (D_InlineAsm_X86) + { + // SSE version is 587% faster + if (sse() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + // Unaligned case + asm + { + mov EAX, bptr; + mov ESI, aptr; + mov EDI, n; + movss XMM4, recip; + //movss XMM4, value + //rcpss XMM4, XMM4 + shufps XMM4, XMM4, 0; + + align 8; + startsseloop: + add ESI, 64; + movups XMM0, [EAX]; + movups XMM1, [EAX+16]; + movups XMM2, [EAX+32]; + movups XMM3, [EAX+48]; + add EAX, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM4; + mulps XMM2, XMM4; + mulps XMM3, XMM4; + //divps XMM0, XMM4; + //divps XMM1, XMM4; + //divps XMM2, XMM4; + //divps XMM3, XMM4; + movups [ESI+ 0-64], XMM0; + movups [ESI+16-64], XMM1; + movups [ESI+32-64], XMM2; + movups [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloop; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + // 3DNow! version is 72% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + T[2] w = void; + + w[0] = recip; + w[1] = recip; + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + mov EAX, dword ptr [bptr]; + movq MM4, qword ptr [w]; + + align 8; + start: + movq MM0, [EAX]; + movq MM1, [EAX+8]; + movq MM2, [EAX+16]; + movq MM3, [EAX+24]; + pfmul MM0, MM4; + pfmul MM1, MM4; + pfmul MM2, MM4; + pfmul MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + add EAX, 32; + cmp ESI, EDI; + jb start; + + emms; + mov dword ptr [aptr], ESI; + mov dword ptr [bptr], EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * recip; + + return a; +} + +unittest +{ + printf("_arraySliceExpDivSliceAssign_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] / 8; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] / 8)) + { + printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] /= value + */ + +T[] _arrayExpSliceDivass_f(T[] a, T value) +{ + //printf("_arrayExpSliceDivass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + /* Multiplying by the reciprocal is faster, but does + * not produce as accurate an answer. + */ + T recip = cast(T)1 / value; + + version (D_InlineAsm_X86) + { + // SSE version is 245% faster + if (sse() && a.length >= 16) + { + // align pointer + auto n = cast(T*)((cast(uint)aptr + 15) & ~15); + while (aptr < n) + *aptr++ *= recip; + n = cast(T*)((cast(uint)aend) & ~15); + if (aptr < n) + + // Aligned case + asm + { + mov ESI, aptr; + mov EDI, n; + movss XMM4, recip; + //movss XMM4, value + //rcpss XMM4, XMM4 + shufps XMM4, XMM4, 0; + + align 8; + startsseloopa: + movaps XMM0, [ESI]; + movaps XMM1, [ESI+16]; + movaps XMM2, [ESI+32]; + movaps XMM3, [ESI+48]; + add ESI, 64; + mulps XMM0, XMM4; + mulps XMM1, XMM4; + mulps XMM2, XMM4; + mulps XMM3, XMM4; + //divps XMM0, XMM4; + //divps XMM1, XMM4; + //divps XMM2, XMM4; + //divps XMM3, XMM4; + movaps [ESI+ 0-64], XMM0; + movaps [ESI+16-64], XMM1; + movaps [ESI+32-64], XMM2; + movaps [ESI+48-64], XMM3; + cmp ESI, EDI; + jb startsseloopa; + + mov aptr, ESI; + } + } + else + // 3DNow! version is 57% faster + if (amd3dnow() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + T[2] w = void; + + w[0] = w[1] = recip; + + asm + { + mov ESI, dword ptr [aptr]; + mov EDI, dword ptr [n]; + movq MM4, qword ptr [w]; + + align 8; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + movq MM2, [ESI+16]; + movq MM3, [ESI+24]; + pfmul MM0, MM4; + pfmul MM1, MM4; + pfmul MM2, MM4; + pfmul MM3, MM4; + movq [ESI], MM0; + movq [ESI+8], MM1; + movq [ESI+16], MM2; + movq [ESI+24], MM3; + add ESI, 32; + cmp ESI, EDI; + jb start; + + emms; + mov dword ptr [aptr], ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= recip; + + return a; +} + +unittest +{ + printf("_arrayExpSliceDivass_f unittest\n"); + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + c[] /= 8; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] / 8)) + { + printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] * value + */ + +T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAddass_f(a, -value, b); +} + +/*********************** + * Computes: + * a[] += b[] * value + */ + +T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + // Handle remainder + while (aptr < aend) + *aptr++ += *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAddass_f unittest\n"); + + cpuid = 1; + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 1; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] += a[] * 6; + + for (int i = 0; i < dim; i++) + { + //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); + if (c[i] != cast(T)(b[i] + a[i] * 6)) + { + printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/arrayint.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/arrayint.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,2430 @@ +/** + * Contains MMX versions of certain operations for dchar, int, and uint ('w', + * 'i' and 'k' suffixes). + * + * Copyright: Copyright Digital Mars 2008 - 2009. + * License: = 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 298% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM2, l; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + paddd MM0, MM2; + paddd MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + else + if (a.length >= 2) + { + auto n = aptr + (a.length & ~1); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov EDX, value; + + align 4; + start386: + add ESI, 8; + mov EBX, [EAX]; + mov ECX, [EAX+4]; + add EAX, 8; + add EBX, EDX; + add ECX, EDX; + mov [ESI -8], EBX; + mov [ESI+4-8], ECX; + cmp ESI, EDI; + jb start386; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ + value; + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + c[] + */ + +T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceAddSliceAssign_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1710% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM2, [ECX]; + movdqu XMM1, [EAX+16]; + movdqu XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM2, [ECX]; + movdqa XMM1, [EAX+16]; + movdqa XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 995% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM2, [ECX]; + movq MM1, [EAX+8]; + movq MM3, [ECX+8]; + add EAX, 16; + add ECX, 16; + paddd MM0, MM2; + paddd MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + +normal: + while (aptr < aend) + *aptr++ = *bptr++ + *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_w(T[] a, T value) +{ + return _arrayExpSliceAddass_i(a, value); +} + +T[] _arrayExpSliceAddass_k(T[] a, T value) +{ + return _arrayExpSliceAddass_i(a, value); +} + +T[] _arrayExpSliceAddass_i(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 83% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 81% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + movq MM2, l; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + paddd MM0, MM2; + paddd MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + else + if (a.length >= 2) + { + auto n = aptr + (a.length & ~1); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EDX, value; + + align 4; + start386: + mov EBX, [ESI]; + mov ECX, [ESI+4]; + add ESI, 8; + add EBX, EDX; + add ECX, EDX; + mov [ESI -8], EBX; + mov [ESI+4-8], ECX; + cmp ESI, EDI; + jb start386; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + a[] += 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(c[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_w(T[] a, T[] b) +{ + return _arraySliceSliceAddass_i(a, b); +} + +T[] _arraySliceSliceAddass_k(T[] a, T[] b) +{ + return _arraySliceSliceAddass_i(a, b); +} + +T[] _arraySliceSliceAddass_i(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 695% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM2, [ECX]; + movdqu XMM1, [ESI+16]; + movdqu XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM2, [ECX]; + movdqa XMM1, [ESI+16]; + movdqa XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + paddd XMM0, XMM2; + paddd XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 471% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM2, [ECX]; + movq MM1, [ESI+8]; + movq MM3, [ECX+8]; + add ESI, 16; + add ECX, 16; + paddd MM0, MM2; + paddd MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + +normal: + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] += a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] + a[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_i(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_i(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 400% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 315% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM2, l; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + psubd MM0, MM2; + psubd MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + else + if (a.length >= 2) + { + auto n = aptr + (a.length & ~1); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov EDX, value; + + align 4; + start386: + add ESI, 8; + mov EBX, [EAX]; + mov ECX, [EAX+4]; + add EAX, 8; + sub EBX, EDX; + sub ECX, EDX; + mov [ESI -8], EBX; + mov [ESI+4-8], ECX; + cmp ESI, EDI; + jb start386; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ - value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_i(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_i(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1812% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 4; + startaddsse2u: + add ESI, 32; + movdqu XMM2, [EAX]; + movdqu XMM3, [EAX+16]; + movdqa XMM0, XMM4; + movdqa XMM1, XMM4; + add EAX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM4, l; + pshufd XMM4, XMM4, 0; + + align 4; + startaddsse2a: + add ESI, 32; + movdqa XMM2, [EAX]; + movdqa XMM3, [EAX+16]; + movdqa XMM0, XMM4; + movdqa XMM1, XMM4; + add EAX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 1077% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM4, l; + + align 4; + startmmx: + add ESI, 16; + movq MM2, [EAX]; + movq MM3, [EAX+8]; + movq MM0, MM4; + movq MM1, MM4; + add EAX, 16; + psubd MM0, MM2; + psubd MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = value - *bptr++; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = 6 - a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - a[i])) + { + printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - c[] + */ + +T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1721% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM2, [ECX]; + movdqu XMM1, [EAX+16]; + movdqu XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM2, [ECX]; + movdqa XMM1, [EAX+16]; + movdqa XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 1002% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM2, [ECX]; + movq MM1, [EAX+8]; + movq MM3, [ECX+8]; + add EAX, 16; + add ECX, 16; + psubd MM0, MM2; + psubd MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ - *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_w(T[] a, T value) +{ + return _arrayExpSliceMinass_i(a, value); +} + +T[] _arrayExpSliceMinass_k(T[] a, T value) +{ + return _arrayExpSliceMinass_i(a, value); +} + +T[] _arrayExpSliceMinass_i(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 81% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 81% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + movq MM2, l; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + psubd MM0, MM2; + psubd MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + else + if (a.length >= 2) + { + auto n = aptr + (a.length & ~1); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EDX, value; + + align 4; + start386: + mov EBX, [ESI]; + mov ECX, [ESI+4]; + add ESI, 8; + sub EBX, EDX; + sub ECX, EDX; + mov [ESI -8], EBX; + mov [ESI+4-8], ECX; + cmp ESI, EDI; + jb start386; + + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + a[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(c[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_w(T[] a, T[] b) +{ + return _arraySliceSliceMinass_i(a, b); +} + +T[] _arraySliceSliceMinass_k(T[] a, T[] b) +{ + return _arraySliceSliceMinass_i(a, b); +} + +T[] _arraySliceSliceMinass_i(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 731% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM2, [ECX]; + movdqu XMM1, [ESI+16]; + movdqu XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM2, [ECX]; + movdqa XMM1, [ESI+16]; + movdqa XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + psubd XMM0, XMM2; + psubd XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 441% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM2, [ECX]; + movq MM1, [ESI+8]; + movq MM3, [ECX+8]; + add ESI, 16; + add ECX, 16; + psubd MM0, MM2; + psubd MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] -= a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] - a[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * value + */ + +T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAssign_i(a, value, b); +} + +T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAssign_i(a, value, b); +} + +T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMulSliceAssign_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (none) // multiplying a pair is not supported by MMX + { + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1380% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + { + // MMX version is 1380% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movq MM2, l; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + pmuludq MM0, MM2; // only multiplies low 32 bits + pmuludq MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * value; + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * 6; + + for (int i = 0; i < dim; i++) + { + //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]); + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * c[] + */ + +T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMulSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMulSliceAssign_i(a, c, b); +} + +T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceMulSliceAssign_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (none) + { + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 1407% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM2, [ECX]; + movdqu XMM1, [EAX+16]; + movdqu XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM2, [ECX]; + movdqa XMM1, [EAX+16]; + movdqa XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 1029% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM2, [ECX]; + movq MM1, [EAX+8]; + movq MM3, [ECX+8]; + add EAX, 16; + add ECX, 16; + pmuludq MM0, MM2; + pmuludq MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + } + + while (aptr < aend) + *aptr++ = *bptr++ * *cptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulSliceAssign_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * b[i])) + { + printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= value + */ + +T[] _arrayExpSliceMulass_w(T[] a, T value) +{ + return _arrayExpSliceMulass_i(a, value); +} + +T[] _arrayExpSliceMulass_k(T[] a, T value) +{ + return _arrayExpSliceMulass_i(a, value); +} + +T[] _arrayExpSliceMulass_i(T[] a, T value) +{ + //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (none) + { + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 400% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = value; + + if (((cast(uint) aptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 402% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); + + asm + { + mov ESI, aptr; + mov EDI, n; + movq MM2, l; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + pmuludq MM0, MM2; + pmuludq MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + } + } + + while (aptr < aend) + *aptr++ *= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = a[]; + a[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(b[i] * 6)) + { + printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= b[] + */ + +T[] _arraySliceSliceMulass_w(T[] a, T[] b) +{ + return _arraySliceSliceMulass_i(a, b); +} + +T[] _arraySliceSliceMulass_k(T[] a, T[] b) +{ + return _arraySliceSliceMulass_i(a, b); +} + +T[] _arraySliceSliceMulass_i(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMulass_i()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (none) + { + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 873% faster + if (sse2() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM2, [ECX]; + movdqu XMM1, [ESI+16]; + movdqu XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM2, [ECX]; + movdqa XMM1, [ESI+16]; + movdqa XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + pmuludq XMM0, XMM2; + pmuludq XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } +/+ BUG: comment out this section until we figure out what is going + wrong with the invalid pshufd instructions. + + else + // MMX version is 573% faster + if (mmx() && a.length >= 4) + { + auto n = aptr + (a.length & ~3); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM2, [ECX]; + movq MM1, [ESI+8]; + movq MM3, [ECX+8]; + pxor MM4, MM4; + pxor MM5, MM5; + punpckldq MM4, MM0; + punpckldq MM5, MM2; + add ESI, 16; + add ECX, 16; + pmuludq MM4, MM5; + pshufd MM4, MM4, 8; // ? + movq [ESI -16], MM4; + pxor MM4, MM4; + pxor MM5, MM5; + punpckldq MM4, MM1; + punpckldq MM5, MM3; + pmuludq MM4, MM5; + pshufd MM4, MM4, 8; // ? + movq [ESI+8-16], MM4; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } ++/ + } + } + + while (aptr < aend) + *aptr++ *= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulass_i unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = a[]; + a[] *= c[]; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(b[i] * c[i])) + { + printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); + assert(0); + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/arrayreal.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/arrayreal.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,241 @@ +/** + * Contains SSE2 and MMX versions of certain operations for real. + * + * Copyright: Copyright Digital Mars 2008 - 2009. + * License: = 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 3343% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + paddw MM0, MM2; + paddw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ + value); + + return a; +} + +unittest +{ + printf("_arraySliceExpAddSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] + c[] + */ + +T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceAddSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceAddSliceAssign_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 3777% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + movdqu XMM2, [ECX]; + movdqu XMM3, [ECX+16]; + add ECX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + movdqa XMM2, [ECX]; + movdqa XMM3, [ECX+16]; + add ECX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 2068% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + movq MM2, [ECX]; + movq MM3, [ECX+8]; + add ECX, 16; + paddw MM0, MM2; + paddw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ + *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] + b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] + b[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += value + */ + +T[] _arrayExpSliceAddass_u(T[] a, T value) +{ + return _arrayExpSliceAddass_s(a, value); +} + +T[] _arrayExpSliceAddass_t(T[] a, T value) +{ + return _arrayExpSliceAddass_s(a, value); +} + +T[] _arrayExpSliceAddass_s(T[] a, T value) +{ + //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 832% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= (l << 16); + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 826% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + paddw MM0, MM2; + paddw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ += value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceAddass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + a[] += 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(c[i] + 6)) + { + printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] += b[] + */ + +T[] _arraySliceSliceAddass_u(T[] a, T[] b) +{ + return _arraySliceSliceAddass_s(a, b); +} + +T[] _arraySliceSliceAddass_t(T[] a, T[] b) +{ + return _arraySliceSliceAddass_s(a, b); +} + +T[] _arraySliceSliceAddass_s(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceAddass_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 2085% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + movdqu XMM2, [ECX]; + movdqu XMM3, [ECX+16]; + add ECX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + movdqa XMM2, [ECX]; + movdqa XMM3, [ECX+16]; + add ECX, 32; + paddw XMM0, XMM2; + paddw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 1022% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + movq MM2, [ECX]; + movq MM3, [ECX+8]; + add ECX, 16; + paddw MM0, MM2; + paddw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb start; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ += *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceAddass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] += a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] + a[i])) + { + printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - value + */ + +T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_s(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b) +{ + return _arraySliceExpMinSliceAssign_s(a, value, b); +} + +T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMinSliceAssign_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 3695% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 3049% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + psubw MM0, MM2; + psubw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ - value); + + return a; +} + +unittest +{ + printf("_arraySliceExpMinSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = value - b[] + */ + +T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_s(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value) +{ + return _arrayExpSliceMinSliceAssign_s(a, b, value); +} + +T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arrayExpSliceMinSliceAssign_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 4995% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= (l << 16); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + + align 4; + startaddsse2u: + movd XMM2, l; + pshufd XMM2, XMM2, 0; + movd XMM3, l; + pshufd XMM3, XMM3, 0; + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + psubw XMM2, XMM0; + psubw XMM3, XMM1; + movdqu [ESI -32], XMM2; + movdqu [ESI+16-32], XMM3; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + + align 4; + startaddsse2a: + movd XMM2, l; + pshufd XMM2, XMM2, 0; + movd XMM3, l; + pshufd XMM3, XMM3, 0; + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + psubw XMM2, XMM0; + psubw XMM3, XMM1; + movdqa [ESI -32], XMM2; + movdqa [ESI+16-32], XMM3; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 4562% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM4, l; + pshufw MM4, MM4, 0; + + align 4; + startmmx: + add ESI, 16; + movq MM2, [EAX]; + movq MM3, [EAX+8]; + movq MM0, MM4; + movq MM1, MM4; + add EAX, 16; + psubw MM0, MM2; + psubw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(value - *bptr++); + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = 6 - a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(6 - a[i])) + { + printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] - c[] + */ + +T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMinSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 4129% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + movdqu XMM2, [ECX]; + movdqu XMM3, [ECX+16]; + add ECX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + movdqa XMM2, [ECX]; + movdqa XMM3, [ECX+16]; + add ECX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 2018% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + movq MM2, [ECX]; + movq MM3, [ECX+8]; + add ECX, 16; + psubw MM0, MM2; + psubw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ - *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] - b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] - b[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= value + */ + +T[] _arrayExpSliceMinass_u(T[] a, T value) +{ + return _arrayExpSliceMinass_s(a, value); +} + +T[] _arrayExpSliceMinass_t(T[] a, T value) +{ + return _arrayExpSliceMinass_s(a, value); +} + +T[] _arrayExpSliceMinass_s(T[] a, T value) +{ + //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 835% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= (l << 16); + + if (((cast(uint) aptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2u; + + mov aptr, ESI; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startaddsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startaddsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 835% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + psubw MM0, MM2; + psubw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ -= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMinass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + a[] = c[]; + a[] -= 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(c[i] - 6)) + { + printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] -= b[] + */ + +T[] _arraySliceSliceMinass_u(T[] a, T[] b) +{ + return _arraySliceSliceMinass_s(a, b); +} + +T[] _arraySliceSliceMinass_t(T[] a, T[] b) +{ + return _arraySliceSliceMinass_s(a, b); +} + +T[] _arraySliceSliceMinass_s(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMinass_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 2121% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm // unaligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + movdqu XMM2, [ECX]; + movdqu XMM3, [ECX+16]; + add ECX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm // aligned case + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + movdqa XMM2, [ECX]; + movdqa XMM3, [ECX+16]; + add ECX, 32; + psubw XMM0, XMM2; + psubw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 1116% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + start: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + movq MM2, [ECX]; + movq MM3, [ECX+8]; + add ECX, 16; + psubw MM0, MM2; + psubw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb start; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ -= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMinass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = c[]; + c[] -= a[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(b[i] - a[i])) + { + printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * value + */ + +T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAssign_s(a, value, b); +} + +T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b) +{ + return _arraySliceExpMulSliceAssign_s(a, value, b); +} + +T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b) +in +{ + assert(a.length == b.length); + assert(disjoint(a, b)); +} +body +{ + //printf("_arraySliceExpMulSliceAssign_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 3733% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= l << 16; + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM1, [EAX+16]; + add EAX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM1, [EAX+16]; + add EAX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + } + } + } + else + // MMX version is 3733% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM1, [EAX+8]; + add EAX, 16; + pmullw MM0, MM2; + pmullw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ * value); + + return a; +} + +unittest +{ + printf("_arraySliceExpMulSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * 6; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * 6)) + { + printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] = b[] * c[] + */ + +T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMulSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b) +{ + return _arraySliceSliceMulSliceAssign_s(a, c, b); +} + +T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b) +in +{ + assert(a.length == b.length && b.length == c.length); + assert(disjoint(a, b)); + assert(disjoint(a, c)); + assert(disjoint(b, c)); +} +body +{ + //printf("_arraySliceSliceMulSliceAssign_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + auto cptr = c.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 2515% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2u: + add ESI, 32; + movdqu XMM0, [EAX]; + movdqu XMM2, [ECX]; + movdqu XMM1, [EAX+16]; + movdqu XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startsse2a: + add ESI, 32; + movdqa XMM0, [EAX]; + movdqa XMM2, [ECX]; + movdqa XMM1, [EAX+16]; + movdqa XMM3, [ECX+16]; + add EAX, 32; + add ECX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + else + // MMX version is 2515% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov EAX, bptr; + mov ECX, cptr; + + align 4; + startmmx: + add ESI, 16; + movq MM0, [EAX]; + movq MM2, [ECX]; + movq MM1, [EAX+8]; + movq MM3, [ECX+8]; + add EAX, 16; + add ECX, 16; + pmullw MM0, MM2; + pmullw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, EAX; + mov cptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ = cast(T)(*bptr++ * *cptr++); + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulSliceAssign_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + c[] = a[] * b[]; + + for (int i = 0; i < dim; i++) + { + if (c[i] != cast(T)(a[i] * b[i])) + { + printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= value + */ + +T[] _arrayExpSliceMulass_u(T[] a, T value) +{ + return _arrayExpSliceMulass_s(a, value); +} + +T[] _arrayExpSliceMulass_t(T[] a, T value) +{ + return _arrayExpSliceMulass_s(a, value); +} + +T[] _arrayExpSliceMulass_s(T[] a, T value) +{ + //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); + auto aptr = a.ptr; + auto aend = aptr + a.length; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 2044% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + uint l = cast(ushort) value; + l |= l << 16; + + if (((cast(uint) aptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM1, [ESI+16]; + add ESI, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM2; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + movd XMM2, l; + pshufd XMM2, XMM2, 0; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM1, [ESI+16]; + add ESI, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM2; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + } + } + } + else + // MMX version is 2056% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + uint l = cast(ushort) value; + + asm + { + mov ESI, aptr; + mov EDI, n; + movd MM2, l; + pshufw MM2, MM2, 0; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM1, [ESI+8]; + add ESI, 16; + pmullw MM0, MM2; + pmullw MM1, MM2; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + } + } + } + + while (aptr < aend) + *aptr++ *= value; + + return a; +} + +unittest +{ + printf("_arrayExpSliceMulass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = a[]; + a[] *= 6; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(b[i] * 6)) + { + printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); + assert(0); + } + } + } + } +} + + +/* ======================================================================== */ + +/*********************** + * Computes: + * a[] *= b[] + */ + +T[] _arraySliceSliceMulass_u(T[] a, T[] b) +{ + return _arraySliceSliceMulass_s(a, b); +} + +T[] _arraySliceSliceMulass_t(T[] a, T[] b) +{ + return _arraySliceSliceMulass_s(a, b); +} + +T[] _arraySliceSliceMulass_s(T[] a, T[] b) +in +{ + assert (a.length == b.length); + assert (disjoint(a, b)); +} +body +{ + //printf("_arraySliceSliceMulass_s()\n"); + auto aptr = a.ptr; + auto aend = aptr + a.length; + auto bptr = b.ptr; + + version (D_InlineAsm_X86) + { + // SSE2 aligned version is 2519% faster + if (sse2() && a.length >= 16) + { + auto n = aptr + (a.length & ~15); + + if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2u: + movdqu XMM0, [ESI]; + movdqu XMM2, [ECX]; + movdqu XMM1, [ESI+16]; + movdqu XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM3; + movdqu [ESI -32], XMM0; + movdqu [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2u; + + mov aptr, ESI; + mov bptr, ECX; + } + } + else + { + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startsse2a: + movdqa XMM0, [ESI]; + movdqa XMM2, [ECX]; + movdqa XMM1, [ESI+16]; + movdqa XMM3, [ECX+16]; + add ESI, 32; + add ECX, 32; + pmullw XMM0, XMM2; + pmullw XMM1, XMM3; + movdqa [ESI -32], XMM0; + movdqa [ESI+16-32], XMM1; + cmp ESI, EDI; + jb startsse2a; + + mov aptr, ESI; + mov bptr, ECX; + } + } + } + else + // MMX version is 1712% faster + if (mmx() && a.length >= 8) + { + auto n = aptr + (a.length & ~7); + + asm + { + mov ESI, aptr; + mov EDI, n; + mov ECX, bptr; + + align 4; + startmmx: + movq MM0, [ESI]; + movq MM2, [ECX]; + movq MM1, [ESI+8]; + movq MM3, [ECX+8]; + add ESI, 16; + add ECX, 16; + pmullw MM0, MM2; + pmullw MM1, MM3; + movq [ESI -16], MM0; + movq [ESI+8-16], MM1; + cmp ESI, EDI; + jb startmmx; + + emms; + mov aptr, ESI; + mov bptr, ECX; + } + } + } + + while (aptr < aend) + *aptr++ *= *bptr++; + + return a; +} + +unittest +{ + printf("_arraySliceSliceMulass_s unittest\n"); + + for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) + { + version (log) printf(" cpuid %d\n", cpuid); + + for (int j = 0; j < 2; j++) + { + const int dim = 67; + T[] a = new T[dim + j]; // aligned on 16 byte boundary + a = a[j .. dim + j]; // misalign for second iteration + T[] b = new T[dim + j]; + b = b[j .. dim + j]; + T[] c = new T[dim + j]; + c = c[j .. dim + j]; + + for (int i = 0; i < dim; i++) + { a[i] = cast(T)i; + b[i] = cast(T)(i + 7); + c[i] = cast(T)(i * 2); + } + + b[] = a[]; + a[] *= c[]; + + for (int i = 0; i < dim; i++) + { + if (a[i] != cast(T)(b[i] * c[i])) + { + printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); + assert(0); + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/cast_.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/cast_.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,169 @@ +/** + * Implementation of array assignment support routines. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright, Sean Kelly + * + * Copyright Digital Mars 2000 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.cover; + +private +{ + version( Windows ) + import core.sys.windows.windows; + else version( Posix ) + { + import core.sys.posix.fcntl; + import core.sys.posix.unistd; + } + import core.bitop; + import core.stdc.stdio; + import rt.util.utf; + + struct BitArray + { + size_t len; + uint* ptr; + + bool opIndex( size_t i ) + in + { + assert( i < len ); + } + body + { + return cast(bool) bt( ptr, i ); + } + } + + struct Cover + { + string filename; + BitArray valid; + uint[] data; + } + + __gshared + { + Cover[] gdata; + string srcpath; + string dstpath; + bool merge; + } +} + + +/** + * Set path to where source files are located. + * + * Params: + * pathname = The new path name. + */ +extern (C) void dmd_coverSourcePath( string pathname ) +{ + srcpath = pathname; +} + + +/** + * Set path to where listing files are to be written. + * + * Params: + * pathname = The new path name. + */ +extern (C) void dmd_coverDestPath( string pathname ) +{ + dstpath = pathname; +} + + +/** + * Set merge mode. + * + * Params: + * flag = true means new data is summed with existing data in the listing + * file; false means a new listing file is always created. + */ +extern (C) void dmd_coverSetMerge( bool flag ) +{ + merge = flag; +} + + +/** + * The coverage callback. + * + * Params: + * filename = The name of the coverage file. + * valid = ??? + * data = ??? + */ +extern (C) void _d_cover_register( string filename, BitArray valid, uint[] data ) +{ + Cover c; + + c.filename = filename; + c.valid = valid; + c.data = data; + gdata ~= c; +} + + +static ~this() +{ + const NUMLINES = 16384 - 1; + const NUMCHARS = 16384 * 16 - 1; + + char[] srcbuf = new char[NUMCHARS]; + char[][] srclines = new char[][NUMLINES]; + char[] lstbuf = new char[NUMCHARS]; + char[][] lstlines = new char[][NUMLINES]; + + foreach( Cover c; gdata ) + { + if( !readFile( appendFN( srcpath, c.filename ), srcbuf ) ) + continue; + splitLines( srcbuf, srclines ); + + if( merge ) + { + if( !readFile( addExt( baseName( c.filename ), "lst" ), lstbuf ) ) + break; + splitLines( lstbuf, lstlines ); + + for( size_t i = 0; i < lstlines.length; ++i ) + { + if( i >= c.data.length ) + break; + + int count = 0; + + foreach( char c2; lstlines[i] ) + { + switch( c2 ) + { + case ' ': + continue; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + count = count * 10 + c2 - '0'; + continue; + default: + break; + } + } + c.data[i] += count; + } + } + + FILE* flst = fopen( (addExt( baseName( c.filename ), "lst\0" )).ptr, "wb" ); + + if( !flst ) + continue; //throw new Exception( "Error opening file for write: " ~ lstfn ); + + uint nno; + uint nyes; + + for( int i = 0; i < c.data.length; i++ ) + { + if( i < srclines.length ) + { + uint n = c.data[i]; + char[] line = srclines[i]; + + line = expandTabs( line ); + + if( n == 0 ) + { + if( c.valid[i] ) + { + nno++; + fprintf( flst, "0000000|%.*s\n", line ); + } + else + { + fprintf( flst, " |%.*s\n", line ); + } + } + else + { + nyes++; + fprintf( flst, "%7u|%.*s\n", n, line ); + } + } + } + if( nyes + nno ) // no divide by 0 bugs + { + fprintf( flst, "%.*s is %d%% covered\n", c.filename, ( nyes * 100 ) / ( nyes + nno ) ); + } + fclose( flst ); + } +} + + +string appendFN( string path, string name ) +{ + version( Windows ) + const char sep = '\\'; + else + const char sep = '/'; + + auto dest = path; + + if( dest && dest[$ - 1] != sep ) + dest ~= sep; + dest ~= name; + return dest; +} + + +string baseName( string name, string ext = null ) +{ + auto i = name.length; + for( ; i > 0; --i ) + { + version( Windows ) + { + if( name[i - 1] == ':' || name[i - 1] == '\\' ) + break; + } + else version( Posix ) + { + if( name[i - 1] == '/' ) + break; + } + } + return chomp( name[i .. $], ext ? ext : "" ); +} + + +string getExt( string name ) +{ + auto i = name.length; + + while( i > 0 ) + { + if( name[i - 1] == '.' ) + return name[i .. $]; + --i; + version( Windows ) + { + if( name[i] == ':' || name[i] == '\\' ) + break; + } + else version( Posix ) + { + if( name[i] == '/' ) + break; + } + } + return null; +} + + +string addExt( string name, string ext ) +{ + auto existing = getExt( name ); + + if( existing.length == 0 ) + { + if( name.length && name[$ - 1] == '.' ) + name ~= ext; + else + name = name ~ "." ~ ext; + } + else + { + name = name[0 .. $ - existing.length] ~ ext; + } + return name; +} + + +string chomp( string str, string delim = null ) +{ + if( delim is null ) + { + auto len = str.length; + + if( len ) + { + auto c = str[len - 1]; + + if( c == '\r' ) + --len; + else if( c == '\n' && str[--len - 1] == '\r' ) + --len; + } + return str[0 .. len]; + } + else if( str.length >= delim.length ) + { + if( str[$ - delim.length .. $] == delim ) + return str[0 .. $ - delim.length]; + } + return str; +} + + +bool readFile( string name, inout char[] buf ) +{ + version( Windows ) + { + auto wnamez = toUTF16z( name ); + HANDLE file = CreateFileW( wnamez, + GENERIC_READ, + FILE_SHARE_READ, + null, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN, + cast(HANDLE) null ); + + delete wnamez; + if( file == INVALID_HANDLE_VALUE ) + return false; + scope( exit ) CloseHandle( file ); + + DWORD num = 0; + DWORD pos = 0; + + buf.length = 4096; + while( true ) + { + if( !ReadFile( file, &buf[pos], cast(DWORD)( buf.length - pos ), &num, null ) ) + return false; + if( !num ) + break; + pos += num; + buf.length = pos * 2; + } + buf.length = pos; + return true; + } + else version( Posix ) + { + char[] namez = new char[name.length + 1]; + namez[0 .. name.length] = name; + namez[$ - 1] = 0; + int file = open( namez.ptr, O_RDONLY ); + + delete namez; + if( file == -1 ) + return false; + scope( exit ) close( file ); + + int num = 0; + uint pos = 0; + + buf.length = 4096; + while( true ) + { + num = read( file, &buf[pos], cast(uint)( buf.length - pos ) ); + if( num == -1 ) + return false; + if( !num ) + break; + pos += num; + buf.length = pos * 2; + } + buf.length = pos; + return true; + } +} + + +void splitLines( char[] buf, inout char[][] lines ) +{ + size_t beg = 0, + pos = 0; + + lines.length = 0; + for( ; pos < buf.length; ++pos ) + { + char c = buf[pos]; + + switch( buf[pos] ) + { + case '\r': + case '\n': + lines ~= buf[beg .. pos]; + beg = pos + 1; + if( buf[pos] == '\r' && pos < buf.length - 1 && buf[pos + 1] == '\n' ) + ++pos, ++beg; + default: + continue; + } + } + if( beg != pos ) + { + lines ~= buf[beg .. pos]; + } +} + + +char[] expandTabs( char[] str, int tabsize = 8 ) +{ + const dchar LS = '\u2028'; // UTF line separator + const dchar PS = '\u2029'; // UTF paragraph separator + + bool changes = false; + char[] result = str; + int column; + int nspaces; + + foreach( size_t i, dchar c; str ) + { + switch( c ) + { + case '\t': + nspaces = tabsize - (column % tabsize); + if( !changes ) + { + changes = true; + result = null; + result.length = str.length + nspaces - 1; + result.length = i + nspaces; + result[0 .. i] = str[0 .. i]; + result[i .. i + nspaces] = ' '; + } + else + { int j = result.length; + result.length = j + nspaces; + result[j .. j + nspaces] = ' '; + } + column += nspaces; + break; + + case '\r': + case '\n': + case PS: + case LS: + column = 0; + goto L1; + + default: + column++; + L1: + if (changes) + { + if (c <= 0x7F) + result ~= c; + else + encode(result, c); + } + break; + } + } + return result; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/critical.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/critical.c Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,173 @@ +/** + * Implementation of support routines for synchronized blocks. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: next) + { + pthread_mutex_lock(&critical_section.cs); + if (!dcs->next) // if, in the meantime, another thread didn't set it + { + dcs->next = dcs_list; + dcs_list = dcs; + pthread_mutex_init(&dcs->cs, &_criticals_attr); + } + pthread_mutex_unlock(&critical_section.cs); + } + pthread_mutex_lock(&dcs->cs); +} + +void _d_criticalexit(D_CRITICAL_SECTION *dcs) +{ + //printf("_d_criticalexit(dcs = x%x)\n", dcs); + pthread_mutex_unlock(&dcs->cs); +} + +void _STI_critical_init() +{ + if (!dcs_list) + { //printf("_STI_critical_init()\n"); + pthread_mutexattr_init(&_criticals_attr); + pthread_mutexattr_settype(&_criticals_attr, PTHREAD_MUTEX_RECURSIVE); + + // The global critical section doesn't need to be recursive + pthread_mutex_init(&critical_section.cs, 0); + dcs_list = &critical_section; + } +} + +void _STD_critical_term() +{ + if (dcs_list) + { //printf("_STI_critical_term()\n"); + while (dcs_list) + { + //printf("\tlooping... %x\n", dcs_list); + pthread_mutex_destroy(&dcs_list->cs); + dcs_list = dcs_list->next; + } + } +} + +#endif + diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/deh.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/deh.c Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,724 @@ +/** + * Implementation of exception handling support routines for Windows. + * + * Copyright: Copyright Digital Mars 1999 - 2009. + * License: ExceptionFlags & EXCEPTION_UNWIND) + { + // Call all the finally blocks in this frame + _d_local_unwind(handler_table, frame, -1); + } + else + { + // Jump to catch block if matching one is found + + int ndx,prev_ndx,i; + struct DHandlerInfo *phi; + struct DCatchInfo *pci; + struct DCatchBlock *pcb; + unsigned ncatches; // number of catches in the current handler + Object *pti; + ClassInfo *ci; + + ci = NULL; // only compute it if we need it + + // walk through handler table, checking each handler + // with an index smaller than the current table_index + for (ndx = frame->table_index; ndx != -1; ndx = prev_ndx) + { + phi = &handler_table->handler_info[ndx]; + prev_ndx = phi->prev_index; + if (phi->cioffset) + { + // this is a catch handler (no finally) + pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset); + ncatches = pci->ncatches; + for (i = 0; i < ncatches; i++) + { + pcb = &pci->catch_block[i]; + + if (!ci) + { + // This code must match the translation code + if (exception_record->ExceptionCode == STATUS_DIGITAL_MARS_D_EXCEPTION) + { + //printf("ei[0] = %p\n", exception_record->ExceptionInformation[0]); + ci = **(ClassInfo ***)(exception_record->ExceptionInformation[0]); + } + else + ci = &_Class_9Throwable; + } + + if (_d_isbaseof(ci, pcb->type)) + { + // Matched the catch type, so we've found the handler. + int regebp; + + pti = _d_translate_se_to_d_exception(exception_record); + + // Initialize catch variable + regebp = (int)&frame->ebp; // EBP for this frame + *(void **)(regebp + (pcb->bpoffset)) = pti; + + // Have system call all finally blocks in intervening frames + _global_unwind(frame, exception_record); + + // Call all the finally blocks skipped in this frame + _d_local_unwind(handler_table, frame, ndx); + + frame->table_index = prev_ndx; // we are out of this handler + + // Jump to catch block. Does not return. + { + unsigned catch_esp; + fp_t catch_addr; + + catch_addr = (fp_t)(pcb->code); + catch_esp = regebp - handler_table->espoffset - sizeof(fp_t); + _asm + { + mov EAX,catch_esp + mov ECX,catch_addr + mov [EAX],ECX + mov EBP,regebp + mov ESP,EAX // reset stack + ret // jump to catch block + } + } + } + } + } + } + } + return ExceptionContinueSearch; +} + +/*********************************** + * Exception filter for use in __try..__except block + * surrounding call to Dmain() + */ + +int _d_exception_filter(struct _EXCEPTION_POINTERS *eptrs, + int retval, + Object **exception_object) +{ + *exception_object = _d_translate_se_to_d_exception(eptrs->ExceptionRecord); + return retval; +} + +/*********************************** + * Throw a D object. + */ + +void __stdcall _d_throw(Object *h) +{ + //printf("_d_throw(h = %p, &h = %p)\n", h, &h); + //printf("\tvptr = %p\n", *(void **)h); + RaiseException(STATUS_DIGITAL_MARS_D_EXCEPTION, + EXCEPTION_NONCONTINUABLE, + 1, (DWORD *)&h); +} + +/*********************************** + * Create an exception object + */ + +Object *_d_create_exception_object(ClassInfo *ci, char *msg) +{ + Throwable *exc; + + exc = (Throwable *)_d_newclass(ci); + // BUG: what if _d_newclass() throws an out of memory exception? + + if (msg) + { + exc->msglen = strlen(msg); + exc->msg = msg; + } + return (Object *)exc; +} + +/*********************************** + * Converts a Windows Structured Exception code to a D Exception Object. + */ + +Object *_d_translate_se_to_d_exception(EXCEPTION_RECORD *exception_record) +{ + Object *pti; + + switch (exception_record->ExceptionCode) { + case STATUS_DIGITAL_MARS_D_EXCEPTION: + // Generated D exception + pti = (Object *)(exception_record->ExceptionInformation[0]); + break; + + case STATUS_INTEGER_DIVIDE_BY_ZERO: + pti = _d_create_exception_object(&_Class_5Error, "Integer Divide by Zero"); + break; + + case STATUS_FLOAT_DIVIDE_BY_ZERO: + pti = _d_create_exception_object(&_Class_5Error, "Float Divide by Zero"); + break; + + case STATUS_ACCESS_VIOLATION: + pti = _d_create_exception_object(&_Class_5Error, "Access Violation"); + break; + + case STATUS_STACK_OVERFLOW: + pti = _d_create_exception_object(&_Class_5Error, "Stack Overflow"); + break; + + case STATUS_DATATYPE_MISALIGNMENT: + pti = _d_create_exception_object(&_Class_5Error, "Datatype Misalignment"); + break; + + case STATUS_ARRAY_BOUNDS_EXCEEDED: + pti = _d_create_exception_object(&_Class_5Error, "Array Bounds Exceeded"); + break; + + case STATUS_FLOAT_INVALID_OPERATION: + pti = _d_create_exception_object(&_Class_5Error, "Invalid Floating Point Operation"); + break; + + case STATUS_FLOAT_DENORMAL_OPERAND: + pti = _d_create_exception_object(&_Class_5Error, "Floating Point Denormal Operand"); + break; + + case STATUS_FLOAT_INEXACT_RESULT: + pti = _d_create_exception_object(&_Class_5Error, "Floating Point Inexact Result"); + break; + + case STATUS_FLOAT_OVERFLOW: + pti = _d_create_exception_object(&_Class_5Error, "Floating Point Overflow"); + break; + + case STATUS_FLOAT_UNDERFLOW: + pti = _d_create_exception_object(&_Class_5Error, "Floating Point Underflow"); + break; + + case STATUS_FLOAT_STACK_CHECK: + pti = _d_create_exception_object(&_Class_5Error, "Floating Point Stack Check"); + break; + + case STATUS_PRIVILEGED_INSTRUCTION: + pti = _d_create_exception_object(&_Class_5Error, "Privileged Instruction"); + break; + + case STATUS_ILLEGAL_INSTRUCTION: + pti = _d_create_exception_object(&_Class_5Error, "Illegal Instruction"); + break; + + case STATUS_BREAKPOINT: + pti = _d_create_exception_object(&_Class_5Error, "Breakpoint"); + break; + + case STATUS_IN_PAGE_ERROR: + pti = _d_create_exception_object(&_Class_5Error, "Win32 In Page Exception"); + break; +/* + case STATUS_INTEGER_OVERFLOW: // not supported on any x86 processor + case STATUS_INVALID_DISPOSITION: + case STATUS_NONCONTINUABLE_EXCEPTION: + case STATUS_SINGLE_STEP: + case DBG_CONTROL_C: // only when a debugger is attached + // In DMC, but not in Microsoft docs + case STATUS_GUARD_PAGE_VIOLATION: + case STATUS_INVALID_HANDLE: +*/ + // convert all other exception codes into a Win32Exception + default: + pti = _d_create_exception_object(&_Class_5Error, "Win32 Exception"); + break; + } + + return pti; +} + +/************************************** + * Call finally blocks in the current stack frame until stop_index. + * This is roughly equivalent to _local_unwind() for C in \src\win32\ehsup.c + */ + +void __cdecl _d_local_unwind(struct DHandlerTable *handler_table, + struct DEstablisherFrame *frame, int stop_index) +{ + struct DHandlerInfo *phi; + struct DCatchInfo *pci; + int i; + + // Set up a special exception handler to catch double-fault exceptions. + __asm + { + push dword ptr -1 + push dword ptr 0 + push offset _local_except_handler // defined in src\win32\ehsup.c + push dword ptr fs:_except_list + mov FS:_except_list,ESP + } + + for (i = frame->table_index; i != -1 && i != stop_index; i = phi->prev_index) + { + phi = &handler_table->handler_info[i]; + if (phi->finally_code) + { + // Note that it is unnecessary to adjust the ESP, as the finally block + // accesses all items on the stack as relative to EBP. + + DWORD *catch_ebp = &frame->ebp; + void *blockaddr = phi->finally_code; + + _asm + { + push EBX + mov EBX,blockaddr + push EBP + mov EBP,catch_ebp + call EBX + pop EBP + pop EBX + } + } + } + + _asm + { + pop FS:_except_list + add ESP,12 + } +} + +/*********************************** + * external version of the unwinder + */ + +__declspec(naked) void __cdecl _d_local_unwind2() +{ + __asm + { + jmp _d_local_unwind + } +} + +/*********************************** + * The frame handler, this is called for each frame that has been registered + * in the OS except_list. + * Input: + * EAX the handler table for the frame + */ + +EXCEPTION_DISPOSITION _d_monitor_handler( + EXCEPTION_RECORD *exception_record, + struct DEstablisherFrame *frame, + CONTEXT context, + void *dispatcher_context) +{ + if (exception_record->ExceptionFlags & EXCEPTION_UNWIND) + { + _d_monitorexit((Object *)frame->table_index); + } + else + { + } + return ExceptionContinueSearch; +} + +/*********************************** + */ + +void _d_monitor_prolog(void *x, void *y, Object *h) +{ + __asm + { + push EAX + } + //printf("_d_monitor_prolog(x=%p, y=%p, h=%p)\n", x, y, h); + _d_monitorenter(h); + __asm + { + pop EAX + } +} + +/*********************************** + */ + +void _d_monitor_epilog(void *x, void *y, Object *h) +{ + //printf("_d_monitor_epilog(x=%p, y=%p, h=%p)\n", x, y, h); + __asm + { + push EAX + push EDX + } + _d_monitorexit(h); + __asm + { + pop EDX + pop EAX + } +} + +#endif + +/* ======================== linux =============================== */ + +#if linux + +#include "mars.h" + +extern ClassInfo D6object9Throwable7__ClassZ; +#define _Class_9Throwable D6object9Throwable7__ClassZ; + +extern ClassInfo D6object5Error7__ClassZ; +#define _Class_5Error D6object5Error7__ClassZ + +typedef int (*fp_t)(); // function pointer in ambient memory model + +struct DHandlerInfo +{ + unsigned offset; // offset from function address to start of guarded section + int prev_index; // previous table index + unsigned cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch) + void *finally_code; // pointer to finally code to execute + // (!=0 if try-finally) +}; + +// Address of DHandlerTable, searched for by eh_finddata() + +struct DHandlerTable +{ + void *fptr; // pointer to start of function + unsigned espoffset; // offset of ESP from EBP + unsigned retoffset; // offset from start of function to return code + unsigned nhandlers; // dimension of handler_info[] + struct DHandlerInfo handler_info[1]; +}; + +struct DCatchBlock +{ + ClassInfo *type; // catch type + unsigned bpoffset; // EBP offset of catch var + void *code; // catch handler code +}; + +// Create one of these for each try-catch +struct DCatchInfo +{ + unsigned ncatches; // number of catch blocks + struct DCatchBlock catch_block[1]; // data for each catch block +}; + +// One of these is generated for each function with try-catch or try-finally + +struct FuncTable +{ + void *fptr; // pointer to start of function + struct DHandlerTable *handlertable; // eh data for this function + unsigned size; // size of function in bytes +}; + +extern struct FuncTable *table_start; +extern struct FuncTable *table_end; + +void terminate() +{ +// _asm +// { +// hlt +// } +} + +/******************************************* + * Given address that is inside a function, + * figure out which function it is in. + * Return DHandlerTable if there is one, NULL if not. + */ + +struct DHandlerTable *__eh_finddata(void *address) +{ + struct FuncTable *ft; + + for (ft = (struct FuncTable *)table_start; + ft < (struct FuncTable *)table_end; + ft++) + { + if (ft->fptr <= address && + address < (void *)((char *)ft->fptr + ft->size)) + { + return ft->handlertable; + } + } + return NULL; +} + + +/****************************** + * Given EBP, find return address to caller, and caller's EBP. + * Input: + * regbp Value of EBP for current function + * *pretaddr Return address + * Output: + * *pretaddr return address to caller + * Returns: + * caller's EBP + */ + +unsigned __eh_find_caller(unsigned regbp, unsigned *pretaddr) +{ + unsigned bp = *(unsigned *)regbp; + + if (bp) // if not end of call chain + { + // Perform sanity checks on new EBP. + // If it is screwed up, terminate() hopefully before we do more damage. + if (bp <= regbp) + // stack should grow to smaller values + terminate(); + + *pretaddr = *(unsigned *)(regbp + sizeof(int)); + } + return bp; +} + +/*********************************** + * Throw a D object. + */ + +void __stdcall _d_throw(Object *h) +{ + unsigned regebp; + + //printf("_d_throw(h = %p, &h = %p)\n", h, &h); + //printf("\tvptr = %p\n", *(void **)h); + + regebp = _EBP; + + while (1) // for each function on the stack + { + struct DHandlerTable *handler_table; + struct FuncTable *pfunc; + struct DHandlerInfo *phi; + unsigned retaddr; + unsigned funcoffset; + unsigned spoff; + unsigned retoffset; + int index; + int dim; + int ndx; + int prev_ndx; + + regebp = __eh_find_caller(regebp,&retaddr); + if (!regebp) + // if end of call chain + break; + + handler_table = __eh_finddata((void *)retaddr); // find static data associated with function + if (!handler_table) // if no static data + { + continue; + } + funcoffset = (unsigned)handler_table->fptr; + spoff = handler_table->espoffset; + retoffset = handler_table->retoffset; + +#ifdef DEBUG + printf("retaddr = x%x\n",(unsigned)retaddr); + printf("regebp=x%04x, funcoffset=x%04x, spoff=x%x, retoffset=x%x\n", + regebp,funcoffset,spoff,retoffset); +#endif + + // Find start index for retaddr in static data + dim = handler_table->nhandlers; + index = -1; + for (int i = 0; i < dim; i++) + { + phi = &handler_table->handler_info[i]; + + if ((unsigned)retaddr >= funcoffset + phi->offset) + index = i; + } + + // walk through handler table, checking each handler + // with an index smaller than the current table_index + for (ndx = index; ndx != -1; ndx = prev_ndx) + { + phi = &handler_table->handler_info[ndx]; + prev_ndx = phi->prev_index; + if (phi->cioffset) + { + // this is a catch handler (no finally) + struct DCatchInfo *pci; + int ncatches; + int i; + + pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset); + ncatches = pci->ncatches; + for (i = 0; i < ncatches; i++) + { + struct DCatchBlock *pcb; + ClassInfo *ci = **(ClassInfo ***)h; + + pcb = &pci->catch_block[i]; + + if (_d_isbaseof(ci, pcb->type)) + { // Matched the catch type, so we've found the handler. + + // Initialize catch variable + *(void **)(regebp + (pcb->bpoffset)) = h; + + // Jump to catch block. Does not return. + { + unsigned catch_esp; + fp_t catch_addr; + + catch_addr = (fp_t)(pcb->code); + catch_esp = regebp - handler_table->espoffset - sizeof(fp_t); + _asm + { + mov EAX,catch_esp + mov ECX,catch_addr + mov [EAX],ECX + mov EBP,regebp + mov ESP,EAX // reset stack + ret // jump to catch block + } + } + } + } + } + else if (phi->finally_code) + { // Call finally block + // Note that it is unnecessary to adjust the ESP, as the finally block + // accesses all items on the stack as relative to EBP. + + void *blockaddr = phi->finally_code; + + _asm + { + push EBX + mov EBX,blockaddr + push EBP + mov EBP,regebp + call EBX + pop EBP + pop EBX + } + } + } + } +} + + +#endif diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/deh2.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/deh2.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,316 @@ +/** + * Implementation of exception handling support routines for Posix. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: funcoffset + phi.offset && + cast(uint)retaddr <= funcoffset + phi.endoffset) + index = i; + } + debug printf("index = %d\n", index); + + // walk through handler table, checking each handler + // with an index smaller than the current table_index + for (ndx = index; ndx != -1; ndx = prev_ndx) + { + phi = &handler_table.handler_info[ndx]; + prev_ndx = phi.prev_index; + if (phi.cioffset) + { + // this is a catch handler (no finally) + DCatchInfo *pci; + int ncatches; + int i; + + pci = cast(DCatchInfo *)(cast(char *)handler_table + phi.cioffset); + ncatches = pci.ncatches; + for (i = 0; i < ncatches; i++) + { + DCatchBlock *pcb; + ClassInfo ci = **cast(ClassInfo **)h; + + pcb = &pci.catch_block[i]; + + if (_d_isbaseof(ci, pcb.type)) + { // Matched the catch type, so we've found the handler. + + // Initialize catch variable + *cast(void **)(regebp + (pcb.bpoffset)) = h; + + // Jump to catch block. Does not return. + { + uint catch_esp; + fp_t catch_addr; + + catch_addr = cast(fp_t)(pcb.code); + catch_esp = regebp - handler_table.espoffset - fp_t.sizeof; + asm + { + mov EAX,catch_esp ; + mov ECX,catch_addr ; + mov [EAX],ECX ; + mov EBP,regebp ; + mov ESP,EAX ; // reset stack + ret ; // jump to catch block + } + } + } + } + } + else if (phi.finally_code) + { // Call finally block + // Note that it is unnecessary to adjust the ESP, as the finally block + // accesses all items on the stack as relative to EBP. + + void *blockaddr = phi.finally_code; + + version (OSX) + { + asm + { + sub ESP,4 ; // align stack to 16 + push EBX ; + mov EBX,blockaddr ; + push EBP ; + mov EBP,regebp ; + call EBX ; + pop EBP ; + pop EBX ; + add ESP,4 ; + } + } + else + { + asm + { + push EBX ; + mov EBX,blockaddr ; + push EBP ; + mov EBP,regebp ; + call EBX ; + pop EBP ; + pop EBX ; + } + } + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/dmain2.BAK --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/dmain2.BAK Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,374 @@ +/* + * Placed into the Public Domain. + * written by Walter Bright + * www.digitalmars.com + */ + +/* + * Modified by Sean Kelly for use with the D Runtime Project + */ + +module rt.dmain2; + +private +{ + import memory; + import util.console; + import core.stdc.stddef; + import core.stdc.stdlib; + import core.stdc.string; +} + +version (Windows) +{ + extern (Windows) alias int function() FARPROC; + extern (Windows) FARPROC GetProcAddress(void*, in char*); + extern (Windows) void* LoadLibraryA(in char*); + extern (Windows) int FreeLibrary(void*); + extern (Windows) void* LocalFree(void*); + extern (Windows) wchar_t* GetCommandLineW(); + extern (Windows) wchar_t** CommandLineToArgvW(wchar_t*, int*); + extern (Windows) export int WideCharToMultiByte(uint, uint, wchar_t*, int, char*, int, char*, int); + pragma(lib, "shell32.lib"); // needed for CommandLineToArgvW +} + +extern (C) void _STI_monitor_staticctor(); +extern (C) void _STD_monitor_staticdtor(); +extern (C) void _STI_critical_init(); +extern (C) void _STD_critical_term(); +extern (C) void gc_init(); +extern (C) void gc_term(); +extern (C) void _minit(); +extern (C) void _moduleCtor(); +extern (C) void _moduleDtor(); +extern (C) void thread_joinAll(); + +/*********************************** + * These are a temporary means of providing a GC hook for DLL use. They may be + * replaced with some other similar functionality later. + */ +extern (C) +{ + void* gc_getProxy(); + void gc_setProxy(void* p); + void gc_clrProxy(); + + alias void* function() gcGetFn; + alias void function(void*) gcSetFn; + alias void function() gcClrFn; +} + +extern (C) void* rt_loadLibrary(in char[] name) +{ + version (Windows) + { + char[260] temp = void; + temp[0 .. name.length] = name[]; + temp[name.length] = cast(char) 0; + void* ptr = LoadLibraryA(temp.ptr); + if (ptr is null) + return ptr; + gcSetFn gcSet = cast(gcSetFn) GetProcAddress(ptr, "gc_setProxy"); + if (gcSet !is null) + gcSet(gc_getProxy()); + return ptr; + + } + else version (linux) + { + throw new Exception("rt_loadLibrary not yet implemented on linux."); + } +} + +extern (C) bool rt_unloadLibrary(void* ptr) +{ + version (Windows) + { + gcClrFn gcClr = cast(gcClrFn) GetProcAddress(ptr, "gc_clrProxy"); + if (gcClr !is null) + gcClr(); + return FreeLibrary(ptr) != 0; + } + else version (linux) + { + throw new Exception("rt_unloadLibrary not yet implemented on linux."); + } +} + +/*********************************** + * These functions must be defined for any D program linked + * against this library. + */ +extern (C) void onAssertError(string file, size_t line); +extern (C) void onAssertErrorMsg(string file, size_t line, string msg); +extern (C) void onRangeError(string file, size_t line); +extern (C) void onHiddenFuncError(Object o); +extern (C) void onSwitchError(string file, size_t line); +extern (C) bool runModuleUnitTests(); + +// this function is called from the utf module +//extern (C) void onUnicodeError(string msg, size_t idx); + +/*********************************** + * These are internal callbacks for various language errors. + */ +extern (C) void _d_assert(string file, uint line) +{ + onAssertError(file, line); +} + +extern (C) static void _d_assert_msg(string msg, string file, uint line) +{ + onAssertErrorMsg(file, line, msg); +} + +extern (C) void _d_array_bounds(string file, uint line) +{ + onRangeError(file, line); +} + +extern (C) void _d_switch_error(string file, uint line) +{ + onSwitchError(file, line); +} + +extern (C) void _d_hidden_func() +{ + Object o; + asm + { + mov o, EAX; + } + onHiddenFuncError(o); +} + +bool _d_isHalting = false; + +extern (C) bool rt_isHalting() +{ + return _d_isHalting; +} + +extern (C) bool rt_trapExceptions = true; + +void _d_criticalInit() +{ + version (linux) + { + _STI_monitor_staticctor(); + _STI_critical_init(); + } +} + +alias void delegate(Throwable) ExceptionHandler; + +extern (C) bool rt_init(ExceptionHandler dg = null) +{ + _d_criticalInit(); + + try + { + gc_init(); + initStaticDataGC(); + version (Windows) + _minit(); + _moduleCtor(); + return true; + } + catch (Throwable e) + { + if (dg) + dg(e); + } + catch + { + + } + _d_criticalTerm(); + return false; +} + +void _d_criticalTerm() +{ + version (linux) + { + _STD_critical_term(); + _STD_monitor_staticdtor(); + } +} + +extern (C) bool rt_term(ExceptionHandler dg = null) +{ + try + { + thread_joinAll(); + _d_isHalting = true; + _moduleDtor(); + gc_term(); + return true; + } + catch (Throwable e) + { + if (dg) + dg(e); + } + catch + { + + } + finally + { + _d_criticalTerm(); + } + return false; +} + +/*********************************** + * The D main() function supplied by the user's program + */ +int main(char[][] args); + +/*********************************** + * Substitutes for the C main() function. + * It's purpose is to wrap the call to the D main() + * function and catch any unhandled exceptions. + */ + +extern (C) int main(int argc, char **argv) +{ + char[][] args; + int result; + + version (linux) + { + _STI_monitor_staticctor(); + _STI_critical_init(); + } + + version (Windows) + { + wchar_t* wcbuf = GetCommandLineW(); + size_t wclen = wcslen(wcbuf); + int wargc = 0; + wchar_t** wargs = CommandLineToArgvW(wcbuf, &wargc); + assert(wargc == argc); + + char* cargp = null; + size_t cargl = WideCharToMultiByte(65001, 0, wcbuf, wclen, null, 0, null, 0); + + cargp = cast(char*) alloca(cargl); + args = ((cast(char[]*) alloca(wargc * (char[]).sizeof)))[0 .. wargc]; + + for (size_t i = 0, p = 0; i < wargc; i++) + { + int wlen = wcslen(wargs[i]); + int clen = WideCharToMultiByte(65001, 0, &wargs[i][0], wlen, null, 0, null, 0); + args[i] = cargp[p .. p+clen]; + p += clen; assert(p <= cargl); + WideCharToMultiByte(65001, 0, &wargs[i][0], wlen, &args[i][0], clen, null, 0); + } + LocalFree(wargs); + wargs = null; + wargc = 0; + } + else version (linux) + { + char[]* am = cast(char[]*) malloc(argc * (char[]).sizeof); + scope(exit) free(am); + + for (size_t i = 0; i < argc; i++) + { + auto len = strlen(argv[i]); + am[i] = argv[i][0 .. len]; + } + args = am[0 .. argc]; + } + + bool trapExceptions = rt_trapExceptions; + + void tryExec(void delegate() dg) + { + + if (trapExceptions) + { + try + { + dg(); + } + catch (Throwable e) + { + while (e) + { + if (e.file) + { + // fprintf(stderr, "%.*s(%u): %.*s\n", e.file, e.line, e.msg); + console (e.classinfo.name)("@")(e.file)("(")(e.line)("): ")(e.msg)("\n"); + } + else + { + // fprintf(stderr, "%.*s\n", e.toString()); + console (e.toString)("\n"); + } + if (e.info) + { + console ("----------------\n"); + foreach (t; e.info) + console (t)("\n"); + } + if (e.next) + console ("\n"); + e = e.next; + } + result = EXIT_FAILURE; + } + catch (Object o) + { + // fprintf(stderr, "%.*s\n", o.toString()); + console (o.toString)("\n"); + result = EXIT_FAILURE; + } + } + else + { + dg(); + } + } + + // NOTE: The lifetime of a process is much like the lifetime of an object: + // it is initialized, then used, then destroyed. If initialization + // fails, the successive two steps are never reached. However, if + // initialization succeeds, then cleanup will occur even if the use + // step fails in some way. Here, the use phase consists of running + // the user's main function. If main terminates with an exception, + // the exception is handled and then cleanup begins. An exception + // thrown during cleanup, however, will abort the cleanup process. + + void runMain() + { + result = main(args); + } + + void runAll() + { + gc_init(); + initStaticDataGC(); + version (Windows) + _minit(); + _moduleCtor(); + if (runModuleUnitTests()) + tryExec(&runMain); + thread_joinAll(); + _d_isHalting = true; + _moduleDtor(); + gc_term(); + } + + tryExec(&runAll); + + version (linux) + { + _STD_critical_term(); + _STD_monitor_staticdtor(); + } + return result; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/dmain2.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/dmain2.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,394 @@ +/** + * Contains main program entry point and support routines. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: TypeidExpression. + */ +class TypeInfo +{ + override hash_t toHash() + { + hash_t hash; + + foreach (char c; this.toString()) + hash = hash * 9 + c; + return hash; + } + + override int opCmp(Object o) + { + if (this is o) + return 0; + TypeInfo ti = cast(TypeInfo)o; + if (ti is null) + return 1; + return dstrcmp(this.toString(), ti.toString()); + } + + override equals_t opEquals(Object o) + { + /* TypeInfo instances are singletons, but duplicates can exist + * across DLL's. Therefore, comparing for a name match is + * sufficient. + */ + if (this is o) + return true; + TypeInfo ti = cast(TypeInfo)o; + return ti && this.toString() == ti.toString(); + } + + /// Returns a hash of the instance of a type. + hash_t getHash(in void* p) { return cast(hash_t)p; } + + /// Compares two instances for equality. + equals_t equals(in void* p1, in void* p2) { return p1 == p2; } + + /// Compares two instances for <, ==, or >. + int compare(in void* p1, in void* p2) { return 0; } + + /// Returns size of the type. + size_t tsize() { return 0; } + + /// Swaps two instances of the type. + void swap(void* p1, void* p2) + { + size_t n = tsize(); + for (size_t i = 0; i < n; i++) + { + byte t = (cast(byte *)p1)[i]; + (cast(byte*)p1)[i] = (cast(byte*)p2)[i]; + (cast(byte*)p2)[i] = t; + } + } + + /// Get TypeInfo for 'next' type, as defined by what kind of type this is, + /// null if none. + TypeInfo next() { return null; } + + /// Return default initializer, null if default initialize to 0 + void[] init() { return null; } + + /// Get flags for type: 1 means GC should scan for pointers + uint flags() { return 0; } + + /// Get type information on the contents of the type; null if not available + OffsetTypeInfo[] offTi() { return null; } + /// Run the destructor on the object and all its sub-objects + void destroy(void* p) {} + /// Run the postblit on the object and all its sub-objects + void postblit(void* p) {} +} + +class TypeInfo_Typedef : TypeInfo +{ + override string toString() { return name; } + + override equals_t opEquals(Object o) + { + TypeInfo_Typedef c; + return this is o || + ((c = cast(TypeInfo_Typedef)o) !is null && + this.name == c.name && + this.base == c.base); + } + + override hash_t getHash(in void* p) { return base.getHash(p); } + override equals_t equals(in void* p1, in void* p2) { return base.equals(p1, p2); } + override int compare(in void* p1, in void* p2) { return base.compare(p1, p2); } + override size_t tsize() { return base.tsize(); } + override void swap(void* p1, void* p2) { return base.swap(p1, p2); } + + override TypeInfo next() { return base.next(); } + override uint flags() { return base.flags(); } + override void[] init() { return m_init.length ? m_init : base.init(); } + + TypeInfo base; + string name; + void[] m_init; +} + +class TypeInfo_Enum : TypeInfo_Typedef +{ + +} + +class TypeInfo_Pointer : TypeInfo +{ + override string toString() { return m_next.toString() ~ "*"; } + + override equals_t opEquals(Object o) + { + TypeInfo_Pointer c; + return this is o || + ((c = cast(TypeInfo_Pointer)o) !is null && + this.m_next == c.m_next); + } + + override hash_t getHash(in void* p) + { + return cast(hash_t)*cast(void**)p; + } + + override equals_t equals(in void* p1, in void* p2) + { + return *cast(void**)p1 == *cast(void**)p2; + } + + override int compare(in void* p1, in void* p2) + { + if (*cast(void**)p1 < *cast(void**)p2) + return -1; + else if (*cast(void**)p1 > *cast(void**)p2) + return 1; + else + return 0; + } + + override size_t tsize() + { + return (void*).sizeof; + } + + override void swap(void* p1, void* p2) + { + void* tmp = *cast(void**)p1; + *cast(void**)p1 = *cast(void**)p2; + *cast(void**)p2 = tmp; + } + + override TypeInfo next() { return m_next; } + override uint flags() { return 1; } + + TypeInfo m_next; +} + +class TypeInfo_Array : TypeInfo +{ + override string toString() { return value.toString() ~ "[]"; } + + override equals_t opEquals(Object o) + { + TypeInfo_Array c; + return this is o || + ((c = cast(TypeInfo_Array)o) !is null && + this.value == c.value); + } + + override hash_t getHash(in void* p) + { + size_t sz = value.tsize(); + hash_t hash = 0; + void[] a = *cast(void[]*)p; + for (size_t i = 0; i < a.length; i++) + hash += value.getHash(a.ptr + i * sz) * 11; + return hash; + } + + override equals_t equals(in void* p1, in void* p2) + { + void[] a1 = *cast(void[]*)p1; + void[] a2 = *cast(void[]*)p2; + if (a1.length != a2.length) + return false; + size_t sz = value.tsize(); + for (size_t i = 0; i < a1.length; i++) + { + if (!value.equals(a1.ptr + i * sz, a2.ptr + i * sz)) + return false; + } + return true; + } + + override int compare(in void* p1, in void* p2) + { + void[] a1 = *cast(void[]*)p1; + void[] a2 = *cast(void[]*)p2; + size_t sz = value.tsize(); + size_t len = a1.length; + + if (a2.length < len) + len = a2.length; + for (size_t u = 0; u < len; u++) + { + int result = value.compare(a1.ptr + u * sz, a2.ptr + u * sz); + if (result) + return result; + } + return cast(int)a1.length - cast(int)a2.length; + } + + override size_t tsize() + { + return (void[]).sizeof; + } + + override void swap(void* p1, void* p2) + { + void[] tmp = *cast(void[]*)p1; + *cast(void[]*)p1 = *cast(void[]*)p2; + *cast(void[]*)p2 = tmp; + } + + TypeInfo value; + + override TypeInfo next() + { + return value; + } + + override uint flags() { return 1; } +} + +class TypeInfo_StaticArray : TypeInfo +{ + override string toString() + { + char[10] tmp = void; + return cast(string)(value.toString() ~ "[" ~ tmp.intToString(len) ~ "]"); + } + + override equals_t opEquals(Object o) + { + TypeInfo_StaticArray c; + return this is o || + ((c = cast(TypeInfo_StaticArray)o) !is null && + this.len == c.len && + this.value == c.value); + } + + override hash_t getHash(in void* p) + { + size_t sz = value.tsize(); + hash_t hash = 0; + for (size_t i = 0; i < len; i++) + hash += value.getHash(p + i * sz); + return hash; + } + + override equals_t equals(in void* p1, in void* p2) + { + size_t sz = value.tsize(); + + for (size_t u = 0; u < len; u++) + { + if (!value.equals(p1 + u * sz, p2 + u * sz)) + return false; + } + return true; + } + + override int compare(in void* p1, in void* p2) + { + size_t sz = value.tsize(); + + for (size_t u = 0; u < len; u++) + { + int result = value.compare(p1 + u * sz, p2 + u * sz); + if (result) + return result; + } + return 0; + } + + override size_t tsize() + { + return len * value.tsize(); + } + + override void swap(void* p1, void* p2) + { + void* tmp; + size_t sz = value.tsize(); + ubyte[16] buffer; + void* pbuffer; + + if (sz < buffer.sizeof) + tmp = buffer.ptr; + else + tmp = pbuffer = (new void[sz]).ptr; + + for (size_t u = 0; u < len; u += sz) + { size_t o = u * sz; + memcpy(tmp, p1 + o, sz); + memcpy(p1 + o, p2 + o, sz); + memcpy(p2 + o, tmp, sz); + } + if (pbuffer) + delete pbuffer; + } + + override void[] init() { return value.init(); } + override TypeInfo next() { return value; } + override uint flags() { return value.flags(); } + + override void destroy(void* p) + { + auto sz = value.tsize(); + p += sz * len; + foreach (i; 0 .. len) + { + p -= sz; + value.destroy(p); + } + } + + override void postblit(void* p) + { + auto sz = value.tsize(); + foreach (i; 0 .. len) + { + value.postblit(p); + p += sz; + } + } + + TypeInfo value; + size_t len; +} + +class TypeInfo_AssociativeArray : TypeInfo +{ + override string toString() + { + return cast(string)(next.toString() ~ "[" ~ key.toString() ~ "]"); + } + + override equals_t opEquals(Object o) + { + TypeInfo_AssociativeArray c; + return this is o || + ((c = cast(TypeInfo_AssociativeArray)o) !is null && + this.key == c.key && + this.value == c.value); + } + + // BUG: need to add the rest of the functions + + override size_t tsize() + { + return (char[int]).sizeof; + } + + override TypeInfo next() { return value; } + override uint flags() { return 1; } + + TypeInfo value; + TypeInfo key; +} + +class TypeInfo_Function : TypeInfo +{ + override string toString() + { + return cast(string)(next.toString() ~ "()"); + } + + override equals_t opEquals(Object o) + { + TypeInfo_Function c; + return this is o || + ((c = cast(TypeInfo_Function)o) !is null && + this.next == c.next); + } + + // BUG: need to add the rest of the functions + + override size_t tsize() + { + return 0; // no size for functions + } + + TypeInfo next; +} + +class TypeInfo_Delegate : TypeInfo +{ + override string toString() + { + return cast(string)(next.toString() ~ " delegate()"); + } + + override equals_t opEquals(Object o) + { + TypeInfo_Delegate c; + return this is o || + ((c = cast(TypeInfo_Delegate)o) !is null && + this.next == c.next); + } + + // BUG: need to add the rest of the functions + + override size_t tsize() + { + alias int delegate() dg; + return dg.sizeof; + } + + override uint flags() { return 1; } + + TypeInfo next; +} + +class TypeInfo_Class : TypeInfo +{ + override string toString() { return info.name; } + + override equals_t opEquals(Object o) + { + TypeInfo_Class c; + return this is o || + ((c = cast(TypeInfo_Class)o) !is null && + this.info.name == c.classinfo.name); + } + + override hash_t getHash(in void* p) + { + Object o = *cast(Object*)p; + return o ? o.toHash() : 0; + } + + override equals_t equals(in void* p1, in void* p2) + { + Object o1 = *cast(Object*)p1; + Object o2 = *cast(Object*)p2; + + return (o1 is o2) || (o1 && o1.opEquals(o2)); + } + + override int compare(in void* p1, in void* p2) + { + Object o1 = *cast(Object*)p1; + Object o2 = *cast(Object*)p2; + int c = 0; + + // Regard null references as always being "less than" + if (o1 !is o2) + { + if (o1) + { + if (!o2) + c = 1; + else + c = o1.opCmp(o2); + } + else + c = -1; + } + return c; + } + + override size_t tsize() + { + return Object.sizeof; + } + + override uint flags() { return 1; } + + override OffsetTypeInfo[] offTi() + { + return (info.flags & 4) ? info.offTi : null; + } + + ClassInfo info; +} + +class TypeInfo_Interface : TypeInfo +{ + override string toString() { return info.name; } + + override equals_t opEquals(Object o) + { + TypeInfo_Interface c; + return this is o || + ((c = cast(TypeInfo_Interface)o) !is null && + this.info.name == c.classinfo.name); + } + + override hash_t getHash(in void* p) + { + Interface* pi = **cast(Interface ***)*cast(void**)p; + Object o = cast(Object)(*cast(void**)p - pi.offset); + assert(o); + return o.toHash(); + } + + override equals_t equals(in void* p1, in void* p2) + { + Interface* pi = **cast(Interface ***)*cast(void**)p1; + Object o1 = cast(Object)(*cast(void**)p1 - pi.offset); + pi = **cast(Interface ***)*cast(void**)p2; + Object o2 = cast(Object)(*cast(void**)p2 - pi.offset); + + return o1 == o2 || (o1 && o1.opCmp(o2) == 0); + } + + override int compare(in void* p1, in void* p2) + { + Interface* pi = **cast(Interface ***)*cast(void**)p1; + Object o1 = cast(Object)(*cast(void**)p1 - pi.offset); + pi = **cast(Interface ***)*cast(void**)p2; + Object o2 = cast(Object)(*cast(void**)p2 - pi.offset); + int c = 0; + + // Regard null references as always being "less than" + if (o1 != o2) + { + if (o1) + { + if (!o2) + c = 1; + else + c = o1.opCmp(o2); + } + else + c = -1; + } + return c; + } + + override size_t tsize() + { + return Object.sizeof; + } + + override uint flags() { return 1; } + + ClassInfo info; +} + +class TypeInfo_Struct : TypeInfo +{ + override string toString() { return name; } + + override equals_t opEquals(Object o) + { + TypeInfo_Struct s; + return this is o || + ((s = cast(TypeInfo_Struct)o) !is null && + this.name == s.name && + this.init.length == s.init.length); + } + + override hash_t getHash(in void* p) + { + assert(p); + if (xtoHash) + { + debug(PRINTF) printf("getHash() using xtoHash\n"); + return (*xtoHash)(p); + } + else + { + hash_t h; + debug(PRINTF) printf("getHash() using default hash\n"); + // A sorry hash algorithm. + // Should use the one for strings. + // BUG: relies on the GC not moving objects + auto q = cast(const(ubyte)*)p; + for (size_t i = 0; i < init.length; i++) + { + h = h * 9 + *q; + q++; + } + return h; + } + } + + override equals_t equals(in void* p1, in void* p2) + { + if (p1 == p2) + return true; + else if (!p1 || !p2) + return false; + else if (xopEquals) + return (*xopEquals)(p1, p2); + else + // BUG: relies on the GC not moving objects + return memcmp(p1, p2, init.length) == 0; + } + + override int compare(in void* p1, in void* p2) + { + // Regard null references as always being "less than" + if (p1 != p2) + { + if (p1) + { + if (!p2) + return true; + else if (xopCmp) + return (*xopCmp)(p2, p1); + else + // BUG: relies on the GC not moving objects + return memcmp(p1, p2, init.length); + } + else + return -1; + } + return 0; + } + + override size_t tsize() + { + return init.length; + } + + override void[] init() { return m_init; } + + override uint flags() { return m_flags; } + + override void destroy(void* p) + { + if (xdtor) + (*xdtor)(p); + } + + override void postblit(void* p) + { + if (xpostblit) + (*xpostblit)(p); + } + + string name; + void[] m_init; // initializer; init.ptr == null if 0 initialize + + hash_t function(in void*) xtoHash; + equals_t function(in void*, in void*) xopEquals; + int function(in void*, in void*) xopCmp; + char[] function(in void*) xtoString; + + uint m_flags; + + const(MemberInfo[]) function(in char[]) xgetMembers; + void function(void*) xdtor; + void function(void*) xpostblit; +} + +class TypeInfo_Tuple : TypeInfo +{ + TypeInfo[] elements; + + override string toString() + { + string s = "("; + foreach (i, element; elements) + { + if (i) + s ~= ','; + s ~= element.toString(); + } + s ~= ")"; + return s; + } + + override equals_t opEquals(Object o) + { + if (this is o) + return true; + + auto t = cast(TypeInfo_Tuple)o; + if (t && elements.length == t.elements.length) + { + for (size_t i = 0; i < elements.length; i++) + { + if (elements[i] != t.elements[i]) + return false; + } + return true; + } + return false; + } + + override hash_t getHash(in void* p) + { + assert(0); + } + + override equals_t equals(in void* p1, in void* p2) + { + assert(0); + } + + override int compare(in void* p1, in void* p2) + { + assert(0); + } + + override size_t tsize() + { + assert(0); + } + + override void swap(void* p1, void* p2) + { + assert(0); + } + + override void destroy(void* p) + { + assert(0); + } + + override void postblit(void* p) + { + assert(0); + } +} + +class TypeInfo_Const : TypeInfo +{ + override string toString() + { + return cast(string) ("const(" ~ base.toString() ~ ")"); + } + + override equals_t opEquals(Object o) { return base.opEquals(o); } + override hash_t getHash(in void *p) { return base.getHash(p); } + override equals_t equals(in void *p1, in void *p2) { return base.equals(p1, p2); } + override int compare(in void *p1, in void *p2) { return base.compare(p1, p2); } + override size_t tsize() { return base.tsize(); } + override void swap(void *p1, void *p2) { return base.swap(p1, p2); } + + override TypeInfo next() { return base.next(); } + override uint flags() { return base.flags(); } + override void[] init() { return base.init(); } + + TypeInfo base; +} + +class TypeInfo_Invariant : TypeInfo_Const +{ + override string toString() + { + return cast(string) ("immutable(" ~ base.toString() ~ ")"); + } +} + +class TypeInfo_Shared : TypeInfo_Const +{ + override string toString() + { + return cast(string) ("shared(" ~ base.toString() ~ ")"); + } +} + +abstract class MemberInfo +{ + string name(); +} + +class MemberInfo_field : MemberInfo +{ + this(string name, TypeInfo ti, size_t offset) + { + m_name = name; + m_typeinfo = ti; + m_offset = offset; + } + + override string name() { return m_name; } + TypeInfo typeInfo() { return m_typeinfo; } + size_t offset() { return m_offset; } + + string m_name; + TypeInfo m_typeinfo; + size_t m_offset; +} + +class MemberInfo_function : MemberInfo +{ + this(string name, TypeInfo ti, void* fp, uint flags) + { + m_name = name; + m_typeinfo = ti; + m_fp = fp; + m_flags = flags; + } + + override string name() { return m_name; } + TypeInfo typeInfo() { return m_typeinfo; } + void* fp() { return m_fp; } + uint flags() { return m_flags; } + + string m_name; + TypeInfo m_typeinfo; + void* m_fp; + uint m_flags; +} + + +/////////////////////////////////////////////////////////////////////////////// +// Throwable +/////////////////////////////////////////////////////////////////////////////// + + +class Throwable : Object +{ + interface TraceInfo + { + int opApply(int delegate(inout char[])); + } + + string msg; + string file; + size_t line; + TraceInfo info; + Throwable next; + + this(string msg, Throwable next = null) + { + this.msg = msg; + this.next = next; + this.info = traceContext(); + } + + this(string msg, string file, size_t line, Throwable next = null) + { + this(msg, next); + this.file = file; + this.line = line; + this.info = traceContext(); + } + + override string toString() + { + char[10] tmp = void; + char[] buf; + + for (Throwable e = this; e !is null; e = e.next) + { + if (e.file) + { + buf ~= e.classinfo.name ~ "@" ~ e.file ~ "(" ~ tmp.intToString(e.line) ~ "): " ~ e.msg; + } + else + { + buf ~= e.classinfo.name ~ ": " ~ e.msg; + } + if (e.info) + { + buf ~= "\n----------------"; + foreach (t; e.info) + buf ~= "\n" ~ t; + } + if (e.next) + buf ~= "\n"; + } + return cast(string) buf; + } +} + + +alias Throwable.TraceInfo function(void* ptr = null) TraceHandler; +private __gshared TraceHandler traceHandler = null; + + +/** + * Overrides the default trace hander with a user-supplied version. + * + * Params: + * h = The new trace handler. Set to null to use the default handler. + */ +extern (C) void rt_setTraceHandler(TraceHandler h) +{ + traceHandler = h; +} + + +/** + * This function will be called when an exception is constructed. The + * user-supplied trace handler will be called if one has been supplied, + * otherwise no trace will be generated. + * + * Params: + * ptr = A pointer to the location from which to generate the trace, or null + * if the trace should be generated from within the trace handler + * itself. + * + * Returns: + * An object describing the current calling context or null if no handler is + * supplied. + */ +Throwable.TraceInfo traceContext(void* ptr = null) +{ + if (traceHandler is null) + return null; + return traceHandler(ptr); +} + + +class Exception : Throwable +{ + this(string msg, Throwable next = null) + { + super(msg, next); + } + + this(string msg, string file, size_t line, Throwable next = null) + { + super(msg, file, line, next); + } +} + + +class Error : Throwable +{ + this(string msg, Throwable next = null) + { + super(msg, next); + } + + this(string msg, string file, size_t line, Throwable next = null) + { + super(msg, file, line, next); + } +} + + +/////////////////////////////////////////////////////////////////////////////// +// ModuleInfo +/////////////////////////////////////////////////////////////////////////////// + + +enum +{ + MIctorstart = 1, // we've started constructing it + MIctordone = 2, // finished construction + MIstandalone = 4, // module ctor does not depend on other module + // ctors being done first + MIhasictor = 8, // has ictor member +} + + +class ModuleInfo +{ + string name; + ModuleInfo[] importedModules; + ClassInfo[] localClasses; + uint flags; + + void function() ctor; // module static constructor (order dependent) + void function() dtor; // module static destructor + void function() unitTest; // module unit tests + + void* xgetMembers; // module getMembers() function + + void function() ictor; // module static constructor (order independent) + + void*[4] reserved; // for future expansion + + static int opApply(int delegate(inout ModuleInfo) dg) + { + int ret = 0; + + foreach (m; _moduleinfo_array) + { + ret = dg(m); + if (ret) + break; + } + return ret; + } +} + + +// Windows: this gets initialized by minit.asm +// Posix: this gets initialized in _moduleCtor() +extern (C) __gshared ModuleInfo[] _moduleinfo_array; + + +version (linux) +{ + // This linked list is created by a compiler generated function inserted + // into the .ctor list by the compiler. + struct ModuleReference + { + ModuleReference* next; + ModuleInfo mod; + } + + extern (C) __gshared ModuleReference* _Dmodule_ref; // start of linked list +} + +version (FreeBSD) +{ + // This linked list is created by a compiler generated function inserted + // into the .ctor list by the compiler. + struct ModuleReference + { + ModuleReference* next; + ModuleInfo mod; + } + + extern (C) __gshared ModuleReference* _Dmodule_ref; // start of linked list +} + +version (Solaris) +{ + // This linked list is created by a compiler generated function inserted + // into the .ctor list by the compiler. + struct ModuleReference + { + ModuleReference* next; + ModuleInfo mod; + } + + extern (C) __gshared ModuleReference* _Dmodule_ref; // start of linked list +} + +version (OSX) +{ + extern (C) + { + extern __gshared void* _minfo_beg; + extern __gshared void* _minfo_end; + } +} + +__gshared ModuleInfo[] _moduleinfo_dtors; +__gshared uint _moduleinfo_dtors_i; + +// Register termination function pointers +extern (C) int _fatexit(void*); + +/** + * Initialize the modules. + */ + +extern (C) void _moduleCtor() +{ + debug(PRINTF) printf("_moduleCtor()\n"); + version (linux) + { + int len = 0; + ModuleReference *mr; + + for (mr = _Dmodule_ref; mr; mr = mr.next) + len++; + _moduleinfo_array = new ModuleInfo[len]; + len = 0; + for (mr = _Dmodule_ref; mr; mr = mr.next) + { _moduleinfo_array[len] = mr.mod; + len++; + } + } + + version (FreeBSD) + { + int len = 0; + ModuleReference *mr; + + for (mr = _Dmodule_ref; mr; mr = mr.next) + len++; + _moduleinfo_array = new ModuleInfo[len]; + len = 0; + for (mr = _Dmodule_ref; mr; mr = mr.next) + { _moduleinfo_array[len] = mr.mod; + len++; + } + } + + version (Solaris) + { + int len = 0; + ModuleReference *mr; + + for (mr = _Dmodule_ref; mr; mr = mr.next) + len++; + _moduleinfo_array = new ModuleInfo[len]; + len = 0; + for (mr = _Dmodule_ref; mr; mr = mr.next) + { _moduleinfo_array[len] = mr.mod; + len++; + } + } + + version (OSX) + { + /* The ModuleInfo references are stored in the special segment + * __minfodata, which is bracketed by the segments __minfo_beg + * and __minfo_end. The variables _minfo_beg and _minfo_end + * are of zero size and are in the two bracketing segments, + * respectively. + */ + size_t length = cast(ModuleInfo*)&_minfo_end - cast(ModuleInfo*)&_minfo_beg; + _moduleinfo_array = (cast(ModuleInfo*)&_minfo_beg)[0 .. length]; + debug printf("moduleinfo: ptr = %p, length = %d\n", _moduleinfo_array.ptr, _moduleinfo_array.length); + + debug foreach (m; _moduleinfo_array) + { + //printf("\t%p\n", m); + printf("\t%.*s\n", m.name); + } + } + + version (Windows) + { + // Ensure module destructors also get called on program termination + //_fatexit(&_STD_moduleDtor); + } + + _moduleinfo_dtors = new ModuleInfo[_moduleinfo_array.length]; + debug(PRINTF) printf("_moduleinfo_dtors = x%x\n", cast(void*)_moduleinfo_dtors); + _moduleIndependentCtors(); + _moduleCtor2(_moduleinfo_array, 0); +} + +extern (C) void _moduleIndependentCtors() +{ + debug(PRINTF) printf("_moduleIndependentCtors()\n"); + foreach (m; _moduleinfo_array) + { + if (m && m.flags & MIhasictor && m.ictor) + { + (*m.ictor)(); + } + } +} + +void _moduleCtor2(ModuleInfo[] mi, int skip) +{ + debug(PRINTF) printf("_moduleCtor2(): %d modules\n", mi.length); + for (uint i = 0; i < mi.length; i++) + { + ModuleInfo m = mi[i]; + + debug(PRINTF) printf("\tmodule[%d] = '%p'\n", i, m); + if (!m) + continue; + debug(PRINTF) printf("\tmodule[%d] = '%.*s'\n", i, m.name); + if (m.flags & MIctordone) + continue; + debug(PRINTF) printf("\tmodule[%d] = '%.*s', m = x%x\n", i, m.name, m); + + if (m.ctor || m.dtor) + { + if (m.flags & MIctorstart) + { if (skip || m.flags & MIstandalone) + continue; + throw new Exception("Cyclic dependency in module " ~ m.name); + } + + m.flags |= MIctorstart; + _moduleCtor2(m.importedModules, 0); + if (m.ctor) + (*m.ctor)(); + m.flags &= ~MIctorstart; + m.flags |= MIctordone; + + // Now that construction is done, register the destructor + //printf("\tadding module dtor x%x\n", m); + assert(_moduleinfo_dtors_i < _moduleinfo_dtors.length); + _moduleinfo_dtors[_moduleinfo_dtors_i++] = m; + } + else + { + m.flags |= MIctordone; + _moduleCtor2(m.importedModules, 1); + } + } +} + +/** + * Destruct the modules. + */ + +// Starting the name with "_STD" means under Posix a pointer to the +// function gets put in the .dtors segment. + +extern (C) void _moduleDtor() +{ + debug(PRINTF) printf("_moduleDtor(): %d modules\n", _moduleinfo_dtors_i); + + for (uint i = _moduleinfo_dtors_i; i-- != 0;) + { + ModuleInfo m = _moduleinfo_dtors[i]; + + debug(PRINTF) printf("\tmodule[%d] = '%.*s', x%x\n", i, m.name, m); + if (m.dtor) + { + (*m.dtor)(); + } + } + debug(PRINTF) printf("_moduleDtor() done\n"); +} + +/////////////////////////////////////////////////////////////////////////////// +// Monitor +/////////////////////////////////////////////////////////////////////////////// + +alias Object.Monitor IMonitor; +alias void delegate(Object) DEvent; + +// NOTE: The dtor callback feature is only supported for monitors that are not +// supplied by the user. The assumption is that any object with a user- +// supplied monitor may have special storage or lifetime requirements and +// that as a result, storing references to local objects within Monitor +// may not be safe or desirable. Thus, devt is only valid if impl is +// null. +struct Monitor +{ + IMonitor impl; + /* internal */ + DEvent[] devt; + /* stuff */ +} + +Monitor* getMonitor(Object h) +{ + return cast(Monitor*) (cast(void**) h)[1]; +} + +void setMonitor(Object h, Monitor* m) +{ + (cast(void**) h)[1] = m; +} + +extern (C) void _d_monitor_create(Object); +extern (C) void _d_monitor_destroy(Object); +extern (C) void _d_monitor_lock(Object); +extern (C) int _d_monitor_unlock(Object); + +extern (C) void _d_monitordelete(Object h, bool det) +{ + Monitor* m = getMonitor(h); + + if (m !is null) + { + IMonitor i = m.impl; + if (i is null) + { + _d_monitor_devt(m, h); + _d_monitor_destroy(h); + setMonitor(h, null); + return; + } + if (det && (cast(void*) i) !is (cast(void*) h)) + delete i; + setMonitor(h, null); + } +} + +extern (C) void _d_monitorenter(Object h) +{ + Monitor* m = getMonitor(h); + + if (m is null) + { + _d_monitor_create(h); + m = getMonitor(h); + } + + IMonitor i = m.impl; + + if (i is null) + { + _d_monitor_lock(h); + return; + } + i.lock(); +} + +extern (C) void _d_monitorexit(Object h) +{ + Monitor* m = getMonitor(h); + IMonitor i = m.impl; + + if (i is null) + { + _d_monitor_unlock(h); + return; + } + i.unlock(); +} + +extern (C) void _d_monitor_devt(Monitor* m, Object h) +{ + if (m.devt.length) + { + DEvent[] devt; + + synchronized (h) + { + devt = m.devt; + m.devt = null; + } + foreach (v; devt) + { + if (v) + v(h); + } + free(devt.ptr); + } +} + +extern (C) void rt_attachDisposeEvent(Object h, DEvent e) +{ + synchronized (h) + { + Monitor* m = getMonitor(h); + assert(m.impl is null); + + foreach (inout v; m.devt) + { + if (v is null || v == e) + { + v = e; + return; + } + } + + auto len = m.devt.length + 4; // grow by 4 elements + auto pos = m.devt.length; // insert position + auto p = realloc(m.devt.ptr, DEvent.sizeof * len); + if (!p) + onOutOfMemoryError(); + m.devt = (cast(DEvent*)p)[0 .. len]; + m.devt[pos+1 .. len] = null; + m.devt[pos] = e; + } +} + +extern (C) void rt_detachDisposeEvent(Object h, DEvent e) +{ + synchronized (h) + { + Monitor* m = getMonitor(h); + assert(m.impl is null); + + foreach (p, v; m.devt) + { + if (v == e) + { + memmove(&m.devt[p], + &m.devt[p+1], + (m.devt.length - p - 1) * DEvent.sizeof); + m.devt[$ - 1] = null; + return; + } + } + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/invariant.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/invariant.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,35 @@ +/** + * Implementation of invariant support routines. + * + * Copyright: Copyright Digital Mars 2007 - 2009. + * License: p.length) + { + size_t size = p.length * sizeelem; + auto info = gc_query(p.data); + + if (info.size <= newsize || info.base != p.data) + { + if (info.size >= PAGESIZE && info.base == p.data) + { // Try to extend in-place + auto u = gc_extend(p.data, (newsize + 1) - info.size, (newsize + 1) - info.size); + if (u) + { + goto L1; + } + } + newdata = cast(byte *)gc_malloc(newsize + 1, info.attr); + newdata[0 .. size] = p.data[0 .. size]; + } + L1: + newdata[size .. newsize] = 0; + } + } + else + { + newdata = cast(byte *)gc_calloc(newsize + 1, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + } + } + else + { + newdata = p.data; + } + + p.data = newdata; + p.length = newlength; + return newdata[0 .. newlength]; + +Loverflow: + onOutOfMemoryError(); +} + + +/** + * Resize arrays for non-zero initializers. + * p pointer to array lvalue to be updated + * newlength new .length property of array + * sizeelem size of each element of array + * initsize size of initializer + * ... initializer + */ +extern (C) byte[] _d_arraysetlengthiT(TypeInfo ti, size_t newlength, Array *p) +in +{ + assert(!p.length || p.data); +} +body +{ + byte* newdata; + size_t sizeelem = ti.next.tsize(); + void[] initializer = ti.next.init(); + size_t initsize = initializer.length; + + assert(sizeelem); + assert(initsize); + assert(initsize <= sizeelem); + assert((sizeelem / initsize) * initsize == sizeelem); + + debug(PRINTF) + { + printf("_d_arraysetlengthiT(p = %p, sizeelem = %d, newlength = %d, initsize = %d)\n", p, sizeelem, newlength, initsize); + if (p) + printf("\tp.data = %p, p.length = %d\n", p.data, p.length); + } + + if (newlength) + { + version (D_InlineAsm_X86) + { + size_t newsize = void; + + asm + { + mov EAX,newlength ; + mul EAX,sizeelem ; + mov newsize,EAX ; + jc Loverflow ; + } + } + else + { + size_t newsize = sizeelem * newlength; + + if (newsize / newlength != sizeelem) + goto Loverflow; + } + debug(PRINTF) printf("newsize = %x, newlength = %x\n", newsize, newlength); + + size_t size = p.length * sizeelem; + + if (p.data) + { + newdata = p.data; + if (newlength > p.length) + { + auto info = gc_query(p.data); + + if (info.size <= newsize || info.base != p.data) + { + if (info.size >= PAGESIZE && info.base == p.data) + { // Try to extend in-place + auto u = gc_extend(p.data, (newsize + 1) - info.size, (newsize + 1) - info.size); + if (u) + { + goto L1; + } + } + newdata = cast(byte *)gc_malloc(newsize + 1, info.attr); + newdata[0 .. size] = p.data[0 .. size]; + L1: ; + } + } + } + else + { + newdata = cast(byte *)gc_malloc(newsize + 1, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + } + + auto q = initializer.ptr; // pointer to initializer + + if (newsize > size) + { + if (initsize == 1) + { + debug(PRINTF) printf("newdata = %p, size = %d, newsize = %d, *q = %d\n", newdata, size, newsize, *cast(byte*)q); + newdata[size .. newsize] = *(cast(byte*)q); + } + else + { + for (size_t u = size; u < newsize; u += initsize) + { + memcpy(newdata + u, q, initsize); + } + } + } + } + else + { + newdata = p.data; + } + + p.data = newdata; + p.length = newlength; + return newdata[0 .. newlength]; + +Loverflow: + onOutOfMemoryError(); +} + + +/** + * Append y[] to array x[]. + * size is size of each array element. + */ +extern (C) long _d_arrayappendT(TypeInfo ti, Array *px, byte[] y) +{ + auto sizeelem = ti.next.tsize(); // array element size + auto info = gc_query(px.data); + auto length = px.length; + auto newlength = length + y.length; + auto newsize = newlength * sizeelem; + + if (info.size < newsize || info.base != px.data) + { byte* newdata; + + if (info.size >= PAGESIZE && info.base == px.data) + { // Try to extend in-place + auto u = gc_extend(px.data, (newsize + 1) - info.size, (newsize + 1) - info.size); + if (u) + { + goto L1; + } + } + newdata = cast(byte *)gc_malloc(newCapacity(newlength, sizeelem) + 1, info.attr); + memcpy(newdata, px.data, length * sizeelem); + px.data = newdata; + } + L1: + px.length = newlength; + memcpy(px.data + length * sizeelem, y.ptr, y.length * sizeelem); + return *cast(long*)px; +} + + +/** + * + */ +size_t newCapacity(size_t newlength, size_t size) +{ + version(none) + { + size_t newcap = newlength * size; + } + else + { + /* + * Better version by Dave Fladebo: + * This uses an inverse logorithmic algorithm to pre-allocate a bit more + * space for larger arrays. + * - Arrays smaller than PAGESIZE bytes are left as-is, so for the most + * common cases, memory allocation is 1 to 1. The small overhead added + * doesn't affect small array perf. (it's virtually the same as + * current). + * - Larger arrays have some space pre-allocated. + * - As the arrays grow, the relative pre-allocated space shrinks. + * - The logorithmic algorithm allocates relatively more space for + * mid-size arrays, making it very fast for medium arrays (for + * mid-to-large arrays, this turns out to be quite a bit faster than the + * equivalent realloc() code in C, on Linux at least. Small arrays are + * just as fast as GCC). + * - Perhaps most importantly, overall memory usage and stress on the GC + * is decreased significantly for demanding environments. + */ + size_t newcap = newlength * size; + size_t newext = 0; + + if (newcap > PAGESIZE) + { + //double mult2 = 1.0 + (size / log10(pow(newcap * 2.0,2.0))); + + // redo above line using only integer math + + static int log2plus1(size_t c) + { int i; + + if (c == 0) + i = -1; + else + for (i = 1; c >>= 1; i++) + { + } + return i; + } + + /* The following setting for mult sets how much bigger + * the new size will be over what is actually needed. + * 100 means the same size, more means proportionally more. + * More means faster but more memory consumption. + */ + //long mult = 100 + (1000L * size) / (6 * log2plus1(newcap)); + long mult = 100 + (1000L * size) / log2plus1(newcap); + + // testing shows 1.02 for large arrays is about the point of diminishing return + if (mult < 102) + mult = 102; + newext = cast(size_t)((newcap * mult) / 100); + newext -= newext % size; + debug(PRINTF) printf("mult: %2.2f, alloc: %2.2f\n",mult/100.0,newext / cast(double)size); + } + newcap = newext > newcap ? newext : newcap; + debug(PRINTF) printf("newcap = %d, newlength = %d, size = %d\n", newcap, newlength, size); + } + return newcap; +} + + +/** + * + */ +extern (C) byte[] _d_arrayappendcT(TypeInfo ti, inout byte[] x, ...) +{ + auto sizeelem = ti.next.tsize(); // array element size + auto info = gc_query(x.ptr); + auto length = x.length; + auto newlength = length + 1; + auto newsize = newlength * sizeelem; + + assert(info.size == 0 || length * sizeelem <= info.size); + + debug(PRINTF) printf("_d_arrayappendcT(sizeelem = %d, ptr = %p, length = %d, cap = %d)\n", sizeelem, x.ptr, x.length, info.size); + + if (info.size <= newsize || info.base != x.ptr) + { byte* newdata; + + if (info.size >= PAGESIZE && info.base == x.ptr) + { // Try to extend in-place + auto u = gc_extend(x.ptr, (newsize + 1) - info.size, (newsize + 1) - info.size); + if (u) + { + goto L1; + } + } + debug(PRINTF) printf("_d_arrayappendcT(length = %d, newlength = %d, cap = %d)\n", length, newlength, info.size); + auto newcap = newCapacity(newlength, sizeelem); + assert(newcap >= newlength * sizeelem); + newdata = cast(byte *)gc_malloc(newcap + 1, info.attr); + memcpy(newdata, x.ptr, length * sizeelem); + (cast(void**)(&x))[1] = newdata; + } + L1: + byte *argp = cast(byte *)(&ti + 2); + + *cast(size_t *)&x = newlength; + x.ptr[length * sizeelem .. newsize] = argp[0 .. sizeelem]; + assert((cast(size_t)x.ptr & 15) == 0); + assert(gc_sizeOf(x.ptr) > x.length * sizeelem); + return x; +} + + +/** + * + */ +extern (C) byte[] _d_arraycatT(TypeInfo ti, byte[] x, byte[] y) +out (result) +{ + auto sizeelem = ti.next.tsize(); // array element size + debug(PRINTF) printf("_d_arraycatT(%d,%p ~ %d,%p sizeelem = %d => %d,%p)\n", x.length, x.ptr, y.length, y.ptr, sizeelem, result.length, result.ptr); + assert(result.length == x.length + y.length); + for (size_t i = 0; i < x.length * sizeelem; i++) + assert((cast(byte*)result)[i] == (cast(byte*)x)[i]); + for (size_t i = 0; i < y.length * sizeelem; i++) + assert((cast(byte*)result)[x.length * sizeelem + i] == (cast(byte*)y)[i]); + + size_t cap = gc_sizeOf(result.ptr); + assert(!cap || cap > result.length * sizeelem); +} +body +{ + version (none) + { + /* Cannot use this optimization because: + * char[] a, b; + * char c = 'a'; + * b = a ~ c; + * c = 'b'; + * will change the contents of b. + */ + if (!y.length) + return x; + if (!x.length) + return y; + } + + debug(PRINTF) printf("_d_arraycatT(%d,%p ~ %d,%p)\n", x.length, x.ptr, y.length, y.ptr); + auto sizeelem = ti.next.tsize(); // array element size + debug(PRINTF) printf("_d_arraycatT(%d,%p ~ %d,%p sizeelem = %d)\n", x.length, x.ptr, y.length, y.ptr, sizeelem); + size_t xlen = x.length * sizeelem; + size_t ylen = y.length * sizeelem; + size_t len = xlen + ylen; + + if (!len) + return null; + + byte* p = cast(byte*)gc_malloc(len + 1, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + memcpy(p, x.ptr, xlen); + memcpy(p + xlen, y.ptr, ylen); + p[len] = 0; + return p[0 .. x.length + y.length]; +} + + +/** + * + */ +extern (C) byte[] _d_arraycatnT(TypeInfo ti, uint n, ...) +{ void* a; + size_t length; + byte[]* p; + uint i; + byte[] b; + auto size = ti.next.tsize(); // array element size + + p = cast(byte[]*)(&n + 1); + + for (i = 0; i < n; i++) + { + b = *p++; + length += b.length; + } + if (!length) + return null; + + a = gc_malloc(length * size, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + p = cast(byte[]*)(&n + 1); + + uint j = 0; + for (i = 0; i < n; i++) + { + b = *p++; + if (b.length) + { + memcpy(a + j, b.ptr, b.length * size); + j += b.length * size; + } + } + + byte[] result; + *cast(int *)&result = length; // jam length + (cast(void **)&result)[1] = a; // jam ptr + return result; +} + + +/** + * + */ +extern (C) void* _d_arrayliteralT(TypeInfo ti, size_t length, ...) +{ + auto sizeelem = ti.next.tsize(); // array element size + void* result; + + debug(PRINTF) printf("_d_arrayliteralT(sizeelem = %d, length = %d)\n", sizeelem, length); + if (length == 0 || sizeelem == 0) + result = null; + else + { + result = gc_malloc(length * sizeelem, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + + va_list q; + va_start!(size_t)(q, length); + + size_t stacksize = (sizeelem + int.sizeof - 1) & ~(int.sizeof - 1); + + if (stacksize == sizeelem) + { + memcpy(result, q, length * sizeelem); + } + else + { + for (size_t i = 0; i < length; i++) + { + memcpy(result + i * sizeelem, q, sizeelem); + q += stacksize; + } + } + + va_end(q); + } + return result; +} + + +/** + * Support for array.dup property. + */ +struct Array2 +{ + size_t length; + void* ptr; +} + + +/** + * + */ +extern (C) long _adDupT(TypeInfo ti, Array2 a) +out (result) +{ + auto sizeelem = ti.next.tsize(); // array element size + assert(memcmp((*cast(Array2*)&result).ptr, a.ptr, a.length * sizeelem) == 0); +} +body +{ + Array2 r; + + if (a.length) + { + auto sizeelem = ti.next.tsize(); // array element size + auto size = a.length * sizeelem; + r.ptr = gc_malloc(size, !(ti.next.flags() & 1) ? BlkAttr.NO_SCAN : 0); + r.length = a.length; + memcpy(r.ptr, a.ptr, size); + } + return *cast(long*)(&r); +} + + +unittest +{ + int[] a; + int[] b; + int i; + + a = new int[3]; + a[0] = 1; a[1] = 2; a[2] = 3; + b = a.dup; + assert(b.length == 3); + for (i = 0; i < 3; i++) + assert(b[i] == i + 1); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/llmath.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/llmath.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,531 @@ +/** + * Support for 64-bit longs. + * + * Copyright: Copyright Digital Mars 1993 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright, Sean Kelly + * + * Copyright Digital Mars 2000 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#include + +#if __cplusplus +extern "C" { +#endif + +struct ClassInfo; +struct Vtbl; + +typedef struct Vtbl +{ + size_t len; + void **vptr; +} Vtbl; + +typedef struct Interface +{ + struct ClassInfo *classinfo; + struct Vtbl vtbl; + int offset; +} Interface; + +typedef struct Object +{ + void **vptr; + void *monitor; +} Object; + +typedef struct ClassInfo +{ + Object object; + + size_t initlen; + void *init; + + size_t namelen; + char *name; + + Vtbl vtbl; + + size_t interfacelen; + Interface *interfaces; + + struct ClassInfo *baseClass; + + void *destructor; + void *invariant; + + int flags; +} ClassInfo; + +typedef struct Throwable +{ + Object object; + + size_t msglen; + char* msg; + + size_t filelen; + char* file; + + size_t line; + + struct Interface *info; + struct Throwable *next; +} Throwable; + +typedef struct Array +{ + size_t length; + void *ptr; +} Array; + +typedef struct Delegate +{ + void *thisptr; + void (*funcptr)(); +} Delegate; + +void _d_monitorenter(Object *h); +void _d_monitorexit(Object *h); + +int _d_isbaseof(ClassInfo *b, ClassInfo *c); +Object *_d_dynamic_cast(Object *o, ClassInfo *ci); + +Object * _d_newclass(ClassInfo *ci); +void _d_delclass(Object **p); + +void _d_OutOfMemory(); + +#if __cplusplus +} +#endif diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/memory.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/memory.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,213 @@ +/** + * This module exposes functionality for inspecting and manipulating memory. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright + * + * Copyright Digital Mars 2004 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.memset; + +extern (C) +{ + // Functions from the C library. + void *memcpy(void *, void *, size_t); +} + +extern (C): + +short *_memset16(short *p, short value, size_t count) +{ + short *pstart = p; + short *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} + +int *_memset32(int *p, int value, size_t count) +{ +version (X86) +{ + asm + { + mov EDI,p ; + mov EAX,value ; + mov ECX,count ; + mov EDX,EDI ; + rep ; + stosd ; + mov EAX,EDX ; + } +} +else +{ + int *pstart = p; + int *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} +} + +long *_memset64(long *p, long value, size_t count) +{ + long *pstart = p; + long *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} + +cdouble *_memset128(cdouble *p, cdouble value, size_t count) +{ + cdouble *pstart = p; + cdouble *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} + +real *_memset80(real *p, real value, size_t count) +{ + real *pstart = p; + real *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} + +creal *_memset160(creal *p, creal value, size_t count) +{ + creal *pstart = p; + creal *ptop; + + for (ptop = &p[count]; p < ptop; p++) + *p = value; + return pstart; +} + +void *_memsetn(void *p, void *value, int count, size_t sizelem) +{ void *pstart = p; + int i; + + for (i = 0; i < count; i++) + { + memcpy(p, value, sizelem); + p = cast(void *)(cast(char *)p + sizelem); + } + return pstart; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/minit.asm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/minit.asm Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,85 @@ +;_ minit.asm +; Module initialization support. +; +; Copyright: Copyright Digital Mars 2000 - 2009. +; License: Boost License 1.0. + * Authors: Walter Bright, Sean Kelly + * + * Copyright Digital Mars 2000 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#include +#include +#include + +#if _WIN32 +#elif linux || __APPLE__ +#define USE_PTHREADS 1 +#else +#endif + +#if _WIN32 +#include +#endif + +#if USE_PTHREADS +#include +#endif + +#include "mars.h" + +// This is what the monitor reference in Object points to +typedef struct Monitor +{ + void* impl; // for user-level monitors + Array devt; // for internal monitors + +#if _WIN32 + CRITICAL_SECTION mon; +#endif + +#if USE_PTHREADS + pthread_mutex_t mon; +#endif +} Monitor; + +#define MONPTR(h) (&((Monitor *)(h)->monitor)->mon) + +static volatile int inited; + +/* =============================== Win32 ============================ */ + +#if _WIN32 + +static CRITICAL_SECTION _monitor_critsec; + +void _STI_monitor_staticctor() +{ + if (!inited) + { InitializeCriticalSection(&_monitor_critsec); + inited = 1; + } +} + +void _STD_monitor_staticdtor() +{ + if (inited) + { inited = 0; + DeleteCriticalSection(&_monitor_critsec); + } +} + +void _d_monitor_create(Object *h) +{ + /* + * NOTE: Assume this is only called when h->monitor is null prior to the + * call. However, please note that another thread may call this function + * at the same time, so we can not assert this here. Instead, try and + * create a lock, and if one already exists then forget about it. + */ + + //printf("+_d_monitor_create(%p)\n", h); + assert(h); + Monitor *cs = NULL; + EnterCriticalSection(&_monitor_critsec); + if (!h->monitor) + { + cs = (Monitor *)calloc(sizeof(Monitor), 1); + assert(cs); + InitializeCriticalSection(&cs->mon); + h->monitor = (void *)cs; + cs = NULL; + } + LeaveCriticalSection(&_monitor_critsec); + if (cs) + free(cs); + //printf("-_d_monitor_create(%p)\n", h); +} + +void _d_monitor_destroy(Object *h) +{ + //printf("+_d_monitor_destroy(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + DeleteCriticalSection(MONPTR(h)); + free((void *)h->monitor); + h->monitor = NULL; + //printf("-_d_monitor_destroy(%p)\n", h); +} + +int _d_monitor_lock(Object *h) +{ + //printf("+_d_monitor_acquire(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + EnterCriticalSection(MONPTR(h)); + //printf("-_d_monitor_acquire(%p)\n", h); +} + +void _d_monitor_unlock(Object *h) +{ + //printf("+_d_monitor_release(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + LeaveCriticalSection(MONPTR(h)); + //printf("-_d_monitor_release(%p)\n", h); +} + +#endif + +/* =============================== linux ============================ */ + +#if USE_PTHREADS + +#ifndef PTHREAD_MUTEX_RECURSIVE +# define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP +#endif + +// Includes attribute fixes from David Friedman's GDC port + +static pthread_mutex_t _monitor_critsec; +static pthread_mutexattr_t _monitors_attr; + +void _STI_monitor_staticctor() +{ + if (!inited) + { + pthread_mutexattr_init(&_monitors_attr); + pthread_mutexattr_settype(&_monitors_attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&_monitor_critsec, 0); + inited = 1; + } +} + +void _STD_monitor_staticdtor() +{ + if (inited) + { inited = 0; + pthread_mutex_destroy(&_monitor_critsec); + pthread_mutexattr_destroy(&_monitors_attr); + } +} + +void _d_monitor_create(Object *h) +{ + /* + * NOTE: Assume this is only called when h->monitor is null prior to the + * call. However, please note that another thread may call this function + * at the same time, so we can not assert this here. Instead, try and + * create a lock, and if one already exists then forget about it. + */ + + //printf("+_d_monitor_create(%p)\n", h); + assert(h); + Monitor *cs = NULL; + pthread_mutex_lock(&_monitor_critsec); + if (!h->monitor) + { + cs = (Monitor *)calloc(sizeof(Monitor), 1); + assert(cs); + pthread_mutex_init(&cs->mon, & _monitors_attr); + h->monitor = (void *)cs; + cs = NULL; + } + pthread_mutex_unlock(&_monitor_critsec); + if (cs) + free(cs); + //printf("-_d_monitor_create(%p)\n", h); +} + +void _d_monitor_destroy(Object *h) +{ + //printf("+_d_monitor_destroy(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + pthread_mutex_destroy(MONPTR(h)); + free((void *)h->monitor); + h->monitor = NULL; + //printf("-_d_monitor_destroy(%p)\n", h); +} + +int _d_monitor_lock(Object *h) +{ + //printf("+_d_monitor_acquire(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + pthread_mutex_lock(MONPTR(h)); + //printf("-_d_monitor_acquire(%p)\n", h); +} + +void _d_monitor_unlock(Object *h) +{ + //printf("+_d_monitor_release(%p)\n", h); + assert(h && h->monitor && !(((Monitor*)h->monitor)->impl)); + pthread_mutex_unlock(MONPTR(h)); + //printf("-_d_monitor_release(%p)\n", h); +} + +#endif diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/obj.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/obj.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,34 @@ +/** + * Containts object comparator functions called by generated code. + * + * Copyright: Copyright Digital Mars 2002 - 2009. + * License: thresh) // if more than _maxspan elements + { + //swap middle, base + ti.swap((cast(uint)(limit - base) >> 1) - + (((cast(uint)(limit - base) >> 1)) % width) + base, base); + + i = base + width; // i scans from left to right + j = limit - width; // j scans from right to left + + if (ti.compare(i, j) > 0) // Sedgewick's + ti.swap(i, j); // three-element sort + if (ti.compare(base, j) > 0) // sets things up + ti.swap(base, j); // so that + if (ti.compare(i, base) > 0) // *i <= *base <= *j + ti.swap(i, base); // *base is the pivot element + + while (1) + { + do // move i right until *i >= pivot + i += width; + while (ti.compare(i, base) < 0); + do // move j left until *j <= pivot + j -= width; + while (ti.compare(j, base) > 0); + if (i > j) // break loop if pointers crossed + break; + ti.swap(i, j); // else swap elements, keep scanning + } + ti.swap(base, j); // move pivot into correct place + if (j - base > limit - i) // if left subarray is larger... + { + sp[0] = base; // stack left subarray base + sp[1] = j; // and limit + base = i; // sort the right subarray + } + else // else right subarray is larger + { + sp[0] = i; // stack right subarray base + sp[1] = limit; // and limit + limit = j; // sort the left subarray + } + sp += 2; // increment stack pointer + assert(sp < cast(byte**)stack + stack.length); + } + + // Insertion sort on remaining subarray + i = base + width; + while (i < limit) + { + j = i; + while (j > base && ti.compare(j - width, j) > 0) + { + ti.swap(j - width, j); + j -= width; + } + i += width; + } + + if (sp > stack.ptr) // if any entries on stack... + { + sp -= 2; // pop the base and limit + base = sp[0]; + limit = sp[1]; + } + else // else stack empty, all done + return *cast(long*)(&a); + } + assert(0); +} + + +unittest +{ + debug(qsort) printf("array.sort.unittest()\n"); + + int a[] = new int[10]; + + a[0] = 23; + a[1] = 1; + a[2] = 64; + a[3] = 5; + a[4] = 6; + a[5] = 5; + a[6] = 17; + a[7] = 3; + a[8] = 0; + a[9] = -1; + + a.sort; + + for (int i = 0; i < a.length - 1; i++) + { + //printf("i = %d", i); + //printf(" %d %d\n", a[i], a[i + 1]); + assert(a[i] <= a[i + 1]); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/qsort2.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/qsort2.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,71 @@ +/** + * This is a public domain version of qsort.d. All it does is call C's + * qsort(), but runs a little slower since it needs to synchronize a global + * variable. + * + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: = 0); + assert(ca.length >= 0); + + // Make sure table[] is sorted correctly + int j; + + for (j = 1; j < table.length; j++) + { + int len1 = table[j - 1].length; + int len2 = table[j].length; + + assert(len1 <= len2); + if (len1 == len2) + { + int ci; + + ci = memcmp(table[j - 1].ptr, table[j].ptr, len1); + assert(ci < 0); // ci==0 means a duplicate + } + } +} +out (result) +{ + int i; + int cj; + + //printf("out _d_switch_string()\n"); + if (result == -1) + { + // Not found + for (i = 0; i < table.length; i++) + { + if (table[i].length == ca.length) + { cj = memcmp(table[i].ptr, ca.ptr, ca.length); + assert(cj != 0); + } + } + } + else + { + assert(0 <= result && result < table.length); + for (i = 0; 1; i++) + { + assert(i < table.length); + if (table[i].length == ca.length) + { + cj = memcmp(table[i].ptr, ca.ptr, ca.length); + if (cj == 0) + { + assert(i == result); + break; + } + } + } + } +} +body +{ + //printf("body _d_switch_string(%.*s)\n", ca); + int low; + int high; + int mid; + int c; + char[] pca; + + low = 0; + high = table.length; + + version (none) + { + // Print table + printf("ca[] = '%s'\n", cast(char *)ca); + for (mid = 0; mid < high; mid++) + { + pca = table[mid]; + printf("table[%d] = %d, '%.*s'\n", mid, pca.length, pca); + } + } + if (high && + ca.length >= table[0].length && + ca.length <= table[high - 1].length) + { + // Looking for 0 length string, which would only be at the beginning + if (ca.length == 0) + return 0; + + char c1 = ca[0]; + + // Do binary search + while (low < high) + { + mid = (low + high) >> 1; + pca = table[mid]; + c = ca.length - pca.length; + if (c == 0) + { + c = cast(ubyte)c1 - cast(ubyte)pca[0]; + if (c == 0) + { + c = memcmp(ca.ptr, pca.ptr, ca.length); + if (c == 0) + { //printf("found %d\n", mid); + return mid; + } + } + } + if (c < 0) + { + high = mid; + } + else + { + low = mid + 1; + } + } + } + + //printf("not found\n"); + return -1; // not found +} + +unittest +{ + switch (cast(char []) "c") + { + case "coo": + default: + break; + } +} + +/********************************** + * Same thing, but for wide chars. + */ + +int _d_switch_ustring(wchar[][] table, wchar[] ca) +in +{ + //printf("in _d_switch_ustring()\n"); + assert(table.length >= 0); + assert(ca.length >= 0); + + // Make sure table[] is sorted correctly + int j; + + for (j = 1; j < table.length; j++) + { + int len1 = table[j - 1].length; + int len2 = table[j].length; + + assert(len1 <= len2); + if (len1 == len2) + { + int c; + + c = memcmp(table[j - 1].ptr, table[j].ptr, len1 * wchar.sizeof); + assert(c < 0); // c==0 means a duplicate + } + } +} +out (result) +{ + int i; + int c; + + //printf("out _d_switch_string()\n"); + if (result == -1) + { + // Not found + for (i = 0; i < table.length; i++) + { + if (table[i].length == ca.length) + { c = memcmp(table[i].ptr, ca.ptr, ca.length * wchar.sizeof); + assert(c != 0); + } + } + } + else + { + assert(0 <= result && result < table.length); + for (i = 0; 1; i++) + { + assert(i < table.length); + if (table[i].length == ca.length) + { + c = memcmp(table[i].ptr, ca.ptr, ca.length * wchar.sizeof); + if (c == 0) + { + assert(i == result); + break; + } + } + } + } +} +body +{ + //printf("body _d_switch_ustring()\n"); + int low; + int high; + int mid; + int c; + wchar[] pca; + + low = 0; + high = table.length; + +/* + // Print table + wprintf("ca[] = '%.*s'\n", ca); + for (mid = 0; mid < high; mid++) + { + pca = table[mid]; + wprintf("table[%d] = %d, '%.*s'\n", mid, pca.length, pca); + } +*/ + + // Do binary search + while (low < high) + { + mid = (low + high) >> 1; + pca = table[mid]; + c = ca.length - pca.length; + if (c == 0) + { + c = memcmp(ca.ptr, pca.ptr, ca.length * wchar.sizeof); + if (c == 0) + { //printf("found %d\n", mid); + return mid; + } + } + if (c < 0) + { + high = mid; + } + else + { + low = mid + 1; + } + } + //printf("not found\n"); + return -1; // not found +} + + +unittest +{ + switch (cast(wchar []) "c") + { + case "coo": + default: + break; + } +} + + +/********************************** + * Same thing, but for wide chars. + */ + +int _d_switch_dstring(dchar[][] table, dchar[] ca) +in +{ + //printf("in _d_switch_dstring()\n"); + assert(table.length >= 0); + assert(ca.length >= 0); + + // Make sure table[] is sorted correctly + int j; + + for (j = 1; j < table.length; j++) + { + int len1 = table[j - 1].length; + int len2 = table[j].length; + + assert(len1 <= len2); + if (len1 == len2) + { + int c; + + c = memcmp(table[j - 1].ptr, table[j].ptr, len1 * dchar.sizeof); + assert(c < 0); // c==0 means a duplicate + } + } +} +out (result) +{ + int i; + int c; + + //printf("out _d_switch_string()\n"); + if (result == -1) + { + // Not found + for (i = 0; i < table.length; i++) + { + if (table[i].length == ca.length) + { c = memcmp(table[i].ptr, ca.ptr, ca.length * dchar.sizeof); + assert(c != 0); + } + } + } + else + { + assert(0 <= result && result < table.length); + for (i = 0; 1; i++) + { + assert(i < table.length); + if (table[i].length == ca.length) + { + c = memcmp(table[i].ptr, ca.ptr, ca.length * dchar.sizeof); + if (c == 0) + { + assert(i == result); + break; + } + } + } + } +} +body +{ + //printf("body _d_switch_ustring()\n"); + int low; + int high; + int mid; + int c; + dchar[] pca; + + low = 0; + high = table.length; + +/* + // Print table + wprintf("ca[] = '%.*s'\n", ca); + for (mid = 0; mid < high; mid++) + { + pca = table[mid]; + wprintf("table[%d] = %d, '%.*s'\n", mid, pca.length, pca); + } +*/ + + // Do binary search + while (low < high) + { + mid = (low + high) >> 1; + pca = table[mid]; + c = ca.length - pca.length; + if (c == 0) + { + c = memcmp(ca.ptr, pca.ptr, ca.length * dchar.sizeof); + if (c == 0) + { //printf("found %d\n", mid); + return mid; + } + } + if (c < 0) + { + high = mid; + } + else + { + low = mid + 1; + } + } + //printf("not found\n"); + return -1; // not found +} + + +unittest +{ + switch (cast(dchar []) "c") + { + case "coo": + default: + break; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/tls.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/tls.S Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,44 @@ +/** + * Contains support code for thread-local storage. + * + * Copyright: Copyright Digital Mars 2008 - 2009. + * License: = count) + { uint u2; + uint c2; + + u2 = (u + 1 < num) ? u + 1 : u; + c2 = base[u2].count; + if (c2 < count) + c2 = count; + trace_place(base[u].sym,c2); + } + else + break; + } + + // Clean up + trace_free(base); + } +} + +///////////////////////////////////// +// Initialize and terminate. + +static this() +{ + trace_init(); +} + +static ~this() +{ + trace_term(); +} + +/////////////////////////////////// +// Report results. +// Also compute nsymbols. + +static void trace_report(Symbol* s) +{ SymPair* sp; + uint count; + + //printf("trace_report()\n"); + while (s) + { nsymbols++; + if (s.Sl) + trace_report(s.Sl); + fprintf(fplog,"------------------\n"); + count = 0; + for (sp = s.Sfanin; sp; sp = sp.next) + { + fprintf(fplog,"\t%5d\t%.*s\n", sp.count, sp.sym.Sident); + count += sp.count; + } + fprintf(fplog,"%.*s\t%u\t%lld\t%lld\n",s.Sident,count,s.totaltime,s.functime); + for (sp = s.Sfanout; sp; sp = sp.next) + { + fprintf(fplog,"\t%5d\t%.*s\n",sp.count,sp.sym.Sident); + } + s = s.Sr; + } +} + +//////////////////////////////////// +// Allocate and fill array of symbols. + +static void trace_array(Symbol *s) +{ + __gshared uint u; + + if (!psymbols) + { u = 0; + psymbols = cast(Symbol **)trace_malloc((Symbol *).sizeof * nsymbols); + } + while (s) + { + psymbols[u++] = s; + trace_array(s.Sl); + s = s.Sr; + } +} + + +////////////////////////////////////// +// Qsort() comparison routine for array of pointers to Symbol's. + +static int symbol_cmp(in void* e1, in void* e2) +{ Symbol** ps1; + Symbol** ps2; + timer_t diff; + + ps1 = cast(Symbol **)e1; + ps2 = cast(Symbol **)e2; + + diff = (*ps2).functime - (*ps1).functime; + return (diff == 0) ? 0 : ((diff > 0) ? 1 : -1); +} + + +/////////////////////////////////// +// Report function timings + +static void trace_times(Symbol* root) +{ uint u; + timer_t freq; + + // Sort array + qsort(psymbols, nsymbols, (Symbol *).sizeof, &symbol_cmp); + + // Print array + QueryPerformanceFrequency(&freq); + fprintf(fplog,"\n======== Timer Is %lld Ticks/Sec, Times are in Microsecs ========\n\n",freq); + fprintf(fplog," Num Tree Func Per\n"); + fprintf(fplog," Calls Time Time Call\n\n"); + for (u = 0; u < nsymbols; u++) + { Symbol* s = psymbols[u]; + timer_t tl,tr; + timer_t fl,fr; + timer_t pl,pr; + timer_t percall; + SymPair* sp; + uint calls; + char[] id; + + version (Windows) + { + char* p = (s.Sident ~ '\0').ptr; + p = unmangle_ident(p); + if (p) + id = p[0 .. strlen(p)]; + } + if (!id) + id = s.Sident; + calls = 0; + for (sp = s.Sfanin; sp; sp = sp.next) + calls += sp.count; + if (calls == 0) + calls = 1; + +version (all) +{ + tl = (s.totaltime * 1000000) / freq; + fl = (s.functime * 1000000) / freq; + percall = s.functime / calls; + pl = (s.functime * 1000000) / calls / freq; + + fprintf(fplog,"%7d%12lld%12lld%12lld %.*s\n", + calls,tl,fl,pl,id); +} +else +{ + tl = s.totaltime / freq; + tr = ((s.totaltime - tl * freq) * 10000000) / freq; + + fl = s.functime / freq; + fr = ((s.functime - fl * freq) * 10000000) / freq; + + percall = s.functime / calls; + pl = percall / freq; + pr = ((percall - pl * freq) * 10000000) / freq; + + fprintf(fplog,"%7d\t%3lld.%07lld\t%3lld.%07lld\t%3lld.%07lld\t%.*s\n", + calls,tl,tr,fl,fr,pl,pr,id); +} + if (id !is s.Sident) + free(id.ptr); + } +} + + +/////////////////////////////////// +// Initialize. + +static void trace_init() +{ + if (!trace_inited) + { + trace_inited = 1; + + { // See if we can determine the overhead. + uint u; + timer_t starttime; + timer_t endtime; + Stack *st; + + st = trace_tos; + trace_tos = null; + QueryPerformanceCounter(&starttime); + for (u = 0; u < 100; u++) + { + asm + { + call _trace_pro_n ; + db 0 ; + call _trace_epi_n ; + } + } + QueryPerformanceCounter(&endtime); + trace_ohd = (endtime - starttime) / u; + //printf("trace_ohd = %lld\n",trace_ohd); + if (trace_ohd > 0) + trace_ohd--; // round down + trace_tos = st; + } + } +} + +///////////////////////////////// +// Terminate. + +void trace_term() +{ + //printf("trace_term()\n"); + if (trace_inited == 1) + { Stack *n; + + trace_inited = 2; + + // Free remainder of the stack + while (trace_tos) + { + n = trace_tos.prev; + stack_free(trace_tos); + trace_tos = n; + } + + while (stack_freelist) + { + n = stack_freelist.prev; + stack_free(stack_freelist); + stack_freelist = n; + } + + // Merge in data from any existing file + trace_merge(); + + // Report results + fplog = fopen(trace_logfilename.ptr, "w"); + if (fplog) + { nsymbols = 0; + trace_report(root); + trace_array(root); + trace_times(root); + fclose(fplog); + } + + // Output function link order + fpdef = fopen(trace_deffilename.ptr,"w"); + if (fpdef) + { fprintf(fpdef,"\nFUNCTIONS\n"); + trace_order(root); + fclose(fpdef); + } + + trace_free(psymbols); + psymbols = null; + } +} + +///////////////////////////////// +// Our storage allocator. + +static void *trace_malloc(size_t nbytes) +{ void *p; + + p = malloc(nbytes); + if (!p) + exit(EXIT_FAILURE); + return p; +} + +static void trace_free(void *p) +{ + free(p); +} + +////////////////////////////////////////////// +// + +static Symbol* trace_addsym(char[] id) +{ + Symbol** parent; + Symbol* rover; + Symbol* s; + int cmp; + char c; + + //printf("trace_addsym('%s',%d)\n",p,len); + parent = &root; + rover = *parent; + while (rover !is null) // while we haven't run out of tree + { + cmp = dstrcmp(id, rover.Sident); + if (cmp == 0) + { + return rover; + } + parent = (cmp < 0) ? /* if we go down left side */ + &(rover.Sl) : /* then get left child */ + &(rover.Sr); /* else get right child */ + rover = *parent; /* get child */ + } + /* not in table, so insert into table */ + s = cast(Symbol *)trace_malloc(Symbol.sizeof); + memset(s,0,Symbol.sizeof); + s.Sident = id; + *parent = s; // link new symbol into tree + return s; +} + +/*********************************** + * Add symbol s with count to SymPair list. + */ + +static void trace_sympair_add(SymPair** psp, Symbol* s, uint count) +{ SymPair* sp; + + for (; 1; psp = &sp.next) + { + sp = *psp; + if (!sp) + { + sp = cast(SymPair *)trace_malloc(SymPair.sizeof); + sp.sym = s; + sp.count = 0; + sp.next = null; + *psp = sp; + break; + } + else if (sp.sym == s) + { + break; + } + } + sp.count += count; +} + +////////////////////////////////////////////// +// + +static void trace_pro(char[] id) +{ + Stack* n; + Symbol* s; + timer_t starttime; + timer_t t; + + QueryPerformanceCounter(&starttime); + if (id.length == 0) + return; + if (!trace_inited) + trace_init(); // initialize package + n = stack_malloc(); + n.prev = trace_tos; + trace_tos = n; + s = trace_addsym(id); + trace_tos.sym = s; + if (trace_tos.prev) + { + Symbol* prev; + int i; + + // Accumulate Sfanout and Sfanin + prev = trace_tos.prev.sym; + trace_sympair_add(&prev.Sfanout,s,1); + trace_sympair_add(&s.Sfanin,prev,1); + } + QueryPerformanceCounter(&t); + trace_tos.starttime = starttime; + trace_tos.ohd = trace_ohd + t - starttime; + trace_tos.subtime = 0; + //printf("trace_tos.ohd=%lld, trace_ohd=%lld + t=%lld - starttime=%lld\n", + // trace_tos.ohd,trace_ohd,t,starttime); +} + +///////////////////////////////////////// +// + +static void trace_epi() +{ Stack* n; + timer_t endtime; + timer_t t; + timer_t ohd; + + //printf("trace_epi()\n"); + if (trace_tos) + { + timer_t starttime; + timer_t totaltime; + + QueryPerformanceCounter(&endtime); + starttime = trace_tos.starttime; + totaltime = endtime - starttime - trace_tos.ohd; + if (totaltime < 0) + { //printf("endtime=%lld - starttime=%lld - trace_tos.ohd=%lld < 0\n", + // endtime,starttime,trace_tos.ohd); + totaltime = 0; // round off error, just make it 0 + } + + // totaltime is time spent in this function + all time spent in + // subfunctions - bookkeeping overhead. + trace_tos.sym.totaltime += totaltime; + + //if (totaltime < trace_tos.subtime) + //printf("totaltime=%lld < trace_tos.subtime=%lld\n",totaltime,trace_tos.subtime); + trace_tos.sym.functime += totaltime - trace_tos.subtime; + ohd = trace_tos.ohd; + n = trace_tos.prev; + stack_free(trace_tos); + trace_tos = n; + if (n) + { QueryPerformanceCounter(&t); + n.ohd += ohd + t - endtime; + n.subtime += totaltime; + //printf("n.ohd = %lld\n",n.ohd); + } + } +} + + +////////////////////////// FILE INTERFACE ///////////////////////// + +///////////////////////////////////// +// Read line from file fp. +// Returns: +// trace_malloc'd line buffer +// null if end of file + +static char* trace_readline(FILE* fp) +{ int c; + int dim; + int i; + char *buf; + + //printf("trace_readline(%p)\n", fp); + i = 0; + dim = 0; + buf = null; + while (1) + { + if (i == dim) + { char *p; + + dim += 80; + p = cast(char *)trace_malloc(dim); + memcpy(p,buf,i); + trace_free(buf); + buf = p; + } + c = fgetc(fp); + switch (c) + { + case EOF: + if (i == 0) + { trace_free(buf); + return null; + } + case '\n': + goto L1; + default: + break; + } + buf[i] = cast(char)c; + i++; + } +L1: + buf[i] = 0; + //printf("line '%s'\n",buf); + return buf; +} + +////////////////////////////////////// +// Skip space + +static char *skipspace(char *p) +{ + while (isspace(*p)) + p++; + return p; +} + +//////////////////////////////////////////////////////// +// Merge in profiling data from existing file. + +static void trace_merge() +{ FILE *fp; + char *buf; + char *p; + uint count; + Symbol *s; + SymPair *sfanin; + SymPair **psp; + + if (trace_logfilename && (fp = fopen(trace_logfilename.ptr,"r")) !is null) + { + buf = null; + sfanin = null; + psp = &sfanin; + while (1) + { + trace_free(buf); + buf = trace_readline(fp); + if (!buf) + break; + switch (*buf) + { + case '=': // ignore rest of file + trace_free(buf); + goto L1; + case ' ': + case '\t': // fan in or fan out line + count = strtoul(buf,&p,10); + if (p == buf) // if invalid conversion + continue; + p = skipspace(p); + if (!*p) + continue; + s = trace_addsym(p[0 .. strlen(p)]); + trace_sympair_add(psp,s,count); + break; + default: + if (!isalpha(*buf)) + { + if (!sfanin) + psp = &sfanin; + continue; // regard unrecognized line as separator + } + case '?': + case '_': + case '$': + case '@': + p = buf; + while (isgraph(*p)) + p++; + *p = 0; + //printf("trace_addsym('%s')\n",buf); + s = trace_addsym(buf[0 .. strlen(buf)]); + if (s.Sfanin) + { SymPair *sp; + + for (; sfanin; sfanin = sp) + { + trace_sympair_add(&s.Sfanin,sfanin.sym,sfanin.count); + sp = sfanin.next; + trace_free(sfanin); + } + } + else + { s.Sfanin = sfanin; + } + sfanin = null; + psp = &s.Sfanout; + + { timer_t t; + + p++; + count = strtoul(p,&p,10); + t = cast(long)strtoull(p,&p,10); + s.totaltime += t; + t = cast(long)strtoull(p,&p,10); + s.functime += t; + } + break; + } + } + L1: + fclose(fp); + } +} + +////////////////////////// COMPILER INTERFACE ///////////////////// + +///////////////////////////////////////////// +// Function called by trace code in function prolog. + +void _trace_pro_n() +{ + /* Length of string is either: + * db length + * ascii string + * or: + * db 0x0FF + * db 0 + * dw length + * ascii string + */ + + version (OSX) + { // 16 byte align stack + asm + { naked ; + pushad ; + mov ECX,8*4[ESP] ; + xor EAX,EAX ; + mov AL,[ECX] ; + cmp AL,0xFF ; + jne L1 ; + cmp byte ptr 1[ECX],0 ; + jne L1 ; + mov AX,2[ECX] ; + add 8*4[ESP],3 ; + add ECX,3 ; + L1: inc EAX ; + inc ECX ; + add 8*4[ESP],EAX ; + dec EAX ; + sub ESP,4 ; + push ECX ; + push EAX ; + call trace_pro ; + add ESP,12 ; + popad ; + ret ; + } + } + else + { + asm + { naked ; + pushad ; + mov ECX,8*4[ESP] ; + xor EAX,EAX ; + mov AL,[ECX] ; + cmp AL,0xFF ; + jne L1 ; + cmp byte ptr 1[ECX],0 ; + jne L1 ; + mov AX,2[ECX] ; + add 8*4[ESP],3 ; + add ECX,3 ; + L1: inc EAX ; + inc ECX ; + add 8*4[ESP],EAX ; + dec EAX ; + push ECX ; + push EAX ; + call trace_pro ; + add ESP,8 ; + popad ; + ret ; + } + } +} + +///////////////////////////////////////////// +// Function called by trace code in function epilog. + + +void _trace_epi_n() +{ + version (OSX) + { // 16 byte align stack + asm + { naked ; + pushad ; + sub ESP,12 ; + } + trace_epi(); + asm + { + add ESP,12 ; + popad ; + ret ; + } + } + else + { + asm + { naked ; + pushad ; + } + trace_epi(); + asm + { + popad ; + ret ; + } + } +} + + +version (Windows) +{ + extern (Windows) + { + export int QueryPerformanceCounter(timer_t *); + export int QueryPerformanceFrequency(timer_t *); + } +} +else version (X86) +{ + extern (D) + { + void QueryPerformanceCounter(timer_t* ctr) + { + asm + { naked ; + mov ECX,EAX ; + rdtsc ; + mov [ECX],EAX ; + mov 4[ECX],EDX ; + ret ; + } + } + + void QueryPerformanceFrequency(timer_t* freq) + { + *freq = 3579545; + } + } +} +else +{ + static assert(0); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_AC.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_AC.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,107 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: 0) + c = 1; + return c; + } + + override size_t tsize() + { + return (Object[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(Object); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_Acdouble.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_Acdouble.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,94 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (cdouble[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(cdouble); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_Acfloat.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_Acfloat.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,92 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (cfloat[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(cfloat); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_Acreal.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_Acreal.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,95 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (creal[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(creal); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_Adouble.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_Adouble.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,104 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (double[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(double); + } +} + +// idouble[] + +class TypeInfo_Ap : TypeInfo_Ad +{ + override string toString() { return "idouble[]"; } + + override TypeInfo next() + { + return typeid(idouble); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_Afloat.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_Afloat.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,103 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (float[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(float); + } +} + +// ifloat[] + +class TypeInfo_Ao : TypeInfo_Af +{ + override string toString() { return "ifloat[]"; } + + override TypeInfo next() + { + return typeid(ifloat); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_Ag.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_Ag.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,228 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (byte[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(byte); + } +} + + +// ubyte[] + +class TypeInfo_Ah : TypeInfo_Ag +{ + override string toString() { return "ubyte[]"; } + + override int compare(in void* p1, in void* p2) + { + char[] s1 = *cast(char[]*)p1; + char[] s2 = *cast(char[]*)p2; + + return dstrcmp(s1, s2); + } + + override TypeInfo next() + { + return typeid(ubyte); + } +} + +// void[] + +class TypeInfo_Av : TypeInfo_Ah +{ + override string toString() { return "void[]"; } + + override TypeInfo next() + { + return typeid(void); + } +} + +// bool[] + +class TypeInfo_Ab : TypeInfo_Ah +{ + override string toString() { return "bool[]"; } + + override TypeInfo next() + { + return typeid(bool); + } +} + +// char[] + +class TypeInfo_Aa : TypeInfo_Ag +{ + override string toString() { return "char[]"; } + + override hash_t getHash(in void* p) + { char[] s = *cast(char[]*)p; + hash_t hash = 0; + +version (all) +{ + foreach (char c; s) + hash = hash * 11 + c; +} +else +{ + size_t len = s.length; + char *str = s; + + while (1) + { + switch (len) + { + case 0: + return hash; + + case 1: + hash *= 9; + hash += *cast(ubyte *)str; + return hash; + + case 2: + hash *= 9; + hash += *cast(ushort *)str; + return hash; + + case 3: + hash *= 9; + hash += (*cast(ushort *)str << 8) + + (cast(ubyte *)str)[2]; + return hash; + + default: + hash *= 9; + hash += *cast(uint *)str; + str += 4; + len -= 4; + break; + } + } +} + return hash; + } + + override TypeInfo next() + { + return typeid(char); + } +} + +// string + +class TypeInfo_Aya : TypeInfo_Aa +{ + override string toString() { return "immutable(char)[]"; } + + override TypeInfo next() + { + return typeid(immutable(char)); + } +} + diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_Aint.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_Aint.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,140 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (int[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(int); + } +} + +unittest +{ + int[][] a = [[5,3,8,7], [2,5,3,8,7]]; + a.sort; + assert(a == [[2,5,3,8,7], [5,3,8,7]]); + + a = [[5,3,8,7], [5,3,8]]; + a.sort; + assert(a == [[5,3,8], [5,3,8,7]]); +} + +// uint[] + +class TypeInfo_Ak : TypeInfo_Ai +{ + override string toString() { return "uint[]"; } + + override int compare(in void* p1, in void* p2) + { + uint[] s1 = *cast(uint[]*)p1; + uint[] s2 = *cast(uint[]*)p2; + size_t len = s1.length; + + if (s2.length < len) + len = s2.length; + for (size_t u = 0; u < len; u++) + { + int result = s1[u] - s2[u]; + if (result) + return result; + } + if (s1.length < s2.length) + return -1; + else if (s1.length > s2.length) + return 1; + return 0; + } + + override TypeInfo next() + { + return typeid(uint); + } +} + +// dchar[] + +class TypeInfo_Aw : TypeInfo_Ak +{ + override string toString() { return "dchar[]"; } + + override TypeInfo next() + { + return typeid(dchar); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_Along.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_Along.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,120 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2[u]) + return 1; + } + if (s1.length < s2.length) + return -1; + else if (s1.length > s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (long[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(long); + } +} + + +// ulong[] + +class TypeInfo_Am : TypeInfo_Al +{ + override string toString() { return "ulong[]"; } + + override int compare(in void* p1, in void* p2) + { + ulong[] s1 = *cast(ulong[]*)p1; + ulong[] s2 = *cast(ulong[]*)p2; + size_t len = s1.length; + + if (s2.length < len) + len = s2.length; + for (size_t u = 0; u < len; u++) + { + if (s1[u] < s2[u]) + return -1; + else if (s1[u] > s2[u]) + return 1; + } + if (s1.length < s2.length) + return -1; + else if (s1.length > s2.length) + return 1; + return 0; + } + + override TypeInfo next() + { + return typeid(ulong); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_Areal.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_Areal.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,105 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (real[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(real); + } +} + +// ireal[] + +class TypeInfo_Aj : TypeInfo_Ae +{ + override string toString() { return "ireal[]"; } + + override TypeInfo next() + { + return typeid(ireal); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_Ashort.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_Ashort.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,143 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: s2.length) + return 1; + return 0; + } + + override size_t tsize() + { + return (short[]).sizeof; + } + + override uint flags() + { + return 1; + } + + override TypeInfo next() + { + return typeid(short); + } +} + + +// ushort[] + +class TypeInfo_At : TypeInfo_As +{ + override string toString() { return "ushort[]"; } + + override int compare(in void* p1, in void* p2) + { + ushort[] s1 = *cast(ushort[]*)p1; + ushort[] s2 = *cast(ushort[]*)p2; + size_t len = s1.length; + + if (s2.length < len) + len = s2.length; + for (size_t u = 0; u < len; u++) + { + int result = s1[u] - s2[u]; + if (result) + return result; + } + if (s1.length < s2.length) + return -1; + else if (s1.length > s2.length) + return 1; + return 0; + } + + override TypeInfo next() + { + return typeid(ushort); + } +} + +// wchar[] + +class TypeInfo_Au : TypeInfo_At +{ + override string toString() { return "wchar[]"; } + + override TypeInfo next() + { + return typeid(wchar); + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_C.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_C.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,63 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: f2.re) + result = 1; + else if (f1.im < f2.im) + result = -1; + else if (f1.im > f2.im) + result = 1; + else + result = 0; + return result; + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(cdouble *)p1, *cast(cdouble *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(cdouble *)p1, *cast(cdouble *)p2); + } + + override size_t tsize() + { + return cdouble.sizeof; + } + + override void swap(void *p1, void *p2) + { + cdouble t; + + t = *cast(cdouble *)p1; + *cast(cdouble *)p1 = *cast(cdouble *)p2; + *cast(cdouble *)p2 = t; + } + + override void[] init() + { static immutable cdouble r; + + return (cast(cdouble *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_cfloat.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_cfloat.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,76 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: f2.re) + result = 1; + else if (f1.im < f2.im) + result = -1; + else if (f1.im > f2.im) + result = 1; + else + result = 0; + return result; + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(cfloat *)p1, *cast(cfloat *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(cfloat *)p1, *cast(cfloat *)p2); + } + + override size_t tsize() + { + return cfloat.sizeof; + } + + override void swap(void *p1, void *p2) + { + cfloat t; + + t = *cast(cfloat *)p1; + *cast(cfloat *)p1 = *cast(cfloat *)p2; + *cast(cfloat *)p2 = t; + } + + override void[] init() + { static immutable cfloat r; + + return (cast(cfloat *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_char.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_char.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,55 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: f2.re) + result = 1; + else if (f1.im < f2.im) + result = -1; + else if (f1.im > f2.im) + result = 1; + else + result = 0; + return result; + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(creal *)p1, *cast(creal *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(creal *)p1, *cast(creal *)p2); + } + + override size_t tsize() + { + return creal.sizeof; + } + + override void swap(void *p1, void *p2) + { + creal t; + + t = *cast(creal *)p1; + *cast(creal *)p1 = *cast(creal *)p2; + *cast(creal *)p2 = t; + } + + override void[] init() + { static immutable creal r; + + return (cast(creal *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_dchar.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_dchar.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,55 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright + * + * Copyright Digital Mars 2004 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.typeinfo.ti_double; + +// double + +class TypeInfo_d : TypeInfo +{ + override string toString() { return "double"; } + + override hash_t getHash(in void* p) + { + return (cast(uint *)p)[0] + (cast(uint *)p)[1]; + } + + static equals_t _equals(double f1, double f2) + { + return f1 == f2 || + (f1 !<>= f1 && f2 !<>= f2); + } + + static int _compare(double d1, double d2) + { + if (d1 !<>= d2) // if either are NaN + { + if (d1 !<>= d1) + { if (d2 !<>= d2) + return 0; + return -1; + } + return 1; + } + return (d1 == d2) ? 0 : ((d1 < d2) ? -1 : 1); + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(double *)p1, *cast(double *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(double *)p1, *cast(double *)p2); + } + + override size_t tsize() + { + return double.sizeof; + } + + override void swap(void *p1, void *p2) + { + double t; + + t = *cast(double *)p1; + *cast(double *)p1 = *cast(double *)p2; + *cast(double *)p2 = t; + } + + override void[] init() + { static immutable double r; + + return (cast(double *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_float.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_float.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,75 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: = f1 && f2 !<>= f2); + } + + static int _compare(float d1, float d2) + { + if (d1 !<>= d2) // if either are NaN + { + if (d1 !<>= d1) + { if (d2 !<>= d2) + return 0; + return -1; + } + return 1; + } + return (d1 == d2) ? 0 : ((d1 < d2) ? -1 : 1); + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(float *)p1, *cast(float *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(float *)p1, *cast(float *)p2); + } + + override size_t tsize() + { + return float.sizeof; + } + + override void swap(void *p1, void *p2) + { + float t; + + t = *cast(float *)p1; + *cast(float *)p1 = *cast(float *)p2; + *cast(float *)p2 = t; + } + + override void[] init() + { static immutable float r; + + return (cast(float *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_idouble.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_idouble.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,22 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: *cast(int*) p2) + return 1; + return 0; + } + + override size_t tsize() + { + return int.sizeof; + } + + override void swap(void *p1, void *p2) + { + int t; + + t = *cast(int *)p1; + *cast(int *)p1 = *cast(int *)p2; + *cast(int *)p2 = t; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_ireal.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_ireal.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,22 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: *cast(long *)p2) + return 1; + return 0; + } + + override size_t tsize() + { + return long.sizeof; + } + + override void swap(void *p1, void *p2) + { + long t; + + t = *cast(long *)p1; + *cast(long *)p1 = *cast(long *)p2; + *cast(long *)p2 = t; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_ptr.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_ptr.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,57 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright + * + * Copyright Digital Mars 2004 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.typeinfo.ti_real; + +// real + +class TypeInfo_e : TypeInfo +{ + override string toString() { return "real"; } + + override hash_t getHash(in void* p) + { + return (cast(uint *)p)[0] + (cast(uint *)p)[1] + (cast(ushort *)p)[4]; + } + + static equals_t _equals(real f1, real f2) + { + return f1 == f2 || + (f1 !<>= f1 && f2 !<>= f2); + } + + static int _compare(real d1, real d2) + { + if (d1 !<>= d2) // if either are NaN + { + if (d1 !<>= d1) + { if (d2 !<>= d2) + return 0; + return -1; + } + return 1; + } + return (d1 == d2) ? 0 : ((d1 < d2) ? -1 : 1); + } + + override equals_t equals(in void* p1, in void* p2) + { + return _equals(*cast(real *)p1, *cast(real *)p2); + } + + override int compare(in void* p1, in void* p2) + { + return _compare(*cast(real *)p1, *cast(real *)p2); + } + + override size_t tsize() + { + return real.sizeof; + } + + override void swap(void *p1, void *p2) + { + real t; + + t = *cast(real *)p1; + *cast(real *)p1 = *cast(real *)p2; + *cast(real *)p2 = t; + } + + override void[] init() + { static immutable real r; + + return (cast(real *)&r)[0 .. 1]; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_short.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_short.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,49 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: *cast(uint*) p2) + return 1; + return 0; + } + + override size_t tsize() + { + return uint.sizeof; + } + + override void swap(void *p1, void *p2) + { + int t; + + t = *cast(uint *)p1; + *cast(uint *)p1 = *cast(uint *)p2; + *cast(uint *)p2 = t; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_ulong.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_ulong.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,53 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: *cast(ulong *)p2) + return 1; + return 0; + } + + override size_t tsize() + { + return ulong.sizeof; + } + + override void swap(void *p1, void *p2) + { + ulong t; + + t = *cast(ulong *)p1; + *cast(ulong *)p1 = *cast(ulong *)p2; + *cast(ulong *)p2 = t; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/typeinfo/ti_ushort.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/typeinfo/ti_ushort.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,49 @@ +/** + * TypeInfo support code. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: =0x48 && x<=0x80) + || x==0x86 || x==0x87 + || (x>=0x66 && x<=0x68) || (x>=0x39 && x<=0x3E)){ + datacache[level].lineSize = 64; + } else datacache[level].lineSize = 32; + } + } + } + + uint[4] a; + bool firstTime = true; + // On a multi-core system, this could theoretically fail, but it's only used + // for old single-core CPUs. + uint numinfos = 1; + do { + asm { + mov EAX, 2; + cpuid; + mov a, EAX; + mov a+4, EBX; + mov a+8, ECX; + mov a+12, EDX; + } + if (firstTime) { + if (a[0]==0x0000_7001 && a[3]==0x80 && a[1]==0 && a[2]==0) { + // Cyrix MediaGX MMXEnhanced returns: EAX= 00007001, EDX=00000080. + // These are NOT standard Intel values + // (TLB = 32 entry, 4 way associative, 4K pages) + // (L1 cache = 16K, 4way, linesize16) + datacache[0].size=8; + datacache[0].associativity=4; + datacache[0].lineSize=16; + return; + } + // lsb of a is how many times to loop. + numinfos = a[0] & 0xFF; + // and otherwise it should be ignored + a[0] &= 0xFFFF_FF00; + firstTime = false; + } + for (int c=0; c<4;++c) { + // high bit set == no info. + if (a[c] & 0x8000_0000) continue; + decipherCpuid2(cast(ubyte)(a[c] & 0xFF)); + decipherCpuid2(cast(ubyte)((a[c]>>8) & 0xFF)); + decipherCpuid2(cast(ubyte)((a[c]>>16) & 0xFF)); + decipherCpuid2(cast(ubyte)((a[c]>>24) & 0xFF)); + } + } while (--numinfos); +} + +// CPUID4: "Deterministic cache parameters" leaf +void getcacheinfoCPUID4() +{ + int cachenum = 0; + for(;;) { + uint a, b, number_of_sets; + asm { + mov EAX, 4; + mov ECX, cachenum; + cpuid; + mov a, EAX; + mov b, EBX; + mov number_of_sets, ECX; + } + ++cachenum; + if ((a&0x1F)==0) break; // no more caches + uint numthreads = ((a>>14) & 0xFFF) + 1; + uint numcores = ((a>>26) & 0x3F) + 1; + if (numcores > maxCores) maxCores = numcores; + if ((a&0x1F)!=1 && ((a&0x1F)!=3)) continue; // we only want data & unified caches + + ++number_of_sets; + ubyte level = cast(ubyte)(((a>>5)&7)-1); + if (level > datacache.length) continue; // ignore deep caches + datacache[level].associativity = a & 0x200 ? ubyte.max :cast(ubyte)((b>>22)+1); + datacache[level].lineSize = (b & 0xFFF)+ 1; // system coherency line size + uint line_partitions = ((b >> 12)& 0x3FF) + 1; + // Size = number of sets * associativity * cachelinesize * linepartitions + // and must convert to Kb, also dividing by the number of hyperthreads using this cache. + ulong sz = (datacache[level].associativity< ubyte.max)? number_of_sets * + datacache[level].associativity : number_of_sets; + datacache[level].size = cast(uint)( + (sz * datacache[level].lineSize * line_partitions ) / (numthreads *1024)); + if (level == 0 && (a&0xF)==3) { + // Halve the size for unified L1 caches + datacache[level].size/=2; + } + } +} + +// CPUID8000_0005 & 6 +void getAMDcacheinfo() +{ + uint c5, c6, d6; + asm { + mov EAX, 0x8000_0005; // L1 cache + cpuid; + // EAX has L1_TLB_4M. + // EBX has L1_TLB_4K + // EDX has L1 instruction cache + mov c5, ECX; + } + + datacache[0].size = ( (c5>>24) & 0xFF); + datacache[0].associativity = cast(ubyte)( (c5 >> 16) & 0xFF); + datacache[0].lineSize = c5 & 0xFF; + + if (max_extended_cpuid >= 0x8000_0006) { + // AMD K6-III or K6-2+ or later. + ubyte numcores = 1; + if (max_extended_cpuid >=0x8000_0008) { + asm { + mov EAX, 0x8000_0008; + cpuid; + mov numcores, CL; + } + ++numcores; + if (numcores>maxCores) maxCores = numcores; + } + asm { + mov EAX, 0x8000_0006; // L2/L3 cache + cpuid; + mov c6, ECX; // L2 cache info + mov d6, EDX; // L3 cache info + } + + immutable ubyte [] assocmap = [ 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 0xFF ]; + datacache[1].size = (c6>>16) & 0xFFFF; + datacache[1].associativity = assocmap[(c6>>12)&0xF]; + datacache[1].lineSize = c6 & 0xFF; + + // The L3 cache value is TOTAL, not per core. + datacache[2].size = ((d6>>18)*512)/numcores; // could be up to 2 * this, -1. + datacache[2].associativity = assocmap[(d6>>12)&0xF]; + datacache[2].lineSize = d6 & 0xFF; + } +} + + +void cpuidX86() +{ + char * venptr = vendorID.ptr; + asm { + mov EAX, 0; + cpuid; + mov max_cpuid, EAX; + mov EAX, venptr; + mov [EAX], EBX; + mov [EAX + 4], EDX; + mov [EAX + 8], ECX; + mov EAX, 0x8000_0000; + cpuid; + mov max_extended_cpuid, EAX; + } + + probablyIntel = vendorID == "GenuineIntel"; + probablyAMD = vendorID == "AuthenticAMD"; + uint a, b, c, d; + uint apic = 0; // brand index, apic id + asm { + mov EAX, 1; // model, stepping + cpuid; + mov a, EAX; + mov apic, EBX; + mov miscfeatures, ECX; + mov features, EDX; + } + amdfeatures = 0; + amdmiscfeatures = 0; + if (max_extended_cpuid >= 0x8000_0001) { + asm { + mov EAX, 0x8000_0001; + cpuid; + mov amdmiscfeatures, ECX; + mov amdfeatures, EDX; + } + } + // Try to detect fraudulent vendorIDs + if (amd3dnow) probablyIntel = false; + + stepping = a & 0xF; + uint fbase = (a >> 8) & 0xF; + uint mbase = (a >> 4) & 0xF; + family = ((fbase == 0xF) || (fbase == 0)) ? fbase + (a >> 20) & 0xFF : fbase; + model = ((fbase == 0xF) || (fbase == 6 && probablyIntel) ) ? + mbase + ((a >> 12) & 0xF0) : mbase; + + if (!probablyIntel && max_extended_cpuid >= 0x8000_0008) { + // determine max number of cores for AMD + asm { + mov EAX, 0x8000_0008; + cpuid; + mov c, ECX; + } + uint apicsize = (c>>12) & 0xF; + if (apicsize == 0) { + // use legacy method + if (hyperThreadingBit) maxCores = c & 0xFF; + else maxCores = 1; + } else { + // maxcores = 2^ apicsize + maxCores = 1; + while (apicsize) { maxCores<<=1; --apicsize; } + } + } + + if (max_extended_cpuid >= 0x8000_0004) { + char *procptr = processorNameBuffer.ptr; + asm { + push ESI; + mov ESI, procptr; + mov EAX, 0x8000_0002; + cpuid; + mov [ESI], EAX; + mov [ESI+4], EBX; + mov [ESI+8], ECX; + mov [ESI+12], EDX; + mov EAX, 0x8000_0003; + cpuid; + mov [ESI+16], EAX; + mov [ESI+20], EBX; + mov [ESI+24], ECX; + mov [ESI+28], EDX; + mov EAX, 0x8000_0004; + cpuid; + mov [ESI+32], EAX; + mov [ESI+36], EBX; + mov [ESI+40], ECX; + mov [ESI+44], EDX; + pop ESI; + } + // Intel P4 and PM pad at front with spaces. + // Other CPUs pad at end with nulls. + int start = 0, end = 0; + while (processorNameBuffer[start] == ' ') { ++start; } + while (processorNameBuffer[$-end-1] == 0) { ++end; } + processorName = processorNameBuffer[start..$-end]; + } else { + processorName[] = "Unknown CPU"; + } + // Determine cache sizes + + // Intel docs specify that they return 0 for 0x8000_0005. + // AMD docs do not specify the behaviour for 0004 and 0002. + // Centaur/VIA and most other manufacturers use the AMD method, + // except Cyrix MediaGX MMX Enhanced uses their OWN form of CPUID2! + // NS Geode GX1 provides CyrixCPUID2 _and_ does the same wrong behaviour + // for CPUID80000005. But Geode GX uses the AMD method + + // Deal with Geode GX1 - make it same as MediaGX MMX. + if (max_extended_cpuid==0x8000_0005 && max_cpuid==2) { + max_extended_cpuid = 0x8000_0004; + } + // Therefore, we try the AMD method unless it's an Intel chip. + // If we still have no info, try the Intel methods. + datacache[0].size = 0; + if (max_cpuid<2 || !probablyIntel) { + if (max_extended_cpuid >= 0x8000_0005) { + getAMDcacheinfo(); + } else if (probablyAMD) { + // According to AMDProcRecognitionAppNote, this means CPU + // K5 model 0, or Am5x86 (model 4), or Am4x86DX4 (model 4) + // Am5x86 has 16Kb 4-way unified data & code cache. + datacache[0].size = 8; + datacache[0].associativity = 4; + datacache[0].lineSize = 32; + } else { + // Some obscure CPU. + // Values for Cyrix 6x86MX (family 6, model 0) + datacache[0].size = 64; + datacache[0].associativity = 4; + datacache[0].lineSize = 32; + } + } + if ((datacache[0].size == 0) && max_cpuid>=4) { + getcacheinfoCPUID4(); + } + if ((datacache[0].size == 0) && max_cpuid>=2) { + getcacheinfoCPUID2(); + } + if (datacache[0].size == 0) { + // Pentium, PMMX, late model 486, or an obscure CPU + if (mmx) { // Pentium MMX. Also has 8kB code cache. + datacache[0].size = 16; + datacache[0].associativity = 4; + datacache[0].lineSize = 32; + } else { // Pentium 1 (which also has 8kB code cache) + // or 486. + // Cyrix 6x86: 16, 4way, 32 linesize + datacache[0].size = 8; + datacache[0].associativity = 2; + datacache[0].lineSize = 32; + } + } + if (hyperThreadingBit) maxThreads = (apic>>>16) & 0xFF; + else maxThreads = maxCores; +} + +// Return true if the cpuid instruction is supported. +// BUG(WONTFIX): Returns false for Cyrix 6x86 and 6x86L. They will be treated as 486 machines. +bool hasCPUID() +{ + uint flags; + asm { + pushfd; + pop EAX; + mov flags, EAX; + xor EAX, 0x0020_0000; + push EAX; + popfd; + pushfd; + pop EAX; + xor flags, EAX; + } + return (flags & 0x0020_0000) !=0; +} + +} else { // inline asm X86 + + bool hasCPUID() { return false; } + + void cpuidX86() + { + datacache[0].size = 8; + datacache[0].associativity = 2; + datacache[0].lineSize = 32; + } +} + +// TODO: Implement this function with OS support +void cpuidPPC() +{ + enum :int { PPC601, PPC603, PPC603E, PPC604, + PPC604E, PPC620, PPCG3, PPCG4, PPCG5 }; + + // TODO: + // asm { mfpvr; } returns the CPU version but unfortunately it can + // only be used in kernel mode. So OS support is required. + int cputype = PPC603; + + // 601 has a 8KB combined data & code L1 cache. + uint sizes[] = [4, 8, 16, 16, 32, 32, 32, 32, 64]; + ubyte ways[] = [8, 2, 4, 4, 4, 8, 8, 8, 8]; + uint L2size[]= [0, 0, 0, 0, 0, 0, 0, 256, 512]; + uint L3size[]= [0, 0, 0, 0, 0, 0, 0, 2048, 0]; + + datacache[0].size = sizes[cputype]; + datacache[0].associativity = ways[cputype]; + datacache[0].lineSize = (cputype==PPCG5)? 128 : + (cputype == PPC620 || cputype == PPCG3)? 64 : 32; + datacache[1].size = L2size[cputype]; + datacache[2].size = L3size[cputype]; + datacache[1].lineSize = datacache[0].lineSize; + datacache[2].lineSize = datacache[0].lineSize; +} + +// TODO: Implement this function with OS support +void cpuidSparc() +{ + // UltaSparcIIi : L1 = 16, 2way. L2 = 512, 4 way. + // UltraSparcIII : L1 = 64, 4way. L2= 4096 or 8192. + // UltraSparcIIIi: L1 = 64, 4way. L2= 1024, 4 way + // UltraSparcIV : L1 = 64, 4way. L2 = 16*1024. + // UltraSparcIV+ : L1 = 64, 4way. L2 = 2048, L3=32*1024. + // Sparc64V : L1 = 128, 2way. L2 = 4096 4way. +} + + +static this() +{ + if (hasCPUID()) { + cpuidX86(); + } else { + // it's a 386 or 486, or a Cyrix 6x86. + //Probably still has an external cache. + } + if (datacache[0].size==0) { + // Guess same as Pentium 1. + datacache[0].size = 8; + datacache[0].associativity = 2; + datacache[0].lineSize = 32; + } + numCacheLevels = 1; + // And now fill up all the unused levels with full memory space. + for (int i=1; i< datacache.length; ++i) { + if (datacache[i].size==0) { + // Set all remaining levels of cache equal to full address space. + datacache[i].size = uint.max/1024; + datacache[i].associativity = 1; + datacache[i].lineSize = datacache[i-1].lineSize; + } else numCacheLevels = i+1; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/util/ctype.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/util/ctype.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,92 @@ +/** + * Simple ASCII char classification functions. + * + * Copyright: Copyright Digital Mars 2004 - 2009. + * License: Boost License 1.0. + * Authors: Sean Kelly + * + * Copyright Sean Kelly 2005 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module rt.util.string; + +private import core.stdc.string; + +char[] intToString( char[] buf, uint val ) +{ + assert( buf.length > 9 ); + auto p = buf.ptr + buf.length; + + do + { + *--p = cast(char)(val % 10 + '0'); + } while( val /= 10 ); + + return buf[p - buf.ptr .. $]; +} + + +int dstrcmp( in char[] s1, in char[] s2 ) +{ + auto len = s1.length; + if( s2.length < len ) + len = s2.length; + if( memcmp( s1.ptr, s2.ptr, len ) == 0 ) + return 0; + return s1.length > s2.length ? 1 : -1; +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/util/utf.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/util/utf.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,901 @@ +/******************************************** + * Encode and decode UTF-8, UTF-16 and UTF-32 strings. + * + * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D + * wchar type. + * For Posix systems, the C wchar_t type is UTF-32 and corresponds to + * the D utf.dchar type. + * + * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). + * + * See_Also: + * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)
+ * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)
+ * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) + * Macros: + * WIKI = Phobos/StdUtf + * + * Copyright: Copyright Digital Mars 2003 - 2009. + * License: = 0xD800 && u <= 0xDBFF); +} + +/** + * stride() returns the length of a UTF-32 sequence starting at index i + * in string s. + * Returns: The return value will always be 1. + */ +uint stride(in dchar[] s, size_t i) +{ + return 1; +} + +/******************************************* + * Given an index i into an array of characters s[], + * and assuming that index i is at the start of a UTF character, + * determine the number of UCS characters up to that index i. + */ + +size_t toUCSindex(in char[] s, size_t i) +{ + size_t n; + size_t j; + + for (j = 0; j < i; ) + { + j += stride(s, j); + n++; + } + if (j > i) + { + onUnicodeError("invalid UTF-8 sequence", j); + } + return n; +} + +/** ditto */ +size_t toUCSindex(in wchar[] s, size_t i) +{ + size_t n; + size_t j; + + for (j = 0; j < i; ) + { + j += stride(s, j); + n++; + } + if (j > i) + { + onUnicodeError("invalid UTF-16 sequence", j); + } + return n; +} + +/** ditto */ +size_t toUCSindex(in dchar[] s, size_t i) +{ + return i; +} + +/****************************************** + * Given a UCS index n into an array of characters s[], return the UTF index. + */ + +size_t toUTFindex(in char[] s, size_t n) +{ + size_t i; + + while (n--) + { + uint j = UTF8stride[s[i]]; + if (j == 0xFF) + onUnicodeError("invalid UTF-8 sequence", i); + i += j; + } + return i; +} + +/** ditto */ +size_t toUTFindex(in wchar[] s, size_t n) +{ + size_t i; + + while (n--) + { wchar u = s[i]; + + i += 1 + (u >= 0xD800 && u <= 0xDBFF); + } + return i; +} + +/** ditto */ +size_t toUTFindex(in dchar[] s, size_t n) +{ + return n; +} + +/* =================== Decode ======================= */ + +/*************** + * Decodes and returns character starting at s[idx]. idx is advanced past the + * decoded character. If the character is not well formed, a UtfException is + * thrown and idx remains unchanged. + */ +dchar decode(in char[] s, inout size_t idx) + in + { + assert(idx >= 0 && idx < s.length); + } + out (result) + { + assert(isValidDchar(result)); + } + body + { + size_t len = s.length; + dchar V; + size_t i = idx; + char u = s[i]; + + if (u & 0x80) + { uint n; + char u2; + + /* The following encodings are valid, except for the 5 and 6 byte + * combinations: + * 0xxxxxxx + * 110xxxxx 10xxxxxx + * 1110xxxx 10xxxxxx 10xxxxxx + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + for (n = 1; ; n++) + { + if (n > 4) + goto Lerr; // only do the first 4 of 6 encodings + if (((u << n) & 0x80) == 0) + { + if (n == 1) + goto Lerr; + break; + } + } + + // Pick off (7 - n) significant bits of B from first byte of octet + V = cast(dchar)(u & ((1 << (7 - n)) - 1)); + + if (i + (n - 1) >= len) + goto Lerr; // off end of string + + /* The following combinations are overlong, and illegal: + * 1100000x (10xxxxxx) + * 11100000 100xxxxx (10xxxxxx) + * 11110000 1000xxxx (10xxxxxx 10xxxxxx) + * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) + * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) + */ + u2 = s[i + 1]; + if ((u & 0xFE) == 0xC0 || + (u == 0xE0 && (u2 & 0xE0) == 0x80) || + (u == 0xF0 && (u2 & 0xF0) == 0x80) || + (u == 0xF8 && (u2 & 0xF8) == 0x80) || + (u == 0xFC && (u2 & 0xFC) == 0x80)) + goto Lerr; // overlong combination + + for (uint j = 1; j != n; j++) + { + u = s[i + j]; + if ((u & 0xC0) != 0x80) + goto Lerr; // trailing bytes are 10xxxxxx + V = (V << 6) | (u & 0x3F); + } + if (!isValidDchar(V)) + goto Lerr; + i += n; + } + else + { + V = cast(dchar) u; + i++; + } + + idx = i; + return V; + + Lerr: + onUnicodeError("invalid UTF-8 sequence", i); + return V; // dummy return + } + +unittest +{ size_t i; + dchar c; + + debug(utf) printf("utf.decode.unittest\n"); + + static s1 = "abcd"c; + i = 0; + c = decode(s1, i); + assert(c == cast(dchar)'a'); + assert(i == 1); + c = decode(s1, i); + assert(c == cast(dchar)'b'); + assert(i == 2); + + static s2 = "\xC2\xA9"c; + i = 0; + c = decode(s2, i); + assert(c == cast(dchar)'\u00A9'); + assert(i == 2); + + static s3 = "\xE2\x89\xA0"c; + i = 0; + c = decode(s3, i); + assert(c == cast(dchar)'\u2260'); + assert(i == 3); + + static s4 = + [ "\xE2\x89"c[], // too short + "\xC0\x8A", + "\xE0\x80\x8A", + "\xF0\x80\x80\x8A", + "\xF8\x80\x80\x80\x8A", + "\xFC\x80\x80\x80\x80\x8A", + ]; + + for (int j = 0; j < s4.length; j++) + { + try + { + i = 0; + c = decode(s4[j], i); + assert(0); + } + catch (Object o) + { + i = 23; + } + assert(i == 23); + } +} + +/** ditto */ + +dchar decode(in wchar[] s, inout size_t idx) + in + { + assert(idx >= 0 && idx < s.length); + } + out (result) + { + assert(isValidDchar(result)); + } + body + { + string msg; + dchar V; + size_t i = idx; + uint u = s[i]; + + if (u & ~0x7F) + { if (u >= 0xD800 && u <= 0xDBFF) + { uint u2; + + if (i + 1 == s.length) + { msg = "surrogate UTF-16 high value past end of string"; + goto Lerr; + } + u2 = s[i + 1]; + if (u2 < 0xDC00 || u2 > 0xDFFF) + { msg = "surrogate UTF-16 low value out of range"; + goto Lerr; + } + u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); + i += 2; + } + else if (u >= 0xDC00 && u <= 0xDFFF) + { msg = "unpaired surrogate UTF-16 value"; + goto Lerr; + } + else if (u == 0xFFFE || u == 0xFFFF) + { msg = "illegal UTF-16 value"; + goto Lerr; + } + else + i++; + } + else + { + i++; + } + + idx = i; + return cast(dchar)u; + + Lerr: + onUnicodeError(msg, i); + return cast(dchar)u; // dummy return + } + +/** ditto */ + +dchar decode(in dchar[] s, inout size_t idx) + in + { + assert(idx >= 0 && idx < s.length); + } + body + { + size_t i = idx; + dchar c = s[i]; + + if (!isValidDchar(c)) + goto Lerr; + idx = i + 1; + return c; + + Lerr: + onUnicodeError("invalid UTF-32 value", i); + return c; // dummy return + } + + +/* =================== Encode ======================= */ + +/******************************* + * Encodes character c and appends it to array s[]. + */ +void encode(inout char[] s, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + char[] r = s; + + if (c <= 0x7F) + { + r ~= cast(char) c; + } + else + { + char[4] buf; + uint L; + + if (c <= 0x7FF) + { + buf[0] = cast(char)(0xC0 | (c >> 6)); + buf[1] = cast(char)(0x80 | (c & 0x3F)); + L = 2; + } + else if (c <= 0xFFFF) + { + buf[0] = cast(char)(0xE0 | (c >> 12)); + buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[2] = cast(char)(0x80 | (c & 0x3F)); + L = 3; + } + else if (c <= 0x10FFFF) + { + buf[0] = cast(char)(0xF0 | (c >> 18)); + buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); + buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[3] = cast(char)(0x80 | (c & 0x3F)); + L = 4; + } + else + { + assert(0); + } + r ~= buf[0 .. L]; + } + s = r; + } + +unittest +{ + debug(utf) printf("utf.encode.unittest\n"); + + char[] s = "abcd".dup; + encode(s, cast(dchar)'a'); + assert(s.length == 5); + assert(s == "abcda"); + + encode(s, cast(dchar)'\u00A9'); + assert(s.length == 7); + assert(s == "abcda\xC2\xA9"); + //assert(s == "abcda\u00A9"); // BUG: fix compiler + + encode(s, cast(dchar)'\u2260'); + assert(s.length == 10); + assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); +} + +/** ditto */ + +void encode(inout wchar[] s, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + wchar[] r = s; + + if (c <= 0xFFFF) + { + r ~= cast(wchar) c; + } + else + { + wchar[2] buf; + + buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); + buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); + r ~= buf; + } + s = r; + } + +/** ditto */ +void encode(inout dchar[] s, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + s ~= c; + } + +/** +Returns the code length of $(D c) in the encoding using $(D C) as a +code point. The code is returned in character count, not in bytes. + */ + +ubyte codeLength(C)(dchar c) +{ + + static if (C.sizeof == 1) + { + return + c <= 0x7F ? 1 + : c <= 0x7FF ? 2 + : c <= 0xFFFF ? 3 + : c <= 0x10FFFF ? 4 + : (assert(false), 6); +} + + else static if (C.sizeof == 2) +{ + return c <= 0xFFFF ? 1 : 2; + } + else + { + static assert(C.sizeof == 4); + return 1; + } +} + +/* =================== Validation ======================= */ + +/*********************************** +Checks to see if string is well formed or not. $(D S) can be an array + of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException) + if it is not. Use to check all untrusted input for correctness. + */ +void validate(S)(in S s) +{ + auto len = s.length; + for (size_t i = 0; i < len; ) + { + decode(s, i); + } +} + +/* =================== Conversion to UTF8 ======================= */ + +char[] toUTF8(char[4] buf, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + if (c <= 0x7F) + { + buf[0] = cast(char) c; + return buf[0 .. 1]; + } + else if (c <= 0x7FF) + { + buf[0] = cast(char)(0xC0 | (c >> 6)); + buf[1] = cast(char)(0x80 | (c & 0x3F)); + return buf[0 .. 2]; + } + else if (c <= 0xFFFF) + { + buf[0] = cast(char)(0xE0 | (c >> 12)); + buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[2] = cast(char)(0x80 | (c & 0x3F)); + return buf[0 .. 3]; + } + else if (c <= 0x10FFFF) + { + buf[0] = cast(char)(0xF0 | (c >> 18)); + buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); + buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[3] = cast(char)(0x80 | (c & 0x3F)); + return buf[0 .. 4]; + } + assert(0); + } + +/******************* + * Encodes string s into UTF-8 and returns the encoded string. + */ +string toUTF8(string s) + in + { + validate(s); + } + body + { + return s; + } + +/** ditto */ +string toUTF8(in wchar[] s) +{ + char[] r; + size_t i; + size_t slen = s.length; + + r.length = slen; + + for (i = 0; i < slen; i++) + { wchar c = s[i]; + + if (c <= 0x7F) + r[i] = cast(char)c; // fast path for ascii + else + { + r.length = i; + foreach (dchar c; s[i .. slen]) + { + encode(r, c); + } + break; + } + } + return cast(string)r; +} + +/** ditto */ +string toUTF8(in dchar[] s) +{ + char[] r; + size_t i; + size_t slen = s.length; + + r.length = slen; + + for (i = 0; i < slen; i++) + { dchar c = s[i]; + + if (c <= 0x7F) + r[i] = cast(char)c; // fast path for ascii + else + { + r.length = i; + foreach (dchar d; s[i .. slen]) + { + encode(r, d); + } + break; + } + } + return cast(string)r; +} + +/* =================== Conversion to UTF16 ======================= */ + +wchar[] toUTF16(wchar[2] buf, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + if (c <= 0xFFFF) + { + buf[0] = cast(wchar) c; + return buf[0 .. 1]; + } + else + { + buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); + buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); + return buf[0 .. 2]; + } + } + +/**************** + * Encodes string s into UTF-16 and returns the encoded string. + * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take + * an LPWSTR or LPCWSTR argument. + */ +wstring toUTF16(in char[] s) +{ + wchar[] r; + size_t slen = s.length; + + r.length = slen; + r.length = 0; + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c <= 0x7F) + { + i++; + r ~= cast(wchar)c; + } + else + { + c = decode(s, i); + encode(r, c); + } + } + return cast(wstring)r; +} + +alias const(wchar)* wptr; +/** ditto */ +wptr toUTF16z(in char[] s) +{ + wchar[] r; + size_t slen = s.length; + + r.length = slen + 1; + r.length = 0; + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c <= 0x7F) + { + i++; + r ~= cast(wchar)c; + } + else + { + c = decode(s, i); + encode(r, c); + } + } + r ~= "\000"; + return r.ptr; +} + +/** ditto */ +wstring toUTF16(wstring s) + in + { + validate(s); + } + body + { + return s; + } + +/** ditto */ +wstring toUTF16(in dchar[] s) +{ + wchar[] r; + size_t slen = s.length; + + r.length = slen; + r.length = 0; + for (size_t i = 0; i < slen; i++) + { + encode(r, s[i]); + } + return cast(wstring)r; +} + +/* =================== Conversion to UTF32 ======================= */ + +/***** + * Encodes string s into UTF-32 and returns the encoded string. + */ +dstring toUTF32(in char[] s) +{ + dchar[] r; + size_t slen = s.length; + size_t j = 0; + + r.length = slen; // r[] will never be longer than s[] + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c >= 0x80) + c = decode(s, i); + else + i++; // c is ascii, no need for decode + r[j++] = c; + } + return cast(dstring)r[0 .. j]; +} + +/** ditto */ +dstring toUTF32(in wchar[] s) +{ + dchar[] r; + size_t slen = s.length; + size_t j = 0; + + r.length = slen; // r[] will never be longer than s[] + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c >= 0x80) + c = decode(s, i); + else + i++; // c is ascii, no need for decode + r[j++] = c; + } + return cast(dstring)r[0 .. j]; +} + +/** ditto */ +dstring toUTF32(dstring s) + in + { + validate(s); + } + body + { + return s; + } + +/* ================================ tests ================================== */ + +unittest +{ + debug(utf) printf("utf.toUTF.unittest\n"); + + auto c = "hello"c[]; + auto w = toUTF16(c); + assert(w == "hello"); + auto d = toUTF32(c); + assert(d == "hello"); + + c = toUTF8(w); + assert(c == "hello"); + d = toUTF32(w); + assert(d == "hello"); + + c = toUTF8(d); + assert(c == "hello"); + w = toUTF16(d); + assert(w == "hello"); + + + c = "hel\u1234o"; + w = toUTF16(c); + assert(w == "hel\u1234o"); + d = toUTF32(c); + assert(d == "hel\u1234o"); + + c = toUTF8(w); + assert(c == "hel\u1234o"); + d = toUTF32(w); + assert(d == "hel\u1234o"); + + c = toUTF8(d); + assert(c == "hel\u1234o"); + w = toUTF16(d); + assert(w == "hel\u1234o"); + + + c = "he\U0010AAAAllo"; + w = toUTF16(c); + //foreach (wchar c; w) printf("c = x%x\n", c); + //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c); + assert(w == "he\U0010AAAAllo"); + d = toUTF32(c); + assert(d == "he\U0010AAAAllo"); + + c = toUTF8(w); + assert(c == "he\U0010AAAAllo"); + d = toUTF32(w); + assert(d == "he\U0010AAAAllo"); + + c = toUTF8(d); + assert(c == "he\U0010AAAAllo"); + w = toUTF16(d); + assert(w == "he\U0010AAAAllo"); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/compiler/ldc/win32.mak --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/compiler/ldc/win32.mak Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,175 @@ +# Makefile to build the D runtime library core components for Posix +# Designed to work with GNU make +# Targets: +# make +# Same as make all +# make debug +# Build the debug version of the library +# make release +# Build the release version of the library +# make doc +# Generate documentation +# make clean +# Delete all files created by build process + +# Essentials + +LIBDIR=..\..\..\lib +DOCDIR=..\..\..\doc +IMPDIR=..\..\..\import +LIBBASENAME=druntime_rt_dmd.lib +#MODULES=bitop exception memory runtime thread vararg \ +# $(addprefix sync/,barrier condition config exception mutex rwmutex semaphore) +BUILDS=debug release unittest + +MODULES_BASE= \ + aaA.d \ + aApply.d \ + aApplyR.d \ + adi.d \ + arrayassign.d \ + arraybyte.d \ + arraycast.d \ + arraycat.d \ + arraydouble.d \ + arrayfloat.d \ + arrayint.d \ + arrayreal.d \ + arrayshort.d \ + cast_.d \ + cover.d \ + dmain2.d \ + invariant.d \ + invariant_.d \ + lifetime.d \ + memory.d \ + memset.d \ + obj.d \ + object_.d \ + qsort.d \ + switch_.d \ + trace.d +# NOTE: trace.d and cover.d are not necessary for a successful build +# as both are used for debugging features (profiling and coverage) +# NOTE: a pre-compiled minit.obj has been provided in dmd for Win32 and +# minit.asm is not used by dmd for Linux + +MODULES_UTIL= \ + util\console.d \ + util\cpuid.d \ + util\ctype.d \ + util\string.d \ + util\utf.d + +MODULES_TI= \ + typeinfo\ti_AC.d \ + typeinfo\ti_Acdouble.d \ + typeinfo\ti_Acfloat.d \ + typeinfo\ti_Acreal.d \ + typeinfo\ti_Adouble.d \ + typeinfo\ti_Afloat.d \ + typeinfo\ti_Ag.d \ + typeinfo\ti_Aint.d \ + typeinfo\ti_Along.d \ + typeinfo\ti_Areal.d \ + typeinfo\ti_Ashort.d \ + typeinfo\ti_byte.d \ + typeinfo\ti_C.d \ + typeinfo\ti_cdouble.d \ + typeinfo\ti_cfloat.d \ + typeinfo\ti_char.d \ + typeinfo\ti_creal.d \ + typeinfo\ti_dchar.d \ + typeinfo\ti_delegate.d \ + typeinfo\ti_double.d \ + typeinfo\ti_float.d \ + typeinfo\ti_idouble.d \ + typeinfo\ti_ifloat.d \ + typeinfo\ti_int.d \ + typeinfo\ti_ireal.d \ + typeinfo\ti_long.d \ + typeinfo\ti_ptr.d \ + typeinfo\ti_real.d \ + typeinfo\ti_short.d \ + typeinfo\ti_ubyte.d \ + typeinfo\ti_uint.d \ + typeinfo\ti_ulong.d \ + typeinfo\ti_ushort.d \ + typeinfo\ti_void.d \ + typeinfo\ti_wchar.d + +C_SRCS=complex.c critical.c deh.c monitor.c + +# Symbols + +CC=dmc +DMD=dmd +DOCFLAGS=-version=DDoc +DFLAGS_release=-d -release -O -inline -w -nofloat +DFLAGS_debug=-d -g -w -nofloat +DFLAGS_unittest=$(DFLAGS_release) -unittest +CFLAGS_release=-mn -6 -r +CFLAGS_debug=-g -mn -6 -r +CFLAGS_unittest=$(CFLAGS_release) + +# Derived symbols + +C_OBJS=complex.obj critical.obj deh.obj monitor.obj +AS_OBJS=minit.obj +ALL_MODULES=$(MODULES_BASE) $(MODULES_UTIL) $(MODULES_TI) +D_SRCS=$(ALL_MODULES) +ALLLIBS=\ + $(LIBDIR)\debug\$(LIBBASENAME) \ + $(LIBDIR)\release\$(LIBBASENAME) \ + $(LIBDIR)\unittest\$(LIBBASENAME) + +# Patterns + +#$(LIBDIR)\%\$(LIBBASENAME) : $(D_SRCS) $(C_SRCS) $(AS_OBJS) +# $(CC) -c $(CFLAGS_$*) $(C_SRCS) +# $(DMD) $(DFLAGS_$*) -lib -of$@ $(D_SRCS) $(C_OBJS) $(AS_OBJS) +# del $(C_OBJS) + +#$(DOCDIR)\%.html : %.d +# $(DMD) -c -d -o- -Df$@ $< + +#$(IMPDIR)\%.di : %.d +# $(DMD) -c -d -o- -Hf$@ $< + +# Patterns - debug + +$(LIBDIR)\debug\$(LIBBASENAME) : $(D_SRCS) $(C_SRCS) $(AS_OBJS) + $(CC) -c $(CFLAGS_debug) $(C_SRCS) + $(DMD) $(DFLAGS_debug) -lib -of$@ $(D_SRCS) $(C_OBJS) $(AS_OBJS) + del $(C_OBJS) + +# Patterns - release + +$(LIBDIR)\release\$(LIBBASENAME) : $(D_SRCS) $(C_SRCS) $(AS_OBJS) + $(CC) -c $(CFLAGS_release) $(C_SRCS) + $(DMD) $(DFLAGS_release) -lib -of$@ $(D_SRCS) $(C_OBJS) $(AS_OBJS) + del $(C_OBJS) + +# Patterns - unittest + +$(LIBDIR)\unittest\$(LIBBASENAME) : $(D_SRCS) $(C_SRCS) $(AS_OBJS) + $(CC) -c $(CFLAGS_unittest) $(C_SRCS) + $(DMD) $(DFLAGS_unittest) -lib -of$@ $(D_SRCS) $(C_OBJS) $(AS_OBJS) + del $(C_OBJS) + +# Patterns - asm + +minit.obj : minit.asm + $(CC) -c $** + +# Rulez + +all : $(BUILDS) doc + +debug : $(LIBDIR)\debug\$(LIBBASENAME) $(IMPORTS) +release : $(LIBDIR)\release\$(LIBBASENAME) $(IMPORTS) +unittest : $(LIBDIR)\unittest\$(LIBBASENAME) $(IMPORTS) +#doc : $(DOCS) + +clean : + del $(IMPORTS) $(DOCS) $(ALLLIBS) diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/dmd-posix.mak --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/dmd-posix.mak Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,72 @@ +# Makefile to build the composite D runtime library for Linux +# Designed to work with GNU make +# Targets: +# make +# Same as make all +# make debug +# Build the debug version of the library +# make release +# Build the release version of the library +# make doc +# Generate documentation +# make clean +# Delete all files created by build process + +# Essentials + +LIBDIR:=../lib +DOCDIR:=../doc +LIBBASENAME=libdruntime.a + +DIR_CC=common +DIR_RT=compiler/dmd +DIR_GC=gc/basic + +# Symbols + +DMD=dmd + +# Targets + +all : debug release doc unittest $(LIBDIR)/$(LIBBASENAME) + +# unittest : +# $(MAKE) -fdmd-posix.mak lib MAKE_LIB="unittest" +# dmd -unittest unittest ../import/core/stdc/stdarg \ +# -defaultlib="$(DUP_TARGET)" -debuglib="$(DUP_TARGET)" +# $(RM) stdarg.o +# ./unittest + +debug release unittest : + @$(MAKE) DMD=$(DMD) -C $(DIR_CC) --no-print-directory -fposix.mak $@ + @$(MAKE) DMD=$(DMD) -C $(DIR_RT) --no-print-directory -fposix.mak $@ + @$(MAKE) DMD=$(DMD) -C $(DIR_GC) --no-print-directory -fposix.mak $@ + @$(DMD) -lib -of$(LIBDIR)/$@/$(LIBBASENAME) \ + $(LIBDIR)/$@/libdruntime-core.a \ + $(LIBDIR)/$@/libdruntime-rt-dmd.a \ + $(LIBDIR)/$@/libdruntime-gc-basic.a + +$(LIBDIR)/$(LIBBASENAME) : $(LIBDIR)/release/$(LIBBASENAME) + ln -sf $(realpath $<) $@ + +doc : $(ALL_DOCS) + $(MAKE) DMD=$(DMD) -C $(DIR_CC) --no-print-directory -fposix.mak doc +# $(MAKE) DMD=$(DMD) -C $(DIR_RT) --no-print-directory -fposix.mak doc +# $(MAKE) DMD=$(DMD) -C $(DIR_GC) --no-print-directory -fposix.mak doc + +###################################################### + +clean : + $(MAKE) DMD=$(DMD) -C $(DIR_CC) --no-print-directory -fposix.mak clean + $(MAKE) DMD=$(DMD) -C $(DIR_RT) --no-print-directory -fposix.mak clean + $(MAKE) DMD=$(DMD) -C $(DIR_GC) --no-print-directory -fposix.mak clean +#find . -name "*.di" | xargs $(RM) + rm -rf $(LIBDIR) $(DOCDIR) + +# install : +# make -C $(DIR_CC) --no-print-directory -fposix.mak install +# make -C $(DIR_RT) --no-print-directory -fposix.mak install +# make -C $(DIR_GC) --no-print-directory -fposix.mak install +# $(CP) $(LIB_MASK) $(LIB_DEST)/. +# $(CP) $(DUP_MASK) $(LIB_DEST)/. + diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/dmd-win32.mak --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/dmd-win32.mak Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,89 @@ +# Makefile to build the composite D runtime library for Linux +# Designed to work with GNU make +# Targets: +# make +# Same as make all +# make debug +# Build the debug version of the library +# make release +# Build the release version of the library +# make doc +# Generate documentation +# make clean +# Delete all files created by build process + +# Essentials + +LIBDIR=..\lib +DOCDIR=..\doc +LIBBASENAME=druntime.lib + +DIR_CC=common +DIR_RT=compiler\dmd +DIR_GC=gc\basic + +# Symbols + +DMD=dmd + +# Targets + +all : debug release doc unittest $(LIBDIR)\$(LIBBASENAME) + +# unittest : +# $(MAKE) -fdmd-posix.mak lib MAKE_LIB="unittest" +# dmd -unittest unittest ../import/core/stdc/stdarg \ +# -defaultlib="$(DUP_TARGET)" -debuglib="$(DUP_TARGET)" +# $(RM) stdarg.o +# ./unittest + +debug release unittest : + cd $(DIR_CC) + make DMD=$(DMD) -fwin32.mak $@ + cd .. + cd $(DIR_RT) + make DMD=$(DMD) -fwin32.mak $@ + cd ..\.. + cd $(DIR_GC) + make DMD=$(DMD) -fwin32.mak $@ + cd ..\.. + $(DMD) -lib -of$(LIBDIR)\$@\$(LIBBASENAME) \ + $(LIBDIR)\$@\druntime_core.lib \ + $(LIBDIR)\$@\druntime_rt_dmd.lib \ + $(LIBDIR)\$@\druntime_gc_basic.lib + +$(LIBDIR)\$(LIBBASENAME) : $(LIBDIR)\release\$(LIBBASENAME) + copy /y $** $@ + +doc : $(ALL_DOCS) + cd $(DIR_CC) + make DMD=$(DMD) -fwin32.mak $@ + cd .. +# cd $(DIR_RT) +# make DMD=$(DMD) -fwin32.mak $@ +# cd ..\.. +# cd $(DIR_GC) +# make DMD=$(DMD) -fwin32.mak $@ +# cd ..\.. + +###################################################### + +clean : $(ALL_DOCS) + cd $(DIR_CC) + make DMD=$(DMD) -fwin32.mak $@ + cd .. + cd $(DIR_RT) + make DMD=$(DMD) -fwin32.mak $@ + cd ..\.. + cd $(DIR_GC) + make DMD=$(DMD) -fwin32.mak $@ + cd ..\.. +#find . -name "*.di" | xargs $(RM) +#rm -rf $(LIBDIR) $(DOCDIR) + +# install : +# make -C $(DIR_CC) --no-print-directory -fposix.mak install +# make -C $(DIR_RT) --no-print-directory -fposix.mak install +# make -C $(DIR_GC) --no-print-directory -fposix.mak install +# $(CP) $(LIB_MASK) $(LIB_DEST)\. +# $(CP) $(DUP_MASK) $(LIB_DEST)\. diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/dmd.conf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/dmd.conf Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,2 @@ +[Environment] +DFLAGS="-I%HOME%/common" "-I%HOME%/../import" diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/gc/basic/gc.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/gc/basic/gc.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,315 @@ +/** + * Contains the external GC interface. + * + * Copyright: Copyright Digital Mars 2005 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright, David Friedman, Sean Kelly + * + * Copyright Digital Mars 2005 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module gc.gcalloc; + + +version (Windows) +{ + private import core.sys.windows.windows; + + alias int pthread_t; + + pthread_t pthread_self() + { + return cast(pthread_t) GetCurrentThreadId(); + } + + //version = GC_Use_Alloc_Win32; +} +else version (Posix) +{ + private import core.sys.posix.sys.mman; + private import core.stdc.stdlib; + + //version = GC_Use_Alloc_MMap; +} +else +{ + private import core.stdc.stdlib; + + //version = GC_Use_Alloc_Malloc; +} + +/+ +static if(is(typeof(VirtualAlloc))) + version = GC_Use_Alloc_Win32; +else static if (is(typeof(mmap))) + version = GC_Use_Alloc_MMap; +else static if (is(typeof(valloc))) + version = GC_Use_Alloc_Valloc; +else static if (is(typeof(malloc))) + version = GC_Use_Alloc_Malloc; +else static assert(false, "No supported allocation methods available."); ++/ + +static if (is(typeof(VirtualAlloc))) // version (GC_Use_Alloc_Win32) +{ + /** + * Map memory. + */ + void *os_mem_map(size_t nbytes) + { + return VirtualAlloc(null, nbytes, MEM_RESERVE, PAGE_READWRITE); + } + + + /** + * Commit memory. + * Returns: + * 0 success + * !=0 failure + */ + int os_mem_commit(void *base, size_t offset, size_t nbytes) + { void *p; + + p = VirtualAlloc(base + offset, nbytes, MEM_COMMIT, PAGE_READWRITE); + return cast(int)(p is null); + } + + + /** + * Decommit memory. + * Returns: + * 0 success + * !=0 failure + */ + int os_mem_decommit(void *base, size_t offset, size_t nbytes) + { + return cast(int)(VirtualFree(base + offset, nbytes, MEM_DECOMMIT) == 0); + } + + + /** + * Unmap memory allocated with os_mem_map(). + * Memory must have already been decommitted. + * Returns: + * 0 success + * !=0 failure + */ + int os_mem_unmap(void *base, size_t nbytes) + { + return cast(int)(VirtualFree(base, 0, MEM_RELEASE) == 0); + } +} +else static if (is(typeof(mmap))) // else version (GC_Use_Alloc_MMap) +{ + void *os_mem_map(size_t nbytes) + { void *p; + + p = mmap(null, nbytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + return (p == MAP_FAILED) ? null : p; + } + + + int os_mem_commit(void *base, size_t offset, size_t nbytes) + { + return 0; + } + + + int os_mem_decommit(void *base, size_t offset, size_t nbytes) + { + return 0; + } + + + int os_mem_unmap(void *base, size_t nbytes) + { + return munmap(base, nbytes); + } +} +else static if (is(typeof(valloc))) // else version (GC_Use_Alloc_Valloc) +{ + void *os_mem_map(size_t nbytes) + { + return valloc(nbytes); + } + + + int os_mem_commit(void *base, size_t offset, size_t nbytes) + { + return 0; + } + + + int os_mem_decommit(void *base, size_t offset, size_t nbytes) + { + return 0; + } + + + int os_mem_unmap(void *base, size_t nbytes) + { + free(base); + return 0; + } +} +else static if (is(typeof(malloc))) // else version (GC_Use_Alloc_Malloc) +{ + // NOTE: This assumes malloc granularity is at least (void*).sizeof. If + // (req_size + PAGESIZE) is allocated, and the pointer is rounded up + // to PAGESIZE alignment, there will be space for a void* at the end + // after PAGESIZE bytes used by the GC. + + + private import gcx : PAGESIZE; + + + const size_t PAGE_MASK = PAGESIZE - 1; + + + void *os_mem_map(size_t nbytes) + { byte *p, q; + p = cast(byte *) malloc(nbytes + PAGESIZE); + q = p + ((PAGESIZE - ((cast(size_t) p & PAGE_MASK))) & PAGE_MASK); + * cast(void**)(q + nbytes) = p; + return q; + } + + + int os_mem_commit(void *base, size_t offset, size_t nbytes) + { + return 0; + } + + + int os_mem_decommit(void *base, size_t offset, size_t nbytes) + { + return 0; + } + + + int os_mem_unmap(void *base, size_t nbytes) + { + free( *cast(void**)( cast(byte*) base + nbytes ) ); + return 0; + } +} +else +{ + static assert(false, "No supported allocation methods available."); +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/gc/basic/gcbits.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/gc/basic/gcbits.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,193 @@ +/** + * Contains a bitfield used by the GC. + * + * Copyright: Copyright Digital Mars 2005 - 2009. + * License: Boost License 1.0. + * Authors: Walter Bright, Sean Kelly + * + * Copyright Digital Mars 2005 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module gc.gcstats; + + +/** + * + */ +struct GCStats +{ + size_t poolsize; // total size of pool + size_t usedsize; // bytes allocated + size_t freeblocks; // number of blocks marked FREE + size_t freelistsize; // total of memory on free lists + size_t pageblocks; // number of blocks marked PAGE +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/gc/basic/gcx.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/gc/basic/gcx.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,3010 @@ +/** + * Contains the garbage collector implementation. + * + * Copyright: Copyright Digital Mars 2001 - 2009. + * License: 0); + gcx.disabled--; + } + else synchronized (gcLock) + { + assert(gcx.disabled > 0); + gcx.disabled--; + } + } + + + /** + * + */ + void disable() + { + if (!thread_needLock()) + { + gcx.disabled++; + } + else synchronized (gcLock) + { + gcx.disabled++; + } + } + + + /** + * + */ + uint getAttr(void* p) + { + if (!p) + { + return 0; + } + + uint go() + { + Pool* pool = gcx.findPool(p); + uint oldb = 0; + + if (pool) + { + auto biti = cast(size_t)(p - pool.baseAddr) / 16; + + oldb = gcx.getBits(pool, biti); + } + return oldb; + } + + if (!thread_needLock()) + { + return go(); + } + else synchronized (gcLock) + { + return go(); + } + } + + + /** + * + */ + uint setAttr(void* p, uint mask) + { + if (!p) + { + return 0; + } + + uint go() + { + Pool* pool = gcx.findPool(p); + uint oldb = 0; + + if (pool) + { + auto biti = cast(size_t)(p - pool.baseAddr) / 16; + + oldb = gcx.getBits(pool, biti); + gcx.setBits(pool, biti, mask); + } + return oldb; + } + + if (!thread_needLock()) + { + return go(); + } + else synchronized (gcLock) + { + return go(); + } + } + + + /** + * + */ + uint clrAttr(void* p, uint mask) + { + if (!p) + { + return 0; + } + + uint go() + { + Pool* pool = gcx.findPool(p); + uint oldb = 0; + + if (pool) + { + auto biti = cast(size_t)(p - pool.baseAddr) / 16; + + oldb = gcx.getBits(pool, biti); + gcx.clrBits(pool, biti, mask); + } + return oldb; + } + + if (!thread_needLock()) + { + return go(); + } + else synchronized (gcLock) + { + return go(); + } + } + + + /** + * + */ + void *malloc(size_t size, uint bits = 0) + { + if (!size) + { + return null; + } + + // Since a finalizer could launch a new thread, we always need to lock + // when collecting. The safest way to do this is to simply always lock + // when allocating. + synchronized (gcLock) + { + return mallocNoSync(size, bits); + } + } + + + // + // + // + private void *mallocNoSync(size_t size, uint bits = 0) + { + assert(size != 0); + + void *p = null; + Bins bin; + + //debug(PRINTF) printf("GC::malloc(size = %d, gcx = %p)\n", size, gcx); + assert(gcx); + //debug(PRINTF) printf("gcx.self = %x, pthread_self() = %x\n", gcx.self, pthread_self()); + + size += SENTINEL_EXTRA; + + // Compute size bin + // Cache previous binsize lookup - Dave Fladebo. + __gshared size_t lastsize = -1; + __gshared Bins lastbin; + if (size == lastsize) + bin = lastbin; + else + { + bin = gcx.findBin(size); + lastsize = size; + lastbin = bin; + } + + if (bin < B_PAGE) + { + int state = gcx.disabled ? 1 : 0; + bool collected = false; + + while (!gcx.bucket[bin] && !gcx.allocPage(bin)) + { + switch (state) + { + case 0: + gcx.fullcollectshell(); + collected = true; + state = 1; + continue; + case 1: + gcx.newPool(1); + state = 2; + continue; + case 2: + if (collected) + onOutOfMemoryError(); + state = 0; + continue; + default: + assert(false); + } + } + p = gcx.bucket[bin]; + + // Return next item from free list + gcx.bucket[bin] = (cast(List*)p).next; + if( !(bits & BlkAttr.NO_SCAN) ) + memset(p + size, 0, binsize[bin] - size); + //debug(PRINTF) printf("\tmalloc => %x\n", p); + debug (MEMSTOMP) memset(p, 0xF0, size); + } + else + { + p = gcx.bigAlloc(size); + if (!p) + onOutOfMemoryError(); + } + size -= SENTINEL_EXTRA; + p = sentinel_add(p); + sentinel_init(p, size); + gcx.log_malloc(p, size); + + if (bits) + { + Pool *pool = gcx.findPool(p); + assert(pool); + + gcx.setBits(pool, cast(size_t)(p - pool.baseAddr) / 16, bits); + } + return p; + } + + + /** + * + */ + void *calloc(size_t size, uint bits = 0) + { + if (!size) + { + return null; + } + + // Since a finalizer could launch a new thread, we always need to lock + // when collecting. The safest way to do this is to simply always lock + // when allocating. + synchronized (gcLock) + { + return callocNoSync(size, bits); + } + } + + + // + // + // + private void *callocNoSync(size_t size, uint bits = 0) + { + assert(size != 0); + + //debug(PRINTF) printf("calloc: %x len %d\n", p, len); + void *p = mallocNoSync(size, bits); + memset(p, 0, size); + return p; + } + + + /** + * + */ + void *realloc(void *p, size_t size, uint bits = 0) + { + // Since a finalizer could launch a new thread, we always need to lock + // when collecting. The safest way to do this is to simply always lock + // when allocating. + synchronized (gcLock) + { + return reallocNoSync(p, size, bits); + } + } + + + // + // + // + private void *reallocNoSync(void *p, size_t size, uint bits = 0) + { + if (!size) + { if (p) + { freeNoSync(p); + p = null; + } + } + else if (!p) + { + p = mallocNoSync(size, bits); + } + else + { void *p2; + size_t psize; + + //debug(PRINTF) printf("GC::realloc(p = %x, size = %u)\n", p, size); + version (SENTINEL) + { + sentinel_Invariant(p); + psize = *sentinel_size(p); + if (psize != size) + { + if (psize) + { + Pool *pool = gcx.findPool(p); + + if (pool) + { + auto biti = cast(size_t)(p - pool.baseAddr) / 16; + + if (bits) + { + gcx.clrBits(pool, biti, BlkAttr.ALL_BITS); + gcx.setBits(pool, biti, bits); + } + else + { + bits = gcx.getBits(pool, biti); + } + } + } + p2 = mallocNoSync(size, bits); + if (psize < size) + size = psize; + //debug(PRINTF) printf("\tcopying %d bytes\n",size); + memcpy(p2, p, size); + p = p2; + } + } + else + { + psize = gcx.findSize(p); // find allocated size + if (psize >= PAGESIZE && size >= PAGESIZE) + { + auto psz = psize / PAGESIZE; + auto newsz = (size + PAGESIZE - 1) / PAGESIZE; + if (newsz == psz) + return p; + + auto pool = gcx.findPool(p); + auto pagenum = (p - pool.baseAddr) / PAGESIZE; + + if (newsz < psz) + { // Shrink in place + synchronized (gcLock) + { + debug (MEMSTOMP) memset(p + size, 0xF2, psize - size); + pool.freePages(pagenum + newsz, psz - newsz); + } + return p; + } + else if (pagenum + newsz <= pool.npages) + { + // Attempt to expand in place + synchronized (gcLock) + { + for (size_t i = pagenum + psz; 1;) + { + if (i == pagenum + newsz) + { + debug (MEMSTOMP) memset(p + psize, 0xF0, size - psize); + memset(&pool.pagetable[pagenum + psz], B_PAGEPLUS, newsz - psz); + return p; + } + if (i == pool.ncommitted) + { + auto u = pool.extendPages(pagenum + newsz - pool.ncommitted); + if (u == OPFAIL) + break; + i = pagenum + newsz; + continue; + } + if (pool.pagetable[i] != B_FREE) + break; + i++; + } + } + } + } + if (psize < size || // if new size is bigger + psize > size * 2) // or less than half + { + if (psize) + { + Pool *pool = gcx.findPool(p); + + if (pool) + { + auto biti = cast(size_t)(p - pool.baseAddr) / 16; + + if (bits) + { + gcx.clrBits(pool, biti, BlkAttr.ALL_BITS); + gcx.setBits(pool, biti, bits); + } + else + { + bits = gcx.getBits(pool, biti); + } + } + } + p2 = mallocNoSync(size, bits); + if (psize < size) + size = psize; + //debug(PRINTF) printf("\tcopying %d bytes\n",size); + memcpy(p2, p, size); + p = p2; + } + } + } + return p; + } + + + /** + * Attempt to in-place enlarge the memory block pointed to by p by at least + * minbytes beyond its current capacity, up to a maximum of maxsize. This + * does not attempt to move the memory block (like realloc() does). + * + * Returns: + * 0 if could not extend p, + * total size of entire memory block if successful. + */ + size_t extend(void* p, size_t minsize, size_t maxsize) + { + if (!thread_needLock()) + { + return extendNoSync(p, minsize, maxsize); + } + else synchronized (gcLock) + { + return extendNoSync(p, minsize, maxsize); + } + } + + + // + // + // + private size_t extendNoSync(void* p, size_t minsize, size_t maxsize) + in + { + assert( minsize <= maxsize ); + } + body + { + //debug(PRINTF) printf("GC::extend(p = %x, minsize = %u, maxsize = %u)\n", p, minsize, maxsize); + version (SENTINEL) + { + return 0; + } + auto psize = gcx.findSize(p); // find allocated size + if (psize < PAGESIZE) + return 0; // cannot extend buckets + + auto psz = psize / PAGESIZE; + auto minsz = (minsize + PAGESIZE - 1) / PAGESIZE; + auto maxsz = (maxsize + PAGESIZE - 1) / PAGESIZE; + + auto pool = gcx.findPool(p); + auto pagenum = (p - pool.baseAddr) / PAGESIZE; + + size_t sz; + for (sz = 0; sz < maxsz; sz++) + { + auto i = pagenum + psz + sz; + if (i == pool.ncommitted) + break; + if (pool.pagetable[i] != B_FREE) + { if (sz < minsz) + return 0; + break; + } + } + if (sz >= minsz) + { + } + else if (pagenum + psz + sz == pool.ncommitted) + { + auto u = pool.extendPages(minsz - sz); + if (u == OPFAIL) + return 0; + sz = minsz; + } + else + return 0; + debug (MEMSTOMP) memset(p + psize, 0xF0, (psz + sz) * PAGESIZE - psize); + memset(pool.pagetable + pagenum + psz, B_PAGEPLUS, sz); + gcx.p_cache = null; + gcx.size_cache = 0; + return (psz + sz) * PAGESIZE; + } + + + /** + * + */ + size_t reserve(size_t size) + { + if (!size) + { + return 0; + } + + if (!thread_needLock()) + { + return reserveNoSync(size); + } + else synchronized (gcLock) + { + return reserveNoSync(size); + } + } + + + // + // + // + private size_t reserveNoSync(size_t size) + { + assert(size != 0); + assert(gcx); + + return gcx.reserve(size); + } + + + /** + * + */ + void free(void *p) + { + if (!p) + { + return; + } + + if (!thread_needLock()) + { + return freeNoSync(p); + } + else synchronized (gcLock) + { + return freeNoSync(p); + } + } + + + // + // + // + private void freeNoSync(void *p) + { + assert (p); + + Pool* pool; + size_t pagenum; + Bins bin; + size_t biti; + + // Find which page it is in + pool = gcx.findPool(p); + if (!pool) // if not one of ours + return; // ignore + sentinel_Invariant(p); + p = sentinel_sub(p); + pagenum = cast(size_t)(p - pool.baseAddr) / PAGESIZE; + biti = cast(size_t)(p - pool.baseAddr) / 16; + gcx.clrBits(pool, biti, BlkAttr.ALL_BITS); + + bin = cast(Bins)pool.pagetable[pagenum]; + if (bin == B_PAGE) // if large alloc + { size_t npages; + size_t n; + + // Free pages + npages = 1; + n = pagenum; + while (++n < pool.ncommitted && pool.pagetable[n] == B_PAGEPLUS) + npages++; + debug (MEMSTOMP) memset(p, 0xF2, npages * PAGESIZE); + pool.freePages(pagenum, npages); + } + else + { // Add to free list + List *list = cast(List*)p; + + debug (MEMSTOMP) memset(p, 0xF2, binsize[bin]); + + list.next = gcx.bucket[bin]; + gcx.bucket[bin] = list; + } + gcx.log_free(sentinel_add(p)); + } + + + /** + * Determine the base address of the block containing p. If p is not a gc + * allocated pointer, return null. + */ + void* addrOf(void *p) + { + if (!p) + { + return null; + } + + if (!thread_needLock()) + { + return addrOfNoSync(p); + } + else synchronized (gcLock) + { + return addrOfNoSync(p); + } + } + + + // + // + // + void* addrOfNoSync(void *p) + { + if (!p) + { + return null; + } + + return gcx.findBase(p); + } + + + /** + * Determine the allocated size of pointer p. If p is an interior pointer + * or not a gc allocated pointer, return 0. + */ + size_t sizeOf(void *p) + { + if (!p) + { + return 0; + } + + if (!thread_needLock()) + { + return sizeOfNoSync(p); + } + else synchronized (gcLock) + { + return sizeOfNoSync(p); + } + } + + + // + // + // + private size_t sizeOfNoSync(void *p) + { + assert (p); + + version (SENTINEL) + { + p = sentinel_sub(p); + size_t size = gcx.findSize(p); + + // Check for interior pointer + // This depends on: + // 1) size is a power of 2 for less than PAGESIZE values + // 2) base of memory pool is aligned on PAGESIZE boundary + if (cast(size_t)p & (size - 1) & (PAGESIZE - 1)) + size = 0; + return size ? size - SENTINEL_EXTRA : 0; + } + else + { + if (p == gcx.p_cache) + return gcx.size_cache; + + size_t size = gcx.findSize(p); + + // Check for interior pointer + // This depends on: + // 1) size is a power of 2 for less than PAGESIZE values + // 2) base of memory pool is aligned on PAGESIZE boundary + if (cast(size_t)p & (size - 1) & (PAGESIZE - 1)) + size = 0; + else + { + gcx.p_cache = p; + gcx.size_cache = size; + } + + return size; + } + } + + + /** + * Determine the base address of the block containing p. If p is not a gc + * allocated pointer, return null. + */ + BlkInfo query(void *p) + { + if (!p) + { + BlkInfo i; + return i; + } + + if (!thread_needLock()) + { + return queryNoSync(p); + } + else synchronized (gcLock) + { + return queryNoSync(p); + } + } + + + // + // + // + BlkInfo queryNoSync(void *p) + { + assert(p); + + return gcx.getInfo(p); + } + + + /** + * Verify that pointer p: + * 1) belongs to this memory pool + * 2) points to the start of an allocated piece of memory + * 3) is not on a free list + */ + void check(void *p) + { + if (!p) + { + return; + } + + if (!thread_needLock()) + { + checkNoSync(p); + } + else synchronized (gcLock) + { + checkNoSync(p); + } + } + + + // + // + // + private void checkNoSync(void *p) + { + assert(p); + + sentinel_Invariant(p); + debug (PTRCHECK) + { + Pool* pool; + size_t pagenum; + Bins bin; + size_t size; + + p = sentinel_sub(p); + pool = gcx.findPool(p); + assert(pool); + pagenum = cast(size_t)(p - pool.baseAddr) / PAGESIZE; + bin = cast(Bins)pool.pagetable[pagenum]; + assert(bin <= B_PAGE); + size = binsize[bin]; + assert((cast(size_t)p & (size - 1)) == 0); + + debug (PTRCHECK2) + { + if (bin < B_PAGE) + { + // Check that p is not on a free list + List *list; + + for (list = gcx.bucket[bin]; list; list = list.next) + { + assert(cast(void*)list != p); + } + } + } + } + } + + + // + // + // + private void setStackBottom(void *p) + { + version (STACKGROWSDOWN) + { + //p = (void *)((uint *)p + 4); + if (p > gcx.stackBottom) + { + //debug(PRINTF) printf("setStackBottom(%x)\n", p); + gcx.stackBottom = p; + } + } + else + { + //p = (void *)((uint *)p - 4); + if (p < gcx.stackBottom) + { + //debug(PRINTF) printf("setStackBottom(%x)\n", p); + gcx.stackBottom = cast(char*)p; + } + } + } + + + /** + * add p to list of roots + */ + void addRoot(void *p) + { + if (!p) + { + return; + } + + if (!thread_needLock()) + { + gcx.addRoot(p); + } + else synchronized (gcLock) + { + gcx.addRoot(p); + } + } + + + /** + * remove p from list of roots + */ + void removeRoot(void *p) + { + if (!p) + { + return; + } + + if (!thread_needLock()) + { + gcx.removeRoot(p); + } + else synchronized (gcLock) + { + gcx.removeRoot(p); + } + } + + + /** + * + */ + int delegate(int delegate(inout void*)) rootIter() + { + if (!thread_needLock()) + { + return &gcx.rootIter; + } + else synchronized (gcLock) + { + return &gcx.rootIter; + } + } + + + /** + * add range to scan for roots + */ + void addRange(void *p, size_t sz) + { + if (!p || !sz) + { + return; + } + + //debug(PRINTF) printf("+GC.addRange(pbot = x%x, ptop = x%x)\n", pbot, ptop); + if (!thread_needLock()) + { + gcx.addRange(p, p + sz); + } + else synchronized (gcLock) + { + gcx.addRange(p, p + sz); + } + //debug(PRINTF) printf("-GC.addRange()\n"); + } + + + /** + * remove range + */ + void removeRange(void *p) + { + if (!p) + { + return; + } + + if (!thread_needLock()) + { + gcx.removeRange(p); + } + else synchronized (gcLock) + { + gcx.removeRange(p); + } + } + + + /** + * + */ + int delegate(int delegate(inout Range)) rangeIter() + { + if (!thread_needLock()) + { + return &gcx.rangeIter; + } + else synchronized (gcLock) + { + return &gcx.rangeIter; + } + } + + + /** + * do full garbage collection + */ + void fullCollect() + { + debug(PRINTF) printf("GC.fullCollect()\n"); + + // Since a finalizer could launch a new thread, we always need to lock + // when collecting. + synchronized (gcLock) + { + gcx.fullcollectshell(); + } + + version (none) + { + GCStats stats; + + getStats(stats); + debug(PRINTF) printf("poolsize = %x, usedsize = %x, freelistsize = %x\n", + stats.poolsize, stats.usedsize, stats.freelistsize); + } + + gcx.log_collect(); + } + + + /** + * do full garbage collection ignoring roots + */ + void fullCollectNoStack() + { + // Since a finalizer could launch a new thread, we always need to lock + // when collecting. + synchronized (gcLock) + { + gcx.noStack++; + gcx.fullcollectshell(); + gcx.noStack--; + } + } + + + /** + * minimize free space usage + */ + void minimize() + { + if (!thread_needLock()) + { + gcx.minimize(); + } + else synchronized (gcLock) + { + gcx.minimize(); + } + } + + + /** + * Retrieve statistics about garbage collection. + * Useful for debugging and tuning. + */ + void getStats(out GCStats stats) + { + if (!thread_needLock()) + { + getStatsNoSync(stats); + } + else synchronized (gcLock) + { + getStatsNoSync(stats); + } + } + + + // + // + // + private void getStatsNoSync(out GCStats stats) + { + size_t psize = 0; + size_t usize = 0; + size_t flsize = 0; + + size_t n; + size_t bsize = 0; + + //debug(PRINTF) printf("getStats()\n"); + memset(&stats, 0, GCStats.sizeof); + + for (n = 0; n < gcx.npools; n++) + { Pool *pool = gcx.pooltable[n]; + + psize += pool.ncommitted * PAGESIZE; + for (size_t j = 0; j < pool.ncommitted; j++) + { + Bins bin = cast(Bins)pool.pagetable[j]; + if (bin == B_FREE) + stats.freeblocks++; + else if (bin == B_PAGE) + stats.pageblocks++; + else if (bin < B_PAGE) + bsize += PAGESIZE; + } + } + + for (n = 0; n < B_PAGE; n++) + { + //debug(PRINTF) printf("bin %d\n", n); + for (List *list = gcx.bucket[n]; list; list = list.next) + { + //debug(PRINTF) printf("\tlist %x\n", list); + flsize += binsize[n]; + } + } + + usize = bsize - flsize; + + stats.poolsize = psize; + stats.usedsize = bsize - flsize; + stats.freelistsize = flsize; + } +} + + +/* ============================ Gcx =============================== */ + +enum +{ PAGESIZE = 4096, + COMMITSIZE = (4096*16), + POOLSIZE = (4096*256), +} + + +enum +{ + B_16, + B_32, + B_64, + B_128, + B_256, + B_512, + B_1024, + B_2048, + B_PAGE, // start of large alloc + B_PAGEPLUS, // continuation of large alloc + B_FREE, // free page + B_UNCOMMITTED, // memory not committed for this page + B_MAX +} + + +alias ubyte Bins; + + +struct List +{ + List *next; +} + + +struct Range +{ + void *pbot; + void *ptop; +} + + +immutable uint binsize[B_MAX] = [ 16,32,64,128,256,512,1024,2048,4096 ]; +immutable uint notbinsize[B_MAX] = [ ~(16u-1),~(32u-1),~(64u-1),~(128u-1),~(256u-1), + ~(512u-1),~(1024u-1),~(2048u-1),~(4096u-1) ]; + +/* ============================ Gcx =============================== */ + + +struct Gcx +{ + debug (THREADINVARIANT) + { + pthread_t self; + void thread_Invariant() + { + if (self != pthread_self()) + printf("thread_Invariant(): gcx = %x, self = %x, pthread_self() = %x\n", this, self, pthread_self()); + assert(self == pthread_self()); + } + } + else + { + void thread_Invariant() { } + } + + void *p_cache; + size_t size_cache; + + size_t nroots; + size_t rootdim; + void **roots; + + size_t nranges; + size_t rangedim; + Range *ranges; + + uint noStack; // !=0 means don't scan stack + uint log; // turn on logging + uint anychanges; + void *stackBottom; + uint inited; + int disabled; // turn off collections if >0 + + byte *minAddr; // min(baseAddr) + byte *maxAddr; // max(topAddr) + + size_t npools; + Pool **pooltable; + + List *bucket[B_MAX]; // free list for each size + + + void initialize() + { int dummy; + + (cast(byte*)&this)[0 .. Gcx.sizeof] = 0; + stackBottom = cast(char*)&dummy; + log_init(); + debug (THREADINVARIANT) + self = pthread_self(); + //printf("gcx = %p, self = %x\n", this, self); + inited = 1; + } + + + void Dtor() + { + inited = 0; + + for (size_t i = 0; i < npools; i++) + { Pool *pool = pooltable[i]; + + pool.Dtor(); + cstdlib.free(pool); + } + if (pooltable) + cstdlib.free(pooltable); + + if (roots) + cstdlib.free(roots); + + if (ranges) + cstdlib.free(ranges); + } + + + void Invariant() { } + + + invariant() + { + if (inited) + { + //printf("Gcx.invariant(): this = %p\n", this); + size_t i; + + // Assure we're called on the right thread + debug (THREADINVARIANT) assert(self == pthread_self()); + + for (i = 0; i < npools; i++) + { Pool *pool = pooltable[i]; + + pool.Invariant(); + if (i == 0) + { + assert(minAddr == pool.baseAddr); + } + if (i + 1 < npools) + { + assert(pool.opCmp(pooltable[i + 1]) < 0); + } + else if (i + 1 == npools) + { + assert(maxAddr == pool.topAddr); + } + } + + if (roots) + { + assert(rootdim != 0); + assert(nroots <= rootdim); + } + + if (ranges) + { + assert(rangedim != 0); + assert(nranges <= rangedim); + + for (i = 0; i < nranges; i++) + { + assert(ranges[i].pbot); + assert(ranges[i].ptop); + assert(ranges[i].pbot <= ranges[i].ptop); + } + } + + for (i = 0; i < B_PAGE; i++) + { + for (List *list = bucket[i]; list; list = list.next) + { + } + } + } + } + + + /** + * + */ + void addRoot(void *p) + { + if (nroots == rootdim) + { + size_t newdim = rootdim * 2 + 16; + void** newroots; + + newroots = cast(void**)cstdlib.malloc(newdim * newroots[0].sizeof); + if (!newroots) + onOutOfMemoryError(); + if (roots) + { memcpy(newroots, roots, nroots * newroots[0].sizeof); + cstdlib.free(roots); + } + roots = newroots; + rootdim = newdim; + } + roots[nroots] = p; + nroots++; + } + + + /** + * + */ + void removeRoot(void *p) + { + for (size_t i = nroots; i--;) + { + if (roots[i] == p) + { + nroots--; + memmove(roots + i, roots + i + 1, (nroots - i) * roots[0].sizeof); + return; + } + } + assert(0); + } + + + /** + * + */ + int rootIter(int delegate(inout void*) dg) + { + int result = 0; + for( size_t i = 0; i < nroots; ++i ) + { + result = dg(roots[i]); + if (result) + break; + } + return result; + } + + + /** + * + */ + void addRange(void *pbot, void *ptop) + { + //debug(PRINTF) printf("Thread %x ", pthread_self()); + debug(PRINTF) printf("%x.Gcx::addRange(%x, %x), nranges = %d\n", this, pbot, ptop, nranges); + if (nranges == rangedim) + { + size_t newdim = rangedim * 2 + 16; + Range *newranges; + + newranges = cast(Range*)cstdlib.malloc(newdim * newranges[0].sizeof); + if (!newranges) + onOutOfMemoryError(); + if (ranges) + { memcpy(newranges, ranges, nranges * newranges[0].sizeof); + cstdlib.free(ranges); + } + ranges = newranges; + rangedim = newdim; + } + ranges[nranges].pbot = pbot; + ranges[nranges].ptop = ptop; + nranges++; + } + + + /** + * + */ + void removeRange(void *pbot) + { + //debug(PRINTF) printf("Thread %x ", pthread_self()); + debug(PRINTF) printf("%x.Gcx.removeRange(%x), nranges = %d\n", this, pbot, nranges); + for (size_t i = nranges; i--;) + { + if (ranges[i].pbot == pbot) + { + nranges--; + memmove(ranges + i, ranges + i + 1, (nranges - i) * ranges[0].sizeof); + return; + } + } + debug(PRINTF) printf("Wrong thread\n"); + + // This is a fatal error, but ignore it. + // The problem is that we can get a Close() call on a thread + // other than the one the range was allocated on. + //assert(zero); + } + + + /** + * + */ + int rangeIter(int delegate(inout Range) dg) + { + int result = 0; + for( size_t i = 0; i < nranges; ++i ) + { + result = dg(ranges[i]); + if (result) + break; + } + return result; + } + + + /** + * Find Pool that pointer is in. + * Return null if not in a Pool. + * Assume pooltable[] is sorted. + */ + Pool *findPool(void *p) + { + if (p >= minAddr && p < maxAddr) + { + if (npools == 1) + { + return pooltable[0]; + } + + for (size_t i = 0; i < npools; i++) + { Pool *pool; + + pool = pooltable[i]; + if (p < pool.topAddr) + { if (pool.baseAddr <= p) + return pool; + break; + } + } + } + return null; + } + + + /** + * Find base address of block containing pointer p. + * Returns null if not a gc'd pointer + */ + void* findBase(void *p) + { + Pool *pool; + + pool = findPool(p); + if (pool) + { + size_t offset = cast(size_t)(p - pool.baseAddr); + size_t pn = offset / PAGESIZE; + Bins bin = cast(Bins)pool.pagetable[pn]; + + // Adjust bit to be at start of allocated memory block + if (bin <= B_PAGE) + { + return pool.baseAddr + (offset & notbinsize[bin]); + } + else if (bin == B_PAGEPLUS) + { + do + { --pn, offset -= PAGESIZE; + } while (cast(Bins)pool.pagetable[pn] == B_PAGEPLUS); + + return pool.baseAddr + (offset & (offset.max ^ (PAGESIZE-1))); + } + else + { + // we are in a B_FREE or B_UNCOMMITTED page + return null; + } + } + return null; + } + + + /** + * Find size of pointer p. + * Returns 0 if not a gc'd pointer + */ + size_t findSize(void *p) + { + Pool* pool; + size_t size = 0; + + pool = findPool(p); + if (pool) + { + size_t pagenum; + Bins bin; + + pagenum = cast(size_t)(p - pool.baseAddr) / PAGESIZE; + bin = cast(Bins)pool.pagetable[pagenum]; + size = binsize[bin]; + if (bin == B_PAGE) + { size_t npages = pool.ncommitted; + ubyte* pt; + size_t i; + + pt = &pool.pagetable[0]; + for (i = pagenum + 1; i < npages; i++) + { + if (pt[i] != B_PAGEPLUS) + break; + } + size = (i - pagenum) * PAGESIZE; + } + } + return size; + } + + + /** + * + */ + BlkInfo getInfo(void* p) + { + Pool* pool; + BlkInfo info; + + pool = findPool(p); + if (pool) + { + size_t offset = cast(size_t)(p - pool.baseAddr); + size_t pn = offset / PAGESIZE; + Bins bin = cast(Bins)pool.pagetable[pn]; + + //////////////////////////////////////////////////////////////////// + // findAddr + //////////////////////////////////////////////////////////////////// + + if (bin <= B_PAGE) + { + info.base = pool.baseAddr + (offset & notbinsize[bin]); + } + else if (bin == B_PAGEPLUS) + { + do + { --pn, offset -= PAGESIZE; + } while (cast(Bins)pool.pagetable[pn] == B_PAGEPLUS); + + info.base = pool.baseAddr + (offset & (offset.max ^ (PAGESIZE-1))); + + // fix bin for use by size calc below + bin = cast(Bins)pool.pagetable[pn]; + } + + //////////////////////////////////////////////////////////////////// + // findSize + //////////////////////////////////////////////////////////////////// + + info.size = binsize[bin]; + if (bin == B_PAGE) + { size_t npages = pool.ncommitted; + ubyte* pt; + size_t i; + + pt = &pool.pagetable[0]; + for (i = pn + 1; i < npages; i++) + { + if (pt[i] != B_PAGEPLUS) + break; + } + info.size = (i - pn) * PAGESIZE; + } + + //////////////////////////////////////////////////////////////////// + // getBits + //////////////////////////////////////////////////////////////////// + + info.attr = getBits(pool, cast(size_t)(offset / 16)); + } + return info; + } + + + /** + * Compute bin for size. + */ + static Bins findBin(size_t size) + { Bins bin; + + if (size <= 256) + { + if (size <= 64) + { + if (size <= 16) + bin = B_16; + else if (size <= 32) + bin = B_32; + else + bin = B_64; + } + else + { + if (size <= 128) + bin = B_128; + else + bin = B_256; + } + } + else + { + if (size <= 1024) + { + if (size <= 512) + bin = B_512; + else + bin = B_1024; + } + else + { + if (size <= 2048) + bin = B_2048; + else + bin = B_PAGE; + } + } + return bin; + } + + + /** + * Allocate a new pool of at least size bytes. + * Sort it into pooltable[]. + * Mark all memory in the pool as B_FREE. + * Return the actual number of bytes reserved or 0 on error. + */ + size_t reserve(size_t size) + { + size_t npages = (size + PAGESIZE - 1) / PAGESIZE; + Pool* pool = newPool(npages); + + if (!pool || pool.extendPages(npages) == OPFAIL) + return 0; + return pool.ncommitted * PAGESIZE; + } + + + /** + * Minimizes physical memory usage by returning free pools to the OS. + */ + void minimize() + { + size_t n; + size_t pn; + Pool* pool; + size_t ncommitted; + + for (n = 0; n < npools; n++) + { + pool = pooltable[n]; + ncommitted = pool.ncommitted; + for (pn = 0; pn < ncommitted; pn++) + { + if (cast(Bins)pool.pagetable[pn] != B_FREE) + break; + } + if (pn < ncommitted) + { + n++; + continue; + } + pool.Dtor(); + cstdlib.free(pool); + memmove(pooltable + n, + pooltable + n + 1, + (--npools - n) * (Pool*).sizeof); + minAddr = pooltable[0].baseAddr; + maxAddr = pooltable[npools - 1].topAddr; + } + } + + + /** + * Allocate a chunk of memory that is larger than a page. + * Return null if out of memory. + */ + void *bigAlloc(size_t size) + { + Pool* pool; + size_t npages; + size_t n; + size_t pn; + size_t freedpages; + void* p; + int state; + bool collected = false; + + npages = (size + PAGESIZE - 1) / PAGESIZE; + + for (state = disabled ? 1 : 0; ; ) + { + // This code could use some refinement when repeatedly + // allocating very large arrays. + + for (n = 0; n < npools; n++) + { + pool = pooltable[n]; + pn = pool.allocPages(npages); + if (pn != OPFAIL) + goto L1; + } + + // Failed + switch (state) + { + case 0: + // Try collecting + collected = true; + freedpages = fullcollectshell(); + if (freedpages >= npools * ((POOLSIZE / PAGESIZE) / 4)) + { state = 1; + continue; + } + // Release empty pools to prevent bloat + minimize(); + // Allocate new pool + pool = newPool(npages); + if (!pool) + { state = 2; + continue; + } + pn = pool.allocPages(npages); + assert(pn != OPFAIL); + goto L1; + case 1: + // Release empty pools to prevent bloat + minimize(); + // Allocate new pool + pool = newPool(npages); + if (!pool) + { + if (collected) + goto Lnomemory; + state = 0; + continue; + } + pn = pool.allocPages(npages); + assert(pn != OPFAIL); + goto L1; + case 2: + goto Lnomemory; + default: + assert(false); + } + } + + L1: + pool.pagetable[pn] = B_PAGE; + if (npages > 1) + memset(&pool.pagetable[pn + 1], B_PAGEPLUS, npages - 1); + p = pool.baseAddr + pn * PAGESIZE; + memset(cast(char *)p + size, 0, npages * PAGESIZE - size); + debug (MEMSTOMP) memset(p, 0xF1, size); + //debug(PRINTF) printf("\tp = %x\n", p); + return p; + + Lnomemory: + return null; // let caller handle the error + } + + + /** + * Allocate a new pool with at least npages in it. + * Sort it into pooltable[]. + * Return null if failed. + */ + Pool *newPool(size_t npages) + { + Pool* pool; + Pool** newpooltable; + size_t newnpools; + size_t i; + + //debug(PRINTF) printf("************Gcx::newPool(npages = %d)****************\n", npages); + + // Round up to COMMITSIZE pages + npages = (npages + (COMMITSIZE/PAGESIZE) - 1) & ~(COMMITSIZE/PAGESIZE - 1); + + // Minimum of POOLSIZE + if (npages < POOLSIZE/PAGESIZE) + npages = POOLSIZE/PAGESIZE; + else if (npages > POOLSIZE/PAGESIZE) + { // Give us 150% of requested size, so there's room to extend + auto n = npages + (npages >> 1); + if (n < size_t.max/PAGESIZE) + npages = n; + } + + // Allocate successively larger pools up to 8 megs + if (npools) + { size_t n; + + n = npools; + if (n > 8) + n = 8; // cap pool size at 8 megs + n *= (POOLSIZE / PAGESIZE); + if (npages < n) + npages = n; + } + + pool = cast(Pool *)cstdlib.calloc(1, Pool.sizeof); + if (pool) + { + pool.initialize(npages); + if (!pool.baseAddr) + goto Lerr; + + newnpools = npools + 1; + newpooltable = cast(Pool **)cstdlib.realloc(pooltable, newnpools * (Pool *).sizeof); + if (!newpooltable) + goto Lerr; + + // Sort pool into newpooltable[] + for (i = 0; i < npools; i++) + { + if (pool.opCmp(newpooltable[i]) < 0) + break; + } + memmove(newpooltable + i + 1, newpooltable + i, (npools - i) * (Pool *).sizeof); + newpooltable[i] = pool; + + pooltable = newpooltable; + npools = newnpools; + + minAddr = pooltable[0].baseAddr; + maxAddr = pooltable[npools - 1].topAddr; + } + return pool; + + Lerr: + pool.Dtor(); + cstdlib.free(pool); + return null; + } + + + /** + * Allocate a page of bin's. + * Returns: + * 0 failed + */ + int allocPage(Bins bin) + { + Pool* pool; + size_t n; + size_t pn; + byte* p; + byte* ptop; + + //debug(PRINTF) printf("Gcx::allocPage(bin = %d)\n", bin); + for (n = 0; n < npools; n++) + { + pool = pooltable[n]; + pn = pool.allocPages(1); + if (pn != OPFAIL) + goto L1; + } + return 0; // failed + + L1: + pool.pagetable[pn] = cast(ubyte)bin; + + // Convert page to free list + size_t size = binsize[bin]; + List **b = &bucket[bin]; + + p = pool.baseAddr + pn * PAGESIZE; + ptop = p + PAGESIZE; + for (; p < ptop; p += size) + { + (cast(List *)p).next = *b; + *b = cast(List *)p; + } + return 1; + } + + + /** + * Search a range of memory values and mark any pointers into the GC pool. + */ + void mark(void *pbot, void *ptop) + { + void **p1 = cast(void **)pbot; + void **p2 = cast(void **)ptop; + size_t pcache = 0; + uint changes = 0; + + //printf("marking range: %p -> %p\n", pbot, ptop); + for (; p1 < p2; p1++) + { + Pool *pool; + byte *p = cast(byte *)(*p1); + + //if (log) debug(PRINTF) printf("\tmark %x\n", p); + if (p >= minAddr && p < maxAddr) + { + if ((cast(size_t)p & ~(PAGESIZE-1)) == pcache) + continue; + + pool = findPool(p); + if (pool) + { + size_t offset = cast(size_t)(p - pool.baseAddr); + size_t biti; + size_t pn = offset / PAGESIZE; + Bins bin = cast(Bins)pool.pagetable[pn]; + + //debug(PRINTF) printf("\t\tfound pool %x, base=%x, pn = %d, bin = %d, biti = x%x\n", pool, pool.baseAddr, pn, bin, biti); + + // Adjust bit to be at start of allocated memory block + if (bin <= B_PAGE) + { + biti = (offset & notbinsize[bin]) >> 4; + //debug(PRINTF) printf("\t\tbiti = x%x\n", biti); + } + else if (bin == B_PAGEPLUS) + { + do + { --pn; + } while (cast(Bins)pool.pagetable[pn] == B_PAGEPLUS); + biti = pn * (PAGESIZE / 16); + } + else + { + // Don't mark bits in B_FREE or B_UNCOMMITTED pages + continue; + } + + if (bin >= B_PAGE) // Cache B_PAGE and B_PAGEPLUS lookups + pcache = cast(size_t)p & ~(PAGESIZE-1); + + //debug(PRINTF) printf("\t\tmark(x%x) = %d\n", biti, pool.mark.test(biti)); + if (!pool.mark.test(biti)) + { + //if (log) debug(PRINTF) printf("\t\tmarking %x\n", p); + pool.mark.set(biti); + if (!pool.noscan.test(biti)) + { + pool.scan.set(biti); + changes = 1; + } + log_parent(sentinel_add(pool.baseAddr + biti * 16), sentinel_add(pbot)); + } + } + } + } + anychanges |= changes; + } + + + /** + * Return number of full pages free'd. + */ + size_t fullcollectshell() + { + // The purpose of the 'shell' is to ensure all the registers + // get put on the stack so they'll be scanned + void *sp; + size_t result; + version (GNU) + { + __builtin_unwind_init(); + sp = & sp; + } + else + { + asm + { + pushad ; + mov sp[EBP],ESP ; + } + } + result = fullcollect(sp); + version (GNU) + { + // nothing to do + } + else + { + asm + { + popad ; + } + } + return result; + } + + + /** + * + */ + size_t fullcollect(void *stackTop) + { + size_t n; + Pool* pool; + + debug(COLLECT_PRINTF) printf("Gcx.fullcollect()\n"); + + thread_suspendAll(); + + p_cache = null; + size_cache = 0; + + anychanges = 0; + for (n = 0; n < npools; n++) + { + pool = pooltable[n]; + pool.mark.zero(); + pool.scan.zero(); + pool.freebits.zero(); + } + + // Mark each free entry, so it doesn't get scanned + for (n = 0; n < B_PAGE; n++) + { + for (List *list = bucket[n]; list; list = list.next) + { + pool = findPool(list); + assert(pool); + pool.freebits.set(cast(size_t)(cast(byte*)list - pool.baseAddr) / 16); + } + } + + for (n = 0; n < npools; n++) + { + pool = pooltable[n]; + pool.mark.copy(&pool.freebits); + } + + version (MULTI_THREADED) + { + if (!noStack) + { + // Scan stacks and registers for each paused thread + thread_scanAll( &mark, stackTop ); + } + } + else + { + if (!noStack) + { + // Scan stack for main thread + debug(PRINTF) printf(" scan stack bot = %x, top = %x\n", stackTop, stackBottom); + version (STACKGROWSDOWN) + mark(stackTop, stackBottom); + else + mark(stackBottom, stackTop); + } + } + + // Scan roots[] + debug(COLLECT_PRINTF) printf("scan roots[]\n"); + mark(roots, roots + nroots); + + // Scan ranges[] + debug(COLLECT_PRINTF) printf("scan ranges[]\n"); + //log++; + for (n = 0; n < nranges; n++) + { + debug(COLLECT_PRINTF) printf("\t%x .. %x\n", ranges[n].pbot, ranges[n].ptop); + mark(ranges[n].pbot, ranges[n].ptop); + } + //log--; + + debug(COLLECT_PRINTF) printf("\tscan heap\n"); + while (anychanges) + { + anychanges = 0; + for (n = 0; n < npools; n++) + { + uint *bbase; + uint *b; + uint *btop; + + pool = pooltable[n]; + + bbase = pool.scan.base(); + btop = bbase + pool.scan.nwords; + for (b = bbase; b < btop;) + { Bins bin; + size_t pn; + size_t u; + size_t bitm; + byte* o; + + bitm = *b; + if (!bitm) + { b++; + continue; + } + *b = 0; + + o = pool.baseAddr + (b - bbase) * 32 * 16; + if (!(bitm & 0xFFFF)) + { + bitm >>= 16; + o += 16 * 16; + } + for (; bitm; o += 16, bitm >>= 1) + { + if (!(bitm & 1)) + continue; + + pn = cast(size_t)(o - pool.baseAddr) / PAGESIZE; + bin = cast(Bins)pool.pagetable[pn]; + if (bin < B_PAGE) + { + mark(o, o + binsize[bin]); + } + else if (bin == B_PAGE || bin == B_PAGEPLUS) + { + if (bin == B_PAGEPLUS) + { + while (pool.pagetable[pn - 1] != B_PAGE) + pn--; + } + u = 1; + while (pn + u < pool.ncommitted && pool.pagetable[pn + u] == B_PAGEPLUS) + u++; + mark(o, o + u * PAGESIZE); + } + } + } + } + } + + thread_resumeAll(); + + // Free up everything not marked + debug(COLLECT_PRINTF) printf("\tfree'ing\n"); + size_t freedpages = 0; + size_t freed = 0; + for (n = 0; n < npools; n++) + { size_t pn; + size_t ncommitted; + uint* bbase; + + pool = pooltable[n]; + bbase = pool.mark.base(); + ncommitted = pool.ncommitted; + for (pn = 0; pn < ncommitted; pn++, bbase += PAGESIZE / (32 * 16)) + { + Bins bin = cast(Bins)pool.pagetable[pn]; + + if (bin < B_PAGE) + { byte* p; + byte* ptop; + size_t biti; + size_t bitstride; + auto size = binsize[bin]; + + p = pool.baseAddr + pn * PAGESIZE; + ptop = p + PAGESIZE; + biti = pn * (PAGESIZE/16); + bitstride = size / 16; + + version(none) // BUG: doesn't work because freebits() must also be cleared + { + // If free'd entire page + if (bbase[0] == 0 && bbase[1] == 0 && bbase[2] == 0 && bbase[3] == 0 && + bbase[4] == 0 && bbase[5] == 0 && bbase[6] == 0 && bbase[7] == 0) + { + for (; p < ptop; p += size, biti += bitstride) + { + if (pool.finals.nbits && pool.finals.testClear(biti)) + rt_finalize(cast(List *)sentinel_add(p), false/*noStack > 0*/); + gcx.clrBits(pool, biti, BlkAttr.ALL_BITS); + + List *list = cast(List *)p; + //debug(PRINTF) printf("\tcollecting %x\n", list); + log_free(sentinel_add(list)); + + debug (MEMSTOMP) memset(p, 0xF3, size); + } + pool.pagetable[pn] = B_FREE; + freed += PAGESIZE; + //debug(PRINTF) printf("freeing entire page %d\n", pn); + continue; + } + } + for (; p < ptop; p += size, biti += bitstride) + { + if (!pool.mark.test(biti)) + { + sentinel_Invariant(sentinel_add(p)); + + pool.freebits.set(biti); + if (pool.finals.nbits && pool.finals.testClear(biti)) + rt_finalize(cast(List *)sentinel_add(p), false/*noStack > 0*/); + clrBits(pool, biti, BlkAttr.ALL_BITS); + + List *list = cast(List *)p; + debug(PRINTF) printf("\tcollecting %x\n", list); + log_free(sentinel_add(list)); + + debug (MEMSTOMP) memset(p, 0xF3, size); + + freed += size; + } + } + } + else if (bin == B_PAGE) + { size_t biti = pn * (PAGESIZE / 16); + + if (!pool.mark.test(biti)) + { byte *p = pool.baseAddr + pn * PAGESIZE; + + sentinel_Invariant(sentinel_add(p)); + if (pool.finals.nbits && pool.finals.testClear(biti)) + rt_finalize(sentinel_add(p), false/*noStack > 0*/); + clrBits(pool, biti, BlkAttr.ALL_BITS); + + debug(COLLECT_PRINTF) printf("\tcollecting big %x\n", p); + log_free(sentinel_add(p)); + pool.pagetable[pn] = B_FREE; + freedpages++; + debug (MEMSTOMP) memset(p, 0xF3, PAGESIZE); + while (pn + 1 < ncommitted && pool.pagetable[pn + 1] == B_PAGEPLUS) + { + pn++; + pool.pagetable[pn] = B_FREE; + freedpages++; + + debug (MEMSTOMP) + { p += PAGESIZE; + memset(p, 0xF3, PAGESIZE); + } + } + } + } + } + } + + // Zero buckets + bucket[] = null; + + // Free complete pages, rebuild free list + debug(COLLECT_PRINTF) printf("\tfree complete pages\n"); + size_t recoveredpages = 0; + for (n = 0; n < npools; n++) + { size_t pn; + size_t ncommitted; + + pool = pooltable[n]; + ncommitted = pool.ncommitted; + for (pn = 0; pn < ncommitted; pn++) + { + Bins bin = cast(Bins)pool.pagetable[pn]; + size_t biti; + size_t u; + + if (bin < B_PAGE) + { + size_t size = binsize[bin]; + size_t bitstride = size / 16; + size_t bitbase = pn * (PAGESIZE / 16); + size_t bittop = bitbase + (PAGESIZE / 16); + byte* p; + + biti = bitbase; + for (biti = bitbase; biti < bittop; biti += bitstride) + { if (!pool.freebits.test(biti)) + goto Lnotfree; + } + pool.pagetable[pn] = B_FREE; + recoveredpages++; + continue; + + Lnotfree: + p = pool.baseAddr + pn * PAGESIZE; + for (u = 0; u < PAGESIZE; u += size) + { biti = bitbase + u / 16; + if (pool.freebits.test(biti)) + { List *list; + + list = cast(List *)(p + u); + if (list.next != bucket[bin]) // avoid unnecessary writes + list.next = bucket[bin]; + bucket[bin] = list; + } + } + } + } + } + + debug(COLLECT_PRINTF) printf("recovered pages = %d\n", recoveredpages); + debug(COLLECT_PRINTF) printf("\tfree'd %u bytes, %u pages from %u pools\n", freed, freedpages, npools); + + return freedpages + recoveredpages; + } + + + /** + * + */ + uint getBits(Pool* pool, size_t biti) + in + { + assert( pool ); + } + body + { + uint bits; + + if (pool.finals.nbits && + pool.finals.test(biti)) + bits |= BlkAttr.FINALIZE; + if (pool.noscan.test(biti)) + bits |= BlkAttr.NO_SCAN; +// if (pool.nomove.nbits && +// pool.nomove.test(biti)) +// bits |= BlkAttr.NO_MOVE; + return bits; + } + + + /** + * + */ + void setBits(Pool* pool, size_t biti, uint mask) + in + { + assert( pool ); + } + body + { + if (mask & BlkAttr.FINALIZE) + { + if (!pool.finals.nbits) + pool.finals.alloc(pool.mark.nbits); + pool.finals.set(biti); + } + if (mask & BlkAttr.NO_SCAN) + { + pool.noscan.set(biti); + } +// if (mask & BlkAttr.NO_MOVE) +// { +// if (!pool.nomove.nbits) +// pool.nomove.alloc(pool.mark.nbits); +// pool.nomove.set(biti); +// } + } + + + /** + * + */ + void clrBits(Pool* pool, size_t biti, uint mask) + in + { + assert( pool ); + } + body + { + if (mask & BlkAttr.FINALIZE && pool.finals.nbits) + pool.finals.clear(biti); + if (mask & BlkAttr.NO_SCAN) + pool.noscan.clear(biti); +// if (mask & BlkAttr.NO_MOVE && pool.nomove.nbits) +// pool.nomove.clear(biti); + } + + + /***** Leak Detector ******/ + + + debug (LOGGING) + { + LogArray current; + LogArray prev; + + + void log_init() + { + //debug(PRINTF) printf("+log_init()\n"); + current.reserve(1000); + prev.reserve(1000); + //debug(PRINTF) printf("-log_init()\n"); + } + + + void log_malloc(void *p, size_t size) + { + //debug(PRINTF) printf("+log_malloc(p = %x, size = %d)\n", p, size); + Log log; + + log.p = p; + log.size = size; + log.line = GC.line; + log.file = GC.file; + log.parent = null; + + GC.line = 0; + GC.file = null; + + current.push(log); + //debug(PRINTF) printf("-log_malloc()\n"); + } + + + void log_free(void *p) + { + //debug(PRINTF) printf("+log_free(%x)\n", p); + size_t i; + + i = current.find(p); + if (i == OPFAIL) + { + debug(PRINTF) printf("free'ing unallocated memory %x\n", p); + } + else + current.remove(i); + //debug(PRINTF) printf("-log_free()\n"); + } + + + void log_collect() + { + //debug(PRINTF) printf("+log_collect()\n"); + // Print everything in current that is not in prev + + debug(PRINTF) printf("New pointers this cycle: --------------------------------\n"); + size_t used = 0; + for (size_t i = 0; i < current.dim; i++) + { + size_t j; + + j = prev.find(current.data[i].p); + if (j == OPFAIL) + current.data[i].print(); + else + used++; + } + + debug(PRINTF) printf("All roots this cycle: --------------------------------\n"); + for (size_t i = 0; i < current.dim; i++) + { + void *p; + size_t j; + + p = current.data[i].p; + if (!findPool(current.data[i].parent)) + { + j = prev.find(current.data[i].p); + if (j == OPFAIL) + debug(PRINTF) printf("N"); + else + debug(PRINTF) printf(" ");; + current.data[i].print(); + } + } + + debug(PRINTF) printf("Used = %d-------------------------------------------------\n", used); + prev.copy(¤t); + + debug(PRINTF) printf("-log_collect()\n"); + } + + + void log_parent(void *p, void *parent) + { + //debug(PRINTF) printf("+log_parent()\n"); + size_t i; + + i = current.find(p); + if (i == OPFAIL) + { + debug(PRINTF) printf("parent'ing unallocated memory %x, parent = %x\n", p, parent); + Pool *pool; + pool = findPool(p); + assert(pool); + size_t offset = cast(size_t)(p - pool.baseAddr); + size_t biti; + size_t pn = offset / PAGESIZE; + Bins bin = cast(Bins)pool.pagetable[pn]; + biti = (offset & notbinsize[bin]); + debug(PRINTF) printf("\tbin = %d, offset = x%x, biti = x%x\n", bin, offset, biti); + } + else + { + current.data[i].parent = parent; + } + //debug(PRINTF) printf("-log_parent()\n"); + } + + } + else + { + void log_init() { } + void log_malloc(void *p, size_t size) { } + void log_free(void *p) { } + void log_collect() { } + void log_parent(void *p, void *parent) { } + } +} + + +/* ============================ Pool =============================== */ + + +struct Pool +{ + byte* baseAddr; + byte* topAddr; + GCBits mark; // entries already scanned, or should not be scanned + GCBits scan; // entries that need to be scanned + GCBits freebits; // entries that are on the free list + GCBits finals; // entries that need finalizer run on them + GCBits noscan; // entries that should not be scanned + + size_t npages; + size_t ncommitted; // ncommitted <= npages + ubyte* pagetable; + + + void initialize(size_t npages) + { + size_t poolsize; + + //debug(PRINTF) printf("Pool::Pool(%u)\n", npages); + poolsize = npages * PAGESIZE; + assert(poolsize >= POOLSIZE); + baseAddr = cast(byte *)os_mem_map(poolsize); + + // Some of the code depends on page alignment of memory pools + assert((cast(size_t)baseAddr & (PAGESIZE - 1)) == 0); + + if (!baseAddr) + { + //debug(PRINTF) printf("GC fail: poolsize = x%x, errno = %d\n", poolsize, errno); + //debug(PRINTF) printf("message = '%s'\n", sys_errlist[errno]); + + npages = 0; + poolsize = 0; + } + //assert(baseAddr); + topAddr = baseAddr + poolsize; + + mark.alloc(cast(size_t)poolsize / 16); + scan.alloc(cast(size_t)poolsize / 16); + freebits.alloc(cast(size_t)poolsize / 16); + noscan.alloc(cast(size_t)poolsize / 16); + + pagetable = cast(ubyte*)cstdlib.malloc(npages); + if (!pagetable) + onOutOfMemoryError(); + memset(pagetable, B_UNCOMMITTED, npages); + + this.npages = npages; + ncommitted = 0; + } + + + void Dtor() + { + if (baseAddr) + { + int result; + + if (ncommitted) + { + result = os_mem_decommit(baseAddr, 0, ncommitted * PAGESIZE); + assert(result == 0); + ncommitted = 0; + } + + if (npages) + { + result = os_mem_unmap(baseAddr, npages * PAGESIZE); + assert(result == 0); + npages = 0; + } + + baseAddr = null; + topAddr = null; + } + if (pagetable) + cstdlib.free(pagetable); + + mark.Dtor(); + scan.Dtor(); + freebits.Dtor(); + finals.Dtor(); + noscan.Dtor(); + } + + + void Invariant() { } + + + invariant() + { + //mark.Invariant(); + //scan.Invariant(); + //freebits.Invariant(); + //finals.Invariant(); + //noscan.Invariant(); + + if (baseAddr) + { + //if (baseAddr + npages * PAGESIZE != topAddr) + //printf("baseAddr = %p, npages = %d, topAddr = %p\n", baseAddr, npages, topAddr); + assert(baseAddr + npages * PAGESIZE == topAddr); + assert(ncommitted <= npages); + } + + for (size_t i = 0; i < npages; i++) + { Bins bin = cast(Bins)pagetable[i]; + + assert(bin < B_MAX); + } + } + + + /** + * Allocate n pages from Pool. + * Returns OPFAIL on failure. + */ + size_t allocPages(size_t n) + { + size_t i; + size_t n2; + + //debug(PRINTF) printf("Pool::allocPages(n = %d)\n", n); + n2 = n; + for (i = 0; i < ncommitted; i++) + { + if (pagetable[i] == B_FREE) + { + if (--n2 == 0) + { //debug(PRINTF) printf("\texisting pn = %d\n", i - n + 1); + return i - n + 1; + } + } + else + n2 = n; + } + return extendPages(n); + } + + /** + * Extend Pool by n pages. + * Returns OPFAIL on failure. + */ + size_t extendPages(size_t n) + { + //debug(PRINTF) printf("Pool::extendPages(n = %d)\n", n); + if (ncommitted + n <= npages) + { + size_t tocommit; + + tocommit = (n + (COMMITSIZE/PAGESIZE) - 1) & ~(COMMITSIZE/PAGESIZE - 1); + if (ncommitted + tocommit > npages) + tocommit = npages - ncommitted; + //debug(PRINTF) printf("\tlooking to commit %d more pages\n", tocommit); + //fflush(stdout); + if (os_mem_commit(baseAddr, ncommitted * PAGESIZE, tocommit * PAGESIZE) == 0) + { + memset(pagetable + ncommitted, B_FREE, tocommit); + auto i = ncommitted; + ncommitted += tocommit; + + while (i && pagetable[i - 1] == B_FREE) + i--; + + return i; + } + //debug(PRINTF) printf("\tfailed to commit %d pages\n", tocommit); + } + + return OPFAIL; + } + + + /** + * Free npages pages starting with pagenum. + */ + void freePages(size_t pagenum, size_t npages) + { + memset(&pagetable[pagenum], B_FREE, npages); + } + + + /** + * Used for sorting pooltable[] + */ + int opCmp(Pool *p2) + { + if (baseAddr < p2.baseAddr) + return -1; + else + return cast(int)(baseAddr > p2.baseAddr); + } +} + + +/* ============================ SENTINEL =============================== */ + + +version (SENTINEL) +{ + const size_t SENTINEL_PRE = cast(size_t) 0xF4F4F4F4F4F4F4F4UL; // 32 or 64 bits + const ubyte SENTINEL_POST = 0xF5; // 8 bits + const uint SENTINEL_EXTRA = 2 * size_t.sizeof + 1; + + + size_t* sentinel_size(void *p) { return &(cast(size_t *)p)[-2]; } + size_t* sentinel_pre(void *p) { return &(cast(size_t *)p)[-1]; } + ubyte* sentinel_post(void *p) { return &(cast(ubyte *)p)[*sentinel_size(p)]; } + + + void sentinel_init(void *p, size_t size) + { + *sentinel_size(p) = size; + *sentinel_pre(p) = SENTINEL_PRE; + *sentinel_post(p) = SENTINEL_POST; + } + + + void sentinel_Invariant(void *p) + { + assert(*sentinel_pre(p) == SENTINEL_PRE); + assert(*sentinel_post(p) == SENTINEL_POST); + } + + + void *sentinel_add(void *p) + { + return p + 2 * size_t.sizeof; + } + + + void *sentinel_sub(void *p) + { + return p - 2 * size_t.sizeof; + } +} +else +{ + const uint SENTINEL_EXTRA = 0; + + + void sentinel_init(void *p, size_t size) + { + } + + + void sentinel_Invariant(void *p) + { + } + + + void *sentinel_add(void *p) + { + return p; + } + + + void *sentinel_sub(void *p) + { + return p; + } +} diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/gc/basic/posix.mak --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/gc/basic/posix.mak Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,63 @@ +# Makefile to build the garbage collector D library for Posix +# Designed to work with GNU make +# Targets: +# make +# Same as make all +# make debug +# Build the debug version of the library +# make release +# Build the release version of the library +# make doc +# Generate documentation +# make clean +# Delete all files created by build process + +# Essentials + +LIBDIR=../../../lib +DOCDIR=../../../doc +IMPDIR=../../../import +LIBBASENAME=libdruntime-gc-basic.a +MODULES=gc gcalloc gcbits gcstats gcx +BUILDS=debug release unittest + +# Symbols + +DMD=dmd +DOCFLAGS=-version=DDoc +DFLAGS_release=-d -release -O -inline -w -nofloat +DFLAGS_debug=-d -g -w -nofloat +DFLAGS_unittest=$(DFLAGS_release) -unittest +CFLAGS_release=-m32 -O +CFLAGS_debug=-m32 -g +CFLAGS_unittest=$(CFLAGS_release) + +# Derived symbols + +SRCS=$(addsuffix .d,$(MODULES)) +DOCS= +IMPORTS= +ALLLIBS=$(addsuffix /$(LIBBASENAME),$(addprefix $(LIBDIR)/,$(BUILDS))) + +# Patterns + +$(LIBDIR)/%/$(LIBBASENAME) : $(SRCS) + $(DMD) $(DFLAGS_$*) -lib -of$@ $^ + +$(DOCDIR)/%.html : %.d + $(DMD) -c -d -o- -Df$@ $< + +$(IMPDIR)/%.di : %.d + $(DMD) -c -d -o- -Hf$@ $< + +# Rulez + +all : $(BUILDS) doc + +debug : $(LIBDIR)/debug/$(LIBBASENAME) $(IMPORTS) +release : $(LIBDIR)/release/$(LIBBASENAME) $(IMPORTS) +unittest : $(LIBDIR)/unittest/$(LIBBASENAME) $(IMPORTS) +#doc : $(DOCS) + +clean : + rm -f $(IMPORTS) $(DOCS) $(ALLLIBS) diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/gc/basic/win32.mak --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/gc/basic/win32.mak Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,82 @@ +# Makefile to build the garbage collector D library for Posix +# Designed to work with GNU make +# Targets: +# make +# Same as make all +# make debug +# Build the debug version of the library +# make release +# Build the release version of the library +# make doc +# Generate documentation +# make clean +# Delete all files created by build process + +# Essentials + +LIBDIR=..\..\..\lib +DOCDIR=..\..\..\doc +IMPDIR=..\..\..\import +LIBBASENAME=druntime_gc_basic.lib +#MODULES=gc gcalloc gcbits gcstats gcx +BUILDS=debug release unittest + +# Symbols + +CC=dmc +DMD=dmd +DOCFLAGS=-version=DDoc +DFLAGS_release=-d -release -O -inline -w -nofloat +DFLAGS_debug=-d -g -w -nofloat +DFLAGS_unittest=$(DFLAGS_release) -unittest +CFLAGS_release=-mn -6 -r +CFLAGS_debug=-g -mn -6 -r +CFLAGS_unittest=$(CFLAGS_release) + +# Derived symbols + +SRCS=gc.d gcalloc.d gcbits.d gcstats.d gcx.d +DOCS= +IMPORTS= +ALLLIBS=\ + $(LIBDIR)\debug\$(LIBBASENAME) \ + $(LIBDIR)\release\$(LIBBASENAME) \ + $(LIBDIR)\unittest\$(LIBBASENAME) + +# Patterns + +#$(LIBDIR)\%\$(LIBBASENAME) : $(SRCS) +# $(DMD) $(DFLAGS_$*) -lib -of$@ $^ + +#$(DOCDIR)\%.html : %.d +# $(DMD) -c -d -o- -Df$@ $< + +#$(IMPDIR)\%.di : %.d +# $(DMD) -c -d -o- -Hf$@ $< + +# Patterns - debug + +$(LIBDIR)\debug\$(LIBBASENAME) : $(SRCS) + $(DMD) $(DFLAGS_debug) -lib -of$@ $** + +# Patterns - release + +$(LIBDIR)\release\$(LIBBASENAME) : $(SRCS) + $(DMD) $(DFLAGS_release) -lib -of$@ $** + +# Patterns - unittest + +$(LIBDIR)\unittest\$(LIBBASENAME) : $(SRCS) + $(DMD) $(DFLAGS_unittest) -lib -of$@ $** + +# Rulez + +all : $(BUILDS) doc + +debug : $(LIBDIR)\debug\$(LIBBASENAME) $(IMPORTS) +release : $(LIBDIR)\release\$(LIBBASENAME) $(IMPORTS) +unittest : $(LIBDIR)\unittest\$(LIBBASENAME) $(IMPORTS) +#doc : $(DOCS) + +clean : + del $(IMPORTS) $(DOCS) $(ALLLIBS) diff -r 7b218ec1044f -r e0b2d67cfe7c druntime/src/gc/stub/gc.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/druntime/src/gc/stub/gc.d Tue Jun 02 17:43:06 2009 +0100 @@ -0,0 +1,359 @@ +/** + * This module contains a minimal garbage collector implementation according to + * published requirements. This library is mostly intended to serve as an + * example, but it is usable in applications which do not rely on a garbage + * collector to clean up memory (ie. when dynamic array resizing is not used, + * and all memory allocated with 'new' is freed deterministically with + * 'delete'). + * + * Please note that block attribute data must be tracked, or at a minimum, the + * FINALIZE bit must be tracked for any allocated memory block because calling + * rt_finalize on a non-object block can result in an access violation. In the + * allocator below, this tracking is done via a leading uint bitmask. A real + * allocator may do better to store this data separately, similar to the basic + * GC. + * + * Copyright: Copyright Sean Kelly 2005 - 2009. + * License: