# HG changeset patch # User Frank Benoit # Date 1204477992 -3600 # Node ID e4e5dea57644980a2a0d1c4518360d3b3c17e0e7 # Parent 285aa0c31277401c3d12c7ef4bf3eeadff29cc73 Add several functions to support utf8 stuff diff -r 285aa0c31277 -r e4e5dea57644 dwt/dwthelper/utils.d --- a/dwt/dwthelper/utils.d Sun Mar 02 18:12:27 2008 +0100 +++ b/dwt/dwthelper/utils.d Sun Mar 02 18:13:12 2008 +0100 @@ -14,6 +14,8 @@ import tango.core.Exception; import tango.stdc.stdlib : exit; +import tango.util.log.Trace; + void implMissing( char[] file, uint line ){ Stderr.formatln( "implementation missing in file {} line {}", file, line ); Stderr.formatln( "exiting ..." ); @@ -46,11 +48,169 @@ alias ArrayWrapperT!(char) ArrayWrapperString; alias ArrayWrapperT!(char[]) ArrayWrapperString2; -dchar getFirstCodepoint( char[] str ){ - foreach( dchar d; str ){ - return d; +int codepointIndexToIndex( char[] str, int cpIndex ){ + int cps = cpIndex; + int res = 0; + while( cps > 0 ){ + cps--; + if( str[res] < 0x80 ){ + res+=1; + } + else if( str[res] < 0xE0 ){ + res+=2; + } + else if( str[res] & 0xF0 ){ + res+=3; + } + else{ + res+=4; + } + } + return res; +} +int indexToCodepointIndex( char[] str, int index ){ + int i = 0; + int res = 0; + while( i < index ){ + if( str[i] < 0x80 ){ + i+=1; + } + else if( str[i] < 0xE0 ){ + i+=2; + } + else if( str[i] & 0xF0 ){ + i+=3; + } + else{ + i+=4; + } + res++; + } + return res; +} + +char[] firstCodePointStr( char[] str, out int consumed ){ + dchar[1] buf; + uint ate; + dchar[] res = str.toString32( buf, &ate ); + consumed = ate; + return str[ 0 .. ate ]; +} + +dchar firstCodePoint( char[] str ){ + int dummy; + return firstCodePoint( str, dummy ); +} +dchar firstCodePoint( char[] str, out int consumed ){ + dchar[1] buf; + uint ate; + dchar[] res = str.toString32( buf, &ate ); + consumed = ate; + if( ate is 0 || res.length is 0 ){ + Trace.formatln( "dwthelper.utils {}: str.length={} str={:X2}", __LINE__, str.length, cast(ubyte[])str ); } + assert( ate > 0 ); + assert( res.length is 1 ); + return res[0]; } + +char[] dcharToString( dchar key ){ + dchar[1] buf; + buf[0] = key; + return tango.text.convert.Utf.toString( buf ); +} + +int codepointCount( char[] str ){ + scope dchar[] buf = new dchar[]( str.length ); + uint ate; + dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate ); + Trace.formatln( "dwthelper.utils codepointCount {}: res.length={}", __LINE__, res.length ); + assert( ate is str.length ); + return res.length; +} + +alias tango.text.convert.Utf.toString16 toString16; +alias tango.text.convert.Utf.toString toString; + +int getRelativeCodePointOffset( char[] str, int startIndex, int searchRelCp ){ + int ignore; + int i = startIndex; + if( searchRelCp > 0 ){ + while( searchRelCp !is 0 ){ + + if( ( i < str.length ) + && ( str[i] & 0x80 ) is 0x00 ) + { + i+=1; + } + else if( ( i+1 < str.length ) + && (( str[i+1] & 0xC0 ) is 0x80 ) + && (( str[i ] & 0xE0 ) is 0xC0 )) + { + i+=2; + } + else if( ( i+2 < str.length ) + && (( str[i+2] & 0xC0 ) is 0x80 ) + && (( str[i+1] & 0xC0 ) is 0x80 ) + && (( str[i ] & 0xF0 ) is 0xE0 )) + { + i+=3; + } + else if(( i+3 < str.length ) + && (( str[i+3] & 0xC0 ) is 0x80 ) + && (( str[i+2] & 0xC0 ) is 0x80 ) + && (( str[i+1] & 0xC0 ) is 0x80 ) + && (( str[i ] & 0xF8 ) is 0xF0 )) + { + i+=4; + } + else{ + tango.text.convert.Utf.onUnicodeError( "invalid utf8 input", i ); + } + searchRelCp--; + } + } + else if( searchRelCp < 0 ){ + while( searchRelCp !is 0 ){ + do{ + i--; + if( i < 0 ){ + Trace.formatln( "dwthelper.utils getRelativeCodePointOffset {}: str={}, startIndex={}, searchRelCp={}", __LINE__, str, startIndex, searchRelCp ); + tango.text.convert.Utf.onUnicodeError( "invalid utf8 input", i ); + } + } while(( str[i] & 0xC0 ) is 0x80 ); + searchRelCp++; + } + } + return i - startIndex; +} +dchar getRelativeCodePoint( char[] str, int startIndex, int searchRelCp, out int relIndex ){ + relIndex = getRelativeCodePointOffset( str, startIndex, searchRelCp ); + int ignore; + return firstCodePoint( str[ startIndex+relIndex .. $ ], ignore ); +} + +int utf8AdjustOffset( char[] str, int offset ){ + if( str.length <= offset || offset <= 0 ){ + return offset; + } + while(( str[offset] & 0xC0 ) is 0x80 ){ + offset--; + } + return offset; +} + +dchar CharacterFirstToLower( char[] str ){ + int consumed; + return CharacterFirstToLower( str, consumed ); +} +dchar CharacterFirstToLower( char[] str, out int consumed ){ + dchar[1] buf; + buf[0] = firstCodePoint( str, consumed ); + dchar[] r = tango.text.Unicode.toLower( buf ); + return r[0]; +} + dchar CharacterToLower( dchar c ){ dchar[] r = tango.text.Unicode.toLower( [c] ); return r[0];