comparison dwt/dwthelper/XmlTranscode.d @ 198:be4ce760802a

Add: missing files
author Frank Benoit <benoit@tionex.de>
date Thu, 10 Apr 2008 11:20:26 +0200
parents
children ab60f3309436
comparison
equal deleted inserted replaced
197:184ab53b7785 198:be4ce760802a
1 module dwt.dwthelper.XmlTranscode;
2
3 import dwt.dwthelper.utils;
4 import tango.core.Exception;
5
6 /++
7 + Decode XML entities into UTF8 string.
8 + Eg. "&amp;" -> "&", "&#38;" -> "&", "&#x26;" -> "&"
9 + Throws TextException on failure
10 + The given string is modified.
11 +/
12 char[] xmlUnescape( char[] str ){
13
14 void error(){
15 throw new TextException( "xmlUnescape" );
16 }
17 // &lt; ...
18 // &#1234;
19 // &#x12AF;
20 char[] src = str;
21 char[] trg = str;
22 while( src.length ){
23 if( src[0] !is '&' ){
24 trg[0] = src[0];
25 trg = trg[1..$];
26 src = src[1..$];
27 }
28 else{
29 src = src[1..$]; // go past '&'
30 if( src.length < 2 ) error();
31
32 // search semi
33 int len = Math.min( src.length, 10 ); // limit semi search to possible longest entityname
34 int semi = tango.text.Util.locate( src[0 .. len ], ';' );
35 if( semi is len ) error(); // no semi found
36
37 char[] entityName = src[ 0 .. semi ]; // name without semi
38 dchar entityValue = 0;
39 switch( entityName ){
40 case "lt": entityValue = '<'; break;
41 case "gt": entityValue = '>'; break;
42 case "amp": entityValue = '&'; break;
43 case "quot": entityValue = '\"'; break;
44 case "apos": entityValue = '\''; break;
45 default:
46 if( entityName[0] is 'x' ){
47 if( semi < 2 ) error();
48 if( semi > 9 ) error();
49 foreach( hex; entityName[1..$] ){
50 entityValue <<= 4;
51 if( hex >= '0' && hex <= '9' ){
52 entityValue |= ( hex - '0' );
53 }
54 else if( hex >= 'a' && hex <= 'f' ){
55 entityValue |= ( hex - 'a' );
56 }
57 else if( hex >= 'A' && hex <= 'F' ){
58 entityValue |= ( hex - 'A' );
59 }
60 else{
61 error();
62 }
63 }
64 }
65 else{
66 if( semi < 1 ) error();
67 if( semi > 9 ) error();
68 foreach( dec; entityName[1..$] ){
69 if( dec >= '0' && dec <= '9' ){
70 entityValue *= 10;
71 entityValue += ( dec - '0' );
72 }
73 else{
74 error();
75 }
76 }
77 }
78 }
79 dchar[1] arr;
80 arr[0] = entityValue;
81 uint ate = 0;
82 char[] res = tango.text.convert.Utf.toString( arr, trg, &ate );
83 trg = trg[ res.length .. $ ];
84 src = src[ semi +1 .. $ ]; // go past semi
85 }
86 }
87 return str[ 0 .. trg.ptr-str.ptr ];
88 }
89
90
91 /++
92 + Encode XML entities into UTF8 string.
93 + First checks if processing is needed.
94 + If not, the original string is returned.
95 + If processing is needed, a new string is allocated.
96 +/
97 char[] xmlEscape( char[] xml ){
98 bool needsReplacement( dchar c ){
99 switch( c ){
100 case '<':
101 case '>':
102 case '&':
103 case '\"':
104 case '\'':
105 case '\r':
106 case '\n':
107 case '\u0009':
108 return true;
109 default:
110 return c > 0x7F;
111 }
112 }
113
114 // Check if processing is needed
115 foreach( char c; xml ){
116 if( needsReplacement( c )){
117 goto Lprocess;
118 }
119 }
120 return xml;
121 Lprocess:
122
123 // yes, do a new string, start with +20 chars
124 char[] res = new char[ xml.length + 20 ];
125 res.length = 0;
126
127 foreach( dchar c; xml ){
128
129 if( !needsReplacement( c )){
130 res ~= c;
131 }
132 else{
133 res ~= '&';
134 switch( c ){
135 case '<': res ~= "lt"; break;
136 case '>': res ~= "gt"; break;
137 case '&': res ~= "amp"; break;
138 case '\"': res ~= "quot"; break;
139 case '\'': res ~= "apos"; break;
140 case '\r': case '\n': case '\u0009':
141 default:
142 char toHexDigit( int i ){
143 if( i < 10 ) return '0'+i;
144 return 'A'+i-10;
145 }
146 res ~= "#x";
147 if( c <= 0xFF ){
148 res ~= toHexDigit(( c >> 4 ) & 0x0F );
149 res ~= toHexDigit(( c >> 0 ) & 0x0F );
150 }
151 else if( c <= 0xFFFF ){
152 res ~= toHexDigit(( c >> 12 ) & 0x0F );
153 res ~= toHexDigit(( c >> 8 ) & 0x0F );
154 res ~= toHexDigit(( c >> 4 ) & 0x0F );
155 res ~= toHexDigit(( c >> 0 ) & 0x0F );
156 }
157 else if( c <= 0xFFFFFF ){
158 res ~= toHexDigit(( c >> 20 ) & 0x0F );
159 res ~= toHexDigit(( c >> 16 ) & 0x0F );
160 res ~= toHexDigit(( c >> 12 ) & 0x0F );
161 res ~= toHexDigit(( c >> 8 ) & 0x0F );
162 res ~= toHexDigit(( c >> 4 ) & 0x0F );
163 res ~= toHexDigit(( c >> 0 ) & 0x0F );
164 }
165 else {
166 res ~= toHexDigit(( c >> 28 ) & 0x0F );
167 res ~= toHexDigit(( c >> 24 ) & 0x0F );
168 res ~= toHexDigit(( c >> 20 ) & 0x0F );
169 res ~= toHexDigit(( c >> 16 ) & 0x0F );
170 res ~= toHexDigit(( c >> 12 ) & 0x0F );
171 res ~= toHexDigit(( c >> 8 ) & 0x0F );
172 res ~= toHexDigit(( c >> 4 ) & 0x0F );
173 res ~= toHexDigit(( c >> 0 ) & 0x0F );
174 }
175 break;
176 }
177 res ~= ';';
178 }
179 }
180 }
181