223
|
1 module dwt.dwthelper.XmlTranscode;
|
|
2
|
224
|
3 import dwt.dwthelper.utils;
|
|
4 import tango.core.Exception;
|
|
5
|
223
|
6 /++
|
|
7 + Decode XML entities into UTF8 string.
|
|
8 + Eg. "&" -> "&", "&" -> "&", "&" -> "&"
|
|
9 + Throws TextException on failure
|
|
10 + The given string is modified.
|
|
11 +/
|
|
12 char[] xmlUnescape( char[] str ){
|
|
13
|
|
14 void error(){
|
|
15 throw new TextException( "xmlUnescape" );
|
|
16 }
|
|
17 // < ...
|
|
18 // Ӓ
|
|
19 // ኯ
|
|
20 char[] src = str;
|
|
21 char[] trg = str;
|
|
22 while( src.length ){
|
|
23 if( src[0] !is '&' ){
|
|
24 trg[0] = src[0];
|
|
25 trg = trg[1..$];
|
|
26 src = src[1..$];
|
|
27 }
|
|
28 else{
|
|
29 src = src[1..$]; // go past '&'
|
|
30 if( src.length < 2 ) error();
|
|
31
|
|
32 // search semi
|
|
33 int len = Math.min( src.length, 10 ); // limit semi search to possible longest entityname
|
|
34 int semi = tango.text.Util.locate( src[0 .. len ], ';' );
|
|
35 if( semi is len ) error(); // no semi found
|
|
36
|
|
37 char[] entityName = src[ 0 .. semi ]; // name without semi
|
|
38 dchar entityValue = 0;
|
|
39 switch( entityName ){
|
|
40 case "lt": entityValue = '<'; break;
|
|
41 case "gt": entityValue = '>'; break;
|
|
42 case "amp": entityValue = '&'; break;
|
|
43 case "quot": entityValue = '\"'; break;
|
|
44 case "apos": entityValue = '\''; break;
|
|
45 default:
|
|
46 if( entityName[0] is 'x' ){
|
|
47 if( semi < 2 ) error();
|
|
48 if( semi > 9 ) error();
|
|
49 foreach( hex; entityName[1..$] ){
|
|
50 entityValue <<= 4;
|
|
51 if( hex >= '0' && hex <= '9' ){
|
|
52 entityValue |= ( hex - '0' );
|
|
53 }
|
|
54 else if( hex >= 'a' && hex <= 'f' ){
|
|
55 entityValue |= ( hex - 'a' );
|
|
56 }
|
|
57 else if( hex >= 'A' && hex <= 'F' ){
|
|
58 entityValue |= ( hex - 'A' );
|
|
59 }
|
|
60 else{
|
|
61 error();
|
|
62 }
|
|
63 }
|
|
64 }
|
|
65 else{
|
|
66 if( semi < 1 ) error();
|
|
67 if( semi > 9 ) error();
|
|
68 foreach( dec; entityName[1..$] ){
|
|
69 if( dec >= '0' && dec <= '9' ){
|
|
70 entityValue *= 10;
|
|
71 entityValue += ( dec - '0' );
|
|
72 }
|
|
73 else{
|
|
74 error();
|
|
75 }
|
|
76 }
|
|
77 }
|
|
78 }
|
|
79 dchar[1] arr;
|
|
80 arr[0] = entityValue;
|
|
81 uint ate = 0;
|
|
82 char[] res = tango.text.convert.Utf.toString( arr, trg, &ate );
|
|
83 trg = trg[ res.length .. $ ];
|
|
84 src = src[ semi +1 .. $ ]; // go past semi
|
|
85 }
|
|
86 }
|
|
87 return str[ 0 .. trg.ptr-str.ptr ];
|
|
88 }
|
|
89
|
|
90
|
|
91 /++
|
|
92 + Encode XML entities into UTF8 string.
|
|
93 + First checks if processing is needed.
|
|
94 + If not, the original string is returned.
|
|
95 + If processing is needed, a new string is allocated.
|
|
96 +/
|
|
97 char[] xmlEscape( char[] xml ){
|
|
98 bool needsReplacement( dchar c ){
|
|
99 switch( c ){
|
|
100 case '<':
|
|
101 case '>':
|
|
102 case '&':
|
|
103 case '\"':
|
|
104 case '\'':
|
|
105 case '\r':
|
|
106 case '\n':
|
|
107 case '\u0009':
|
|
108 return true;
|
|
109 default:
|
|
110 return c > 0x7F;
|
|
111 }
|
|
112 }
|
|
113
|
|
114 // Check if processing is needed
|
|
115 foreach( char c; xml ){
|
|
116 if( needsReplacement( c )){
|
|
117 goto Lprocess;
|
|
118 }
|
|
119 }
|
|
120 return xml;
|
|
121 Lprocess:
|
|
122
|
|
123 // yes, do a new string, start with +20 chars
|
|
124 char[] res = new char[ xml.length + 20 ];
|
|
125 res.length = 0;
|
|
126
|
|
127 foreach( dchar c; xml ){
|
|
128
|
|
129 if( !needsReplacement( c )){
|
|
130 res ~= c;
|
|
131 }
|
|
132 else{
|
|
133 res ~= '&';
|
|
134 switch( c ){
|
|
135 case '<': res ~= "lt"; break;
|
|
136 case '>': res ~= "gt"; break;
|
|
137 case '&': res ~= "amp"; break;
|
|
138 case '\"': res ~= "quot"; break;
|
|
139 case '\'': res ~= "apos"; break;
|
|
140 case '\r': case '\n': case '\u0009':
|
|
141 default:
|
|
142 char toHexDigit( int i ){
|
|
143 if( i < 10 ) return '0'+i;
|
|
144 return 'A'+i-10;
|
|
145 }
|
|
146 res ~= "#x";
|
|
147 if( c <= 0xFF ){
|
|
148 res ~= toHexDigit(( c >> 4 ) & 0x0F );
|
|
149 res ~= toHexDigit(( c >> 0 ) & 0x0F );
|
|
150 }
|
|
151 else if( c <= 0xFFFF ){
|
|
152 res ~= toHexDigit(( c >> 12 ) & 0x0F );
|
|
153 res ~= toHexDigit(( c >> 8 ) & 0x0F );
|
|
154 res ~= toHexDigit(( c >> 4 ) & 0x0F );
|
|
155 res ~= toHexDigit(( c >> 0 ) & 0x0F );
|
|
156 }
|
|
157 else if( c <= 0xFFFFFF ){
|
|
158 res ~= toHexDigit(( c >> 20 ) & 0x0F );
|
|
159 res ~= toHexDigit(( c >> 16 ) & 0x0F );
|
|
160 res ~= toHexDigit(( c >> 12 ) & 0x0F );
|
|
161 res ~= toHexDigit(( c >> 8 ) & 0x0F );
|
|
162 res ~= toHexDigit(( c >> 4 ) & 0x0F );
|
|
163 res ~= toHexDigit(( c >> 0 ) & 0x0F );
|
|
164 }
|
|
165 else {
|
|
166 res ~= toHexDigit(( c >> 28 ) & 0x0F );
|
|
167 res ~= toHexDigit(( c >> 24 ) & 0x0F );
|
|
168 res ~= toHexDigit(( c >> 20 ) & 0x0F );
|
|
169 res ~= toHexDigit(( c >> 16 ) & 0x0F );
|
|
170 res ~= toHexDigit(( c >> 12 ) & 0x0F );
|
|
171 res ~= toHexDigit(( c >> 8 ) & 0x0F );
|
|
172 res ~= toHexDigit(( c >> 4 ) & 0x0F );
|
|
173 res ~= toHexDigit(( c >> 0 ) & 0x0F );
|
|
174 }
|
|
175 break;
|
|
176 }
|
|
177 res ~= ';';
|
|
178 }
|
|
179 }
|
|
180 }
|
|
181
|