comparison base/src/java/nonstandard/XmlTranscode.d @ 27:1bf55a6eb092

Renamed java tree to base
author Frank Benoit <benoit@tionex.de>
date Sat, 21 Mar 2009 11:33:57 +0100
parents java/src/java/nonstandard/XmlTranscode.d@9b96950f2c3c
children 9f4c18c268b2
comparison
equal deleted inserted replaced
26:f589fc20a5f9 27:1bf55a6eb092
1 module java.nonstandard.XmlTranscode;
2
3 import java.lang.util;
4 import java.lang.exceptions;
5 import java.lang.Math;
6
7 /++
8 + Decode XML entities into UTF8 string.
9 + Eg. "&amp;" -> "&", "&#38;" -> "&", "&#x26;" -> "&"
10 + Throws TextException on failure
11 + The given string is modified.
12 +/
13 String xmlUnescape( String str ){
14
15 void error(){
16 throw new RuntimeException( "xmlUnescape" );
17 }
18 // &lt; ...
19 // &#1234;
20 // &#x12AF;
21 String src = str;
22 String trg = str;
23 while( src.length ){
24 if( src[0] !is '&' ){
25 //trg[0] = src[0];
26 trg = trg[1..$];
27 src = src[1..$];
28 }
29 else{
30 src = src[1..$]; // go past '&'
31 if( src.length < 2 ) error();
32
33 // search semi
34 int len = Math.min( cast(int)src.length, 10 ); // limit semi search to possible longest entityname
35 int semi = java.lang.util.indexOf( src[0 .. len ], ';' );
36 if( semi is -1 ) error(); // no semi found
37
38 String entityName = src[ 0 .. semi ]; // name without semi
39 dchar entityValue = 0;
40 switch( entityName ){
41 case "lt": entityValue = '<'; break;
42 case "gt": entityValue = '>'; break;
43 case "amp": entityValue = '&'; break;
44 case "quot": entityValue = '\"'; break;
45 case "apos": entityValue = '\''; break;
46 default:
47 if( entityName[0] is 'x' ){
48 if( semi < 2 ) error();
49 if( semi > 9 ) error();
50 foreach( hex; entityName[1..$] ){
51 entityValue <<= 4;
52 if( hex >= '0' && hex <= '9' ){
53 entityValue |= ( hex - '0' );
54 }
55 else if( hex >= 'a' && hex <= 'f' ){
56 entityValue |= ( hex - 'a' );
57 }
58 else if( hex >= 'A' && hex <= 'F' ){
59 entityValue |= ( hex - 'A' );
60 }
61 else{
62 error();
63 }
64 }
65 }
66 else{
67 if( semi < 1 ) error();
68 if( semi > 9 ) error();
69 foreach( dec; entityName[1..$] ){
70 if( dec >= '0' && dec <= '9' ){
71 entityValue *= 10;
72 entityValue += ( dec - '0' );
73 }
74 else{
75 error();
76 }
77 }
78 }
79 }
80 String res = String_valueOf( entityValue );
81 trg = trg[ res.length .. $ ];
82 src = src[ semi +1 .. $ ]; // go past semi
83 }
84 }
85 return str[ 0 .. trg.ptr-str.ptr ];
86 }
87
88
89 /++
90 + Encode XML entities into UTF8 string.
91 + First checks if processing is needed.
92 + If not, the original string is returned.
93 + If processing is needed, a new string is allocated.
94 +/
95 String xmlEscape( String xml ){
96 bool needsReplacement( dchar c ){
97 switch( c ){
98 case '<':
99 case '>':
100 case '&':
101 case '\"':
102 case '\'':
103 case '\r':
104 case '\n':
105 case '\u0009':
106 return true;
107 default:
108 return c > 0x7F;
109 }
110 }
111
112 // Check if processing is needed
113 foreach( char c; xml ){
114 if( needsReplacement( c )){
115 goto Lprocess;
116 }
117 }
118 return xml;
119 Lprocess:
120
121 // yes, do a new string, start with +20 chars
122 char[] res = new char[ xml.length + 20 ];
123 res.length = 0;
124
125 foreach( dchar c; xml ){
126
127 if( !needsReplacement( c )){
128 res ~= c;
129 }
130 else{
131 res ~= '&';
132 switch( c ){
133 case '<': res ~= "lt"; break;
134 case '>': res ~= "gt"; break;
135 case '&': res ~= "amp"; break;
136 case '\"': res ~= "quot"; break;
137 case '\'': res ~= "apos"; break;
138 case '\r': case '\n': case '\u0009':
139 default:
140 char toHexDigit( int i ){
141 if( i < 10 ) return '0'+i;
142 return 'A'+i-10;
143 }
144 res ~= "#x";
145 if( c <= 0xFF ){
146 res ~= toHexDigit(( c >> 4 ) & 0x0F );
147 res ~= toHexDigit(( c >> 0 ) & 0x0F );
148 }
149 else if( c <= 0xFFFF ){
150 res ~= toHexDigit(( c >> 12 ) & 0x0F );
151 res ~= toHexDigit(( c >> 8 ) & 0x0F );
152 res ~= toHexDigit(( c >> 4 ) & 0x0F );
153 res ~= toHexDigit(( c >> 0 ) & 0x0F );
154 }
155 else if( c <= 0xFFFFFF ){
156 res ~= toHexDigit(( c >> 20 ) & 0x0F );
157 res ~= toHexDigit(( c >> 16 ) & 0x0F );
158 res ~= toHexDigit(( c >> 12 ) & 0x0F );
159 res ~= toHexDigit(( c >> 8 ) & 0x0F );
160 res ~= toHexDigit(( c >> 4 ) & 0x0F );
161 res ~= toHexDigit(( c >> 0 ) & 0x0F );
162 }
163 else {
164 res ~= toHexDigit(( c >> 28 ) & 0x0F );
165 res ~= toHexDigit(( c >> 24 ) & 0x0F );
166 res ~= toHexDigit(( c >> 20 ) & 0x0F );
167 res ~= toHexDigit(( c >> 16 ) & 0x0F );
168 res ~= toHexDigit(( c >> 12 ) & 0x0F );
169 res ~= toHexDigit(( c >> 8 ) & 0x0F );
170 res ~= toHexDigit(( c >> 4 ) & 0x0F );
171 res ~= toHexDigit(( c >> 0 ) & 0x0F );
172 }
173 break;
174 }
175 res ~= ';';
176 }
177 }
178 }
179