comparison trunk/src/dil/HtmlEntities.d @ 609:0c10255d8009

Wrote custom look-up tables for HTML entities.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sun, 06 Jan 2008 21:06:20 +0100
parents f203c5248d0b
children 3b34f6a95a27
comparison
equal deleted inserted replaced
608:fac9e8b258fc 609:0c10255d8009
2 Author: Aziz Köksal 2 Author: Aziz Köksal
3 License: GPL3 3 License: GPL3
4 +/ 4 +/
5 module dil.HtmlEntities; 5 module dil.HtmlEntities;
6 6
7 private const dchar[char[]] entities_table; 7 import common;
8 8
9 static this() 9 struct Entity
10 { 10 {
11 entities_table = [ 11 char[] name;
12 "Aacute"[] : '\u00C1', 12 uint value;
13 "aacute" : '\u00E1', 13 }
14 "Acirc" : '\u00C2', 14
15 "acirc" : '\u00E2', 15 static const Entity[] namedEntities = [
16 "acute" : '\u00B4', 16 {"Aacute", '\u00C1'},
17 "AElig" : '\u00C6', 17 {"aacute", '\u00E1'},
18 "aelig" : '\u00E6', 18 {"Acirc", '\u00C2'},
19 "Agrave" : '\u00C0', 19 {"acirc", '\u00E2'},
20 "agrave" : '\u00E0', 20 {"acute", '\u00B4'},
21 "alefsym" : '\u2135', 21 {"AElig", '\u00C6'},
22 "Alpha" : '\u0391', 22 {"aelig", '\u00E6'},
23 "alpha" : '\u03B1', 23 {"Agrave", '\u00C0'},
24 "amp" : '\u0026', 24 {"agrave", '\u00E0'},
25 "and" : '\u2227', 25 {"alefsym", '\u2135'},
26 "ang" : '\u2220', 26 {"Alpha", '\u0391'},
27 "Aring" : '\u00C5', 27 {"alpha", '\u03B1'},
28 "aring" : '\u00E5', 28 {"amp", '\u0026'},
29 "asymp" : '\u2248', 29 {"and", '\u2227'},
30 "Atilde" : '\u00C3', 30 {"ang", '\u2220'},
31 "atilde" : '\u00E3', 31 {"Aring", '\u00C5'},
32 "Auml" : '\u00C4', 32 {"aring", '\u00E5'},
33 "auml" : '\u00E4', 33 {"asymp", '\u2248'},
34 "bdquo" : '\u201E', 34 {"Atilde", '\u00C3'},
35 "Beta" : '\u0392', 35 {"atilde", '\u00E3'},
36 "beta" : '\u03B2', 36 {"Auml", '\u00C4'},
37 "brvbar" : '\u00A6', 37 {"auml", '\u00E4'},
38 "bull" : '\u2022', 38 {"bdquo", '\u201E'},
39 "cap" : '\u2229', 39 {"Beta", '\u0392'},
40 "Ccedil" : '\u00C7', 40 {"beta", '\u03B2'},
41 "ccedil" : '\u00E7', 41 {"brvbar", '\u00A6'},
42 "cedil" : '\u00B8', 42 {"bull", '\u2022'},
43 "cent" : '\u00A2', 43 {"cap", '\u2229'},
44 "Chi" : '\u03A7', 44 {"Ccedil", '\u00C7'},
45 "chi" : '\u03C7', 45 {"ccedil", '\u00E7'},
46 "circ" : '\u02C6', 46 {"cedil", '\u00B8'},
47 "clubs" : '\u2663', 47 {"cent", '\u00A2'},
48 "cong" : '\u2245', 48 {"Chi", '\u03A7'},
49 "copy" : '\u00A9', 49 {"chi", '\u03C7'},
50 "crarr" : '\u21B5', 50 {"circ", '\u02C6'},
51 "cup" : '\u222A', 51 {"clubs", '\u2663'},
52 "curren" : '\u00A4', 52 {"cong", '\u2245'},
53 "Dagger" : '\u2021', 53 {"copy", '\u00A9'},
54 "dagger" : '\u2020', 54 {"crarr", '\u21B5'},
55 "dArr" : '\u21D3', 55 {"cup", '\u222A'},
56 "darr" : '\u2193', 56 {"curren", '\u00A4'},
57 "deg" : '\u00B0', 57 {"Dagger", '\u2021'},
58 "Delta" : '\u0394', 58 {"dagger", '\u2020'},
59 "delta" : '\u03B4', 59 {"dArr", '\u21D3'},
60 "diams" : '\u2666', 60 {"darr", '\u2193'},
61 "divide" : '\u00F7', 61 {"deg", '\u00B0'},
62 "Eacute" : '\u00C9', 62 {"Delta", '\u0394'},
63 "eacute" : '\u00E9', 63 {"delta", '\u03B4'},
64 "Ecirc" : '\u00CA', 64 {"diams", '\u2666'},
65 "ecirc" : '\u00EA', 65 {"divide", '\u00F7'},
66 "Egrave" : '\u00C8', 66 {"Eacute", '\u00C9'},
67 "egrave" : '\u00E8', 67 {"eacute", '\u00E9'},
68 "empty" : '\u2205', 68 {"Ecirc", '\u00CA'},
69 "emsp" : '\u2003', 69 {"ecirc", '\u00EA'},
70 "ensp" : '\u2002', 70 {"Egrave", '\u00C8'},
71 "Epsilon" : '\u0395', 71 {"egrave", '\u00E8'},
72 "epsilon" : '\u03B5', 72 {"empty", '\u2205'},
73 "equiv" : '\u2261', 73 {"emsp", '\u2003'},
74 "Eta" : '\u0397', 74 {"ensp", '\u2002'},
75 "eta" : '\u03B7', 75 {"Epsilon", '\u0395'},
76 "ETH" : '\u00D0', 76 {"epsilon", '\u03B5'},
77 "eth" : '\u00F0', 77 {"equiv", '\u2261'},
78 "Euml" : '\u00CB', 78 {"Eta", '\u0397'},
79 "euml" : '\u00EB', 79 {"eta", '\u03B7'},
80 "euro" : '\u20AC', 80 {"ETH", '\u00D0'},
81 "exist" : '\u2203', 81 {"eth", '\u00F0'},
82 "fnof" : '\u0192', 82 {"Euml", '\u00CB'},
83 "forall" : '\u2200', 83 {"euml", '\u00EB'},
84 "frac12" : '\u00BD', 84 {"euro", '\u20AC'},
85 "frac14" : '\u00BC', 85 {"exist", '\u2203'},
86 "frac34" : '\u00BE', 86 {"fnof", '\u0192'},
87 "frasl" : '\u2044', 87 {"forall", '\u2200'},
88 "Gamma" : '\u0393', 88 {"frac12", '\u00BD'},
89 "gamma" : '\u03B3', 89 {"frac14", '\u00BC'},
90 "ge" : '\u2265', 90 {"frac34", '\u00BE'},
91 "gt" : '\u003E', 91 {"frasl", '\u2044'},
92 "hArr" : '\u21D4', 92 {"Gamma", '\u0393'},
93 "harr" : '\u2194', 93 {"gamma", '\u03B3'},
94 "hearts" : '\u2665', 94 {"ge", '\u2265'},
95 "hellip" : '\u2026', 95 {"gt", '\u003E'},
96 "Iacute" : '\u00CD', 96 {"hArr", '\u21D4'},
97 "iacute" : '\u00ED', 97 {"harr", '\u2194'},
98 "Icirc" : '\u00CE', 98 {"hearts", '\u2665'},
99 "icirc" : '\u00EE', 99 {"hellip", '\u2026'},
100 "iexcl" : '\u00A1', 100 {"Iacute", '\u00CD'},
101 "Igrave" : '\u00CC', 101 {"iacute", '\u00ED'},
102 "igrave" : '\u00EC', 102 {"Icirc", '\u00CE'},
103 "image" : '\u2111', 103 {"icirc", '\u00EE'},
104 "infin" : '\u221E', 104 {"iexcl", '\u00A1'},
105 "int" : '\u222B', 105 {"Igrave", '\u00CC'},
106 "Iota" : '\u0399', 106 {"igrave", '\u00EC'},
107 "iota" : '\u03B9', 107 {"image", '\u2111'},
108 "iquest" : '\u00BF', 108 {"infin", '\u221E'},
109 "isin" : '\u2208', 109 {"int", '\u222B'},
110 "Iuml" : '\u00CF', 110 {"Iota", '\u0399'},
111 "iuml" : '\u00EF', 111 {"iota", '\u03B9'},
112 "Kappa" : '\u039A', 112 {"iquest", '\u00BF'},
113 "kappa" : '\u03BA', 113 {"isin", '\u2208'},
114 "Lambda" : '\u039B', 114 {"Iuml", '\u00CF'},
115 "lambda" : '\u03BB', 115 {"iuml", '\u00EF'},
116 "lang" : '\u2329', 116 {"Kappa", '\u039A'},
117 "laquo" : '\u00AB', 117 {"kappa", '\u03BA'},
118 "lArr" : '\u21D0', 118 {"Lambda", '\u039B'},
119 "larr" : '\u2190', 119 {"lambda", '\u03BB'},
120 "lceil" : '\u2308', 120 {"lang", '\u2329'},
121 "ldquo" : '\u201C', 121 {"laquo", '\u00AB'},
122 "le" : '\u2264', 122 {"lArr", '\u21D0'},
123 "lfloor" : '\u230A', 123 {"larr", '\u2190'},
124 "lowast" : '\u2217', 124 {"lceil", '\u2308'},
125 "loz" : '\u25CA', 125 {"ldquo", '\u201C'},
126 "lrm" : '\u200E', 126 {"le", '\u2264'},
127 "lsaquo" : '\u2039', 127 {"lfloor", '\u230A'},
128 "lsquo" : '\u2018', 128 {"lowast", '\u2217'},
129 "lt" : '\u003C', 129 {"loz", '\u25CA'},
130 "macr" : '\u00AF', 130 {"lrm", '\u200E'},
131 "mdash" : '\u2014', 131 {"lsaquo", '\u2039'},
132 "micro" : '\u00B5', 132 {"lsquo", '\u2018'},
133 "middot" : '\u00B7', 133 {"lt", '\u003C'},
134 "minus" : '\u2212', 134 {"macr", '\u00AF'},
135 "Mu" : '\u039C', 135 {"mdash", '\u2014'},
136 "mu" : '\u03BC', 136 {"micro", '\u00B5'},
137 "nabla" : '\u2207', 137 {"middot", '\u00B7'},
138 "nbsp" : '\u00A0', 138 {"minus", '\u2212'},
139 "ndash" : '\u2013', 139 {"Mu", '\u039C'},
140 "ne" : '\u2260', 140 {"mu", '\u03BC'},
141 "ni" : '\u220B', 141 {"nabla", '\u2207'},
142 "not" : '\u00AC', 142 {"nbsp", '\u00A0'},
143 "notin" : '\u2209', 143 {"ndash", '\u2013'},
144 "nsub" : '\u2284', 144 {"ne", '\u2260'},
145 "Ntilde" : '\u00D1', 145 {"ni", '\u220B'},
146 "ntilde" : '\u00F1', 146 {"not", '\u00AC'},
147 "Nu" : '\u039D', 147 {"notin", '\u2209'},
148 "nu" : '\u03BD', 148 {"nsub", '\u2284'},
149 "Oacute" : '\u00D3', 149 {"Ntilde", '\u00D1'},
150 "oacute" : '\u00F3', 150 {"ntilde", '\u00F1'},
151 "Ocirc" : '\u00D4', 151 {"Nu", '\u039D'},
152 "ocirc" : '\u00F4', 152 {"nu", '\u03BD'},
153 "OElig" : '\u0152', 153 {"Oacute", '\u00D3'},
154 "oelig" : '\u0153', 154 {"oacute", '\u00F3'},
155 "Ograve" : '\u00D2', 155 {"Ocirc", '\u00D4'},
156 "ograve" : '\u00F2', 156 {"ocirc", '\u00F4'},
157 "oline" : '\u203E', 157 {"OElig", '\u0152'},
158 "Omega" : '\u03A9', 158 {"oelig", '\u0153'},
159 "omega" : '\u03C9', 159 {"Ograve", '\u00D2'},
160 "Omicron" : '\u039F', 160 {"ograve", '\u00F2'},
161 "omicron" : '\u03BF', 161 {"oline", '\u203E'},
162 "oplus" : '\u2295', 162 {"Omega", '\u03A9'},
163 "or" : '\u2228', 163 {"omega", '\u03C9'},
164 "ordf" : '\u00AA', 164 {"Omicron", '\u039F'},
165 "ordm" : '\u00BA', 165 {"omicron", '\u03BF'},
166 "Oslash" : '\u00D8', 166 {"oplus", '\u2295'},
167 "oslash" : '\u00F8', 167 {"or", '\u2228'},
168 "Otilde" : '\u00D5', 168 {"ordf", '\u00AA'},
169 "otilde" : '\u00F5', 169 {"ordm", '\u00BA'},
170 "otimes" : '\u2297', 170 {"Oslash", '\u00D8'},
171 "Ouml" : '\u00D6', 171 {"oslash", '\u00F8'},
172 "ouml" : '\u00F6', 172 {"Otilde", '\u00D5'},
173 "para" : '\u00B6', 173 {"otilde", '\u00F5'},
174 "part" : '\u2202', 174 {"otimes", '\u2297'},
175 "permil" : '\u2030', 175 {"Ouml", '\u00D6'},
176 "perp" : '\u22A5', 176 {"ouml", '\u00F6'},
177 "Phi" : '\u03A6', 177 {"para", '\u00B6'},
178 "phi" : '\u03C6', 178 {"part", '\u2202'},
179 "Pi" : '\u03A0', 179 {"permil", '\u2030'},
180 "pi" : '\u03C0', 180 {"perp", '\u22A5'},
181 "piv" : '\u03D6', 181 {"Phi", '\u03A6'},
182 "plusmn" : '\u00B1', 182 {"phi", '\u03C6'},
183 "pound" : '\u00A3', 183 {"Pi", '\u03A0'},
184 "Prime" : '\u2033', 184 {"pi", '\u03C0'},
185 "prime" : '\u2032', 185 {"piv", '\u03D6'},
186 "prod" : '\u220F', 186 {"plusmn", '\u00B1'},
187 "prop" : '\u221D', 187 {"pound", '\u00A3'},
188 "Psi" : '\u03A8', 188 {"Prime", '\u2033'},
189 "psi" : '\u03C8', 189 {"prime", '\u2032'},
190 "quot" : '\u0022', 190 {"prod", '\u220F'},
191 "radic" : '\u221A', 191 {"prop", '\u221D'},
192 "rang" : '\u232A', 192 {"Psi", '\u03A8'},
193 "raquo" : '\u00BB', 193 {"psi", '\u03C8'},
194 "rArr" : '\u21D2', 194 {"quot", '\u0022'},
195 "rarr" : '\u2192', 195 {"radic", '\u221A'},
196 "rceil" : '\u2309', 196 {"rang", '\u232A'},
197 "rdquo" : '\u201D', 197 {"raquo", '\u00BB'},
198 "real" : '\u211C', 198 {"rArr", '\u21D2'},
199 "reg" : '\u00AE', 199 {"rarr", '\u2192'},
200 "rfloor" : '\u230B', 200 {"rceil", '\u2309'},
201 "Rho" : '\u03A1', 201 {"rdquo", '\u201D'},
202 "rho" : '\u03C1', 202 {"real", '\u211C'},
203 "rlm" : '\u200F', 203 {"reg", '\u00AE'},
204 "rsaquo" : '\u203A', 204 {"rfloor", '\u230B'},
205 "rsquo" : '\u2019', 205 {"Rho", '\u03A1'},
206 "sbquo" : '\u201A', 206 {"rho", '\u03C1'},
207 "Scaron" : '\u0160', 207 {"rlm", '\u200F'},
208 "scaron" : '\u0161', 208 {"rsaquo", '\u203A'},
209 "sdot" : '\u22C5', 209 {"rsquo", '\u2019'},
210 "sect" : '\u00A7', 210 {"sbquo", '\u201A'},
211 "shy" : '\u00AD', 211 {"Scaron", '\u0160'},
212 "Sigma" : '\u03A3', 212 {"scaron", '\u0161'},
213 "sigma" : '\u03C3', 213 {"sdot", '\u22C5'},
214 "sigmaf" : '\u03C2', 214 {"sect", '\u00A7'},
215 "sim" : '\u223C', 215 {"shy", '\u00AD'},
216 "spades" : '\u2660', 216 {"Sigma", '\u03A3'},
217 "sub" : '\u2282', 217 {"sigma", '\u03C3'},
218 "sube" : '\u2286', 218 {"sigmaf", '\u03C2'},
219 "sum" : '\u2211', 219 {"sim", '\u223C'},
220 "sup" : '\u2283', 220 {"spades", '\u2660'},
221 "sup1" : '\u00B9', 221 {"sub", '\u2282'},
222 "sup2" : '\u00B2', 222 {"sube", '\u2286'},
223 "sup3" : '\u00B3', 223 {"sum", '\u2211'},
224 "supe" : '\u2287', 224 {"sup", '\u2283'},
225 "szlig" : '\u00DF', 225 {"sup1", '\u00B9'},
226 "Tau" : '\u03A4', 226 {"sup2", '\u00B2'},
227 "tau" : '\u03C4', 227 {"sup3", '\u00B3'},
228 "there4" : '\u2234', 228 {"supe", '\u2287'},
229 "Theta" : '\u0398', 229 {"szlig", '\u00DF'},
230 "theta" : '\u03B8', 230 {"Tau", '\u03A4'},
231 "thetasym" : '\u03D1', 231 {"tau", '\u03C4'},
232 "thinsp" : '\u2009', 232 {"there4", '\u2234'},
233 "THORN" : '\u00DE', 233 {"Theta", '\u0398'},
234 "thorn" : '\u00FE', 234 {"theta", '\u03B8'},
235 "tilde" : '\u02DC', 235 {"thetasym", '\u03D1'},
236 "times" : '\u00D7', 236 {"thinsp", '\u2009'},
237 "trade" : '\u2122', 237 {"THORN", '\u00DE'},
238 "Uacute" : '\u00DA', 238 {"thorn", '\u00FE'},
239 "uacute" : '\u00FA', 239 {"tilde", '\u02DC'},
240 "uArr" : '\u21D1', 240 {"times", '\u00D7'},
241 "uarr" : '\u2191', 241 {"trade", '\u2122'},
242 "Ucirc" : '\u00DB', 242 {"Uacute", '\u00DA'},
243 "ucirc" : '\u00FB', 243 {"uacute", '\u00FA'},
244 "Ugrave" : '\u00D9', 244 {"uArr", '\u21D1'},
245 "ugrave" : '\u00F9', 245 {"uarr", '\u2191'},
246 "uml" : '\u00A8', 246 {"Ucirc", '\u00DB'},
247 "upsih" : '\u03D2', 247 {"ucirc", '\u00FB'},
248 "Upsilon" : '\u03A5', 248 {"Ugrave", '\u00D9'},
249 "upsilon" : '\u03C5', 249 {"ugrave", '\u00F9'},
250 "Uuml" : '\u00DC', 250 {"uml", '\u00A8'},
251 "uuml" : '\u00FC', 251 {"upsih", '\u03D2'},
252 "weierp" : '\u2118', 252 {"Upsilon", '\u03A5'},
253 "Xi" : '\u039E', 253 {"upsilon", '\u03C5'},
254 "xi" : '\u03BE', 254 {"Uuml", '\u00DC'},
255 "Yacute" : '\u00DD', 255 {"uuml", '\u00FC'},
256 "yacute" : '\u00FD', 256 {"weierp", '\u2118'},
257 "yen" : '\u00A5', 257 {"Xi", '\u039E'},
258 "Yuml" : '\u0178', 258 {"xi", '\u03BE'},
259 "yuml" : '\u00FF', 259 {"Yacute", '\u00DD'},
260 "Zeta" : '\u0396', 260 {"yacute", '\u00FD'},
261 "zeta" : '\u03B6', 261 {"yen", '\u00A5'},
262 "zwj" : '\u200D', 262 {"Yuml", '\u0178'},
263 "zwnj" : '\u200C' 263 {"yuml", '\u00FF'},
264 ]; 264 {"Zeta", '\u0396'},
265 } 265 {"zeta", '\u03B6'},
266 {"zwj", '\u200D'},
267 {"zwnj", '\u200C'}
268 ];
269
270 uint stringToHash(char[] str)
271 {
272 uint hash;
273 foreach(c; str) {
274 hash *= 11;
275 hash += c;
276 }
277 return hash;
278 }
279
280 char[] toString(uint x)
281 {
282 char[] str;
283 do
284 str = cast(char)('0' + (x % 10)) ~ str;
285 while (x /= 10)
286 return str;
287 }
288
289 char[] generateHashAndValueArrays()
290 {
291 uint[] hashes; // String hashes.
292 uint[] values; // Unicode codepoints.
293 // Build arrays:
294 foreach (entity; namedEntities)
295 {
296 auto hash = stringToHash(entity.name);
297 auto value = entity.value;
298 assert(hash != 0);
299 // Find insertion place.
300 uint i;
301 for (; i < hashes.length; ++i)
302 {
303 assert(hash != hashes[i], "bad hash function: conflicting hashes");
304 if (hash < hashes[i])
305 break;
306 }
307 // Insert hash and value into tables.
308 if (i == hashes.length)
309 {
310 hashes ~= hash;
311 values ~= value;
312 }
313 else
314 {
315 hashes = hashes[0..i] ~ hash ~ hashes[i..$]; // Insert before index.
316 values = values[0..i] ~ value ~ values[i..$]; // Insert before index.
317 }
318 assert(hashes[i] == hash && values[i] == value);
319 }
320 // Build source text:
321 char[] hashesText = "private static const uint[] hashes = [",
322 valuesText = "private static const dchar[] values = [";
323 foreach (i, hash; hashes)
324 {
325 hashesText ~= toString(hash) ~ ",";
326 valuesText ~= toString(values[i]) ~ ",";
327 }
328 hashesText ~= "];";
329 valuesText ~= "];";
330 return hashesText ~"\n"~ valuesText;
331 }
332
333 // Mixin:
334 // private static const uint[] hashes;
335 // private static const dchar[] values;
336 mixin(generateHashAndValueArrays);
337 // pragma(msg, generateHashAndValueArrays());
266 338
267 /++ 339 /++
268 Converts a named HTML entity into its equivalent Unicode codepoint. 340 Converts a named HTML entity into its equivalent Unicode codepoint.
269 Returns 0xFFFF if entity doesn't exist. 341 Returns 0xFFFF if entity doesn't exist.
270 +/ 342 +/
271 dchar entity2Unicode(char[] entity) 343 dchar entity2Unicode(char[] entity)
272 { 344 {
273 auto d = entity in entities_table; 345 auto hash = stringToHash(entity);
274 if (d) 346 // Binary search:
275 return *d; 347 size_t lower = void, index = void, upper = void;
276 return 0xFFFF; 348 lower = 0;
277 } 349 upper = hashes.length -1;
350 while (lower <= upper)
351 {
352 index = (lower + upper) / 2;
353 if (hash < hashes[index])
354 upper = index - 1;
355 else if (hash > hashes[index])
356 lower = index + 1;
357 else
358 return values[index]; // Return the Unicode codepoint.
359 }
360 return 0xFFFF; // Return error value.
361 }
362
363 unittest
364 {
365 Stdout("Testing entity2Unicode().").newline;
366 alias entity2Unicode f;
367 foreach (entity; namedEntities)
368 assert(f(entity.name) == entity.value,
369 Format("'&{};' == \\u{:X4}, not \\u{:X4}", entity.name, entity.value, cast(uint)f(entity.name))
370 );
371 }