# HG changeset patch # User Aziz K?ksal # Date 1199649980 -3600 # Node ID 0c10255d80090a9dfca095d8adc270cb00f8169c # Parent fac9e8b258fce452759e1ffb8f62338828087472 Wrote custom look-up tables for HTML entities. diff -r fac9e8b258fc -r 0c10255d8009 trunk/src/dil/HtmlEntities.d --- a/trunk/src/dil/HtmlEntities.d Sun Jan 06 15:26:03 2008 +0100 +++ b/trunk/src/dil/HtmlEntities.d Sun Jan 06 21:06:20 2008 +0100 @@ -4,274 +4,368 @@ +/ module dil.HtmlEntities; -private const dchar[char[]] entities_table; +import common; -static this() +struct Entity { - entities_table = [ - "Aacute"[] : '\u00C1', - "aacute" : '\u00E1', - "Acirc" : '\u00C2', - "acirc" : '\u00E2', - "acute" : '\u00B4', - "AElig" : '\u00C6', - "aelig" : '\u00E6', - "Agrave" : '\u00C0', - "agrave" : '\u00E0', - "alefsym" : '\u2135', - "Alpha" : '\u0391', - "alpha" : '\u03B1', - "amp" : '\u0026', - "and" : '\u2227', - "ang" : '\u2220', - "Aring" : '\u00C5', - "aring" : '\u00E5', - "asymp" : '\u2248', - "Atilde" : '\u00C3', - "atilde" : '\u00E3', - "Auml" : '\u00C4', - "auml" : '\u00E4', - "bdquo" : '\u201E', - "Beta" : '\u0392', - "beta" : '\u03B2', - "brvbar" : '\u00A6', - "bull" : '\u2022', - "cap" : '\u2229', - "Ccedil" : '\u00C7', - "ccedil" : '\u00E7', - "cedil" : '\u00B8', - "cent" : '\u00A2', - "Chi" : '\u03A7', - "chi" : '\u03C7', - "circ" : '\u02C6', - "clubs" : '\u2663', - "cong" : '\u2245', - "copy" : '\u00A9', - "crarr" : '\u21B5', - "cup" : '\u222A', - "curren" : '\u00A4', - "Dagger" : '\u2021', - "dagger" : '\u2020', - "dArr" : '\u21D3', - "darr" : '\u2193', - "deg" : '\u00B0', - "Delta" : '\u0394', - "delta" : '\u03B4', - "diams" : '\u2666', - "divide" : '\u00F7', - "Eacute" : '\u00C9', - "eacute" : '\u00E9', - "Ecirc" : '\u00CA', - "ecirc" : '\u00EA', - "Egrave" : '\u00C8', - "egrave" : '\u00E8', - "empty" : '\u2205', - "emsp" : '\u2003', - "ensp" : '\u2002', - "Epsilon" : '\u0395', - "epsilon" : '\u03B5', - "equiv" : '\u2261', - "Eta" : '\u0397', - "eta" : '\u03B7', - "ETH" : '\u00D0', - "eth" : '\u00F0', - "Euml" : '\u00CB', - "euml" : '\u00EB', - "euro" : '\u20AC', - "exist" : '\u2203', - "fnof" : '\u0192', - "forall" : '\u2200', - "frac12" : '\u00BD', - "frac14" : '\u00BC', - "frac34" : '\u00BE', - "frasl" : '\u2044', - "Gamma" : '\u0393', - "gamma" : '\u03B3', - "ge" : '\u2265', - "gt" : '\u003E', - "hArr" : '\u21D4', - "harr" : '\u2194', - "hearts" : '\u2665', - "hellip" : '\u2026', - "Iacute" : '\u00CD', - "iacute" : '\u00ED', - "Icirc" : '\u00CE', - "icirc" : '\u00EE', - "iexcl" : '\u00A1', - "Igrave" : '\u00CC', - "igrave" : '\u00EC', - "image" : '\u2111', - "infin" : '\u221E', - "int" : '\u222B', - "Iota" : '\u0399', - "iota" : '\u03B9', - "iquest" : '\u00BF', - "isin" : '\u2208', - "Iuml" : '\u00CF', - "iuml" : '\u00EF', - "Kappa" : '\u039A', - "kappa" : '\u03BA', - "Lambda" : '\u039B', - "lambda" : '\u03BB', - "lang" : '\u2329', - "laquo" : '\u00AB', - "lArr" : '\u21D0', - "larr" : '\u2190', - "lceil" : '\u2308', - "ldquo" : '\u201C', - "le" : '\u2264', - "lfloor" : '\u230A', - "lowast" : '\u2217', - "loz" : '\u25CA', - "lrm" : '\u200E', - "lsaquo" : '\u2039', - "lsquo" : '\u2018', - "lt" : '\u003C', - "macr" : '\u00AF', - "mdash" : '\u2014', - "micro" : '\u00B5', - "middot" : '\u00B7', - "minus" : '\u2212', - "Mu" : '\u039C', - "mu" : '\u03BC', - "nabla" : '\u2207', - "nbsp" : '\u00A0', - "ndash" : '\u2013', - "ne" : '\u2260', - "ni" : '\u220B', - "not" : '\u00AC', - "notin" : '\u2209', - "nsub" : '\u2284', - "Ntilde" : '\u00D1', - "ntilde" : '\u00F1', - "Nu" : '\u039D', - "nu" : '\u03BD', - "Oacute" : '\u00D3', - "oacute" : '\u00F3', - "Ocirc" : '\u00D4', - "ocirc" : '\u00F4', - "OElig" : '\u0152', - "oelig" : '\u0153', - "Ograve" : '\u00D2', - "ograve" : '\u00F2', - "oline" : '\u203E', - "Omega" : '\u03A9', - "omega" : '\u03C9', - "Omicron" : '\u039F', - "omicron" : '\u03BF', - "oplus" : '\u2295', - "or" : '\u2228', - "ordf" : '\u00AA', - "ordm" : '\u00BA', - "Oslash" : '\u00D8', - "oslash" : '\u00F8', - "Otilde" : '\u00D5', - "otilde" : '\u00F5', - "otimes" : '\u2297', - "Ouml" : '\u00D6', - "ouml" : '\u00F6', - "para" : '\u00B6', - "part" : '\u2202', - "permil" : '\u2030', - "perp" : '\u22A5', - "Phi" : '\u03A6', - "phi" : '\u03C6', - "Pi" : '\u03A0', - "pi" : '\u03C0', - "piv" : '\u03D6', - "plusmn" : '\u00B1', - "pound" : '\u00A3', - "Prime" : '\u2033', - "prime" : '\u2032', - "prod" : '\u220F', - "prop" : '\u221D', - "Psi" : '\u03A8', - "psi" : '\u03C8', - "quot" : '\u0022', - "radic" : '\u221A', - "rang" : '\u232A', - "raquo" : '\u00BB', - "rArr" : '\u21D2', - "rarr" : '\u2192', - "rceil" : '\u2309', - "rdquo" : '\u201D', - "real" : '\u211C', - "reg" : '\u00AE', - "rfloor" : '\u230B', - "Rho" : '\u03A1', - "rho" : '\u03C1', - "rlm" : '\u200F', - "rsaquo" : '\u203A', - "rsquo" : '\u2019', - "sbquo" : '\u201A', - "Scaron" : '\u0160', - "scaron" : '\u0161', - "sdot" : '\u22C5', - "sect" : '\u00A7', - "shy" : '\u00AD', - "Sigma" : '\u03A3', - "sigma" : '\u03C3', - "sigmaf" : '\u03C2', - "sim" : '\u223C', - "spades" : '\u2660', - "sub" : '\u2282', - "sube" : '\u2286', - "sum" : '\u2211', - "sup" : '\u2283', - "sup1" : '\u00B9', - "sup2" : '\u00B2', - "sup3" : '\u00B3', - "supe" : '\u2287', - "szlig" : '\u00DF', - "Tau" : '\u03A4', - "tau" : '\u03C4', - "there4" : '\u2234', - "Theta" : '\u0398', - "theta" : '\u03B8', - "thetasym" : '\u03D1', - "thinsp" : '\u2009', - "THORN" : '\u00DE', - "thorn" : '\u00FE', - "tilde" : '\u02DC', - "times" : '\u00D7', - "trade" : '\u2122', - "Uacute" : '\u00DA', - "uacute" : '\u00FA', - "uArr" : '\u21D1', - "uarr" : '\u2191', - "Ucirc" : '\u00DB', - "ucirc" : '\u00FB', - "Ugrave" : '\u00D9', - "ugrave" : '\u00F9', - "uml" : '\u00A8', - "upsih" : '\u03D2', - "Upsilon" : '\u03A5', - "upsilon" : '\u03C5', - "Uuml" : '\u00DC', - "uuml" : '\u00FC', - "weierp" : '\u2118', - "Xi" : '\u039E', - "xi" : '\u03BE', - "Yacute" : '\u00DD', - "yacute" : '\u00FD', - "yen" : '\u00A5', - "Yuml" : '\u0178', - "yuml" : '\u00FF', - "Zeta" : '\u0396', - "zeta" : '\u03B6', - "zwj" : '\u200D', - "zwnj" : '\u200C' - ]; + char[] name; + uint value; } +static const Entity[] namedEntities = [ + {"Aacute", '\u00C1'}, + {"aacute", '\u00E1'}, + {"Acirc", '\u00C2'}, + {"acirc", '\u00E2'}, + {"acute", '\u00B4'}, + {"AElig", '\u00C6'}, + {"aelig", '\u00E6'}, + {"Agrave", '\u00C0'}, + {"agrave", '\u00E0'}, + {"alefsym", '\u2135'}, + {"Alpha", '\u0391'}, + {"alpha", '\u03B1'}, + {"amp", '\u0026'}, + {"and", '\u2227'}, + {"ang", '\u2220'}, + {"Aring", '\u00C5'}, + {"aring", '\u00E5'}, + {"asymp", '\u2248'}, + {"Atilde", '\u00C3'}, + {"atilde", '\u00E3'}, + {"Auml", '\u00C4'}, + {"auml", '\u00E4'}, + {"bdquo", '\u201E'}, + {"Beta", '\u0392'}, + {"beta", '\u03B2'}, + {"brvbar", '\u00A6'}, + {"bull", '\u2022'}, + {"cap", '\u2229'}, + {"Ccedil", '\u00C7'}, + {"ccedil", '\u00E7'}, + {"cedil", '\u00B8'}, + {"cent", '\u00A2'}, + {"Chi", '\u03A7'}, + {"chi", '\u03C7'}, + {"circ", '\u02C6'}, + {"clubs", '\u2663'}, + {"cong", '\u2245'}, + {"copy", '\u00A9'}, + {"crarr", '\u21B5'}, + {"cup", '\u222A'}, + {"curren", '\u00A4'}, + {"Dagger", '\u2021'}, + {"dagger", '\u2020'}, + {"dArr", '\u21D3'}, + {"darr", '\u2193'}, + {"deg", '\u00B0'}, + {"Delta", '\u0394'}, + {"delta", '\u03B4'}, + {"diams", '\u2666'}, + {"divide", '\u00F7'}, + {"Eacute", '\u00C9'}, + {"eacute", '\u00E9'}, + {"Ecirc", '\u00CA'}, + {"ecirc", '\u00EA'}, + {"Egrave", '\u00C8'}, + {"egrave", '\u00E8'}, + {"empty", '\u2205'}, + {"emsp", '\u2003'}, + {"ensp", '\u2002'}, + {"Epsilon", '\u0395'}, + {"epsilon", '\u03B5'}, + {"equiv", '\u2261'}, + {"Eta", '\u0397'}, + {"eta", '\u03B7'}, + {"ETH", '\u00D0'}, + {"eth", '\u00F0'}, + {"Euml", '\u00CB'}, + {"euml", '\u00EB'}, + {"euro", '\u20AC'}, + {"exist", '\u2203'}, + {"fnof", '\u0192'}, + {"forall", '\u2200'}, + {"frac12", '\u00BD'}, + {"frac14", '\u00BC'}, + {"frac34", '\u00BE'}, + {"frasl", '\u2044'}, + {"Gamma", '\u0393'}, + {"gamma", '\u03B3'}, + {"ge", '\u2265'}, + {"gt", '\u003E'}, + {"hArr", '\u21D4'}, + {"harr", '\u2194'}, + {"hearts", '\u2665'}, + {"hellip", '\u2026'}, + {"Iacute", '\u00CD'}, + {"iacute", '\u00ED'}, + {"Icirc", '\u00CE'}, + {"icirc", '\u00EE'}, + {"iexcl", '\u00A1'}, + {"Igrave", '\u00CC'}, + {"igrave", '\u00EC'}, + {"image", '\u2111'}, + {"infin", '\u221E'}, + {"int", '\u222B'}, + {"Iota", '\u0399'}, + {"iota", '\u03B9'}, + {"iquest", '\u00BF'}, + {"isin", '\u2208'}, + {"Iuml", '\u00CF'}, + {"iuml", '\u00EF'}, + {"Kappa", '\u039A'}, + {"kappa", '\u03BA'}, + {"Lambda", '\u039B'}, + {"lambda", '\u03BB'}, + {"lang", '\u2329'}, + {"laquo", '\u00AB'}, + {"lArr", '\u21D0'}, + {"larr", '\u2190'}, + {"lceil", '\u2308'}, + {"ldquo", '\u201C'}, + {"le", '\u2264'}, + {"lfloor", '\u230A'}, + {"lowast", '\u2217'}, + {"loz", '\u25CA'}, + {"lrm", '\u200E'}, + {"lsaquo", '\u2039'}, + {"lsquo", '\u2018'}, + {"lt", '\u003C'}, + {"macr", '\u00AF'}, + {"mdash", '\u2014'}, + {"micro", '\u00B5'}, + {"middot", '\u00B7'}, + {"minus", '\u2212'}, + {"Mu", '\u039C'}, + {"mu", '\u03BC'}, + {"nabla", '\u2207'}, + {"nbsp", '\u00A0'}, + {"ndash", '\u2013'}, + {"ne", '\u2260'}, + {"ni", '\u220B'}, + {"not", '\u00AC'}, + {"notin", '\u2209'}, + {"nsub", '\u2284'}, + {"Ntilde", '\u00D1'}, + {"ntilde", '\u00F1'}, + {"Nu", '\u039D'}, + {"nu", '\u03BD'}, + {"Oacute", '\u00D3'}, + {"oacute", '\u00F3'}, + {"Ocirc", '\u00D4'}, + {"ocirc", '\u00F4'}, + {"OElig", '\u0152'}, + {"oelig", '\u0153'}, + {"Ograve", '\u00D2'}, + {"ograve", '\u00F2'}, + {"oline", '\u203E'}, + {"Omega", '\u03A9'}, + {"omega", '\u03C9'}, + {"Omicron", '\u039F'}, + {"omicron", '\u03BF'}, + {"oplus", '\u2295'}, + {"or", '\u2228'}, + {"ordf", '\u00AA'}, + {"ordm", '\u00BA'}, + {"Oslash", '\u00D8'}, + {"oslash", '\u00F8'}, + {"Otilde", '\u00D5'}, + {"otilde", '\u00F5'}, + {"otimes", '\u2297'}, + {"Ouml", '\u00D6'}, + {"ouml", '\u00F6'}, + {"para", '\u00B6'}, + {"part", '\u2202'}, + {"permil", '\u2030'}, + {"perp", '\u22A5'}, + {"Phi", '\u03A6'}, + {"phi", '\u03C6'}, + {"Pi", '\u03A0'}, + {"pi", '\u03C0'}, + {"piv", '\u03D6'}, + {"plusmn", '\u00B1'}, + {"pound", '\u00A3'}, + {"Prime", '\u2033'}, + {"prime", '\u2032'}, + {"prod", '\u220F'}, + {"prop", '\u221D'}, + {"Psi", '\u03A8'}, + {"psi", '\u03C8'}, + {"quot", '\u0022'}, + {"radic", '\u221A'}, + {"rang", '\u232A'}, + {"raquo", '\u00BB'}, + {"rArr", '\u21D2'}, + {"rarr", '\u2192'}, + {"rceil", '\u2309'}, + {"rdquo", '\u201D'}, + {"real", '\u211C'}, + {"reg", '\u00AE'}, + {"rfloor", '\u230B'}, + {"Rho", '\u03A1'}, + {"rho", '\u03C1'}, + {"rlm", '\u200F'}, + {"rsaquo", '\u203A'}, + {"rsquo", '\u2019'}, + {"sbquo", '\u201A'}, + {"Scaron", '\u0160'}, + {"scaron", '\u0161'}, + {"sdot", '\u22C5'}, + {"sect", '\u00A7'}, + {"shy", '\u00AD'}, + {"Sigma", '\u03A3'}, + {"sigma", '\u03C3'}, + {"sigmaf", '\u03C2'}, + {"sim", '\u223C'}, + {"spades", '\u2660'}, + {"sub", '\u2282'}, + {"sube", '\u2286'}, + {"sum", '\u2211'}, + {"sup", '\u2283'}, + {"sup1", '\u00B9'}, + {"sup2", '\u00B2'}, + {"sup3", '\u00B3'}, + {"supe", '\u2287'}, + {"szlig", '\u00DF'}, + {"Tau", '\u03A4'}, + {"tau", '\u03C4'}, + {"there4", '\u2234'}, + {"Theta", '\u0398'}, + {"theta", '\u03B8'}, + {"thetasym", '\u03D1'}, + {"thinsp", '\u2009'}, + {"THORN", '\u00DE'}, + {"thorn", '\u00FE'}, + {"tilde", '\u02DC'}, + {"times", '\u00D7'}, + {"trade", '\u2122'}, + {"Uacute", '\u00DA'}, + {"uacute", '\u00FA'}, + {"uArr", '\u21D1'}, + {"uarr", '\u2191'}, + {"Ucirc", '\u00DB'}, + {"ucirc", '\u00FB'}, + {"Ugrave", '\u00D9'}, + {"ugrave", '\u00F9'}, + {"uml", '\u00A8'}, + {"upsih", '\u03D2'}, + {"Upsilon", '\u03A5'}, + {"upsilon", '\u03C5'}, + {"Uuml", '\u00DC'}, + {"uuml", '\u00FC'}, + {"weierp", '\u2118'}, + {"Xi", '\u039E'}, + {"xi", '\u03BE'}, + {"Yacute", '\u00DD'}, + {"yacute", '\u00FD'}, + {"yen", '\u00A5'}, + {"Yuml", '\u0178'}, + {"yuml", '\u00FF'}, + {"Zeta", '\u0396'}, + {"zeta", '\u03B6'}, + {"zwj", '\u200D'}, + {"zwnj", '\u200C'} +]; + +uint stringToHash(char[] str) +{ + uint hash; + foreach(c; str) { + hash *= 11; + hash += c; + } + return hash; +} + +char[] toString(uint x) +{ + char[] str; + do + str = cast(char)('0' + (x % 10)) ~ str; + while (x /= 10) + return str; +} + +char[] generateHashAndValueArrays() +{ + uint[] hashes; // String hashes. + uint[] values; // Unicode codepoints. + // Build arrays: + foreach (entity; namedEntities) + { + auto hash = stringToHash(entity.name); + auto value = entity.value; + assert(hash != 0); + // Find insertion place. + uint i; + for (; i < hashes.length; ++i) + { + assert(hash != hashes[i], "bad hash function: conflicting hashes"); + if (hash < hashes[i]) + break; + } + // Insert hash and value into tables. + if (i == hashes.length) + { + hashes ~= hash; + values ~= value; + } + else + { + hashes = hashes[0..i] ~ hash ~ hashes[i..$]; // Insert before index. + values = values[0..i] ~ value ~ values[i..$]; // Insert before index. + } + assert(hashes[i] == hash && values[i] == value); + } + // Build source text: + char[] hashesText = "private static const uint[] hashes = [", + valuesText = "private static const dchar[] values = ["; + foreach (i, hash; hashes) + { + hashesText ~= toString(hash) ~ ","; + valuesText ~= toString(values[i]) ~ ","; + } + hashesText ~= "];"; + valuesText ~= "];"; + return hashesText ~"\n"~ valuesText; +} + +// Mixin: +// private static const uint[] hashes; +// private static const dchar[] values; +mixin(generateHashAndValueArrays); +// pragma(msg, generateHashAndValueArrays()); + /++ Converts a named HTML entity into its equivalent Unicode codepoint. Returns 0xFFFF if entity doesn't exist. +/ dchar entity2Unicode(char[] entity) { - auto d = entity in entities_table; - if (d) - return *d; - return 0xFFFF; + auto hash = stringToHash(entity); + // Binary search: + size_t lower = void, index = void, upper = void; + lower = 0; + upper = hashes.length -1; + while (lower <= upper) + { + index = (lower + upper) / 2; + if (hash < hashes[index]) + upper = index - 1; + else if (hash > hashes[index]) + lower = index + 1; + else + return values[index]; // Return the Unicode codepoint. + } + return 0xFFFF; // Return error value. } + +unittest +{ + Stdout("Testing entity2Unicode().").newline; + alias entity2Unicode f; + foreach (entity; namedEntities) + assert(f(entity.name) == entity.value, + Format("'&{};' == \\u{:X4}, not \\u{:X4}", entity.name, entity.value, cast(uint)f(entity.name)) + ); +}