Mercurial > projects > dil
diff src/dil/HtmlEntities.d @ 806:bcb74c9b895c
Moved out files in the trunk folder to the root.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sun, 09 Mar 2008 00:12:19 +0100 |
parents | trunk/src/dil/HtmlEntities.d@3b34f6a95a27 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/dil/HtmlEntities.d Sun Mar 09 00:12:19 2008 +0100 @@ -0,0 +1,376 @@ +/++ + Author: Aziz Köksal + License: GPL3 ++/ +module dil.HtmlEntities; + +import common; + +/// A named HTML entity. +struct Entity +{ + char[] name; + dchar value; +} + +/// The table of named HTML entities. +static const Entity[] namedEntities = [ + {"Aacute", '\u00C1'}, + {"aacute", '\u00E1'}, + {"Acirc", '\u00C2'}, + {"acirc", '\u00E2'}, + {"acute", '\u00B4'}, + {"AElig", '\u00C6'}, + {"aelig", '\u00E6'}, + {"Agrave", '\u00C0'}, + {"agrave", '\u00E0'}, + {"alefsym", '\u2135'}, + {"Alpha", '\u0391'}, + {"alpha", '\u03B1'}, + {"amp", '\u0026'}, + {"and", '\u2227'}, + {"ang", '\u2220'}, + {"Aring", '\u00C5'}, + {"aring", '\u00E5'}, + {"asymp", '\u2248'}, + {"Atilde", '\u00C3'}, + {"atilde", '\u00E3'}, + {"Auml", '\u00C4'}, + {"auml", '\u00E4'}, + {"bdquo", '\u201E'}, + {"Beta", '\u0392'}, + {"beta", '\u03B2'}, + {"brvbar", '\u00A6'}, + {"bull", '\u2022'}, + {"cap", '\u2229'}, + {"Ccedil", '\u00C7'}, + {"ccedil", '\u00E7'}, + {"cedil", '\u00B8'}, + {"cent", '\u00A2'}, + {"Chi", '\u03A7'}, + {"chi", '\u03C7'}, + {"circ", '\u02C6'}, + {"clubs", '\u2663'}, + {"cong", '\u2245'}, + {"copy", '\u00A9'}, + {"crarr", '\u21B5'}, + {"cup", '\u222A'}, + {"curren", '\u00A4'}, + {"Dagger", '\u2021'}, + {"dagger", '\u2020'}, + {"dArr", '\u21D3'}, + {"darr", '\u2193'}, + {"deg", '\u00B0'}, + {"Delta", '\u0394'}, + {"delta", '\u03B4'}, + {"diams", '\u2666'}, + {"divide", '\u00F7'}, + {"Eacute", '\u00C9'}, + {"eacute", '\u00E9'}, + {"Ecirc", '\u00CA'}, + {"ecirc", '\u00EA'}, + {"Egrave", '\u00C8'}, + {"egrave", '\u00E8'}, + {"empty", '\u2205'}, + {"emsp", '\u2003'}, + {"ensp", '\u2002'}, + {"Epsilon", '\u0395'}, + {"epsilon", '\u03B5'}, + {"equiv", '\u2261'}, + {"Eta", '\u0397'}, + {"eta", '\u03B7'}, + {"ETH", '\u00D0'}, + {"eth", '\u00F0'}, + {"Euml", '\u00CB'}, + {"euml", '\u00EB'}, + {"euro", '\u20AC'}, + {"exist", '\u2203'}, + {"fnof", '\u0192'}, + {"forall", '\u2200'}, + {"frac12", '\u00BD'}, + {"frac14", '\u00BC'}, + {"frac34", '\u00BE'}, + {"frasl", '\u2044'}, + {"Gamma", '\u0393'}, + {"gamma", '\u03B3'}, + {"ge", '\u2265'}, + {"gt", '\u003E'}, + {"hArr", '\u21D4'}, + {"harr", '\u2194'}, + {"hearts", '\u2665'}, + {"hellip", '\u2026'}, + {"Iacute", '\u00CD'}, + {"iacute", '\u00ED'}, + {"Icirc", '\u00CE'}, + {"icirc", '\u00EE'}, + {"iexcl", '\u00A1'}, + {"Igrave", '\u00CC'}, + {"igrave", '\u00EC'}, + {"image", '\u2111'}, + {"infin", '\u221E'}, + {"int", '\u222B'}, + {"Iota", '\u0399'}, + {"iota", '\u03B9'}, + {"iquest", '\u00BF'}, + {"isin", '\u2208'}, + {"Iuml", '\u00CF'}, + {"iuml", '\u00EF'}, + {"Kappa", '\u039A'}, + {"kappa", '\u03BA'}, + {"Lambda", '\u039B'}, + {"lambda", '\u03BB'}, + {"lang", '\u2329'}, + {"laquo", '\u00AB'}, + {"lArr", '\u21D0'}, + {"larr", '\u2190'}, + {"lceil", '\u2308'}, + {"ldquo", '\u201C'}, + {"le", '\u2264'}, + {"lfloor", '\u230A'}, + {"lowast", '\u2217'}, + {"loz", '\u25CA'}, + {"lrm", '\u200E'}, + {"lsaquo", '\u2039'}, + {"lsquo", '\u2018'}, + {"lt", '\u003C'}, + {"macr", '\u00AF'}, + {"mdash", '\u2014'}, + {"micro", '\u00B5'}, + {"middot", '\u00B7'}, + {"minus", '\u2212'}, + {"Mu", '\u039C'}, + {"mu", '\u03BC'}, + {"nabla", '\u2207'}, + {"nbsp", '\u00A0'}, + {"ndash", '\u2013'}, + {"ne", '\u2260'}, + {"ni", '\u220B'}, + {"not", '\u00AC'}, + {"notin", '\u2209'}, + {"nsub", '\u2284'}, + {"Ntilde", '\u00D1'}, + {"ntilde", '\u00F1'}, + {"Nu", '\u039D'}, + {"nu", '\u03BD'}, + {"Oacute", '\u00D3'}, + {"oacute", '\u00F3'}, + {"Ocirc", '\u00D4'}, + {"ocirc", '\u00F4'}, + {"OElig", '\u0152'}, + {"oelig", '\u0153'}, + {"Ograve", '\u00D2'}, + {"ograve", '\u00F2'}, + {"oline", '\u203E'}, + {"Omega", '\u03A9'}, + {"omega", '\u03C9'}, + {"Omicron", '\u039F'}, + {"omicron", '\u03BF'}, + {"oplus", '\u2295'}, + {"or", '\u2228'}, + {"ordf", '\u00AA'}, + {"ordm", '\u00BA'}, + {"Oslash", '\u00D8'}, + {"oslash", '\u00F8'}, + {"Otilde", '\u00D5'}, + {"otilde", '\u00F5'}, + {"otimes", '\u2297'}, + {"Ouml", '\u00D6'}, + {"ouml", '\u00F6'}, + {"para", '\u00B6'}, + {"part", '\u2202'}, + {"permil", '\u2030'}, + {"perp", '\u22A5'}, + {"Phi", '\u03A6'}, + {"phi", '\u03C6'}, + {"Pi", '\u03A0'}, + {"pi", '\u03C0'}, + {"piv", '\u03D6'}, + {"plusmn", '\u00B1'}, + {"pound", '\u00A3'}, + {"Prime", '\u2033'}, + {"prime", '\u2032'}, + {"prod", '\u220F'}, + {"prop", '\u221D'}, + {"Psi", '\u03A8'}, + {"psi", '\u03C8'}, + {"quot", '\u0022'}, + {"radic", '\u221A'}, + {"rang", '\u232A'}, + {"raquo", '\u00BB'}, + {"rArr", '\u21D2'}, + {"rarr", '\u2192'}, + {"rceil", '\u2309'}, + {"rdquo", '\u201D'}, + {"real", '\u211C'}, + {"reg", '\u00AE'}, + {"rfloor", '\u230B'}, + {"Rho", '\u03A1'}, + {"rho", '\u03C1'}, + {"rlm", '\u200F'}, + {"rsaquo", '\u203A'}, + {"rsquo", '\u2019'}, + {"sbquo", '\u201A'}, + {"Scaron", '\u0160'}, + {"scaron", '\u0161'}, + {"sdot", '\u22C5'}, + {"sect", '\u00A7'}, + {"shy", '\u00AD'}, + {"Sigma", '\u03A3'}, + {"sigma", '\u03C3'}, + {"sigmaf", '\u03C2'}, + {"sim", '\u223C'}, + {"spades", '\u2660'}, + {"sub", '\u2282'}, + {"sube", '\u2286'}, + {"sum", '\u2211'}, + {"sup", '\u2283'}, + {"sup1", '\u00B9'}, + {"sup2", '\u00B2'}, + {"sup3", '\u00B3'}, + {"supe", '\u2287'}, + {"szlig", '\u00DF'}, + {"Tau", '\u03A4'}, + {"tau", '\u03C4'}, + {"there4", '\u2234'}, + {"Theta", '\u0398'}, + {"theta", '\u03B8'}, + {"thetasym", '\u03D1'}, + {"thinsp", '\u2009'}, + {"THORN", '\u00DE'}, + {"thorn", '\u00FE'}, + {"tilde", '\u02DC'}, + {"times", '\u00D7'}, + {"trade", '\u2122'}, + {"Uacute", '\u00DA'}, + {"uacute", '\u00FA'}, + {"uArr", '\u21D1'}, + {"uarr", '\u2191'}, + {"Ucirc", '\u00DB'}, + {"ucirc", '\u00FB'}, + {"Ugrave", '\u00D9'}, + {"ugrave", '\u00F9'}, + {"uml", '\u00A8'}, + {"upsih", '\u03D2'}, + {"Upsilon", '\u03A5'}, + {"upsilon", '\u03C5'}, + {"Uuml", '\u00DC'}, + {"uuml", '\u00FC'}, + {"weierp", '\u2118'}, + {"Xi", '\u039E'}, + {"xi", '\u03BE'}, + {"Yacute", '\u00DD'}, + {"yacute", '\u00FD'}, + {"yen", '\u00A5'}, + {"Yuml", '\u0178'}, + {"yuml", '\u00FF'}, + {"Zeta", '\u0396'}, + {"zeta", '\u03B6'}, + {"zwj", '\u200D'}, + {"zwnj", '\u200C'} +]; + +uint stringToHash(char[] str) +{ + uint hash; + foreach(c; str) { + hash *= 11; + hash += c; + } + return hash; +} + +char[] toString(uint x) +{ + char[] str; + do + str = cast(char)('0' + (x % 10)) ~ str; + while (x /= 10) + return str; +} + +char[] generateHashAndValueArrays() +{ + uint[] hashes; // String hashes. + dchar[] values; // Unicode codepoints. + // Build arrays: + foreach (entity; namedEntities) + { + auto hash = stringToHash(entity.name); + auto value = entity.value; + assert(hash != 0); + // Find insertion place. + uint i; + for (; i < hashes.length; ++i) + { + assert(hash != hashes[i], "bad hash function: conflicting hashes"); + if (hash < hashes[i]) + break; + } + // Insert hash and value into tables. + if (i == hashes.length) + { + hashes ~= hash; + values ~= value; + } + else + { + hashes = hashes[0..i] ~ hash ~ hashes[i..$]; // Insert before index. + values = values[0..i] ~ value ~ values[i..$]; // Insert before index. + } + assert(hashes[i] == hash && values[i] == value); + } + // Build source text: + char[] hashesText = "private static const uint[] hashes = [", + valuesText = "private static const dchar[] values = ["; + foreach (i, hash; hashes) + { + hashesText ~= toString(hash) ~ ","; + valuesText ~= toString(values[i]) ~ ","; + } + hashesText ~= "];"; + valuesText ~= "];"; + return hashesText ~"\n"~ valuesText; +} + +version(DDoc) +{ + /// Table of hash values of the entities' names. + private static const uint[] hashes; + /// Table of Unicode codepoints. + private static const dchar[] values; +} +else + mixin(generateHashAndValueArrays); +// pragma(msg, generateHashAndValueArrays()); + +/// Converts a named HTML entity into its equivalent Unicode codepoint. +/// Returns: the entity's value or 0xFFFF if it doesn't exist. +dchar entity2Unicode(char[] entity) +{ + auto hash = stringToHash(entity); + // Binary search: + size_t lower = void, index = void, upper = void; + lower = 0; + upper = hashes.length -1; + while (lower <= upper) + { + index = (lower + upper) / 2; + if (hash < hashes[index]) + upper = index - 1; + else if (hash > hashes[index]) + lower = index + 1; + else + return values[index]; // Return the Unicode codepoint. + } + return 0xFFFF; // Return error value. +} + +unittest +{ + Stdout("Testing entity2Unicode().").newline; + alias entity2Unicode f; + foreach (entity; namedEntities) + assert(f(entity.name) == entity.value, + Format("'&{};' == \\u{:X4}, not \\u{:X4}", entity.name, entity.value, cast(uint)f(entity.name)) + ); +}