Mercurial > projects > dil
changeset 272:0bde32503976
- Added module HtmlEntities. It contains a table for converting HTML entities to Unicode characters.
- The lexer converts HTML entities to Unicode characters now.
- Added UndefinedHTMLEntity to messages table.
author | aziz |
---|---|
date | Sun, 05 Aug 2007 10:19:00 +0000 |
parents | 68987c1c59b6 |
children | e095ec570c31 |
files | trunk/src/HtmlEntities.d trunk/src/Lexer.d trunk/src/Messages.d trunk/src/Parser.d |
diffstat | 4 files changed, 296 insertions(+), 16 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trunk/src/HtmlEntities.d Sun Aug 05 10:19:00 2007 +0000 @@ -0,0 +1,277 @@ +/++ + Author: Aziz Köksal + License: GPL3 ++/ +module HtmlEntities; + +private const dchar[char[]] entities_table; + +static this() +{ + entities_table = [ + "Aacute"[] : '\u00C1', + "aacute" : '\u00E1', + "Acirc" : '\u00C2', + "acirc" : '\u00E2', + "acute" : '\u00B4', + "AElig" : '\u00C6', + "aelig" : '\u00E6', + "Agrave" : '\u00C0', + "agrave" : '\u00E0', + "alefsym" : '\u2135', + "Alpha" : '\u0391', + "alpha" : '\u03B1', + "amp" : '\u0026', + "and" : '\u2227', + "ang" : '\u2220', + "Aring" : '\u00C5', + "aring" : '\u00E5', + "asymp" : '\u2248', + "Atilde" : '\u00C3', + "atilde" : '\u00E3', + "Auml" : '\u00C4', + "auml" : '\u00E4', + "bdquo" : '\u201E', + "Beta" : '\u0392', + "beta" : '\u03B2', + "brvbar" : '\u00A6', + "bull" : '\u2022', + "cap" : '\u2229', + "Ccedil" : '\u00C7', + "ccedil" : '\u00E7', + "cedil" : '\u00B8', + "cent" : '\u00A2', + "Chi" : '\u03A7', + "chi" : '\u03C7', + "circ" : '\u02C6', + "clubs" : '\u2663', + "cong" : '\u2245', + "copy" : '\u00A9', + "crarr" : '\u21B5', + "cup" : '\u222A', + "curren" : '\u00A4', + "Dagger" : '\u2021', + "dagger" : '\u2020', + "dArr" : '\u21D3', + "darr" : '\u2193', + "deg" : '\u00B0', + "Delta" : '\u0394', + "delta" : '\u03B4', + "diams" : '\u2666', + "divide" : '\u00F7', + "Eacute" : '\u00C9', + "eacute" : '\u00E9', + "Ecirc" : '\u00CA', + "ecirc" : '\u00EA', + "Egrave" : '\u00C8', + "egrave" : '\u00E8', + "empty" : '\u2205', + "emsp" : '\u2003', + "ensp" : '\u2002', + "Epsilon" : '\u0395', + "epsilon" : '\u03B5', + "equiv" : '\u2261', + "Eta" : '\u0397', + "eta" : '\u03B7', + "ETH" : '\u00D0', + "eth" : '\u00F0', + "Euml" : '\u00CB', + "euml" : '\u00EB', + "euro" : '\u20AC', + "exist" : '\u2203', + "fnof" : '\u0192', + "forall" : '\u2200', + "frac12" : '\u00BD', + "frac14" : '\u00BC', + "frac34" : '\u00BE', + "frasl" : '\u2044', + "Gamma" : '\u0393', + "gamma" : '\u03B3', + "ge" : '\u2265', + "gt" : '\u003E', + "hArr" : '\u21D4', + "harr" : '\u2194', + "hearts" : '\u2665', + "hellip" : '\u2026', + "Iacute" : '\u00CD', + "iacute" : '\u00ED', + "Icirc" : '\u00CE', + "icirc" : '\u00EE', + "iexcl" : '\u00A1', + "Igrave" : '\u00CC', + "igrave" : '\u00EC', + "image" : '\u2111', + "infin" : '\u221E', + "int" : '\u222B', + "Iota" : '\u0399', + "iota" : '\u03B9', + "iquest" : '\u00BF', + "isin" : '\u2208', + "Iuml" : '\u00CF', + "iuml" : '\u00EF', + "Kappa" : '\u039A', + "kappa" : '\u03BA', + "Lambda" : '\u039B', + "lambda" : '\u03BB', + "lang" : '\u2329', + "laquo" : '\u00AB', + "lArr" : '\u21D0', + "larr" : '\u2190', + "lceil" : '\u2308', + "ldquo" : '\u201C', + "le" : '\u2264', + "lfloor" : '\u230A', + "lowast" : '\u2217', + "loz" : '\u25CA', + "lrm" : '\u200E', + "lsaquo" : '\u2039', + "lsquo" : '\u2018', + "lt" : '\u003C', + "macr" : '\u00AF', + "mdash" : '\u2014', + "micro" : '\u00B5', + "middot" : '\u00B7', + "minus" : '\u2212', + "Mu" : '\u039C', + "mu" : '\u03BC', + "nabla" : '\u2207', + "nbsp" : '\u00A0', + "ndash" : '\u2013', + "ne" : '\u2260', + "ni" : '\u220B', + "not" : '\u00AC', + "notin" : '\u2209', + "nsub" : '\u2284', + "Ntilde" : '\u00D1', + "ntilde" : '\u00F1', + "Nu" : '\u039D', + "nu" : '\u03BD', + "Oacute" : '\u00D3', + "oacute" : '\u00F3', + "Ocirc" : '\u00D4', + "ocirc" : '\u00F4', + "OElig" : '\u0152', + "oelig" : '\u0153', + "Ograve" : '\u00D2', + "ograve" : '\u00F2', + "oline" : '\u203E', + "Omega" : '\u03A9', + "omega" : '\u03C9', + "Omicron" : '\u039F', + "omicron" : '\u03BF', + "oplus" : '\u2295', + "or" : '\u2228', + "ordf" : '\u00AA', + "ordm" : '\u00BA', + "Oslash" : '\u00D8', + "oslash" : '\u00F8', + "Otilde" : '\u00D5', + "otilde" : '\u00F5', + "otimes" : '\u2297', + "Ouml" : '\u00D6', + "ouml" : '\u00F6', + "para" : '\u00B6', + "part" : '\u2202', + "permil" : '\u2030', + "perp" : '\u22A5', + "Phi" : '\u03A6', + "phi" : '\u03C6', + "Pi" : '\u03A0', + "pi" : '\u03C0', + "piv" : '\u03D6', + "plusmn" : '\u00B1', + "pound" : '\u00A3', + "Prime" : '\u2033', + "prime" : '\u2032', + "prod" : '\u220F', + "prop" : '\u221D', + "Psi" : '\u03A8', + "psi" : '\u03C8', + "quot" : '\u0022', + "radic" : '\u221A', + "rang" : '\u232A', + "raquo" : '\u00BB', + "rArr" : '\u21D2', + "rarr" : '\u2192', + "rceil" : '\u2309', + "rdquo" : '\u201D', + "real" : '\u211C', + "reg" : '\u00AE', + "rfloor" : '\u230B', + "Rho" : '\u03A1', + "rho" : '\u03C1', + "rlm" : '\u200F', + "rsaquo" : '\u203A', + "rsquo" : '\u2019', + "sbquo" : '\u201A', + "Scaron" : '\u0160', + "scaron" : '\u0161', + "sdot" : '\u22C5', + "sect" : '\u00A7', + "shy" : '\u00AD', + "Sigma" : '\u03A3', + "sigma" : '\u03C3', + "sigmaf" : '\u03C2', + "sim" : '\u223C', + "spades" : '\u2660', + "sub" : '\u2282', + "sube" : '\u2286', + "sum" : '\u2211', + "sup" : '\u2283', + "sup1" : '\u00B9', + "sup2" : '\u00B2', + "sup3" : '\u00B3', + "supe" : '\u2287', + "szlig" : '\u00DF', + "Tau" : '\u03A4', + "tau" : '\u03C4', + "there4" : '\u2234', + "Theta" : '\u0398', + "theta" : '\u03B8', + "thetasym" : '\u03D1', + "thinsp" : '\u2009', + "THORN" : '\u00DE', + "thorn" : '\u00FE', + "tilde" : '\u02DC', + "times" : '\u00D7', + "trade" : '\u2122', + "Uacute" : '\u00DA', + "uacute" : '\u00FA', + "uArr" : '\u21D1', + "uarr" : '\u2191', + "Ucirc" : '\u00DB', + "ucirc" : '\u00FB', + "Ugrave" : '\u00D9', + "ugrave" : '\u00F9', + "uml" : '\u00A8', + "upsih" : '\u03D2', + "Upsilon" : '\u03A5', + "upsilon" : '\u03C5', + "Uuml" : '\u00DC', + "uuml" : '\u00FC', + "weierp" : '\u2118', + "Xi" : '\u039E', + "xi" : '\u03BE', + "Yacute" : '\u00DD', + "yacute" : '\u00FD', + "yen" : '\u00A5', + "Yuml" : '\u0178', + "yuml" : '\u00FF', + "Zeta" : '\u0396', + "zeta" : '\u03B6', + "zwj" : '\u200D', + "zwnj" : '\u200C' + ]; +} + +/++ + Converts a named HTML entity into its equivalent Unicode codepoint. + Returns 0xFFFF if entity doesn't exist. ++/ +dchar entity2Unicode(char[] entity) +{ + auto d = entity in entities_table; + if (d) + return *d; + return 0xFFFF; +} \ No newline at end of file
--- a/trunk/src/Lexer.d Sat Aug 04 19:49:01 2007 +0000 +++ b/trunk/src/Lexer.d Sun Aug 05 10:19:00 2007 +0000 @@ -8,6 +8,7 @@ import Keywords; import Identifier; import Messages; +import HtmlEntities; import std.stdio; import std.utf; import std.uni; @@ -868,19 +869,22 @@ { if (isalpha(*++p)) { - while (1) + auto begin = p; + while (isalnum(*++p)) + {} + + if (*p == ';') { - if (isalnum(*++p)) - continue; - if (*p == ';') { - // TODO: convert entity to unicode codepoint. - ++p; - break; - } - else { - error(MID.UnterminatedHTMLEntity); - break; - } + c = entity2Unicode(begin[0..p - begin]); + ++p; + if (c == 0xFFFF) + error(MID.UndefinedHTMLEntity, (begin-1)[0..p-(begin-1)]); + break; + } + else + { + error(MID.UnterminatedHTMLEntity); + break; } } else
--- a/trunk/src/Messages.d Sat Aug 04 19:49:01 2007 +0000 +++ b/trunk/src/Messages.d Sun Aug 05 10:19:00 2007 +0000 @@ -34,6 +34,7 @@ UndefinedEscapeSequence, InsufficientHexDigits, // \&[a-zA-Z][a-zA-Z0-9]+; + UndefinedHTMLEntity, UnterminatedHTMLEntity, InvalidBeginHTMLEntity, // integer overflows @@ -83,7 +84,8 @@ "found undefined escape sequence.", "insufficient number of hex digits in escape sequence.", // \&[a-zA-Z][a-zA-Z0-9]+; - "unterminated html entity.", + "undefined HTML entity '{1}'", + "unterminated HTML entity.", "html entities must begin with a letter.", // integer overflows "decimal number overflows sign bit.",
--- a/trunk/src/Parser.d Sat Aug 04 19:49:01 2007 +0000 +++ b/trunk/src/Parser.d Sun Aug 05 10:19:00 2007 +0000 @@ -45,10 +45,7 @@ if (!trying) { writef("\33[32m%s\33[0m", token.type); -try writef("%s", prev[0 .. token.end - prev]); -catch -{writef("\33[30mø\33[0m");} prev = token.end; } } while (token.type == T.Comment) // Skip comments