comparison trunk/src/dil/Token.d @ 485:ea8c7459f1c4

Changed a lot of things in the Lexer. Newlines are tokenized now, instead of being treated as whitespace. Newline tokens store location info as well, which make quite a few functions unnecessary. Added a static method getLocation() which returns a Location instance for any given token. This will also be very useful for finding the location of AST nodes (through Node.begin,) which is needed for reporting parser and semantic errors and emitting documentation. Removed rescanNewlines(), LocState, getState(), restoreState(), evaluateHashLine() and updateErrorLoc(). Added isUnicodeNewlineChar(), isUnicodeNewline(), isNewline(), isNewlineEnd(), isEndOfLine(), scanNewline(), getLocation() and error_(). Replaced some clunky expressions with isascii(), isNewlineEnd(), isEndOfLine(), isUnicodeNewline(), isUnicodeNewlineChar(). Fix in scanNormalStringLiteral(): scanPostfix() must be before label Lreturn. Fixed Lexer unittest. Fix in parseDeclarationDefinitionsBlock(): 'token' should be 'begin'. Added method isMultiline() to Token and added documentation comments.: File name too long abort: file /home/aziz/dil/trunk/Changed a lot of things in the Lexer. Newlines are tokenized now, instead of being treated as whitespace. Newline tokens store location info as well, which make quite a few functions unnecessary. Added a static method getLocation() which returns a Location instance for any given token. This will also be very useful for finding the location of AST nodes (through Node.begin,) which is needed for reporting parser and semantic errors and emitting documentation. Removed rescanNewlines(), LocState, getState(), restoreState(), evaluateHashLine() and updateErrorLoc(). Added isUnicodeNewlineChar(), isUnicodeNewline(), isNewline(), isNewlineEnd(), isEndOfLine(), scanNewline(), getLocation() and error_(). Replaced some clunky expressions with isascii(), isNewlineEnd(), isEndOfLine(), isUnicodeNewline(), isUnicodeNewlineChar(). Fix in scanNormalStringLiteral(): scanPostfix() must be before label Lreturn. Fixed Lexer unittest. Fix in parseDeclarationDefinitionsBlock(): 'token' should be 'begin'. Added method isMultiline() to Token and added documentation comments.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Fri, 30 Nov 2007 20:17:29 +0100
parents 9c69615a4876
children bccca748d745
comparison
equal deleted inserted replaced
484:265c0b655f18 485:ea8c7459f1c4
5 module dil.Token; 5 module dil.Token;
6 import common; 6 import common;
7 import tango.stdc.stdlib : malloc, free; 7 import tango.stdc.stdlib : malloc, free;
8 import tango.core.Exception; 8 import tango.core.Exception;
9 9
10 struct Position
11 {
12 size_t loc;
13 size_t col;
14 }
15
16 enum TOK : ushort 10 enum TOK : ushort
17 { 11 {
18 Invalid, 12 Invalid,
19 13
20 /// Flag for whitespace tokens that must be ignored in the parsing phase. 14 /// Flag for whitespace tokens that must be ignored in the parsing phase.
22 Illegal = 1 | Whitespace, 16 Illegal = 1 | Whitespace,
23 Comment = 2 | Whitespace, 17 Comment = 2 | Whitespace,
24 Shebang = 3 | Whitespace, 18 Shebang = 3 | Whitespace,
25 HashLine = 4 | Whitespace, 19 HashLine = 4 | Whitespace,
26 Filespec = 5 | Whitespace, 20 Filespec = 5 | Whitespace,
27 Empty = 6, 21 Newline = 6 | Whitespace,
28 22 Empty = 7,
29 Identifier = 7, 23
24 Identifier = 8,
30 String, 25 String,
31 CharLiteral, WCharLiteral, DCharLiteral, 26 CharLiteral, WCharLiteral, DCharLiteral,
32 27
33 // Special tokens 28 // Special tokens
34 FILE, 29 FILE,
117 alias TOK.Abstract KeywordsBegin; 112 alias TOK.Abstract KeywordsBegin;
118 alias TOK.With KeywordsEnd; 113 alias TOK.With KeywordsEnd;
119 114
120 struct Token 115 struct Token
121 { 116 {
122 TOK type; 117 TOK type; /// The type of the token.
123 // Position pos; 118 /// Pointers to the next and previous tokens (doubly-linked list.)
124
125 Token* next, prev; 119 Token* next, prev;
126 120
127 char* ws; /// Start of whitespace characters before token. Null if no WS. 121 char* ws; /// Start of whitespace characters before token. Null if no WS.
128 char* start; /// Start of token in source text. 122 char* start; /// Start of token in source text.
129 char* end; /// Points one past the end of token in source text. 123 char* end; /// Points one past the end of token in source text.
130 124
131 union 125 union
132 { 126 {
127 /// For newline tokens.
133 struct 128 struct
134 { 129 {
135 Token* line_num; // #line number 130 char[] filePath;
136 Token* line_filespec; // #line number filespec 131 uint lineNum;
137 } 132 uint lineNum_hline;
133 }
134 /// For #line tokens.
138 struct 135 struct
139 { 136 {
137 Token* tokLineNum; /// #line number
138 Token* tokLineFilespec; /// #line number filespec
139 }
140 /// For string tokens.
141 struct
142 {
140 string str; 143 string str;
141 char pf; /// Postfix 'c', 'w' or 'd' 144 char pf; /// Postfix 'c', 'w', 'd' or 0 for none.
142 version(D2) 145 version(D2)
143 Token* tok_str; /// Points to the contents of a token string stored as a 146 Token* tok_str; /// Points to the contents of a token string stored as a
144 /// doubly linked list. The last token is always '}' or 147 /// doubly linked list. The last token is always '}' or
145 /// EOF in case end of source text is "q{" EOF. 148 /// EOF in case end of source text is "q{" EOF.
146 } 149 }
197 static string toString(TOK tok) 200 static string toString(TOK tok)
198 { 201 {
199 return tokToString[tok]; 202 return tokToString[tok];
200 } 203 }
201 204
205 /++
206 Returns true if this is a token which can have newlines in it.
207 These can be any string literal except for escape literals
208 and block and nested comments.
209 +/
210 bool isMultiline()
211 {
212 return type == TOK.String && start[0] != '\\' ||
213 type == TOK.Comment && start[1] != '/';
214 }
215
216 /// Returns true if this is a keyword token.
202 bool isKeyword() 217 bool isKeyword()
203 { 218 {
204 return KeywordsBegin <= type && type <= KeywordsEnd; 219 return KeywordsBegin <= type && type <= KeywordsEnd;
205 } 220 }
206 221
222 /// Returns true if this is a whitespace token.
207 bool isWhitespace() 223 bool isWhitespace()
208 { 224 {
209 return !!(type & TOK.Whitespace); 225 return !!(type & TOK.Whitespace);
210 } 226 }
211 227
228 /// Returns true if this is a special token.
212 bool isSpecialToken() 229 bool isSpecialToken()
213 { 230 {
214 return *start == '_' && type != TOK.Identifier; 231 return *start == '_' && type != TOK.Identifier;
215 } 232 }
216 233
217 version(D2) 234 version(D2)
218 { 235 {
236 /// Returns true if this is a token string literal.
219 bool isTokenStringLiteral() 237 bool isTokenStringLiteral()
220 { 238 {
221 return type == TOK.String && tok_str !is null; 239 return type == TOK.String && tok_str !is null;
222 } 240 }
223 } 241 }
254 } 272 }
255 273
256 void destructHashLineToken() 274 void destructHashLineToken()
257 { 275 {
258 assert(type == TOK.HashLine); 276 assert(type == TOK.HashLine);
259 delete line_num; 277 delete tokLineNum;
260 delete line_filespec; 278 delete tokLineFilespec;
261 } 279 }
262 280
263 version(D2) 281 version(D2)
264 { 282 {
265 void destructTokenStringLiteral() 283 void destructTokenStringLiteral()
278 } 296 }
279 } 297 }
280 } 298 }
281 } 299 }
282 300
283 const string[] tokToString = [ 301 /// A table mapping each TOK to a string.
302 private const string[] tokToString = [
284 "Invalid", 303 "Invalid",
285 304
286 "Illegal", 305 "Illegal",
287 "Comment", 306 "Comment",
288 "#! /shebang/", 307 "#! /shebang/",
289 "#line", 308 "#line",
290 `"filespec"`, 309 `"filespec"`,
310 "Newline",
291 "Empty", 311 "Empty",
292 312
293 "Identifier", 313 "Identifier",
294 "String", 314 "String",
295 "CharLiteral", "WCharLiteral", "DCharLiteral", 315 "CharLiteral", "WCharLiteral", "DCharLiteral",
313 "{", 333 "{",
314 "}", 334 "}",
315 335
316 ".", "..", "...", 336 ".", "..", "...",
317 337
318 "Unordered", 338 "!<>=", // Unordered
319 "UorE", 339 "!<>", // UorE
320 "UorG", 340 "!<=", // UorG
321 "UorGorE", 341 "!<", // UorGorE
322 "UorL", 342 "!>=", // UorL
323 "UorLorE", 343 "!>", // UorLorE
324 "LorEorG", 344 "<>=", // LorEorG
325 "LorG", 345 "<>", // LorG
326 346
327 "=", "==", "!=", "!", 347 "=", "==", "!=", "!",
328 "<=", "<", 348 "<=", "<",
329 ">=", ">", 349 ">=", ">",
330 "<<=", "<<", 350 "<<=", "<<",