comparison trunk/src/dil/Lexer.d @ 485:ea8c7459f1c4

Changed a lot of things in the Lexer. Newlines are tokenized now, instead of being treated as whitespace. Newline tokens store location info as well, which make quite a few functions unnecessary. Added a static method getLocation() which returns a Location instance for any given token. This will also be very useful for finding the location of AST nodes (through Node.begin,) which is needed for reporting parser and semantic errors and emitting documentation. Removed rescanNewlines(), LocState, getState(), restoreState(), evaluateHashLine() and updateErrorLoc(). Added isUnicodeNewlineChar(), isUnicodeNewline(), isNewline(), isNewlineEnd(), isEndOfLine(), scanNewline(), getLocation() and error_(). Replaced some clunky expressions with isascii(), isNewlineEnd(), isEndOfLine(), isUnicodeNewline(), isUnicodeNewlineChar(). Fix in scanNormalStringLiteral(): scanPostfix() must be before label Lreturn. Fixed Lexer unittest. Fix in parseDeclarationDefinitionsBlock(): 'token' should be 'begin'. Added method isMultiline() to Token and added documentation comments.: File name too long abort: file /home/aziz/dil/trunk/Changed a lot of things in the Lexer. Newlines are tokenized now, instead of being treated as whitespace. Newline tokens store location info as well, which make quite a few functions unnecessary. Added a static method getLocation() which returns a Location instance for any given token. This will also be very useful for finding the location of AST nodes (through Node.begin,) which is needed for reporting parser and semantic errors and emitting documentation. Removed rescanNewlines(), LocState, getState(), restoreState(), evaluateHashLine() and updateErrorLoc(). Added isUnicodeNewlineChar(), isUnicodeNewline(), isNewline(), isNewlineEnd(), isEndOfLine(), scanNewline(), getLocation() and error_(). Replaced some clunky expressions with isascii(), isNewlineEnd(), isEndOfLine(), isUnicodeNewline(), isUnicodeNewlineChar(). Fix in scanNormalStringLiteral(): scanPostfix() must be before label Lreturn. Fixed Lexer unittest. Fix in parseDeclarationDefinitionsBlock(): 'token' should be 'begin'. Added method isMultiline() to Token and added documentation comments.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Fri, 30 Nov 2007 20:17:29 +0100
parents 325714d8aa6c
children bccca748d745
comparison
equal deleted inserted replaced
484:265c0b655f18 485:ea8c7459f1c4
18 import std.uni; 18 import std.uni;
19 import common; 19 import common;
20 20
21 const char[3] LS = \u2028; /// Line separator. 21 const char[3] LS = \u2028; /// Line separator.
22 const char[3] PS = \u2029; /// Paragraph separator. 22 const char[3] PS = \u2029; /// Paragraph separator.
23
24 const dchar LSd = 0x2028; 23 const dchar LSd = 0x2028;
25 const dchar PSd = 0x2029; 24 const dchar PSd = 0x2029;
25 static assert(LS[0] == PS[0] && LS[1] == PS[1]);
26 26
27 /// U+FFFD = �. Used to replace invalid Unicode characters. 27 /// U+FFFD = �. Used to replace invalid Unicode characters.
28 const dchar REPLACEMENT_CHAR = '\uFFFD'; 28 const dchar REPLACEMENT_CHAR = '\uFFFD';
29 29
30 const uint _Z_ = 26; /// Control+Z 30 const uint _Z_ = 26; /// Control+Z
31 31
32 class Lexer 32 class Lexer
33 { 33 {
34 Token* head; /// The head of the doubly linked token list. 34 Token* head; /// The head of the doubly linked token list.
35 Token* tail; /// The tail of the linked list. Set in scan(). 35 Token* tail; /// The tail of the linked list. Set in scan().
36 Token* token; /// Points to the current token in the token list. 36 Token* token; /// Points to the current token in the token list.
37 string text; /// The source text. 37 string text; /// The source text.
38 char[] filePath; /// Path to the source file. 38 char[] filePath; /// Path to the source text.
39 char* p; /// Points to the current character in the source text. 39 char* p; /// Points to the current character in the source text.
40 char* end; /// Points one character past the end of the source text. 40 char* end; /// Points one character past the end of the source text.
41 41
42 // Members used for error messages: 42 // Members used for error messages:
43 Information[] errors; 43 Information[] errors;
44 char* lineBegin; /// Always points to the beginning of the current line. 44 /// Always points to the beginning of the current line.
45 uint loc = 1; /// Actual line of code. 45 char* lineBegin;
46 uint loc_hline; /// Line number set by #line. 46 // Token* newline; /// Current newline token.
47 uint lineNum = 1; /// Current, actual source text line number.
48 uint lineNum_hline; /// Line number set by #line.
47 uint inTokenString; /// > 0 if inside q{ } 49 uint inTokenString; /// > 0 if inside q{ }
48 Location errorLoc; 50 char[] errorPath; /// The path displayed in error messages.
49 51
50 Identifier[string] idtable; 52 Identifier[string] idtable;
51 53
52 version(token2LocTable) 54 /++
53 /// Maps every token that starts a new line to a Location. 55 Construct a Lexer object.
54 Location[Token*] token2LocTable; 56 Params:
55 57 text = the UTF-8 source code.
58 filePath = the path to the source code; used for error messages.
59 +/
56 this(string text, string filePath) 60 this(string text, string filePath)
57 { 61 {
58 this.filePath = filePath; 62 this.filePath = this.errorPath = filePath;
59 63
60 this.text = text; 64 this.text = text;
61 if (text.length == 0 || text[$-1] != 0) 65 if (text.length == 0 || text[$-1] != 0)
62 { 66 {
63 this.text.length = this.text.length + 1; 67 this.text.length = this.text.length + 1;
65 } 69 }
66 70
67 this.p = this.text.ptr; 71 this.p = this.text.ptr;
68 this.end = this.p + this.text.length; 72 this.end = this.p + this.text.length;
69 this.lineBegin = this.p; 73 this.lineBegin = this.p;
70 this.errorLoc = new Location(filePath, 1, this.lineBegin, this.lineBegin);
71 loadKeywords(this.idtable); 74 loadKeywords(this.idtable);
72 75
73 this.head = new Token; 76 this.head = new Token;
74 this.head.type = TOK.HEAD; 77 this.head.type = TOK.HEAD;
78 this.head.start = this.head.end = this.p;
75 this.token = this.head; 79 this.token = this.head;
80 // Add a newline as the first token after the head.
81 auto newline = new Token;
82 newline.type = TOK.Newline;
83 newline.start = newline.end = this.p;
84 newline.filePath = this.errorPath;
85 newline.lineNum = 1;
86 newline.lineNum_hline = 0;
87 // Link in.
88 this.token.next = newline;
89 newline.prev = this.token;
90 this.token = newline;
91 // this.newline = newline;
76 scanShebang(); 92 scanShebang();
77 version(token2LocTable)
78 {
79 // Add first token to table.
80 auto firstToken = this.head;
81 peek(firstToken);
82 token2LocTable[firstToken] = new Location(1, null);
83 }
84 } 93 }
85 94
86 ~this() 95 ~this()
87 { 96 {
88 auto token = head.next; 97 auto token = head.next;
93 token = token.next; 102 token = token.next;
94 } 103 }
95 delete tail; 104 delete tail;
96 } 105 }
97 106
107 /++
108 The "shebang" may optionally appear once at the beginning of a file.
109 Regexp: #![^\EndOfLine]*
110 +/
98 void scanShebang() 111 void scanShebang()
99 { 112 {
100 if (*p == '#' && p[1] == '!') 113 if (*p == '#' && p[1] == '!')
101 { 114 {
102 Token* t = new Token; 115 auto t = new Token;
116 t.type = TOK.Shebang;
103 t.start = p; 117 t.start = p;
104 t.type = TOK.Shebang;
105 ++p; 118 ++p;
106 assert(*p == '!'); 119 while (!isEndOfLine(++p))
107 while (1) 120 isascii(*p) || decodeUTF8();
108 { 121 t.end = p;
109 t.end = ++p; 122 this.token.next = t;
110 switch (*p) 123 t.prev = this.token;
111 {
112 case '\r', '\n', 0, _Z_:
113 break;
114 case LS[0]:
115 if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
116 break;
117 default:
118 if (*p & 128)
119 decodeUTF8();
120 continue;
121 }
122 break; // Exit loop.
123 }
124 // Reset p. The newline will be scanned as whitespace in scan().
125 p = t.end;
126 this.head.next = t;
127 t.prev = this.head;
128 } 124 }
129 } 125 }
130 126
131 void finalizeSpecialToken(ref Token t) 127 void finalizeSpecialToken(ref Token t)
132 { 128 {
133 assert(t.srcText[0..2] == "__"); 129 assert(t.srcText[0..2] == "__");
134 switch (t.type) 130 switch (t.type)
135 { 131 {
136 case TOK.FILE: 132 case TOK.FILE:
137 t.str = this.errorLoc.filePath; 133 t.str = this.errorPath;
138 break; 134 break;
139 case TOK.LINE: 135 case TOK.LINE:
140 t.uint_ = this.errorLineNum(this.loc); 136 t.uint_ = this.errorLineNumber(this.lineNum);
141 break; 137 break;
142 case TOK.DATE, 138 case TOK.DATE,
143 TOK.TIME, 139 TOK.TIME,
144 TOK.TIMESTAMP: 140 TOK.TIMESTAMP:
145 time_t time_val; 141 time_t time_val;
167 default: 163 default:
168 assert(0); 164 assert(0);
169 } 165 }
170 } 166 }
171 167
172 void setLineBegin(char* p) 168 private void setLineBegin(char* p)
173 { 169 {
174 // Check that we can look behind one character. 170 // Check that we can look behind one character.
175 assert((p-1) >= text.ptr && p < end); 171 assert((p-1) >= text.ptr && p < end);
176 // Check that previous character is a newline. 172 // Check that previous character is a newline.
177 assert(p[-1] == '\n' || p[-1] == '\r' || 173 assert(isNewlineEnd(p - 1));
178 p[-1] == LS[2] || p[-1] == PS[2]);
179 this.lineBegin = p; 174 this.lineBegin = p;
180 } 175 }
181 176
182 private void scanNext(bool rescan)(ref Token* t) 177 private void scanNext(ref Token* t)
183 { 178 {
184 assert(t !is null); 179 assert(t !is null);
185 if (t.next) 180 if (t.next)
186 { 181 {
187 t = t.next; 182 t = t.next;
188 static if (rescan == true) 183 // if (t.type == TOK.Newline)
189 rescanNewlines(*t); 184 // this.newline = t;
190 } 185 }
191 else if (t != this.tail) 186 else if (t != this.tail)
192 { 187 {
193 Token* new_t = new Token; 188 Token* new_t = new Token;
194 scan(*new_t); 189 scan(*new_t);
196 t.next = new_t; 191 t.next = new_t;
197 t = new_t; 192 t = new_t;
198 } 193 }
199 } 194 }
200 195
196 /// Advance t one token forward.
201 void peek(ref Token* t) 197 void peek(ref Token* t)
202 { 198 {
203 scanNext!(false)(t); 199 scanNext(t);
204 } 200 }
205 201
202 /// Advance to the next token in the source text.
206 TOK nextToken() 203 TOK nextToken()
207 { 204 {
208 scanNext!(true)(this.token); 205 scanNext(this.token);
209 return this.token.type; 206 return this.token.type;
210 } 207 }
211 208
212 void rescanNewlines(ref Token t) 209 /// Returns true if d is a Unicode line or paragraph separator.
213 { 210 static bool isUnicodeNewlineChar(dchar d)
214 auto p = t.ws; 211 {
215 auto end = t.start; 212 return d == LSd || d == PSd;
216 213 }
217 if (p !is null) 214
218 { 215 /// Returns true if p points to a line or paragraph separator.
219 assert(end !is null); 216 static bool isUnicodeNewline(char* p)
220 // Scan preceding whitespace for newlines. 217 {
221 do 218 return *p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]);
222 { 219 }
223 switch (*p) 220
224 { 221 /++
225 case '\r': 222 Returns true if p points to the start of a Newline.
226 if (p[1] == '\n') 223 Newline: \n | \r | \r\n | LS | PS
227 ++p; 224 +/
228 case '\n': 225 static bool isNewline(char* p)
229 ++loc; 226 {
230 setLineBegin(p + 1); 227 return *p == '\n' || *p == '\r' || isUnicodeNewline(p);
231 break; 228 }
232 case LS[0]: 229
233 assert(p+2 < end && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])); 230 /// Returns true if p points to the last character of a Newline.
234 ++p; ++p; 231 bool isNewlineEnd(char* p)
235 ++loc; 232 {
236 setLineBegin(p + 1); 233 if (*p == '\n' || *p == '\r')
237 break; 234 return true;
238 default: 235 if (*p == LS[2] || *p == PS[2])
239 assert(isspace(*p)); 236 if ((p-2) >= text.ptr)
240 } 237 if (p[-1] == LS[1] && p[-2] == LS[0])
238 return true;
239 return false;
240 }
241
242 /++
243 Returns true if p points to the first character of an EndOfLine.
244 EndOfLine: Newline | 0 | _Z_
245 +/
246 static bool isEndOfLine(char* p)
247 {
248 return isNewline(p) || *p == 0 || *p == _Z_;
249 }
250
251 /++
252 Scans a Newline and sets p one character past it.
253 Returns '\n' if scanned or 0 otherwise.
254 +/
255 static dchar scanNewline(ref char* p)
256 {
257 switch (*p)
258 {
259 case '\r':
260 if (p[1] == '\n')
241 ++p; 261 ++p;
242 } while (p < end) 262 case '\n':
243 } 263 ++p;
244 264 return '\n';
245 if (t.type == TOK.String && t.start[0] != '\\' || 265 default:
246 t.type == TOK.Comment && t.start[1] != '/') 266 if (isUnicodeNewline(p))
247 { 267 {
248 // String literals and comments are the only tokens that can have 268 ++p; ++p; ++p;
249 // newlines. 269 return '\n';
250 p = t.start; 270 }
251 end = t.end; 271 }
252 assert(p !is null && end !is null); 272 return 0;
253 do 273 }
254 { 274
255 switch (*p) 275 /// Returns a Location for the given token.
256 { 276 static Location getLocation(Token* token)
257 case '\r': 277 {
258 if (p[1] == '\n') 278 auto search_t = token.prev;
259 ++p; 279 // Find previous newline token.
260 case '\n': 280 while (search_t.type != TOK.Newline)
261 ++loc; 281 search_t = search_t.prev;
262 setLineBegin(p + 1); 282 auto filePath = search_t.filePath;
263 break; 283 auto lineNum = search_t.lineNum - search_t.lineNum_hline;
264 case LS[0]: 284 auto lineBegin = search_t.end;
265 if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) 285 // Determine actual line begin and line number.
286 while (1)
287 {
288 search_t = search_t.next;
289 if (search_t == token)
290 break;
291 // Multiline tokens must be rescanned for newlines.
292 if (search_t.isMultiline)
293 {
294 auto p = search_t.start, end = search_t.end;
295 while (p != end)
296 {
297 if (Lexer.scanNewline(p) == '\n')
266 { 298 {
267 ++p; ++p; 299 lineBegin = p;
268 ++loc; 300 ++lineNum;
269 setLineBegin(p + 1);
270 break;
271 } 301 }
272 default: 302 else
273 } 303 ++p;
274 ++p; 304 }
275 } while (p < end) 305 }
276 } 306 }
277 else 307 return new Location(filePath, lineNum, lineBegin, token.start);
278 { 308 }
279 if (t.type == TOK.HashLine) 309
280 evaluateHashLine(t); 310 /++
281 311 This is the old scan method.
282 assert(delegate() { 312 TODO: profile old and new to see which one is faster.
283 p = t.start; 313 +/
284 end = t.end;
285 while (p < end)
286 {
287 if (*p == '\n' || *p == '\r' ||
288 (p+2) < end && *p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
289 return false;
290 ++p;
291 }
292 return true;
293 }() == true, "Token '" ~ t.srcText ~ "' has unexpected newline."
294 );
295 }
296 }
297
298 struct LocState
299 {
300 char[] filePath;
301 uint loc;
302 uint loc_hline;
303 char* lineBegin;
304 }
305
306 LocState getState()
307 {
308 LocState s;
309 s.filePath = this.errorLoc.filePath;
310 s.lineBegin = this.lineBegin;
311 s.loc_hline = this.loc_hline;
312 s.loc = this.loc;
313 return s;
314 }
315
316 void restoreState(LocState s)
317 {
318 if (s.lineBegin == this.lineBegin)
319 return;
320 assert(s.loc != this.loc);
321 this.errorLoc.setFilePath(s.filePath);
322 this.lineBegin = s.lineBegin;
323 this.loc = s.loc;
324 this.loc_hline = s.loc_hline;
325 }
326
327 public void scan_(out Token t) 314 public void scan_(out Token t)
328 in 315 in
329 { 316 {
330 assert(text.ptr <= p && p < end); 317 assert(text.ptr <= p && p < end);
331 } 318 }
335 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.type)); 322 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.type));
336 } 323 }
337 body 324 body
338 { 325 {
339 // Scan whitespace. 326 // Scan whitespace.
340 auto pws = p; 327 if (isspace(*p))
341 auto old_loc = this.loc; 328 {
342 while (1) 329 t.ws = p;
343 { 330 while (isspace(*++p))
331 {}
332 }
333
334 // Scan a token.
335 uint c = *p;
336 {
337 t.start = p;
338 // Newline.
344 switch (*p) 339 switch (*p)
345 { 340 {
346 case '\r': 341 case '\r':
347 if (p[1] == '\n') 342 if (p[1] == '\n')
348 ++p; 343 ++p;
349 case '\n': 344 case '\n':
350 assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]); 345 assert(isNewlineEnd(p));
351 ++p; 346 ++p;
352 ++loc; 347 ++lineNum;
353 setLineBegin(p); 348 setLineBegin(p);
354 continue; 349 // this.newline = &t;
355 case LS[0]: 350 t.type = TOK.Newline;
356 if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) 351 t.filePath = this.errorPath;
352 t.lineNum = lineNum;
353 t.lineNum_hline = lineNum_hline;
354 t.end = p;
355 return;
356 default:
357 if (isUnicodeNewline(p))
357 { 358 {
358 ++p; ++p; 359 ++p; ++p;
359 goto case '\n'; 360 goto case '\n';
360 } 361 }
361 // goto default; 362 }
362 default: 363 // Identifier or string literal.
363 if (!isspace(*p))
364 break;
365 ++p;
366 continue;
367 }
368 break; // Exit loop.
369 }
370
371 if (p != pws)
372 {
373 t.ws = pws;
374 if (old_loc != this.loc)
375 version(token2LocTable)
376 token2LocTable[&t] = new Location(loc, null);
377 }
378
379 // Scan token.
380 uint c = *p;
381 {
382 t.start = p;
383
384 if (isidbeg(c)) 364 if (isidbeg(c))
385 { 365 {
386 if (c == 'r' && p[1] == '"' && ++p) 366 if (c == 'r' && p[1] == '"' && ++p)
387 return scanRawStringLiteral(t); 367 return scanRawStringLiteral(t);
388 if (c == 'x' && p[1] == '"') 368 if (c == 'x' && p[1] == '"')
392 if (c == 'q' && p[1] == '"') 372 if (c == 'q' && p[1] == '"')
393 return scanDelimitedStringLiteral(t); 373 return scanDelimitedStringLiteral(t);
394 if (c == 'q' && p[1] == '{') 374 if (c == 'q' && p[1] == '{')
395 return scanTokenStringLiteral(t); 375 return scanTokenStringLiteral(t);
396 } 376 }
377 // Scan identifier.
397 Lidentifier: 378 Lidentifier:
398 do 379 do
399 { c = *++p; } 380 { c = *++p; }
400 while (isident(c) || c & 128 && isUniAlpha(decodeUTF8())) 381 while (isident(c) || !isascii(c) && isUniAlpha(decodeUTF8()))
401 382
402 t.end = p; 383 t.end = p;
403 384
404 string str = t.srcText; 385 string str = t.srcText;
405 Identifier* id = str in idtable; 386 Identifier* id = str in idtable;
441 case '+': 422 case '+':
442 return scanNestedComment(t); 423 return scanNestedComment(t);
443 case '*': 424 case '*':
444 return scanBlockComment(t); 425 return scanBlockComment(t);
445 case '/': 426 case '/':
446 while (1) 427 while (!isEndOfLine(++p))
447 { 428 isascii(*p) || decodeUTF8();
448 c = *++p;
449 switch (c)
450 {
451 case '\r', '\n', 0, _Z_:
452 break;
453 case LS[0]:
454 if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
455 break;
456 default:
457 if (c & 128)
458 decodeUTF8();
459 continue;
460 }
461 break; // Exit loop.
462 }
463 t.type = TOK.Comment; 429 t.type = TOK.Comment;
464 t.end = p; 430 t.end = p;
465 return; 431 return;
466 default: 432 default:
467 t.type = TOK.Div; 433 t.type = TOK.Div;
481 case '\\': 447 case '\\':
482 char[] buffer; 448 char[] buffer;
483 do 449 do
484 { 450 {
485 c = scanEscapeSequence(); 451 c = scanEscapeSequence();
486 if (c < 128) 452 if (isascii(c))
487 buffer ~= c; 453 buffer ~= c;
488 else 454 else
489 encodeUTF8(buffer, c); 455 encodeUTF8(buffer, c);
490 } while (*p == '\\') 456 } while (*p == '\\')
491 buffer ~= 0; 457 buffer ~= 0;
747 tail = &t; 713 tail = &t;
748 assert(t.start == t.end); 714 assert(t.start == t.end);
749 return; 715 return;
750 } 716 }
751 717
752 if (c & 128) 718 if (!isascii(c))
753 { 719 {
754 c = decodeUTF8(); 720 c = decodeUTF8();
755 if (isUniAlpha(c)) 721 if (isUniAlpha(c))
756 goto Lidentifier; 722 goto Lidentifier;
757 } 723 }
816 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.type)); 782 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.type));
817 } 783 }
818 body 784 body
819 { 785 {
820 // Scan whitespace. 786 // Scan whitespace.
821 auto pws = p; 787 if (isspace(*p))
822 auto old_loc = this.loc; 788 {
823 while (1) 789 t.ws = p;
824 { 790 while (isspace(*++p))
825 switch (*p) 791 {}
826 { 792 }
827 case '\r': 793
828 if (p[1] == '\n') 794 // Scan a token.
829 ++p; 795 t.start = p;
830 case '\n': 796 // Newline.
831 assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]); 797 switch (*p)
798 {
799 case '\r':
800 if (p[1] == '\n')
832 ++p; 801 ++p;
833 ++loc; 802 case '\n':
834 setLineBegin(p); 803 assert(isNewlineEnd(p));
835 continue; 804 ++p;
836 case LS[0]: 805 ++lineNum;
837 if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) 806 setLineBegin(p);
838 { 807 // this.newline = &t;
839 ++p; ++p; 808 t.type = TOK.Newline;
840 goto case '\n'; 809 t.filePath = this.errorPath;
841 } 810 t.lineNum = lineNum;
842 // goto default; 811 t.lineNum_hline = lineNum_hline;
843 default: 812 t.end = p;
844 if (!isspace(*p)) 813 return;
845 break; 814 default:
846 ++p; 815 if (isUnicodeNewline(p))
847 continue; 816 {
848 } 817 ++p; ++p;
849 break; // Exit loop. 818 goto case '\n';
850 } 819 }
851 820 }
852 if (p != pws)
853 {
854 t.ws = pws;
855 if (old_loc != this.loc)
856 version(token2LocTable)
857 token2LocTable[&t] = new Location(loc, null);
858 }
859
860 // Scan token.
861 t.start = p;
862 821
863 uint c = *p; 822 uint c = *p;
864 assert(end - p != 0); 823 assert(end - p != 0);
865 switch (end - p) 824 switch (end - p)
866 { 825 {
954 ++p; // Skip / 913 ++p; // Skip /
955 return scanBlockComment(t); 914 return scanBlockComment(t);
956 case toUint!("//"): 915 case toUint!("//"):
957 ++p; // Skip / 916 ++p; // Skip /
958 assert(*p == '/'); 917 assert(*p == '/');
959 while (1) 918 while (!isEndOfLine(++p))
960 { 919 isascii(*p) || decodeUTF8();
961 c = *++p;
962 switch (c)
963 {
964 case '\r', '\n', 0, _Z_:
965 break;
966 case LS[0]:
967 if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
968 break;
969 default:
970 if (c & 128)
971 decodeUTF8();
972 continue;
973 }
974 break; // Exit loop.
975 }
976 t.type = TOK.Comment; 920 t.type = TOK.Comment;
977 t.end = p; 921 t.end = p;
978 return; 922 return;
979 case toUint!(">="): 923 case toUint!(">="):
980 t.type = TOK.GreaterEqual; 924 t.type = TOK.GreaterEqual;
1068 case '\\': 1012 case '\\':
1069 char[] buffer; 1013 char[] buffer;
1070 do 1014 do
1071 { 1015 {
1072 c = scanEscapeSequence(); 1016 c = scanEscapeSequence();
1073 if (c < 128) 1017 if (isascii(c))
1074 buffer ~= c; 1018 buffer ~= c;
1075 else 1019 else
1076 encodeUTF8(buffer, c); 1020 encodeUTF8(buffer, c);
1077 } while (*p == '\\') 1021 } while (*p == '\\')
1078 buffer ~= 0; 1022 buffer ~= 0;
1180 if (c == 'q' && p[1] == '"') 1124 if (c == 'q' && p[1] == '"')
1181 return scanDelimitedStringLiteral(t); 1125 return scanDelimitedStringLiteral(t);
1182 if (c == 'q' && p[1] == '{') 1126 if (c == 'q' && p[1] == '{')
1183 return scanTokenStringLiteral(t); 1127 return scanTokenStringLiteral(t);
1184 } 1128 }
1129 // Scan identifier.
1185 Lidentifier: 1130 Lidentifier:
1186 do 1131 do
1187 { c = *++p; } 1132 { c = *++p; }
1188 while (isident(c) || c & 128 && isUniAlpha(decodeUTF8())) 1133 while (isident(c) || !isascii(c) && isUniAlpha(decodeUTF8()))
1189 1134
1190 t.end = p; 1135 t.end = p;
1191 1136
1192 string str = t.srcText; 1137 string str = t.srcText;
1193 Identifier* id = str in idtable; 1138 Identifier* id = str in idtable;
1225 tail = &t; 1170 tail = &t;
1226 assert(t.start == t.end); 1171 assert(t.start == t.end);
1227 return; 1172 return;
1228 } 1173 }
1229 1174
1230 if (c & 128) 1175 if (!isascii(c))
1231 { 1176 {
1232 c = decodeUTF8(); 1177 c = decodeUTF8();
1233 if (isUniAlpha(c)) 1178 if (isUniAlpha(c))
1234 goto Lidentifier; 1179 goto Lidentifier;
1235 } 1180 }
1244 } 1189 }
1245 1190
1246 void scanBlockComment(ref Token t) 1191 void scanBlockComment(ref Token t)
1247 { 1192 {
1248 assert(p[-1] == '/' && *p == '*'); 1193 assert(p[-1] == '/' && *p == '*');
1249 auto tokenLineNum = loc; 1194 auto tokenLineNum = lineNum;
1250 auto tokenLineBegin = lineBegin; 1195 auto tokenLineBegin = lineBegin;
1251 uint c; 1196 uint c;
1252 while (1) 1197 while (1)
1253 { 1198 {
1254 c = *++p; 1199 c = *++p;
1257 { 1202 {
1258 case '\r': 1203 case '\r':
1259 if (p[1] == '\n') 1204 if (p[1] == '\n')
1260 ++p; 1205 ++p;
1261 case '\n': 1206 case '\n':
1262 assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]); 1207 assert(isNewlineEnd(p));
1263 ++loc; 1208 ++lineNum;
1264 setLineBegin(p+1); 1209 setLineBegin(p+1);
1265 continue; 1210 continue;
1266 case 0, _Z_: 1211 case 0, _Z_:
1267 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedBlockComment); 1212 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedBlockComment);
1268 goto LreturnBC; 1213 goto LreturnBC;
1269 default: 1214 default:
1270 if (c & 128) 1215 if (!isascii(c))
1271 { 1216 {
1272 c = decodeUTF8(); 1217 c = decodeUTF8();
1273 if (c == LSd || c == PSd) 1218 if (isUnicodeNewlineChar(c))
1274 goto case '\n'; 1219 goto case '\n';
1275 continue; 1220 continue;
1276 } 1221 }
1277 } 1222 }
1278 1223
1295 } 1240 }
1296 1241
1297 void scanNestedComment(ref Token t) 1242 void scanNestedComment(ref Token t)
1298 { 1243 {
1299 assert(p[-1] == '/' && *p == '+'); 1244 assert(p[-1] == '/' && *p == '+');
1300 auto tokenLineNum = loc; 1245 auto tokenLineNum = lineNum;
1301 auto tokenLineBegin = lineBegin; 1246 auto tokenLineBegin = lineBegin;
1302 uint level = 1; 1247 uint level = 1;
1303 uint c; 1248 uint c;
1304 while (1) 1249 while (1)
1305 { 1250 {
1309 { 1254 {
1310 case '\r': 1255 case '\r':
1311 if (p[1] == '\n') 1256 if (p[1] == '\n')
1312 ++p; 1257 ++p;
1313 case '\n': 1258 case '\n':
1314 assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]); 1259 assert(isNewlineEnd(p));
1315 ++loc; 1260 ++lineNum;
1316 setLineBegin(p+1); 1261 setLineBegin(p+1);
1317 continue; 1262 continue;
1318 case 0, _Z_: 1263 case 0, _Z_:
1319 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedNestedComment); 1264 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedNestedComment);
1320 goto LreturnNC; 1265 goto LreturnNC;
1321 default: 1266 default:
1322 if (c & 128) 1267 if (!isascii(c))
1323 { 1268 {
1324 c = decodeUTF8(); 1269 c = decodeUTF8();
1325 if (c == LSd || c == PSd) 1270 if (isUnicodeNewlineChar(c))
1326 goto case '\n'; 1271 goto case '\n';
1327 continue; 1272 continue;
1328 } 1273 }
1329 } 1274 }
1330 1275
1351 } 1296 }
1352 } 1297 }
1353 assert(0); 1298 assert(0);
1354 } 1299 }
1355 1300
1301 char scanPostfix()
1302 {
1303 assert(p[-1] == '"' || p[-1] == '`' ||
1304 { version(D2) return p[-1] == '}';
1305 else return 0; }()
1306 );
1307 switch (*p)
1308 {
1309 case 'c':
1310 case 'w':
1311 case 'd':
1312 return *p++;
1313 default:
1314 return 0;
1315 }
1316 assert(0);
1317 }
1318
1356 void scanNormalStringLiteral(ref Token t) 1319 void scanNormalStringLiteral(ref Token t)
1357 { 1320 {
1358 assert(*p == '"'); 1321 assert(*p == '"');
1359 auto tokenLineNum = loc; 1322 auto tokenLineNum = lineNum;
1360 auto tokenLineBegin = lineBegin; 1323 auto tokenLineBegin = lineBegin;
1324 t.type = TOK.String;
1361 char[] buffer; 1325 char[] buffer;
1362 t.type = TOK.String;
1363 uint c; 1326 uint c;
1364 while (1) 1327 while (1)
1365 { 1328 {
1366 c = *++p; 1329 c = *++p;
1367 switch (c) 1330 switch (c)
1368 { 1331 {
1369 case '"': 1332 case '"':
1370 ++p; 1333 ++p;
1334 t.pf = scanPostfix();
1371 Lreturn: 1335 Lreturn:
1372 buffer ~= 0; 1336 t.str = buffer ~ '\0';
1373 t.str = buffer;
1374 t.pf = scanPostfix();
1375 t.end = p; 1337 t.end = p;
1376 return; 1338 return;
1377 case '\\': 1339 case '\\':
1378 c = scanEscapeSequence(); 1340 c = scanEscapeSequence();
1379 --p; 1341 --p;
1380 if (c < 128) 1342 if (isascii(c))
1381 break; 1343 break;
1382 encodeUTF8(buffer, c); 1344 encodeUTF8(buffer, c);
1383 continue; 1345 continue;
1384 case '\r': 1346 case '\r':
1385 if (p[1] == '\n') 1347 if (p[1] == '\n')
1386 ++p; 1348 ++p;
1387 case '\n': 1349 case '\n':
1388 assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]); 1350 assert(isNewlineEnd(p));
1389 ++loc; 1351 c = '\n'; // Convert Newline to \n.
1390 c = '\n'; // Convert EndOfLine to \n. 1352 ++lineNum;
1391 setLineBegin(p+1); 1353 setLineBegin(p+1);
1392 break; 1354 break;
1393 case 0, _Z_: 1355 case 0, _Z_:
1394 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedString); 1356 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedString);
1395 goto Lreturn; 1357 goto Lreturn;
1396 default: 1358 default:
1397 if (c & 128) 1359 if (!isascii(c))
1398 { 1360 {
1399 c = decodeUTF8(); 1361 c = decodeUTF8();
1400 if (c == LSd || c == PSd) 1362 if (isUnicodeNewlineChar(c))
1401 goto case '\n'; 1363 goto case '\n';
1402
1403 encodeUTF8(buffer, c); 1364 encodeUTF8(buffer, c);
1404 continue; 1365 continue;
1405 } 1366 }
1406 } 1367 }
1407 assert(isascii(c)); 1368 assert(isascii(c));
1411 } 1372 }
1412 1373
1413 void scanCharacterLiteral(ref Token t) 1374 void scanCharacterLiteral(ref Token t)
1414 { 1375 {
1415 assert(*p == '\''); 1376 assert(*p == '\'');
1416 MID id = MID.UnterminatedCharacterLiteral;
1417 ++p; 1377 ++p;
1418 TOK type = TOK.CharLiteral; 1378 t.type = TOK.CharLiteral;
1419 switch (*p) 1379 switch (*p)
1420 { 1380 {
1421 case '\\': 1381 case '\\':
1422 switch (p[1]) 1382 switch (p[1])
1423 { 1383 {
1424 case 'u': 1384 case 'u':
1425 type = TOK.WCharLiteral; break; 1385 t.type = TOK.WCharLiteral; break;
1426 case 'U': 1386 case 'U':
1427 type = TOK.DCharLiteral; break; 1387 t.type = TOK.DCharLiteral; break;
1428 default: 1388 default:
1429 } 1389 }
1430 t.dchar_ = scanEscapeSequence(); 1390 t.dchar_ = scanEscapeSequence();
1431 break; 1391 break;
1432 case '\'': 1392 case '\'':
1433 ++p; 1393 error(t.start, MID.EmptyCharacterLiteral);
1434 id = MID.EmptyCharacterLiteral; 1394 break;
1435 // fall through
1436 case '\n', '\r', 0, _Z_:
1437 goto Lerr;
1438 case LS[0]:
1439 if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
1440 goto Lerr;
1441 // fall through
1442 default: 1395 default:
1396 if (isEndOfLine(p))
1397 break;
1443 uint c = *p; 1398 uint c = *p;
1444 if (c & 128) 1399 if (!isascii(c))
1445 { 1400 {
1446 c = decodeUTF8(); 1401 c = decodeUTF8();
1447 if (c <= 0xFFFF) 1402 t.type = c <= 0xFFFF ? TOK.WCharLiteral : TOK.DCharLiteral;
1448 type = TOK.WCharLiteral;
1449 else
1450 type = TOK.DCharLiteral;
1451 } 1403 }
1452 t.dchar_ = c; 1404 t.dchar_ = c;
1453 ++p; 1405 ++p;
1454 } 1406 }
1455 1407
1456 if (*p == '\'') 1408 if (*p == '\'')
1457 ++p; 1409 ++p;
1458 else 1410 else
1459 Lerr: 1411 error(t.start, MID.UnterminatedCharacterLiteral);
1460 error(t.start, id);
1461 t.type = type;
1462 t.end = p; 1412 t.end = p;
1463 } 1413 }
1464 1414
1465 char scanPostfix()
1466 {
1467 switch (*p)
1468 {
1469 case 'c':
1470 case 'w':
1471 case 'd':
1472 return *p++;
1473 default:
1474 return 0;
1475 }
1476 assert(0);
1477 }
1478
1479 void scanRawStringLiteral(ref Token t) 1415 void scanRawStringLiteral(ref Token t)
1480 { 1416 {
1481 auto tokenLineNum = loc; 1417 assert(*p == '`' || *p == '"' && p[-1] == 'r');
1418 auto tokenLineNum = lineNum;
1482 auto tokenLineBegin = lineBegin; 1419 auto tokenLineBegin = lineBegin;
1420 t.type = TOK.String;
1483 uint delim = *p; 1421 uint delim = *p;
1484 assert(delim == '`' || delim == '"' && p[-1] == 'r');
1485 t.type = TOK.String;
1486 char[] buffer; 1422 char[] buffer;
1487 uint c; 1423 uint c;
1488 while (1) 1424 while (1)
1489 { 1425 {
1490 c = *++p; 1426 c = *++p;
1492 { 1428 {
1493 case '\r': 1429 case '\r':
1494 if (p[1] == '\n') 1430 if (p[1] == '\n')
1495 ++p; 1431 ++p;
1496 case '\n': 1432 case '\n':
1497 assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]); 1433 assert(isNewlineEnd(p));
1498 c = '\n'; // Convert EndOfLine ('\r','\r\n','\n',LS,PS) to '\n' 1434 c = '\n'; // Convert Newline to '\n'.
1499 ++loc; 1435 ++lineNum;
1500 setLineBegin(p+1); 1436 setLineBegin(p+1);
1501 break; 1437 break;
1502 case '`': 1438 case '`':
1503 case '"': 1439 case '"':
1504 if (c == delim) 1440 if (c == delim)
1510 t.end = p; 1446 t.end = p;
1511 return; 1447 return;
1512 } 1448 }
1513 break; 1449 break;
1514 case 0, _Z_: 1450 case 0, _Z_:
1515 if (delim == 'r') 1451 error(tokenLineNum, tokenLineBegin, t.start,
1516 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedRawString); 1452 delim == 'r' ? MID.UnterminatedRawString : MID.UnterminatedBackQuoteString);
1517 else
1518 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedBackQuoteString);
1519 goto Lreturn; 1453 goto Lreturn;
1520 default: 1454 default:
1521 if (c & 128) 1455 if (!isascii(c))
1522 { 1456 {
1523 c = decodeUTF8(); 1457 c = decodeUTF8();
1524 if (c == LSd || c == PSd) 1458 if (isUnicodeNewlineChar(c))
1525 goto case '\n'; 1459 goto case '\n';
1526 encodeUTF8(buffer, c); 1460 encodeUTF8(buffer, c);
1527 continue; 1461 continue;
1528 } 1462 }
1529 } 1463 }
1536 void scanHexStringLiteral(ref Token t) 1470 void scanHexStringLiteral(ref Token t)
1537 { 1471 {
1538 assert(p[0] == 'x' && p[1] == '"'); 1472 assert(p[0] == 'x' && p[1] == '"');
1539 t.type = TOK.String; 1473 t.type = TOK.String;
1540 1474
1541 auto tokenLineNum = loc; 1475 auto tokenLineNum = lineNum;
1542 auto tokenLineBegin = lineBegin; 1476 auto tokenLineBegin = lineBegin;
1543 1477
1544 uint c; 1478 uint c;
1545 ubyte[] buffer; 1479 ubyte[] buffer;
1546 ubyte h; // hex number 1480 ubyte h; // hex number
1552 { 1486 {
1553 c = *++p; 1487 c = *++p;
1554 switch (c) 1488 switch (c)
1555 { 1489 {
1556 case '"': 1490 case '"':
1557 ++p;
1558 if (n & 1) 1491 if (n & 1)
1559 error(tokenLineNum, tokenLineBegin, t.start, MID.OddNumberOfDigitsInHexString); 1492 error(tokenLineNum, tokenLineBegin, t.start, MID.OddNumberOfDigitsInHexString);
1493 ++p;
1560 t.pf = scanPostfix(); 1494 t.pf = scanPostfix();
1561 Lreturn: 1495 Lreturn:
1562 buffer ~= 0; 1496 t.str = cast(string) (buffer ~= 0);
1563 t.str = cast(string) buffer;
1564 t.end = p; 1497 t.end = p;
1565 return; 1498 return;
1566 case '\r': 1499 case '\r':
1567 if (p[1] == '\n') 1500 if (p[1] == '\n')
1568 ++p; 1501 ++p;
1569 case '\n': 1502 case '\n':
1570 assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]); 1503 assert(isNewlineEnd(p));
1571 ++loc; 1504 ++lineNum;
1572 setLineBegin(p+1); 1505 setLineBegin(p+1);
1573 continue; 1506 continue;
1574 default: 1507 default:
1575 if (ishexad(c)) 1508 if (ishexad(c))
1576 { 1509 {
1592 ++n; 1525 ++n;
1593 continue; 1526 continue;
1594 } 1527 }
1595 else if (isspace(c)) 1528 else if (isspace(c))
1596 continue; // Skip spaces. 1529 continue; // Skip spaces.
1597 else if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
1598 {
1599 ++p; ++p;
1600 goto case '\n';
1601 }
1602 else if (c == 0 || c == _Z_) 1530 else if (c == 0 || c == _Z_)
1603 { 1531 {
1604 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedHexString); 1532 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedHexString);
1605 t.pf = 0; 1533 t.pf = 0;
1606 goto Lreturn; 1534 goto Lreturn;
1607 } 1535 }
1608 else 1536 else
1609 { 1537 {
1610 auto errorAt = p; 1538 auto errorAt = p;
1611 if (c & 128) 1539 if (!isascii(c))
1540 {
1612 c = decodeUTF8(); 1541 c = decodeUTF8();
1542 if (isUnicodeNewlineChar(c))
1543 goto case '\n';
1544 }
1613 error(errorAt, MID.NonHexCharInHexString, cast(dchar)c); 1545 error(errorAt, MID.NonHexCharInHexString, cast(dchar)c);
1614 } 1546 }
1615 } 1547 }
1616 } 1548 }
1617 assert(0); 1549 assert(0);
1622 void scanDelimitedStringLiteral(ref Token t) 1554 void scanDelimitedStringLiteral(ref Token t)
1623 { 1555 {
1624 assert(p[0] == 'q' && p[1] == '"'); 1556 assert(p[0] == 'q' && p[1] == '"');
1625 t.type = TOK.String; 1557 t.type = TOK.String;
1626 1558
1627 auto tokenLineNum = loc; 1559 auto tokenLineNum = lineNum;
1628 auto tokenLineBegin = lineBegin; 1560 auto tokenLineBegin = lineBegin;
1629 1561
1630 char[] buffer; 1562 char[] buffer;
1631 dchar opening_delim = 0, // 0 if no nested delimiter or '[', '(', '<', '{' 1563 dchar opening_delim = 0, // 0 if no nested delimiter or '[', '(', '<', '{'
1632 closing_delim; // Will be ']', ')', '>', '}, 1564 closing_delim; // Will be ']', ')', '>', '},
1654 { 1586 {
1655 case '\r': 1587 case '\r':
1656 if (p[1] == '\n') 1588 if (p[1] == '\n')
1657 ++p; 1589 ++p;
1658 case '\n': 1590 case '\n':
1659 assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]); 1591 assert(isNewlineEnd(p));
1660 ++p; 1592 ++p;
1661 ++loc; 1593 ++lineNum;
1662 setLineBegin(p); 1594 setLineBegin(p);
1663 return '\n'; 1595 return '\n';
1664 case LS[0]: 1596 default:
1665 if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) 1597 if (isUnicodeNewline(p))
1666 { 1598 {
1667 ++p; ++p; 1599 ++p; ++p;
1668 goto case '\n'; 1600 goto case '\n';
1669 } 1601 }
1670 default:
1671 } 1602 }
1672 return 0; 1603 return 0;
1673 } 1604 }
1674
1675 // Skip leading newlines: 1605 // Skip leading newlines:
1676 while (scanNewline() != 0){} 1606 while (scanNewline() != 0)
1677 assert(*p != '\n' && *p != '\r' && 1607 {}
1678 !(*p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))); 1608 assert(!isNewline(p));
1679 1609
1680 char* begin = p; 1610 char* begin = p;
1681 c = *p; 1611 c = *p;
1682 closing_delim = c; 1612 closing_delim = c;
1683 // TODO: Check for non-printable characters? 1613 // TODO: Check for non-printable characters?
1684 if (c & 128) 1614 if (!isascii(c))
1685 { 1615 {
1686 closing_delim = decodeUTF8(); 1616 closing_delim = decodeUTF8();
1687 if (!isUniAlpha(closing_delim)) 1617 if (!isUniAlpha(closing_delim))
1688 break; // Not an identifier. 1618 break; // Not an identifier.
1689 } 1619 }
1691 break; // Not an identifier. 1621 break; // Not an identifier.
1692 1622
1693 // Parse Identifier + EndOfLine 1623 // Parse Identifier + EndOfLine
1694 do 1624 do
1695 { c = *++p; } 1625 { c = *++p; }
1696 while (isident(c) || c & 128 && isUniAlpha(decodeUTF8())) 1626 while (isident(c) || !isascii(c) && isUniAlpha(decodeUTF8()))
1697 // Store identifier 1627 // Store identifier
1698 str_delim = begin[0..p-begin]; 1628 str_delim = begin[0..p-begin];
1699 // Scan newline 1629 // Scan newline
1700 if (scanNewline() == '\n') 1630 if (scanNewline() == '\n')
1701 --p; // Go back one because of "c = *++p;" in main loop. 1631 --p; // Go back one because of "c = *++p;" in main loop.
1722 { 1652 {
1723 case '\r': 1653 case '\r':
1724 if (p[1] == '\n') 1654 if (p[1] == '\n')
1725 ++p; 1655 ++p;
1726 case '\n': 1656 case '\n':
1727 assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]); 1657 assert(isNewlineEnd(p));
1728 c = '\n'; // Convert EndOfLine ('\r','\r\n','\n',LS,PS) to '\n' 1658 c = '\n'; // Convert Newline to '\n'.
1729 ++loc; 1659 ++lineNum;
1730 setLineBegin(p+1); 1660 setLineBegin(p+1);
1731 break; 1661 break;
1732 case 0, _Z_: 1662 case 0, _Z_:
1733 // TODO: error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedDelimitedString); 1663 // TODO: error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedDelimitedString);
1734 goto Lreturn3; 1664 goto Lreturn3;
1735 default: 1665 default:
1736 if (c & 128) 1666 if (!isascii(c))
1737 { 1667 {
1738 auto begin = p; 1668 auto begin = p;
1739 c = decodeUTF8(); 1669 c = decodeUTF8();
1740 if (c == LSd || c == PSd) 1670 if (isUnicodeNewlineChar(c))
1741 goto case '\n'; 1671 goto case '\n';
1742 if (c == closing_delim) 1672 if (c == closing_delim)
1743 { 1673 {
1744 if (str_delim.length) 1674 if (str_delim.length)
1745 { 1675 {
1802 void scanTokenStringLiteral(ref Token t) 1732 void scanTokenStringLiteral(ref Token t)
1803 { 1733 {
1804 assert(p[0] == 'q' && p[1] == '{'); 1734 assert(p[0] == 'q' && p[1] == '{');
1805 t.type = TOK.String; 1735 t.type = TOK.String;
1806 1736
1807 auto tokenLineNum = loc; 1737 auto tokenLineNum = lineNum;
1808 auto tokenLineBegin = lineBegin; 1738 auto tokenLineBegin = lineBegin;
1809 1739
1810 // A guard against changes to particular members: 1740 // A guard against changes to particular members:
1811 // this.loc_hline and this.errorLoc.filePath 1741 // this.lineNum_hline and this.errorPath
1812 ++inTokenString; 1742 ++inTokenString;
1813 1743
1814 uint loc = this.loc; 1744 uint lineNum = this.lineNum;
1815 uint level = 1; 1745 uint level = 1;
1816 1746
1817 ++p; ++p; // Skip q{ 1747 ++p; ++p; // Skip q{
1818 1748
1819 auto prev_t = &t; 1749 auto prev_t = &t;
1866 { 1796 {
1867 // Assign to buffer before scanPostfix(). 1797 // Assign to buffer before scanPostfix().
1868 t.end = p; 1798 t.end = p;
1869 buffer = t.srcText[2..$-1].dup ~ '\0'; 1799 buffer = t.srcText[2..$-1].dup ~ '\0';
1870 t.pf = scanPostfix(); 1800 t.pf = scanPostfix();
1871 t.end = p; 1801 t.end = p; // Assign again because of postfix.
1872 } 1802 }
1873 // Convert EndOfLines to '\n' 1803 // Convert newlines to '\n'.
1874 if (loc != this.loc) 1804 if (lineNum != this.lineNum)
1875 { 1805 {
1876 assert(buffer[$-1] == '\0'); 1806 assert(buffer[$-1] == '\0');
1877 uint i, j; 1807 uint i, j;
1878 for (; i < buffer.length; ++i) 1808 for (; i < buffer.length; ++i)
1879 switch (buffer[i]) 1809 switch (buffer[i])
1880 { 1810 {
1881 case '\r': 1811 case '\r':
1882 if (buffer[i+1] == '\n') 1812 if (buffer[i+1] == '\n')
1883 ++i; 1813 ++i;
1884 case '\n': 1814 case '\n':
1885 buffer[j++] = '\n'; 1815 assert(isNewlineEnd(buffer.ptr + i));
1816 buffer[j++] = '\n'; // Convert Newline to '\n'.
1886 break; 1817 break;
1887 case LS[0]: 1818 default:
1888 auto b = buffer[i..$]; 1819 if (isUnicodeNewline(buffer.ptr + i))
1889 if (b[1] == LS[1] && (b[2] == LS[2] || b[2] == PS[2]))
1890 { 1820 {
1891 ++i; ++i; 1821 ++i; ++i;
1892 goto case '\n'; 1822 goto case '\n';
1893 } 1823 }
1894 // goto default; 1824 buffer[j++] = buffer[i]; // Copy.
1895 default: 1825 }
1896 buffer[j++] = buffer[i]; // Copy character 1826 buffer.length = j; // Adjust length.
1897 }
1898 buffer.length = j; // Adjust length
1899 } 1827 }
1900 assert(buffer[$-1] == '\0'); 1828 assert(buffer[$-1] == '\0');
1901 t.str = buffer; 1829 t.str = buffer;
1902 1830
1903 --inTokenString; 1831 --inTokenString;
2002 error(sequenceStart, MID.UnterminatedHTMLEntity, sequenceStart[0 .. p - sequenceStart]); 1930 error(sequenceStart, MID.UnterminatedHTMLEntity, sequenceStart[0 .. p - sequenceStart]);
2003 } 1931 }
2004 else 1932 else
2005 error(sequenceStart, MID.InvalidBeginHTMLEntity); 1933 error(sequenceStart, MID.InvalidBeginHTMLEntity);
2006 } 1934 }
2007 else if (*p == '\n' || *p == '\r' || 1935 else if (isEndOfLine(p))
2008 *p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) 1936 error(sequenceStart, MID.UndefinedEscapeSequence,
2009 { 1937 (*p == 0 || *p == _Z_) ? `\EOF` : `\NewLine`);
2010 error(sequenceStart, MID.UndefinedEscapeSequence, r"\NewLine");
2011 }
2012 else if (*p == 0 || *p == _Z_)
2013 {
2014 error(sequenceStart, MID.UndefinedEscapeSequence, r"\EOF");
2015 }
2016 else 1938 else
2017 { 1939 {
2018 char[] str = `\`; 1940 char[] str = `\`;
2019 if (*p & 128) 1941 if (isascii(c))
1942 str ~= *p;
1943 else
2020 encodeUTF8(str, decodeUTF8()); 1944 encodeUTF8(str, decodeUTF8());
2021 else
2022 str ~= *p;
2023 ++p; 1945 ++p;
2024 // TODO: check for unprintable character? 1946 // TODO: check for unprintable character?
2025 error(sequenceStart, MID.UndefinedEscapeSequence, str); 1947 error(sequenceStart, MID.UndefinedEscapeSequence, str);
2026 } 1948 }
2027 } 1949 }
2477 enum State 2399 enum State
2478 { /+Space,+/ Integer, Filespec, End } 2400 { /+Space,+/ Integer, Filespec, End }
2479 2401
2480 State state = State.Integer; 2402 State state = State.Integer;
2481 2403
2482 Loop: 2404 while (!isEndOfLine(++p))
2483 while (1) 2405 {
2484 { 2406 if (isspace(*p))
2485 switch (*++p) 2407 continue;
2486 { 2408 if (state == State.Integer)
2487 case LS[0]: 2409 {
2488 if (!(p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))) 2410 if (!isdigit(*p))
2489 goto default; 2411 {
2490 case '\r', '\n', 0, _Z_: 2412 errorAtColumn = p;
2491 break Loop; 2413 mid = MID.ExpectedIntegerAfterSTLine;
2492 default: 2414 goto Lerr;
2493 if (isspace(*p)) 2415 }
2494 continue; 2416 t.tokLineNum = new Token;
2495 if (state == State.Integer) 2417 scan(*t.tokLineNum);
2496 { 2418 if (t.tokLineNum.type != TOK.Int32 && t.tokLineNum.type != TOK.Uint32)
2497 if (!isdigit(*p)) 2419 {
2420 errorAtColumn = t.tokLineNum.start;
2421 mid = MID.ExpectedIntegerAfterSTLine;
2422 goto Lerr;
2423 }
2424 --p; // Go one back because scan() advanced p past the integer.
2425 state = State.Filespec;
2426 }
2427 else if (state == State.Filespec)
2428 {
2429 if (*p != '"')
2430 {
2431 errorAtColumn = p;
2432 mid = MID.ExpectedFilespec;
2433 goto Lerr;
2434 }
2435 t.tokLineFilespec = new Token;
2436 t.tokLineFilespec.start = p;
2437 t.tokLineFilespec.type = TOK.Filespec;
2438 while (*++p != '"')
2439 {
2440 if (isEndOfLine(p))
2498 { 2441 {
2499 errorAtColumn = p; 2442 errorAtColumn = t.tokLineFilespec.start;
2500 mid = MID.ExpectedIntegerAfterSTLine; 2443 mid = MID.UnterminatedFilespec;
2444 t.tokLineFilespec.end = p;
2501 goto Lerr; 2445 goto Lerr;
2502 } 2446 }
2503 t.line_num = new Token; 2447 isascii(*p) || decodeUTF8();
2504 scan(*t.line_num); 2448 }
2505 if (t.line_num.type != TOK.Int32 && t.line_num.type != TOK.Uint32) 2449 auto start = t.tokLineFilespec.start +1; // +1 skips '"'
2506 { 2450 t.tokLineFilespec.str = start[0 .. p - start];
2507 errorAtColumn = t.line_num.start; 2451 t.tokLineFilespec.end = p + 1;
2508 mid = MID.ExpectedIntegerAfterSTLine; 2452 state = State.End;
2509 goto Lerr; 2453 }
2510 } 2454 else/+ if (state == State.End)+/
2511 --p; // Go one back because scan() advanced p past the integer. 2455 {
2512 state = State.Filespec; 2456 mid = MID.UnterminatedSpecialToken;
2513 } 2457 goto Lerr;
2514 else if (state == State.Filespec) 2458 }
2515 { 2459 }
2516 if (*p != '"') 2460 assert(isEndOfLine(p));
2517 {
2518 errorAtColumn = p;
2519 mid = MID.ExpectedFilespec;
2520 goto Lerr;
2521 }
2522 t.line_filespec = new Token;
2523 t.line_filespec.start = p;
2524 t.line_filespec.type = TOK.Filespec;
2525 while (1)
2526 {
2527 switch (*++p)
2528 {
2529 case '"':
2530 break;
2531 case LS[0]:
2532 if (!(p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])))
2533 goto default;
2534 case '\r', '\n', 0, _Z_:
2535 errorAtColumn = t.line_filespec.start;
2536 mid = MID.UnterminatedFilespec;
2537 t.line_filespec.end = p;
2538 goto Lerr;
2539 default:
2540 if (*p & 128)
2541 decodeUTF8();
2542 continue;
2543 }
2544 break; // Exit loop.
2545 }
2546 auto start = t.line_filespec.start +1; // +1 skips '"'
2547 t.line_filespec.str = start[0 .. p - start];
2548 t.line_filespec.end = p + 1;
2549 state = State.End;
2550 }
2551 else/+ if (state == State.End)+/
2552 {
2553 mid = MID.UnterminatedSpecialToken;
2554 goto Lerr;
2555 }
2556 }
2557 }
2558 assert(*p == '\r' || *p == '\n' || *p == 0 || *p == _Z_ ||
2559 *p == LS[0] && (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
2560 );
2561 2461
2562 if (state == State.Integer) 2462 if (state == State.Integer)
2563 { 2463 {
2564 errorAtColumn = p; 2464 errorAtColumn = p;
2565 mid = MID.ExpectedIntegerAfterSTLine; 2465 mid = MID.ExpectedIntegerAfterSTLine;
2566 goto Lerr; 2466 goto Lerr;
2567 } 2467 }
2568 2468
2569 // Evaluate #line only when not in token string. 2469 // Evaluate #line only when not in token string.
2570 if (!inTokenString) 2470 if (!inTokenString && t.tokLineNum)
2571 evaluateHashLine(t); 2471 {
2472 this.lineNum_hline = this.lineNum - t.tokLineNum.uint_ + 1;
2473 if (t.tokLineFilespec)
2474 this.errorPath = t.tokLineFilespec.str;
2475 }
2572 t.end = p; 2476 t.end = p;
2573 2477
2574 return; 2478 return;
2575 Lerr: 2479 Lerr:
2576 t.end = p; 2480 t.end = p;
2577 error(errorAtColumn, mid); 2481 error(errorAtColumn, mid);
2578 } 2482 }
2579 2483
2580 void evaluateHashLine(ref Token t) 2484 /++
2581 {
2582 assert(t.type == TOK.HashLine);
2583 if (t.line_num)
2584 {
2585 this.loc_hline = this.loc - t.line_num.uint_ + 1;
2586 if (t.line_filespec)
2587 this.errorLoc.setFilePath(t.line_filespec.str);
2588 }
2589 }
2590
2591 /+
2592 Insert an empty dummy token before t. 2485 Insert an empty dummy token before t.
2593 Useful in the parsing phase for representing a node in the AST 2486 Useful in the parsing phase for representing a node in the AST
2594 that doesn't consume an actual token from the source text. 2487 that doesn't consume an actual token from the source text.
2595 +/ 2488 +/
2596 Token* insertEmptyTokenBefore(Token* t) 2489 Token* insertEmptyTokenBefore(Token* t)
2609 new_t.next = t; 2502 new_t.next = t;
2610 t.prev = new_t; 2503 t.prev = new_t;
2611 return new_t; 2504 return new_t;
2612 } 2505 }
2613 2506
2614 void updateErrorLoc(char* columnPos) 2507 uint errorLineNumber(uint lineNum)
2615 { 2508 {
2616 updateErrorLoc(this.loc, this.lineBegin, columnPos); 2509 return lineNum - this.lineNum_hline;
2617 }
2618
2619 void updateErrorLoc(uint lineNum, char* lineBegin, char* columnPos)
2620 {
2621 errorLoc.set(this.errorLineNum(lineNum), lineBegin, columnPos);
2622 }
2623
2624 uint errorLineNum(uint loc)
2625 {
2626 return loc - this.loc_hline;
2627 } 2510 }
2628 2511
2629 void error(char* columnPos, MID mid, ...) 2512 void error(char* columnPos, MID mid, ...)
2630 { 2513 {
2631 updateErrorLoc(columnPos); 2514 error_(this.lineNum, this.lineBegin, columnPos, mid, _arguments, _argptr);
2632 errors ~= new Information(InfoType.Lexer, mid, errorLoc.clone, Format(_arguments, _argptr, GetMsg(mid)));
2633 } 2515 }
2634 2516
2635 void error(uint lineNum, char* lineBegin, char* columnPos, MID mid, ...) 2517 void error(uint lineNum, char* lineBegin, char* columnPos, MID mid, ...)
2636 { 2518 {
2637 updateErrorLoc(lineNum, lineBegin, columnPos); 2519 error_(lineNum, lineBegin, columnPos, mid, _arguments, _argptr);
2638 errors ~= new Information(InfoType.Lexer, mid, errorLoc.clone, Format(_arguments, _argptr, GetMsg(mid))); 2520 }
2521
2522 void error_(uint lineNum, char* lineBegin, char* columnPos, MID mid,
2523 TypeInfo[] _arguments, void* _argptr)
2524 {
2525 lineNum = this.errorLineNumber(lineNum);
2526 auto location = new Location(errorPath, lineNum, lineBegin, columnPos);
2527 auto msg = Format(_arguments, _argptr, GetMsg(mid));
2528 errors ~= new Information(InfoType.Lexer, mid, location, msg);
2639 } 2529 }
2640 2530
2641 Token* getTokens() 2531 Token* getTokens()
2642 { 2532 {
2643 while (nextToken() != TOK.EOF) 2533 while (nextToken() != TOK.EOF)
2669 return isUniAlpha(std.utf.decode(ident, idx)); 2559 return isUniAlpha(std.utf.decode(ident, idx));
2670 } 2560 }
2671 2561
2672 try 2562 try
2673 { 2563 {
2674 if (isidbeg(ident[0]) || 2564 if (isidbeg(ident[0]) || !isascii(ident[0]) && isFirstCharUniAlpha())
2675 ident[0] & 128 && isFirstCharUniAlpha())
2676 { 2565 {
2677 foreach (dchar c; ident[idx..$]) 2566 foreach (dchar c; ident[idx..$])
2678 if (!isident(c) && !isUniAlpha(c)) 2567 if (!isident(c) && !isUniAlpha(c))
2679 return false; 2568 return false;
2680 } 2569 }
2870 unittest 2759 unittest
2871 { 2760 {
2872 Stdout("Testing Lexer.\n"); 2761 Stdout("Testing Lexer.\n");
2873 struct Pair 2762 struct Pair
2874 { 2763 {
2875 char[] token; 2764 char[] tokenText;
2876 TOK type; 2765 TOK type;
2877 } 2766 }
2878 static Pair[] pairs = [ 2767 static Pair[] pairs = [
2879 {"//çay\n", TOK.Comment}, {"&", TOK.AndBinary}, 2768 {"//çay", TOK.Comment}, {"\n", TOK.Newline},
2769 {"&", TOK.AndBinary},
2880 {"/*çağ*/", TOK.Comment}, {"&&", TOK.AndLogical}, 2770 {"/*çağ*/", TOK.Comment}, {"&&", TOK.AndLogical},
2881 {"/+çak+/", TOK.Comment}, {"&=", TOK.AndAssign}, 2771 {"/+çak+/", TOK.Comment}, {"&=", TOK.AndAssign},
2882 {">", TOK.Greater}, {"+", TOK.Plus}, 2772 {">", TOK.Greater}, {"+", TOK.Plus},
2883 {">=", TOK.GreaterEqual}, {"++", TOK.PlusPlus}, 2773 {">=", TOK.GreaterEqual}, {"++", TOK.PlusPlus},
2884 {">>", TOK.RShift}, {"+=", TOK.PlusAssign}, 2774 {">>", TOK.RShift}, {"+=", TOK.PlusAssign},
2906 {"||", TOK.OrLogical}, {":", TOK.Colon}, 2796 {"||", TOK.OrLogical}, {":", TOK.Colon},
2907 {"|=", TOK.OrAssign}, {";", TOK.Semicolon}, 2797 {"|=", TOK.OrAssign}, {";", TOK.Semicolon},
2908 {"?", TOK.Question}, {",", TOK.Comma}, 2798 {"?", TOK.Question}, {",", TOK.Comma},
2909 {"$", TOK.Dollar}, {"cam", TOK.Identifier}, 2799 {"$", TOK.Dollar}, {"cam", TOK.Identifier},
2910 {"çay", TOK.Identifier}, {".0", TOK.Float64}, 2800 {"çay", TOK.Identifier}, {".0", TOK.Float64},
2911 {"0", TOK.Int32}, 2801 {"0", TOK.Int32}, {"\n", TOK.Newline},
2802 {"\r", TOK.Newline}, {"\r\n", TOK.Newline},
2803 {"\u2028", TOK.Newline}, {"\u2029", TOK.Newline}
2912 ]; 2804 ];
2913 2805
2914 char[] src; 2806 char[] src;
2915 2807
2916 foreach (pair; pairs) 2808 // Join all token texts into a single string.
2917 src ~= pair.token ~ " "; 2809 foreach (i, pair; pairs)
2918 2810 if (pair.type == TOK.Comment && pair.tokenText[1] == '/') // Line comment.
2919 assert(pairs[0].token == "//çay\n"); 2811 {
2920 // Remove \n after src has been constructed. 2812 assert(pairs[i+1].type == TOK.Newline); // Must be followed by a newline.
2921 // It won't be part of the scanned token string. 2813 src ~= pair.tokenText;
2922 pairs[0].token = "//çay"; 2814 }
2815 else
2816 src ~= pair.tokenText ~ " ";
2923 2817
2924 auto lx = new Lexer(src, ""); 2818 auto lx = new Lexer(src, "");
2925 auto token = lx.getTokens(); 2819 auto token = lx.getTokens();
2926 2820
2927 uint i; 2821 uint i;
2928 assert(token == lx.head); 2822 assert(token == lx.head);
2929 token = token.next; 2823 assert(token.next.type == TOK.Newline);
2824 token = token.next.next;
2930 do 2825 do
2931 { 2826 {
2932 assert(i < pairs.length); 2827 assert(i < pairs.length);
2933 assert(token.srcText == pairs[i].token, Format("Scanned '{0}' but expected '{1}'", token.srcText, pairs[i].token)); 2828 assert(token.srcText == pairs[i].tokenText, Format("Scanned '{0}' but expected '{1}'", token.srcText, pairs[i].tokenText));
2934 ++i; 2829 ++i;
2935 token = token.next; 2830 token = token.next;
2936 } while (token.type != TOK.EOF) 2831 } while (token.type != TOK.EOF)
2937 } 2832 }
2938 2833
2940 { 2835 {
2941 Stdout("Testing method Lexer.peek()\n"); 2836 Stdout("Testing method Lexer.peek()\n");
2942 string sourceText = "unittest { }"; 2837 string sourceText = "unittest { }";
2943 auto lx = new Lexer(sourceText, null); 2838 auto lx = new Lexer(sourceText, null);
2944 2839
2945 Token* next = lx.head; 2840 auto next = lx.head;
2841 lx.peek(next);
2842 assert(next.type == TOK.Newline);
2946 lx.peek(next); 2843 lx.peek(next);
2947 assert(next.type == TOK.Unittest); 2844 assert(next.type == TOK.Unittest);
2948 lx.peek(next); 2845 lx.peek(next);
2949 assert(next.type == TOK.LBrace); 2846 assert(next.type == TOK.LBrace);
2950 lx.peek(next); 2847 lx.peek(next);
2951 assert(next.type == TOK.RBrace); 2848 assert(next.type == TOK.RBrace);
2849 lx.peek(next);
2850 assert(next.type == TOK.EOF);
2851
2852 lx = new Lexer("", null);
2853 next = lx.head;
2854 lx.peek(next);
2855 assert(next.type == TOK.Newline);
2952 lx.peek(next); 2856 lx.peek(next);
2953 assert(next.type == TOK.EOF); 2857 assert(next.type == TOK.EOF);
2954 } 2858 }
2955 2859
2956 unittest 2860 unittest