1 /**
2 * Implements the lexical analyzer, which converts source code into lexical tokens.
3 *
4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
5 *
6 * Copyright: Copyright (C) 1999-2022 by The D Language Foundation, All Rights Reserved
7 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright)
8 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10 * Documentation: https://dlang.org/phobos/dmd_lexer.html
11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
12 */
13
14 module dmd.lexer;
15
16 import core.stdc.ctype;
17 import core.stdc.errno;
18 import core.stdc.stdarg;
19 import core.stdc.stdio;
20 import core.stdc.stdlib : getenv;
21 import core.stdc.string;
22 import core.stdc.time;
23
24 import dmd.entity;
25 import dmd.errors;
26 import dmd.globals;
27 import dmd.id;
28 import dmd.identifier;
29 import dmd.root.array;
30 import dmd.root.ctfloat;
31 import dmd.common.outbuffer;
32 import dmd.root.port;
33 import dmd.root.rmem;
34 import dmd.root.string;
35 import dmd.root.utf;
36 import dmd.tokens;
37 import dmd.utils;
38
39 nothrow:
40
version(DMDLIB)41 version (DMDLIB)
42 {
43 version = LocOffset;
44 }
45
46 /***********************************************************
47 */
48 class Lexer
49 {
50 private __gshared OutBuffer stringbuffer;
51
52 Loc scanloc; // for error messages
53 Loc prevloc; // location of token before current
54
55 const(char)* p; // current character
56
57 Token token;
58
59 // For ImportC
60 bool Ccompile; /// true if compiling ImportC
61
62 // The following are valid only if (Ccompile == true)
63 ubyte boolsize; /// size of a C _Bool, default 1
64 ubyte shortsize; /// size of a C short, default 2
65 ubyte intsize; /// size of a C int, default 4
66 ubyte longsize; /// size of C long, 4 or 8
67 ubyte long_longsize; /// size of a C long long, default 8
68 ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof
69 ubyte wchar_tsize; /// size of C wchar_t, 2 or 4
70
71 private
72 {
73 const(char)* base; // pointer to start of buffer
74 const(char)* end; // pointer to last element of buffer
75 const(char)* line; // start of current line
76
77 bool doDocComment; // collect doc comment information
78 bool anyToken; // seen at least one token
79 bool commentToken; // comments are TOK.comment's
80 bool tokenizeNewlines; // newlines are turned into TOK.endOfLine's
81
version(DMDLIB)82 version (DMDLIB)
83 {
84 bool whitespaceToken; // tokenize whitespaces
85 }
86
87 int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
88 int lastDocLine; // last line of previous doc comment
89
90 Token* tokenFreelist;
91 }
92
93 nothrow:
94
95 /*********************
96 * Creates a Lexer for the source code base[begoffset..endoffset+1].
97 * The last character, base[endoffset], must be null (0) or EOF (0x1A).
98 *
99 * Params:
100 * filename = used for error messages
101 * base = source code, must be terminated by a null (0) or EOF (0x1A) character
102 * begoffset = starting offset into base[]
103 * endoffset = the last offset to read into base[]
104 * doDocComment = handle documentation comments
105 * commentToken = comments become TOK.comment's
106 */
this(const (char)* filename,const (char)* base,size_t begoffset,size_t endoffset,bool doDocComment,bool commentToken)107 this(const(char)* filename, const(char)* base, size_t begoffset,
108 size_t endoffset, bool doDocComment, bool commentToken) pure
109 {
110 scanloc = Loc(filename, 1, 1);
111 // debug printf("Lexer::Lexer(%p)\n", base);
112 // debug printf("lexer.filename = %s\n", filename);
113 token = Token.init;
114 this.base = base;
115 this.end = base + endoffset;
116 p = base + begoffset;
117 line = p;
118 this.doDocComment = doDocComment;
119 this.commentToken = commentToken;
120 this.tokenizeNewlines = false;
121 this.inTokenStringConstant = 0;
122 this.lastDocLine = 0;
123 //initKeywords();
124 /* If first line starts with '#!', ignore the line
125 */
126 if (p && p[0] == '#' && p[1] == '!')
127 {
128 p += 2;
129 while (1)
130 {
131 char c = *p++;
132 switch (c)
133 {
134 case 0:
135 case 0x1A:
136 p--;
137 goto case;
138 case '\n':
139 break;
140 default:
141 continue;
142 }
143 break;
144 }
145 endOfLine();
146 }
147 }
148
version(DMDLIB)149 version (DMDLIB)
150 {
151 this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
152 bool doDocComment, bool commentToken, bool whitespaceToken)
153 {
154 this(filename, base, begoffset, endoffset, doDocComment, commentToken);
155 this.whitespaceToken = whitespaceToken;
156 }
157
158 bool empty() const pure @property @nogc @safe
159 {
160 return front() == TOK.endOfFile;
161 }
162
163 TOK front() const pure @property @nogc @safe
164 {
165 return token.value;
166 }
167
168 void popFront()
169 {
170 nextToken();
171 }
172 }
173
174 /// Returns: a newly allocated `Token`.
allocateToken()175 Token* allocateToken() pure nothrow @safe
176 {
177 if (tokenFreelist)
178 {
179 Token* t = tokenFreelist;
180 tokenFreelist = t.next;
181 t.next = null;
182 return t;
183 }
184 return new Token();
185 }
186
187 /// Frees the given token by returning it to the freelist.
releaseToken(Token * token)188 private void releaseToken(Token* token) pure nothrow @nogc @safe
189 {
190 if (mem.isGCEnabled)
191 *token = Token.init;
192 token.next = tokenFreelist;
193 tokenFreelist = token;
194 }
195
nextToken()196 final TOK nextToken()
197 {
198 prevloc = token.loc;
199 if (token.next)
200 {
201 Token* t = token.next;
202 memcpy(&token, t, Token.sizeof);
203 releaseToken(t);
204 }
205 else
206 {
207 scan(&token);
208 }
209 //printf(token.toChars());
210 return token.value;
211 }
212
213 /***********************
214 * Look ahead at next token's value.
215 */
peekNext()216 final TOK peekNext()
217 {
218 return peek(&token).value;
219 }
220
221 /***********************
222 * Look 2 tokens ahead at value.
223 */
peekNext2()224 final TOK peekNext2()
225 {
226 Token* t = peek(&token);
227 return peek(t).value;
228 }
229
230 /****************************
231 * Turn next token in buffer into a token.
232 * Params:
233 * t = the token to set the resulting Token to
234 */
scan(Token * t)235 final void scan(Token* t)
236 {
237 const lastLine = scanloc.linnum;
238 Loc startLoc;
239 t.blockComment = null;
240 t.lineComment = null;
241
242 while (1)
243 {
244 t.ptr = p;
245 //printf("p = %p, *p = '%c'\n",p,*p);
246 t.loc = loc();
247 switch (*p)
248 {
249 case 0:
250 case 0x1A:
251 t.value = TOK.endOfFile; // end of file
252 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
253 return;
254 case ' ':
255 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
256 while ((cast(size_t)p) % uint.sizeof)
257 {
258 if (*p != ' ')
259 goto LendSkipFourSpaces;
260 p++;
261 }
262 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
263 p += 4;
264 // Skip over any remaining space on the line.
265 while (*p == ' ')
266 p++;
267 LendSkipFourSpaces:
268 version (DMDLIB)
269 {
270 if (whitespaceToken)
271 {
272 t.value = TOK.whitespace;
273 return;
274 }
275 }
276 continue; // skip white space
277 case '\t':
278 case '\v':
279 case '\f':
280 p++;
281 version (DMDLIB)
282 {
283 if (whitespaceToken)
284 {
285 t.value = TOK.whitespace;
286 return;
287 }
288 }
289 continue; // skip white space
290 case '\r':
291 p++;
292 if (*p != '\n') // if CR stands by itself
293 {
294 endOfLine();
295 if (tokenizeNewlines)
296 {
297 t.value = TOK.endOfLine;
298 tokenizeNewlines = false;
299 return;
300 }
301 }
302 version (DMDLIB)
303 {
304 if (whitespaceToken)
305 {
306 t.value = TOK.whitespace;
307 return;
308 }
309 }
310 continue; // skip white space
311 case '\n':
312 p++;
313 endOfLine();
314 if (tokenizeNewlines)
315 {
316 t.value = TOK.endOfLine;
317 tokenizeNewlines = false;
318 return;
319 }
320 version (DMDLIB)
321 {
322 if (whitespaceToken)
323 {
324 t.value = TOK.whitespace;
325 return;
326 }
327 }
328 continue; // skip white space
329 case '0':
330 if (!isZeroSecond(p[1])) // if numeric literal does not continue
331 {
332 ++p;
333 t.unsvalue = 0;
334 t.value = TOK.int32Literal;
335 return;
336 }
337 goto Lnumber;
338
339 case '1': .. case '9':
340 if (!isDigitSecond(p[1])) // if numeric literal does not continue
341 {
342 t.unsvalue = *p - '0';
343 ++p;
344 t.value = TOK.int32Literal;
345 return;
346 }
347 Lnumber:
348 t.value = number(t);
349 return;
350
351 case '\'':
352 if (issinglechar(p[1]) && p[2] == '\'')
353 {
354 t.unsvalue = p[1]; // simple one character literal
355 t.value = TOK.charLiteral;
356 p += 3;
357 }
358 else if (Ccompile)
359 {
360 clexerCharConstant(*t, 0);
361 }
362 else
363 {
364 t.value = charConstant(t);
365 }
366 return;
367
368 case 'u':
369 case 'U':
370 case 'L':
371 if (!Ccompile)
372 goto case_ident;
373 if (p[1] == '\'') // C wide character constant
374 {
375 char c = *p;
376 if (c == 'L') // convert L to u or U
377 c = (wchar_tsize == 4) ? 'u' : 'U';
378 ++p;
379 clexerCharConstant(*t, c);
380 return;
381 }
382 else if (p[1] == '\"') // C wide string literal
383 {
384 const c = *p;
385 ++p;
386 escapeStringConstant(t);
387 t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
388 c == 'u' ? 'w' :
389 'd';
390 return;
391 }
392 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
393 {
394 p += 2;
395 escapeStringConstant(t);
396 return;
397 }
398 goto case_ident;
399
400 case 'r':
401 if (Ccompile || p[1] != '"')
402 goto case_ident;
403 p++;
404 goto case '`';
405 case '`':
406 if (Ccompile)
407 goto default;
408 wysiwygStringConstant(t);
409 return;
410 case 'q':
411 if (Ccompile)
412 goto case_ident;
413 if (p[1] == '"')
414 {
415 p++;
416 delimitedStringConstant(t);
417 return;
418 }
419 else if (p[1] == '{')
420 {
421 p++;
422 tokenStringConstant(t);
423 return;
424 }
425 else
426 goto case_ident;
427 case '"':
428 escapeStringConstant(t);
429 return;
430 case 'a':
431 case 'b':
432 case 'c':
433 case 'd':
434 case 'e':
435 case 'f':
436 case 'g':
437 case 'h':
438 case 'i':
439 case 'j':
440 case 'k':
441 case 'l':
442 case 'm':
443 case 'n':
444 case 'o':
445 case 'p':
446 /*case 'q': case 'r':*/
447 case 's':
448 case 't':
449 //case 'u':
450 case 'v':
451 case 'w':
452 case 'x':
453 case 'y':
454 case 'z':
455 case 'A':
456 case 'B':
457 case 'C':
458 case 'D':
459 case 'E':
460 case 'F':
461 case 'G':
462 case 'H':
463 case 'I':
464 case 'J':
465 case 'K':
466 //case 'L':
467 case 'M':
468 case 'N':
469 case 'O':
470 case 'P':
471 case 'Q':
472 case 'R':
473 case 'S':
474 case 'T':
475 //case 'U':
476 case 'V':
477 case 'W':
478 case 'X':
479 case 'Y':
480 case 'Z':
481 case '_':
482 case_ident:
483 {
484 while (1)
485 {
486 const c = *++p;
487 if (isidchar(c))
488 continue;
489 else if (c & 0x80)
490 {
491 const s = p;
492 const u = decodeUTF();
493 if (isUniAlpha(u))
494 continue;
495 error("char 0x%04x not allowed in identifier", u);
496 p = s;
497 }
498 break;
499 }
500 Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr));
501 t.ident = id;
502 t.value = cast(TOK)id.getValue();
503
504 anyToken = 1;
505
506 /* Different keywords for C and D
507 */
508 if (Ccompile)
509 {
510 if (t.value != TOK.identifier)
511 {
512 t.value = Ckeywords[t.value]; // filter out D keywords
513 }
514 }
515 else if (t.value >= FirstCKeyword)
516 t.value = TOK.identifier; // filter out C keywords
517
518 else if (*t.ptr == '_') // if special identifier token
519 {
520 // Lazy initialization
521 TimeStampInfo.initialize(t.loc);
522
523 if (id == Id.DATE)
524 {
525 t.ustring = TimeStampInfo.date.ptr;
526 goto Lstr;
527 }
528 else if (id == Id.TIME)
529 {
530 t.ustring = TimeStampInfo.time.ptr;
531 goto Lstr;
532 }
533 else if (id == Id.VENDOR)
534 {
535 t.ustring = global.vendor.xarraydup.ptr;
536 goto Lstr;
537 }
538 else if (id == Id.TIMESTAMP)
539 {
540 t.ustring = TimeStampInfo.timestamp.ptr;
541 Lstr:
542 t.value = TOK.string_;
543 t.postfix = 0;
544 t.len = cast(uint)strlen(t.ustring);
545 }
546 else if (id == Id.VERSIONX)
547 {
548 t.value = TOK.int64Literal;
549 t.unsvalue = global.versionNumber();
550 }
551 else if (id == Id.EOFX)
552 {
553 t.value = TOK.endOfFile;
554 // Advance scanner to end of file
555 while (!(*p == 0 || *p == 0x1A))
556 p++;
557 }
558 }
559 //printf("t.value = %d\n",t.value);
560 return;
561 }
562 case '/':
563 p++;
564 switch (*p)
565 {
566 case '=':
567 p++;
568 t.value = TOK.divAssign;
569 return;
570 case '*':
571 p++;
572 startLoc = loc();
573 while (1)
574 {
575 while (1)
576 {
577 const c = *p;
578 switch (c)
579 {
580 case '/':
581 break;
582 case '\n':
583 endOfLine();
584 p++;
585 continue;
586 case '\r':
587 p++;
588 if (*p != '\n')
589 endOfLine();
590 continue;
591 case 0:
592 case 0x1A:
593 error("unterminated /* */ comment");
594 p = end;
595 t.loc = loc();
596 t.value = TOK.endOfFile;
597 return;
598 default:
599 if (c & 0x80)
600 {
601 const u = decodeUTF();
602 if (u == PS || u == LS)
603 endOfLine();
604 }
605 p++;
606 continue;
607 }
608 break;
609 }
610 p++;
611 if (p[-2] == '*' && p - 3 != t.ptr)
612 break;
613 }
614 if (commentToken)
615 {
616 t.loc = startLoc;
617 t.value = TOK.comment;
618 return;
619 }
620 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
621 {
622 // if /** but not /**/
623 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
624 lastDocLine = scanloc.linnum;
625 }
626 continue;
627 case '/': // do // style comments
628 startLoc = loc();
629 while (1)
630 {
631 const c = *++p;
632 switch (c)
633 {
634 case '\n':
635 break;
636 case '\r':
637 if (p[1] == '\n')
638 p++;
639 break;
640 case 0:
641 case 0x1A:
642 if (commentToken)
643 {
644 p = end;
645 t.loc = startLoc;
646 t.value = TOK.comment;
647 return;
648 }
649 if (doDocComment && t.ptr[2] == '/')
650 {
651 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
652 lastDocLine = scanloc.linnum;
653 }
654 p = end;
655 t.loc = loc();
656 t.value = TOK.endOfFile;
657 return;
658 default:
659 if (c & 0x80)
660 {
661 const u = decodeUTF();
662 if (u == PS || u == LS)
663 break;
664 }
665 continue;
666 }
667 break;
668 }
669 if (commentToken)
670 {
671 version (DMDLIB) {}
672 else
673 {
674 p++;
675 endOfLine();
676 }
677 t.loc = startLoc;
678 t.value = TOK.comment;
679 return;
680 }
681 if (doDocComment && t.ptr[2] == '/')
682 {
683 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
684 lastDocLine = scanloc.linnum;
685 }
686 p++;
687 endOfLine();
688 continue;
689 case '+':
690 if (!Ccompile)
691 {
692 int nest;
693 startLoc = loc();
694 p++;
695 nest = 1;
696 while (1)
697 {
698 char c = *p;
699 switch (c)
700 {
701 case '/':
702 p++;
703 if (*p == '+')
704 {
705 p++;
706 nest++;
707 }
708 continue;
709 case '+':
710 p++;
711 if (*p == '/')
712 {
713 p++;
714 if (--nest == 0)
715 break;
716 }
717 continue;
718 case '\r':
719 p++;
720 if (*p != '\n')
721 endOfLine();
722 continue;
723 case '\n':
724 endOfLine();
725 p++;
726 continue;
727 case 0:
728 case 0x1A:
729 error("unterminated /+ +/ comment");
730 p = end;
731 t.loc = loc();
732 t.value = TOK.endOfFile;
733 return;
734 default:
735 if (c & 0x80)
736 {
737 uint u = decodeUTF();
738 if (u == PS || u == LS)
739 endOfLine();
740 }
741 p++;
742 continue;
743 }
744 break;
745 }
746 if (commentToken)
747 {
748 t.loc = startLoc;
749 t.value = TOK.comment;
750 return;
751 }
752 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
753 {
754 // if /++ but not /++/
755 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
756 lastDocLine = scanloc.linnum;
757 }
758 continue;
759 }
760 break;
761 default:
762 break;
763 }
764 t.value = TOK.div;
765 return;
766 case '.':
767 p++;
768 if (isdigit(*p))
769 {
770 /* Note that we don't allow ._1 and ._ as being
771 * valid floating point numbers.
772 */
773 p--;
774 t.value = inreal(t);
775 }
776 else if (p[0] == '.')
777 {
778 if (p[1] == '.')
779 {
780 p += 2;
781 t.value = TOK.dotDotDot;
782 }
783 else
784 {
785 p++;
786 t.value = TOK.slice;
787 }
788 }
789 else
790 t.value = TOK.dot;
791 return;
792 case '&':
793 p++;
794 if (*p == '=')
795 {
796 p++;
797 t.value = TOK.andAssign;
798 }
799 else if (*p == '&')
800 {
801 p++;
802 t.value = TOK.andAnd;
803 }
804 else
805 t.value = TOK.and;
806 return;
807 case '|':
808 p++;
809 if (*p == '=')
810 {
811 p++;
812 t.value = TOK.orAssign;
813 }
814 else if (*p == '|')
815 {
816 p++;
817 t.value = TOK.orOr;
818 }
819 else
820 t.value = TOK.or;
821 return;
822 case '-':
823 p++;
824 if (*p == '=')
825 {
826 p++;
827 t.value = TOK.minAssign;
828 }
829 else if (*p == '-')
830 {
831 p++;
832 t.value = TOK.minusMinus;
833 }
834 else if (*p == '>')
835 {
836 ++p;
837 t.value = TOK.arrow;
838 }
839 else
840 t.value = TOK.min;
841 return;
842 case '+':
843 p++;
844 if (*p == '=')
845 {
846 p++;
847 t.value = TOK.addAssign;
848 }
849 else if (*p == '+')
850 {
851 p++;
852 t.value = TOK.plusPlus;
853 }
854 else
855 t.value = TOK.add;
856 return;
857 case '<':
858 p++;
859 if (*p == '=')
860 {
861 p++;
862 t.value = TOK.lessOrEqual; // <=
863 }
864 else if (*p == '<')
865 {
866 p++;
867 if (*p == '=')
868 {
869 p++;
870 t.value = TOK.leftShiftAssign; // <<=
871 }
872 else
873 t.value = TOK.leftShift; // <<
874 }
875 else if (*p == ':' && Ccompile)
876 {
877 ++p;
878 t.value = TOK.leftBracket; // <:
879 }
880 else if (*p == '%' && Ccompile)
881 {
882 ++p;
883 t.value = TOK.leftCurly; // <%
884 }
885 else
886 t.value = TOK.lessThan; // <
887 return;
888 case '>':
889 p++;
890 if (*p == '=')
891 {
892 p++;
893 t.value = TOK.greaterOrEqual; // >=
894 }
895 else if (*p == '>')
896 {
897 p++;
898 if (*p == '=')
899 {
900 p++;
901 t.value = TOK.rightShiftAssign; // >>=
902 }
903 else if (*p == '>')
904 {
905 p++;
906 if (*p == '=')
907 {
908 p++;
909 t.value = TOK.unsignedRightShiftAssign; // >>>=
910 }
911 else
912 t.value = TOK.unsignedRightShift; // >>>
913 }
914 else
915 t.value = TOK.rightShift; // >>
916 }
917 else
918 t.value = TOK.greaterThan; // >
919 return;
920 case '!':
921 p++;
922 if (*p == '=')
923 {
924 p++;
925 t.value = TOK.notEqual; // !=
926 }
927 else
928 t.value = TOK.not; // !
929 return;
930 case '=':
931 p++;
932 if (*p == '=')
933 {
934 p++;
935 t.value = TOK.equal; // ==
936 }
937 else if (*p == '>')
938 {
939 p++;
940 t.value = TOK.goesTo; // =>
941 }
942 else
943 t.value = TOK.assign; // =
944 return;
945 case '~':
946 p++;
947 if (*p == '=')
948 {
949 p++;
950 t.value = TOK.concatenateAssign; // ~=
951 }
952 else
953 t.value = TOK.tilde; // ~
954 return;
955 case '^':
956 p++;
957 if (*p == '^')
958 {
959 p++;
960 if (*p == '=')
961 {
962 p++;
963 t.value = TOK.powAssign; // ^^=
964 }
965 else
966 t.value = TOK.pow; // ^^
967 }
968 else if (*p == '=')
969 {
970 p++;
971 t.value = TOK.xorAssign; // ^=
972 }
973 else
974 t.value = TOK.xor; // ^
975 return;
976 case '(':
977 p++;
978 t.value = TOK.leftParenthesis;
979 return;
980 case ')':
981 p++;
982 t.value = TOK.rightParenthesis;
983 return;
984 case '[':
985 p++;
986 t.value = TOK.leftBracket;
987 return;
988 case ']':
989 p++;
990 t.value = TOK.rightBracket;
991 return;
992 case '{':
993 p++;
994 t.value = TOK.leftCurly;
995 return;
996 case '}':
997 p++;
998 t.value = TOK.rightCurly;
999 return;
1000 case '?':
1001 p++;
1002 t.value = TOK.question;
1003 return;
1004 case ',':
1005 p++;
1006 t.value = TOK.comma;
1007 return;
1008 case ';':
1009 p++;
1010 t.value = TOK.semicolon;
1011 return;
1012 case ':':
1013 p++;
1014 if (*p == ':')
1015 {
1016 ++p;
1017 t.value = TOK.colonColon;
1018 }
1019 else if (*p == '>' && Ccompile)
1020 {
1021 ++p;
1022 t.value = TOK.rightBracket;
1023 }
1024 else
1025 t.value = TOK.colon;
1026 return;
1027 case '$':
1028 p++;
1029 t.value = TOK.dollar;
1030 return;
1031 case '@':
1032 p++;
1033 t.value = TOK.at;
1034 return;
1035 case '*':
1036 p++;
1037 if (*p == '=')
1038 {
1039 p++;
1040 t.value = TOK.mulAssign;
1041 }
1042 else
1043 t.value = TOK.mul;
1044 return;
1045 case '%':
1046 p++;
1047 if (*p == '=')
1048 {
1049 p++;
1050 t.value = TOK.modAssign;
1051 }
1052 else if (*p == '>' && Ccompile)
1053 {
1054 ++p;
1055 t.value = TOK.rightCurly;
1056 }
1057 else if (*p == ':' && Ccompile)
1058 {
1059 goto case '#'; // %: means #
1060 }
1061 else
1062 t.value = TOK.mod;
1063 return;
1064 case '#':
1065 {
1066 // https://issues.dlang.org/show_bug.cgi?id=22825
1067 // Special token sequences are terminated by newlines,
1068 // and should not be skipped over.
1069 this.tokenizeNewlines = true;
1070 p++;
1071 if (parseSpecialTokenSequence())
1072 continue;
1073 t.value = TOK.pound;
1074 return;
1075 }
1076 default:
1077 {
1078 dchar c = *p;
1079 if (c & 0x80)
1080 {
1081 c = decodeUTF();
1082 // Check for start of unicode identifier
1083 if (isUniAlpha(c))
1084 goto case_ident;
1085 if (c == PS || c == LS)
1086 {
1087 endOfLine();
1088 p++;
1089 if (tokenizeNewlines)
1090 {
1091 t.value = TOK.endOfLine;
1092 tokenizeNewlines = false;
1093 return;
1094 }
1095 continue;
1096 }
1097 }
1098 if (c < 0x80 && isprint(c))
1099 error("character '%c' is not a valid token", c);
1100 else
1101 error("character 0x%02x is not a valid token", c);
1102 p++;
1103 continue;
1104 }
1105 }
1106 }
1107 }
1108
peek(Token * ct)1109 final Token* peek(Token* ct)
1110 {
1111 Token* t;
1112 if (ct.next)
1113 t = ct.next;
1114 else
1115 {
1116 t = allocateToken();
1117 scan(t);
1118 ct.next = t;
1119 }
1120 return t;
1121 }
1122
1123 /*********************************
1124 * tk is on the opening (.
1125 * Look ahead and return token that is past the closing ).
1126 */
peekPastParen(Token * tk)1127 final Token* peekPastParen(Token* tk)
1128 {
1129 //printf("peekPastParen()\n");
1130 int parens = 1;
1131 int curlynest = 0;
1132 while (1)
1133 {
1134 tk = peek(tk);
1135 //tk.print();
1136 switch (tk.value)
1137 {
1138 case TOK.leftParenthesis:
1139 parens++;
1140 continue;
1141 case TOK.rightParenthesis:
1142 --parens;
1143 if (parens)
1144 continue;
1145 tk = peek(tk);
1146 break;
1147 case TOK.leftCurly:
1148 curlynest++;
1149 continue;
1150 case TOK.rightCurly:
1151 if (--curlynest >= 0)
1152 continue;
1153 break;
1154 case TOK.semicolon:
1155 if (curlynest)
1156 continue;
1157 break;
1158 case TOK.endOfFile:
1159 break;
1160 default:
1161 continue;
1162 }
1163 return tk;
1164 }
1165 }
1166
1167 /*******************************************
1168 * Parse escape sequence.
1169 */
escapeSequence()1170 private uint escapeSequence()
1171 {
1172 return Lexer.escapeSequence(token.loc, p, Ccompile);
1173 }
1174
1175 /********
1176 * Parse the given string literal escape sequence into a single character.
1177 * D https://dlang.org/spec/lex.html#escape_sequences
1178 * C11 6.4.4.4
1179 * Params:
1180 * loc = location to use for error messages
1181 * sequence = pointer to string with escape sequence to parse. Updated to
1182 * point past the end of the escape sequence
1183 * Ccompile = true for compile C11 escape sequences
1184 * Returns:
1185 * the escape sequence as a single character
1186 */
escapeSequence(const ref Loc loc,ref const (char)* sequence,bool Ccompile)1187 private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile)
1188 {
1189 const(char)* p = sequence; // cache sequence reference on stack
1190 scope(exit) sequence = p;
1191
1192 uint c = *p;
1193 int ndigits;
1194 switch (c)
1195 {
1196 case '\'':
1197 case '"':
1198 case '?':
1199 case '\\':
1200 Lconsume:
1201 p++;
1202 break;
1203 case 'a':
1204 c = 7;
1205 goto Lconsume;
1206 case 'b':
1207 c = 8;
1208 goto Lconsume;
1209 case 'f':
1210 c = 12;
1211 goto Lconsume;
1212 case 'n':
1213 c = 10;
1214 goto Lconsume;
1215 case 'r':
1216 c = 13;
1217 goto Lconsume;
1218 case 't':
1219 c = 9;
1220 goto Lconsume;
1221 case 'v':
1222 c = 11;
1223 goto Lconsume;
1224 case 'u':
1225 ndigits = 4;
1226 goto Lhex;
1227 case 'U':
1228 ndigits = 8;
1229 goto Lhex;
1230 case 'x':
1231 ndigits = 2;
1232 Lhex:
1233 p++;
1234 c = *p;
1235 if (ishex(cast(char)c))
1236 {
1237 uint v = 0;
1238 int n = 0;
1239 if (Ccompile && ndigits == 2)
1240 {
1241 /* C11 6.4.4.4-7 one to infinity hex digits
1242 */
1243 do
1244 {
1245 if (isdigit(cast(char)c))
1246 c -= '0';
1247 else if (islower(c))
1248 c -= 'a' - 10;
1249 else
1250 c -= 'A' - 10;
1251 v = v * 16 + c;
1252 c = *++p;
1253 } while (ishex(cast(char)c));
1254 }
1255 else
1256 {
1257 while (1)
1258 {
1259 if (isdigit(cast(char)c))
1260 c -= '0';
1261 else if (islower(c))
1262 c -= 'a' - 10;
1263 else
1264 c -= 'A' - 10;
1265 v = v * 16 + c;
1266 c = *++p;
1267 if (++n == ndigits)
1268 break;
1269 if (!ishex(cast(char)c))
1270 {
1271 .error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
1272 break;
1273 }
1274 }
1275 if (ndigits != 2 && !utf_isValidDchar(v))
1276 {
1277 .error(loc, "invalid UTF character \\U%08x", v);
1278 v = '?'; // recover with valid UTF character
1279 }
1280 }
1281 c = v;
1282 }
1283 else
1284 {
1285 .error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
1286 p++;
1287 }
1288 break;
1289 case '&':
1290 if (Ccompile)
1291 goto default;
1292
1293 // named character entity
1294 for (const idstart = ++p; 1; p++)
1295 {
1296 switch (*p)
1297 {
1298 case ';':
1299 c = HtmlNamedEntity(idstart, p - idstart);
1300 if (c == ~0)
1301 {
1302 .error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
1303 c = '?';
1304 }
1305 p++;
1306 break;
1307 default:
1308 if (isalpha(*p) || (p != idstart && isdigit(*p)))
1309 continue;
1310 .error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
1311 c = '?';
1312 break;
1313 }
1314 break;
1315 }
1316 break;
1317 case 0:
1318 case 0x1A:
1319 // end of file
1320 c = '\\';
1321 break;
1322 default:
1323 if (isoctal(cast(char)c))
1324 {
1325 uint v = 0;
1326 int n = 0;
1327 do
1328 {
1329 v = v * 8 + (c - '0');
1330 c = *++p;
1331 }
1332 while (++n < 3 && isoctal(cast(char)c));
1333 c = v;
1334 if (c > 0xFF)
1335 .error(loc, "escape octal sequence \\%03o is larger than \\377", c);
1336 }
1337 else
1338 {
1339 .error(loc, "undefined escape sequence \\%c", c);
1340 p++;
1341 }
1342 break;
1343 }
1344 return c;
1345 }
1346
1347 /**
1348 Lex a wysiwyg string. `p` must be pointing to the first character before the
1349 contents of the string literal. The character pointed to by `p` will be used as
1350 the terminating character (i.e. backtick or double-quote).
1351 Params:
1352 result = pointer to the token that accepts the result
1353 */
wysiwygStringConstant(Token * result)1354 private void wysiwygStringConstant(Token* result)
1355 {
1356 result.value = TOK.string_;
1357 Loc start = loc();
1358 auto terminator = p[0];
1359 p++;
1360 stringbuffer.setsize(0);
1361 while (1)
1362 {
1363 dchar c = p[0];
1364 p++;
1365 switch (c)
1366 {
1367 case '\n':
1368 endOfLine();
1369 break;
1370 case '\r':
1371 if (p[0] == '\n')
1372 continue; // ignore
1373 c = '\n'; // treat EndOfLine as \n character
1374 endOfLine();
1375 break;
1376 case 0:
1377 case 0x1A:
1378 error("unterminated string constant starting at %s", start.toChars());
1379 result.setString();
1380 // rewind `p` so it points to the EOF character
1381 p--;
1382 return;
1383 default:
1384 if (c == terminator)
1385 {
1386 result.setString(stringbuffer);
1387 stringPostfix(result);
1388 return;
1389 }
1390 else if (c & 0x80)
1391 {
1392 p--;
1393 const u = decodeUTF();
1394 p++;
1395 if (u == PS || u == LS)
1396 endOfLine();
1397 stringbuffer.writeUTF8(u);
1398 continue;
1399 }
1400 break;
1401 }
1402 stringbuffer.writeByte(c);
1403 }
1404 }
1405
1406 /**
1407 Lex a delimited string. Some examples of delimited strings are:
1408 ---
1409 q"(foo(xxx))" // "foo(xxx)"
1410 q"[foo$(LPAREN)]" // "foo$(LPAREN)"
1411 q"/foo]/" // "foo]"
1412 q"HERE
1413 foo
1414 HERE" // "foo\n"
1415 ---
1416 It is assumed that `p` points to the opening double-quote '"'.
1417 Params:
1418 result = pointer to the token that accepts the result
1419 */
delimitedStringConstant(Token * result)1420 private void delimitedStringConstant(Token* result)
1421 {
1422 result.value = TOK.string_;
1423 Loc start = loc();
1424 dchar delimleft = 0;
1425 dchar delimright = 0;
1426 uint nest = 1;
1427 uint nestcount = ~0; // dead assignment, needed to suppress warning
1428 Identifier hereid = null;
1429 uint blankrol = 0;
1430 uint startline = 0;
1431 p++;
1432 stringbuffer.setsize(0);
1433 while (1)
1434 {
1435 dchar c = *p++;
1436 //printf("c = '%c'\n", c);
1437 switch (c)
1438 {
1439 case '\n':
1440 Lnextline:
1441 endOfLine();
1442 startline = 1;
1443 if (blankrol)
1444 {
1445 blankrol = 0;
1446 continue;
1447 }
1448 if (hereid)
1449 {
1450 stringbuffer.writeUTF8(c);
1451 continue;
1452 }
1453 break;
1454 case '\r':
1455 if (*p == '\n')
1456 continue; // ignore
1457 c = '\n'; // treat EndOfLine as \n character
1458 goto Lnextline;
1459 case 0:
1460 case 0x1A:
1461 error("unterminated delimited string constant starting at %s", start.toChars());
1462 result.setString();
1463 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1464 p--;
1465 return;
1466 default:
1467 if (c & 0x80)
1468 {
1469 p--;
1470 c = decodeUTF();
1471 p++;
1472 if (c == PS || c == LS)
1473 goto Lnextline;
1474 }
1475 break;
1476 }
1477 if (delimleft == 0)
1478 {
1479 delimleft = c;
1480 nest = 1;
1481 nestcount = 1;
1482 if (c == '(')
1483 delimright = ')';
1484 else if (c == '{')
1485 delimright = '}';
1486 else if (c == '[')
1487 delimright = ']';
1488 else if (c == '<')
1489 delimright = '>';
1490 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1491 {
1492 // Start of identifier; must be a heredoc
1493 Token tok;
1494 p--;
1495 scan(&tok); // read in heredoc identifier
1496 if (tok.value != TOK.identifier)
1497 {
1498 error("identifier expected for heredoc, not %s", tok.toChars());
1499 delimright = c;
1500 }
1501 else
1502 {
1503 hereid = tok.ident;
1504 //printf("hereid = '%s'\n", hereid.toChars());
1505 blankrol = 1;
1506 }
1507 nest = 0;
1508 }
1509 else
1510 {
1511 delimright = c;
1512 nest = 0;
1513 if (isspace(c))
1514 error("delimiter cannot be whitespace");
1515 }
1516 }
1517 else
1518 {
1519 if (blankrol)
1520 {
1521 error("heredoc rest of line should be blank");
1522 blankrol = 0;
1523 continue;
1524 }
1525 if (nest == 1)
1526 {
1527 if (c == delimleft)
1528 nestcount++;
1529 else if (c == delimright)
1530 {
1531 nestcount--;
1532 if (nestcount == 0)
1533 goto Ldone;
1534 }
1535 }
1536 else if (c == delimright)
1537 goto Ldone;
1538 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
1539 {
1540 Token tok;
1541 auto psave = p;
1542 p--;
1543 scan(&tok); // read in possible heredoc identifier
1544 //printf("endid = '%s'\n", tok.ident.toChars());
1545 if (tok.value == TOK.identifier && tok.ident is hereid)
1546 {
1547 /* should check that rest of line is blank
1548 */
1549 goto Ldone;
1550 }
1551 p = psave;
1552 }
1553 stringbuffer.writeUTF8(c);
1554 startline = 0;
1555 }
1556 }
1557 Ldone:
1558 if (*p == '"')
1559 p++;
1560 else if (hereid)
1561 error("delimited string must end in `%s\"`", hereid.toChars());
1562 else if (isspace(delimright))
1563 error("delimited string must end in `\"`");
1564 else
1565 error("delimited string must end in `%c\"`", delimright);
1566 result.setString(stringbuffer);
1567 stringPostfix(result);
1568 }
1569
1570 /**
1571 Lex a token string. Some examples of token strings are:
1572 ---
1573 q{ foo(xxx) } // " foo(xxx) "
1574 q{foo$(LPAREN)} // "foo$(LPAREN)"
1575 q{{foo}"}"} // "{foo}"}""
1576 ---
1577 It is assumed that `p` points to the opening curly-brace.
1578 Params:
1579 result = pointer to the token that accepts the result
1580 */
tokenStringConstant(Token * result)1581 private void tokenStringConstant(Token* result)
1582 {
1583 result.value = TOK.string_;
1584
1585 uint nest = 1;
1586 const start = loc();
1587 const pstart = ++p;
1588 inTokenStringConstant++;
1589 scope(exit) inTokenStringConstant--;
1590 while (1)
1591 {
1592 Token tok;
1593 scan(&tok);
1594 switch (tok.value)
1595 {
1596 case TOK.leftCurly:
1597 nest++;
1598 continue;
1599 case TOK.rightCurly:
1600 if (--nest == 0)
1601 {
1602 result.setString(pstart, p - 1 - pstart);
1603 stringPostfix(result);
1604 return;
1605 }
1606 continue;
1607 case TOK.endOfFile:
1608 error("unterminated token string constant starting at %s", start.toChars());
1609 result.setString();
1610 return;
1611 default:
1612 continue;
1613 }
1614 }
1615 }
1616
1617 /**
1618 Scan a quoted string while building the processed string value by
1619 handling escape sequences. The result is returned in the given `t` token.
1620 This function assumes that `p` currently points to the opening quote
1621 of the string.
1622 Params:
1623 t = the token to set the resulting string to
1624 * References:
1625 * D https://dlang.org/spec/lex.html#double_quoted_strings
1626 * ImportC C11 6.4.5
1627 */
escapeStringConstant(Token * t)1628 private void escapeStringConstant(Token* t)
1629 {
1630 t.value = TOK.string_;
1631
1632 const start = loc();
1633 const tc = *p++; // opening quote
1634 stringbuffer.setsize(0);
1635 while (1)
1636 {
1637 dchar c = *p++;
1638 switch (c)
1639 {
1640 case '\\':
1641 switch (*p)
1642 {
1643 case '&':
1644 if (Ccompile)
1645 goto default;
1646 goto case;
1647
1648 case 'u':
1649 case 'U':
1650 c = escapeSequence();
1651 stringbuffer.writeUTF8(c);
1652 continue;
1653 default:
1654 c = escapeSequence();
1655 break;
1656 }
1657 break;
1658 case '\n':
1659 endOfLine();
1660 if (Ccompile)
1661 goto Lunterminated;
1662 break;
1663 case '\r':
1664 if (*p == '\n')
1665 continue; // ignore
1666 c = '\n'; // treat EndOfLine as \n character
1667 endOfLine();
1668 if (Ccompile)
1669 goto Lunterminated;
1670 break;
1671 case '\'':
1672 case '"':
1673 if (c != tc)
1674 goto default;
1675 t.setString(stringbuffer);
1676 if (!Ccompile)
1677 stringPostfix(t);
1678 return;
1679 case 0:
1680 case 0x1A:
1681 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1682 p--;
1683 Lunterminated:
1684 error("unterminated string constant starting at %s", start.toChars());
1685 t.setString();
1686 return;
1687 default:
1688 if (c & 0x80)
1689 {
1690 p--;
1691 c = decodeUTF();
1692 if (c == LS || c == PS)
1693 {
1694 c = '\n';
1695 endOfLine();
1696 if (Ccompile)
1697 goto Lunterminated;
1698 }
1699 p++;
1700 stringbuffer.writeUTF8(c);
1701 continue;
1702 }
1703 break;
1704 }
1705 stringbuffer.writeByte(c);
1706 }
1707 }
1708
1709 /**************************************
1710 * Reference:
1711 * https://dlang.org/spec/lex.html#characterliteral
1712 */
charConstant(Token * t)1713 private TOK charConstant(Token* t)
1714 {
1715 TOK tk = TOK.charLiteral;
1716 //printf("Lexer::charConstant\n");
1717 p++;
1718 dchar c = *p++;
1719 switch (c)
1720 {
1721 case '\\':
1722 switch (*p)
1723 {
1724 case 'u':
1725 t.unsvalue = escapeSequence();
1726 tk = TOK.wcharLiteral;
1727 break;
1728 case 'U':
1729 case '&':
1730 t.unsvalue = escapeSequence();
1731 tk = TOK.dcharLiteral;
1732 break;
1733 default:
1734 t.unsvalue = escapeSequence();
1735 break;
1736 }
1737 break;
1738 case '\n':
1739 L1:
1740 endOfLine();
1741 goto case;
1742 case '\r':
1743 goto case '\'';
1744 case 0:
1745 case 0x1A:
1746 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1747 p--;
1748 goto case;
1749 case '\'':
1750 error("unterminated character constant");
1751 t.unsvalue = '?';
1752 return tk;
1753 default:
1754 if (c & 0x80)
1755 {
1756 p--;
1757 c = decodeUTF();
1758 p++;
1759 if (c == LS || c == PS)
1760 goto L1;
1761 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1762 tk = TOK.wcharLiteral;
1763 else
1764 tk = TOK.dcharLiteral;
1765 }
1766 t.unsvalue = c;
1767 break;
1768 }
1769 if (*p != '\'')
1770 {
1771 while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
1772 *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
1773 {
1774 if (*p & 0x80)
1775 {
1776 const s = p;
1777 c = decodeUTF();
1778 if (c == LS || c == PS)
1779 {
1780 p = s;
1781 break;
1782 }
1783 }
1784 p++;
1785 }
1786
1787 if (*p == '\'')
1788 {
1789 error("character constant has multiple characters");
1790 p++;
1791 }
1792 else
1793 error("unterminated character constant");
1794 t.unsvalue = '?';
1795 return tk;
1796 }
1797 p++;
1798 return tk;
1799 }
1800
1801 /***************************************
1802 * Lex C character constant.
1803 * Parser is on the opening quote.
1804 * Params:
1805 * t = token to fill in
1806 * prefix = one of `u`, `U` or 0.
1807 * Reference:
1808 * C11 6.4.4.4
1809 */
clexerCharConstant(ref Token t,char prefix)1810 private void clexerCharConstant(ref Token t, char prefix)
1811 {
1812 escapeStringConstant(&t);
1813 const(char)[] str = t.ustring[0 .. t.len];
1814 const n = str.length;
1815 const loc = t.loc;
1816 if (n == 0)
1817 {
1818 error(loc, "empty character constant");
1819 t.value = TOK.semicolon;
1820 return;
1821 }
1822
1823 uint u;
1824 switch (prefix)
1825 {
1826 case 0:
1827 if (n == 1) // fast case
1828 {
1829 u = str[0];
1830 }
1831 else if (n > 4)
1832 error(loc, "max number of chars in character literal is 4, had %d",
1833 cast(int)n);
1834 else
1835 {
1836 foreach (i, c; str)
1837 (cast(char*)&u)[n - 1 - i] = c;
1838 }
1839 break;
1840
1841 case 'u':
1842 dchar d1;
1843 size_t idx;
1844 auto msg = utf_decodeChar(str, idx, d1);
1845 dchar d2 = 0;
1846 if (idx < n && !msg)
1847 msg = utf_decodeChar(str, idx, d2);
1848 if (msg)
1849 error(loc, "%s", msg);
1850 else if (idx < n)
1851 error(loc, "max number of chars in 16 bit character literal is 2, had %d",
1852 (n + 1) >> 1);
1853 else if (d1 > 0x1_0000)
1854 error(loc, "%d does not fit in 16 bits", d1);
1855 else if (d2 > 0x1_0000)
1856 error(loc, "%d does not fit in 16 bits", d2);
1857 u = d1;
1858 if (d2)
1859 u = (d1 << 16) | d2;
1860 break;
1861
1862 case 'U':
1863 dchar d;
1864 size_t idx;
1865 auto msg = utf_decodeChar(str, idx, d);
1866 if (msg)
1867 error(loc, "%s", msg);
1868 else if (idx < n)
1869 error(loc, "max number of chars in 32 bit character literal is 1, had %d",
1870 (n + 3) >> 2);
1871 u = d;
1872 break;
1873
1874 default:
1875 assert(0);
1876 }
1877 t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
1878 t.unsvalue = u;
1879 }
1880
1881 /***************************************
1882 * Get postfix of string literal.
1883 */
stringPostfix(Token * t)1884 private void stringPostfix(Token* t) pure @nogc
1885 {
1886 switch (*p)
1887 {
1888 case 'c':
1889 case 'w':
1890 case 'd':
1891 t.postfix = *p;
1892 p++;
1893 break;
1894 default:
1895 t.postfix = 0;
1896 break;
1897 }
1898 }
1899
1900 /**************************************
1901 * Read in a number.
1902 * If it's an integer, store it in tok.TKutok.Vlong.
1903 * integers can be decimal, octal or hex
1904 * Handle the suffixes U, UL, LU, L, etc.
1905 * If it's double, store it in tok.TKutok.Vdouble.
1906 * Returns:
1907 * TKnum
1908 * TKdouble,...
1909 */
number(Token * t)1910 private TOK number(Token* t)
1911 {
1912 int base = 10;
1913 const start = p;
1914 uinteger_t n = 0; // unsigned >=64 bit integer type
1915 int d;
1916 bool err = false;
1917 bool overflow = false;
1918 bool anyBinaryDigitsNoSingleUS = false;
1919 bool anyHexDigitsNoSingleUS = false;
1920 char errorDigit = 0;
1921 dchar c = *p;
1922 if (c == '0')
1923 {
1924 ++p;
1925 c = *p;
1926 switch (c)
1927 {
1928 case '0':
1929 case '1':
1930 case '2':
1931 case '3':
1932 case '4':
1933 case '5':
1934 case '6':
1935 case '7':
1936 base = 8;
1937 break;
1938
1939 case '8':
1940 case '9':
1941 errorDigit = cast(char) c;
1942 base = 8;
1943 break;
1944 case 'x':
1945 case 'X':
1946 ++p;
1947 base = 16;
1948 break;
1949 case 'b':
1950 case 'B':
1951 if (Ccompile)
1952 error("binary constants not allowed");
1953 ++p;
1954 base = 2;
1955 break;
1956 case '.':
1957 if (p[1] == '.')
1958 goto Ldone; // if ".."
1959 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
1960 {
1961 if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
1962 goto Lreal; // if `0.f` or `0.L`
1963 goto Ldone; // if ".identifier" or ".unicode"
1964 }
1965 goto Lreal; // '.' is part of current token
1966 case 'i':
1967 case 'f':
1968 case 'F':
1969 goto Lreal;
1970 case '_':
1971 if (Ccompile)
1972 error("embedded `_` not allowed");
1973 ++p;
1974 base = 8;
1975 break;
1976 case 'L':
1977 if (p[1] == 'i')
1978 goto Lreal;
1979 break;
1980 default:
1981 break;
1982 }
1983 }
1984 while (1)
1985 {
1986 c = *p;
1987 switch (c)
1988 {
1989 case '0':
1990 case '1':
1991 case '2':
1992 case '3':
1993 case '4':
1994 case '5':
1995 case '6':
1996 case '7':
1997 case '8':
1998 case '9':
1999 ++p;
2000 d = c - '0';
2001 break;
2002 case 'a':
2003 case 'b':
2004 case 'c':
2005 case 'd':
2006 case 'e':
2007 case 'f':
2008 case 'A':
2009 case 'B':
2010 case 'C':
2011 case 'D':
2012 case 'E':
2013 case 'F':
2014 ++p;
2015 if (base != 16)
2016 {
2017 if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
2018 goto Lreal;
2019 }
2020 if (c >= 'a')
2021 d = c + 10 - 'a';
2022 else
2023 d = c + 10 - 'A';
2024 break;
2025 case 'L':
2026 if (p[1] == 'i')
2027 goto Lreal;
2028 goto Ldone;
2029 case '.':
2030 if (p[1] == '.')
2031 goto Ldone; // if ".."
2032 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
2033 {
2034 if (Ccompile && base == 10 &&
2035 (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2036 goto Lreal; // if `1.e6` or `1.f` or `1.L`
2037 goto Ldone; // if ".identifier" or ".unicode"
2038 }
2039 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
2040 goto Ldone; // if ".identifier" or ".unicode"
2041 if (base == 2)
2042 goto Ldone; // if ".identifier" or ".unicode"
2043 goto Lreal; // otherwise as part of a floating point literal
2044 case 'p':
2045 case 'P':
2046 case 'i':
2047 Lreal:
2048 p = start;
2049 return inreal(t);
2050 case '_':
2051 if (Ccompile)
2052 goto default;
2053 ++p;
2054 continue;
2055 default:
2056 goto Ldone;
2057 }
2058 // got a digit here, set any necessary flags, check for errors
2059 anyHexDigitsNoSingleUS = true;
2060 anyBinaryDigitsNoSingleUS = true;
2061 if (!errorDigit && d >= base)
2062 {
2063 errorDigit = cast(char) c;
2064 }
2065 // Avoid expensive overflow check if we aren't at risk of overflow
2066 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
2067 n = n * base + d;
2068 else
2069 {
2070 import core.checkedint : mulu, addu;
2071
2072 n = mulu(n, base, overflow);
2073 n = addu(n, d, overflow);
2074 }
2075 }
2076 Ldone:
2077 if (errorDigit)
2078 {
2079 error("%s digit expected, not `%c`", base == 2 ? "binary".ptr :
2080 base == 8 ? "octal".ptr :
2081 "decimal".ptr, errorDigit);
2082 err = true;
2083 }
2084 if (overflow && !err)
2085 {
2086 error("integer overflow");
2087 err = true;
2088 }
2089 if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
2090 (base == 16 && !anyHexDigitsNoSingleUS))
2091 error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
2092
2093 t.unsvalue = n;
2094
2095 if (Ccompile)
2096 return cnumber(base, n);
2097
2098 enum FLAGS : int
2099 {
2100 none = 0,
2101 decimal = 1, // decimal
2102 unsigned = 2, // u or U suffix
2103 long_ = 4, // L suffix
2104 }
2105
2106 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
2107 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2108 const psuffix = p;
2109 while (1)
2110 {
2111 FLAGS f;
2112 switch (*p)
2113 {
2114 case 'U':
2115 case 'u':
2116 f = FLAGS.unsigned;
2117 goto L1;
2118 case 'l':
2119 f = FLAGS.long_;
2120 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2121 goto L1;
2122 case 'L':
2123 f = FLAGS.long_;
2124 L1:
2125 p++;
2126 if ((flags & f) && !err)
2127 {
2128 error("unrecognized token");
2129 err = true;
2130 }
2131 flags = cast(FLAGS)(flags | f);
2132 continue;
2133 default:
2134 break;
2135 }
2136 break;
2137 }
2138 if (base == 8 && n >= 8)
2139 {
2140 if (err)
2141 // can't translate invalid octal value, just show a generic message
2142 error("octal literals larger than 7 are no longer supported");
2143 else
2144 error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
2145 n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
2146 }
2147 TOK result;
2148 switch (flags)
2149 {
2150 case FLAGS.none:
2151 /* Octal or Hexadecimal constant.
2152 * First that fits: int, uint, long, ulong
2153 */
2154 if (n & 0x8000000000000000L)
2155 result = TOK.uns64Literal;
2156 else if (n & 0xFFFFFFFF00000000L)
2157 result = TOK.int64Literal;
2158 else if (n & 0x80000000)
2159 result = TOK.uns32Literal;
2160 else
2161 result = TOK.int32Literal;
2162 break;
2163 case FLAGS.decimal:
2164 /* First that fits: int, long, long long
2165 */
2166 if (n & 0x8000000000000000L)
2167 {
2168 result = TOK.uns64Literal;
2169 }
2170 else if (n & 0xFFFFFFFF80000000L)
2171 result = TOK.int64Literal;
2172 else
2173 result = TOK.int32Literal;
2174 break;
2175 case FLAGS.unsigned:
2176 case FLAGS.decimal | FLAGS.unsigned:
2177 /* First that fits: uint, ulong
2178 */
2179 if (n & 0xFFFFFFFF00000000L)
2180 result = TOK.uns64Literal;
2181 else
2182 result = TOK.uns32Literal;
2183 break;
2184 case FLAGS.decimal | FLAGS.long_:
2185 if (n & 0x8000000000000000L)
2186 {
2187 if (!err)
2188 {
2189 error("signed integer overflow");
2190 err = true;
2191 }
2192 result = TOK.uns64Literal;
2193 }
2194 else
2195 result = TOK.int64Literal;
2196 break;
2197 case FLAGS.long_:
2198 if (n & 0x8000000000000000L)
2199 result = TOK.uns64Literal;
2200 else
2201 result = TOK.int64Literal;
2202 break;
2203 case FLAGS.unsigned | FLAGS.long_:
2204 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2205 result = TOK.uns64Literal;
2206 break;
2207 default:
2208 debug
2209 {
2210 printf("%x\n", flags);
2211 }
2212 assert(0);
2213 }
2214 return result;
2215 }
2216
2217 /**************************************
2218 * Lex C integer-suffix
2219 * Params:
2220 * base = number base
2221 * n = raw integer value
2222 * Returns:
2223 * token value
2224 */
cnumber(int base,uinteger_t n)2225 private TOK cnumber(int base, uinteger_t n)
2226 {
2227 /* C11 6.4.4.1
2228 * Parse trailing suffixes:
2229 * u or U
2230 * l or L
2231 * ll or LL
2232 */
2233 enum FLAGS : uint
2234 {
2235 octalhex = 1, // octal or hexadecimal
2236 decimal = 2, // decimal
2237 unsigned = 4, // u or U suffix
2238 long_ = 8, // l or L suffix
2239 llong = 0x10 // ll or LL
2240 }
2241 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
2242 bool err;
2243 Lsuffixes:
2244 while (1)
2245 {
2246 FLAGS f;
2247 const cs = *p;
2248 switch (cs)
2249 {
2250 case 'U':
2251 case 'u':
2252 f = FLAGS.unsigned;
2253 break;
2254
2255 case 'l':
2256 case 'L':
2257 f = FLAGS.long_;
2258 if (cs == p[1])
2259 {
2260 f = FLAGS.long_ | FLAGS.llong;
2261 ++p;
2262 }
2263 break;
2264
2265 default:
2266 break Lsuffixes;
2267 }
2268 ++p;
2269 if ((flags & f) && !err)
2270 {
2271 error("duplicate integer suffixes");
2272 err = true;
2273 }
2274 flags = cast(FLAGS)(flags | f);
2275 }
2276
2277 TOK result = TOK.int32Literal; // default
2278 switch (flags)
2279 {
2280 /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2281 * this code deviates from C by picking D int, uint, long, or ulong instead
2282 */
2283
2284 case FLAGS.octalhex:
2285 /* Octal or Hexadecimal constant.
2286 * First that fits: int, unsigned, long, unsigned long,
2287 * long long, unsigned long long
2288 */
2289 if (n & 0x8000000000000000L)
2290 result = TOK.uns64Literal; // unsigned long
2291 else if (n & 0xFFFFFFFF00000000L)
2292 result = TOK.int64Literal; // long
2293 else if (n & 0x80000000)
2294 result = TOK.uns32Literal;
2295 else
2296 result = TOK.int32Literal;
2297 break;
2298
2299 case FLAGS.decimal:
2300 /* First that fits: int, long, long long
2301 */
2302 if (n & 0x8000000000000000L)
2303 result = TOK.uns64Literal; // unsigned long
2304 else if (n & 0xFFFFFFFF80000000L)
2305 result = TOK.int64Literal; // long
2306 else
2307 result = TOK.int32Literal;
2308 break;
2309
2310 case FLAGS.octalhex | FLAGS.unsigned:
2311 case FLAGS.decimal | FLAGS.unsigned:
2312 /* First that fits: unsigned, unsigned long, unsigned long long
2313 */
2314 if (n & 0xFFFFFFFF00000000L)
2315 result = TOK.uns64Literal; // unsigned long
2316 else
2317 result = TOK.uns32Literal;
2318 break;
2319
2320 case FLAGS.decimal | FLAGS.long_:
2321 /* First that fits: long, long long
2322 */
2323 if (longsize == 4 || long_longsize == 4)
2324 {
2325 if (n & 0xFFFFFFFF_80000000L)
2326 result = TOK.int64Literal;
2327 else
2328 result = TOK.int32Literal; // long
2329 }
2330 else
2331 {
2332 result = TOK.int64Literal; // long
2333 }
2334 break;
2335
2336 case FLAGS.octalhex | FLAGS.long_:
2337 /* First that fits: long, unsigned long, long long,
2338 * unsigned long long
2339 */
2340 if (longsize == 4 || long_longsize == 4)
2341 {
2342 if (n & 0x8000000000000000L)
2343 result = TOK.uns64Literal;
2344 else if (n & 0xFFFFFFFF00000000L)
2345 result = TOK.int64Literal;
2346 else if (n & 0x80000000)
2347 result = TOK.uns32Literal; // unsigned long
2348 else
2349 result = TOK.int32Literal; // long
2350 }
2351 else
2352 {
2353 if (n & 0x80000000_00000000L)
2354 result = TOK.uns64Literal; // unsigned long
2355 else
2356 result = TOK.int64Literal; // long
2357 }
2358 break;
2359
2360 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
2361 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2362 /* First that fits: unsigned long, unsigned long long
2363 */
2364 if (longsize == 4 || long_longsize == 4)
2365 {
2366 if (n & 0xFFFFFFFF00000000L)
2367 result = TOK.uns64Literal;
2368 else
2369 result = TOK.uns32Literal; // unsigned long
2370 }
2371 else
2372 {
2373 result = TOK.uns64Literal; // unsigned long
2374 }
2375 break;
2376
2377 case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
2378 /* First that fits: long long, unsigned long long
2379 */
2380 if (n & 0x8000000000000000L)
2381 result = TOK.uns64Literal;
2382 else
2383 result = TOK.int64Literal;
2384 break;
2385
2386 case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
2387 /* long long
2388 */
2389 result = TOK.int64Literal;
2390 break;
2391
2392 case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2393 case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2394 result = TOK.uns64Literal;
2395 break;
2396
2397 default:
2398 debug printf("%x\n",flags);
2399 assert(0);
2400 }
2401 return result;
2402 }
2403
2404 /**************************************
2405 * Read in characters, converting them to real.
2406 * Bugs:
2407 * Exponent overflow not detected.
2408 * Too much requested precision is not detected.
2409 */
inreal(Token * t)2410 private TOK inreal(Token* t)
2411 {
2412 //printf("Lexer::inreal()\n");
2413 debug
2414 {
2415 assert(*p == '.' || isdigit(*p));
2416 }
2417 bool isWellformedString = true;
2418 stringbuffer.setsize(0);
2419 auto pstart = p;
2420 bool hex = false;
2421 dchar c = *p++;
2422 // Leading '0x'
2423 if (c == '0')
2424 {
2425 c = *p++;
2426 if (c == 'x' || c == 'X')
2427 {
2428 hex = true;
2429 c = *p++;
2430 }
2431 }
2432 // Digits to left of '.'
2433 while (1)
2434 {
2435 if (c == '.')
2436 {
2437 c = *p++;
2438 break;
2439 }
2440 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2441 {
2442 c = *p++;
2443 continue;
2444 }
2445 break;
2446 }
2447 // Digits to right of '.'
2448 while (1)
2449 {
2450 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2451 {
2452 c = *p++;
2453 continue;
2454 }
2455 break;
2456 }
2457 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2458 {
2459 c = *p++;
2460 if (c == '-' || c == '+')
2461 {
2462 c = *p++;
2463 }
2464 bool anyexp = false;
2465 while (1)
2466 {
2467 if (isdigit(c))
2468 {
2469 anyexp = true;
2470 c = *p++;
2471 continue;
2472 }
2473 if (c == '_')
2474 {
2475 if (Ccompile)
2476 error("embedded `_` in numeric literals not allowed");
2477 c = *p++;
2478 continue;
2479 }
2480 if (!anyexp)
2481 {
2482 error("missing exponent");
2483 isWellformedString = false;
2484 }
2485 break;
2486 }
2487 }
2488 else if (hex)
2489 {
2490 error("exponent required for hex float");
2491 isWellformedString = false;
2492 }
2493 --p;
2494 while (pstart < p)
2495 {
2496 if (*pstart != '_')
2497 stringbuffer.writeByte(*pstart);
2498 ++pstart;
2499 }
2500 stringbuffer.writeByte(0);
2501 auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
2502 TOK result;
2503 bool isOutOfRange = false;
2504 t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero);
2505 switch (*p)
2506 {
2507 case 'F':
2508 case 'f':
2509 if (isWellformedString && !isOutOfRange)
2510 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
2511 result = TOK.float32Literal;
2512 p++;
2513 break;
2514 default:
2515 if (isWellformedString && !isOutOfRange)
2516 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
2517 result = TOK.float64Literal;
2518 break;
2519 case 'l':
2520 if (!Ccompile)
2521 error("use 'L' suffix instead of 'l'");
2522 goto case 'L';
2523 case 'L':
2524 ++p;
2525 if (Ccompile && long_doublesize == 8)
2526 goto default;
2527 result = TOK.float80Literal;
2528 break;
2529 }
2530 if ((*p == 'i' || *p == 'I') && !Ccompile)
2531 {
2532 if (*p == 'I')
2533 error("use 'i' suffix instead of 'I'");
2534 p++;
2535 switch (result)
2536 {
2537 case TOK.float32Literal:
2538 result = TOK.imaginary32Literal;
2539 break;
2540 case TOK.float64Literal:
2541 result = TOK.imaginary64Literal;
2542 break;
2543 case TOK.float80Literal:
2544 result = TOK.imaginary80Literal;
2545 break;
2546 default:
2547 break;
2548 }
2549 }
2550 const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
2551 if (isOutOfRange && !isLong && (!Ccompile || hex))
2552 {
2553 /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2554 */
2555 const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : "";
2556 error(scanloc, "number `%s%s` is not representable", sbufptr, suffix);
2557 }
2558 debug
2559 {
2560 switch (result)
2561 {
2562 case TOK.float32Literal:
2563 case TOK.float64Literal:
2564 case TOK.float80Literal:
2565 case TOK.imaginary32Literal:
2566 case TOK.imaginary64Literal:
2567 case TOK.imaginary80Literal:
2568 break;
2569 default:
2570 assert(0);
2571 }
2572 }
2573 return result;
2574 }
2575
loc()2576 final Loc loc() pure @nogc
2577 {
2578 scanloc.charnum = cast(uint)(1 + p - line);
2579 version (LocOffset)
2580 scanloc.fileOffset = cast(uint)(p - base);
2581 return scanloc;
2582 }
2583
error(const (char)* format,...)2584 final void error(const(char)* format, ...)
2585 {
2586 va_list args;
2587 va_start(args, format);
2588 .verror(token.loc, format, args);
2589 va_end(args);
2590 }
2591
error(const ref Loc loc,const (char)* format,...)2592 final void error(const ref Loc loc, const(char)* format, ...)
2593 {
2594 va_list args;
2595 va_start(args, format);
2596 .verror(loc, format, args);
2597 va_end(args);
2598 }
2599
deprecation(const (char)* format,...)2600 final void deprecation(const(char)* format, ...)
2601 {
2602 va_list args;
2603 va_start(args, format);
2604 .vdeprecation(token.loc, format, args);
2605 va_end(args);
2606 }
2607
2608 /***************************************
2609 * Parse special token sequence:
2610 * Returns:
2611 * true if the special token sequence was handled
2612 * References:
2613 * https://dlang.org/spec/lex.html#special-token-sequence
2614 */
parseSpecialTokenSequence()2615 bool parseSpecialTokenSequence()
2616 {
2617 Token n;
2618 scan(&n);
2619 if (n.value == TOK.identifier)
2620 {
2621 if (n.ident == Id.line)
2622 {
2623 poundLine(n, false);
2624 return true;
2625 }
2626 else
2627 {
2628 const locx = loc();
2629 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
2630 }
2631 }
2632 else if (n.value == TOK.if_)
2633 {
2634 error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
2635 }
2636 return false;
2637 }
2638
2639 /*********************************************
2640 * Parse line/file preprocessor directive:
2641 * #line linnum [filespec]
2642 * Allow __LINE__ for linnum, and __FILE__ for filespec.
2643 * Accept linemarker format:
2644 * # linnum [filespec] {flags}
2645 * There can be zero or more flags, which are one of the digits 1..4, and
2646 * must be in ascending order. The flags are ignored.
2647 * Params:
2648 * tok = token we're on, which is linnum of linemarker
2649 * linemarker = true if line marker format and lexer is on linnum
2650 * References:
2651 * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2652 */
poundLine(ref Token tok,bool linemarker)2653 final void poundLine(ref Token tok, bool linemarker)
2654 {
2655 auto linnum = this.scanloc.linnum;
2656 const(char)* filespec = null;
2657 bool flags;
2658
2659 if (!linemarker)
2660 scan(&tok);
2661 if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
2662 {
2663 const lin = cast(int)(tok.unsvalue);
2664 if (lin != tok.unsvalue)
2665 {
2666 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
2667 skipToNextLine();
2668 return;
2669 }
2670 else
2671 linnum = lin;
2672 }
2673 else if (tok.value == TOK.line) // #line __LINE__
2674 {
2675 }
2676 else
2677 {
2678 error(tok.loc, "positive integer argument expected following `#line`");
2679 if (tok.value != TOK.endOfLine)
2680 skipToNextLine();
2681 return;
2682 }
2683 while (1)
2684 {
2685 scan(&tok);
2686 switch (tok.value)
2687 {
2688 case TOK.endOfFile:
2689 case TOK.endOfLine:
2690 if (!inTokenStringConstant)
2691 {
2692 this.scanloc.linnum = linnum;
2693 if (filespec)
2694 this.scanloc.filename = filespec;
2695 }
2696 return;
2697 case TOK.file:
2698 if (filespec || flags)
2699 goto Lerr;
2700 filespec = mem.xstrdup(scanloc.filename);
2701 continue;
2702 case TOK.string_:
2703 if (filespec || flags)
2704 goto Lerr;
2705 if (tok.ptr[0] != '"' || tok.postfix != 0)
2706 goto Lerr;
2707 filespec = tok.ustring;
2708 continue;
2709 case TOK.int32Literal:
2710 if (!filespec)
2711 goto Lerr;
2712 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
2713 {
2714 flags = true; // linemarker flags seen
2715 continue;
2716 }
2717 goto Lerr;
2718 default:
2719 goto Lerr;
2720 }
2721 }
2722 Lerr:
2723 if (filespec is null)
2724 error(tok.loc, "invalid filename for `#line` directive");
2725 else if (linemarker)
2726 error(tok.loc, "invalid flag for line marker directive");
2727 else if (!Ccompile)
2728 error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
2729 if (tok.value != TOK.endOfLine)
2730 skipToNextLine();
2731 }
2732
2733 /***************************************
2734 * Scan forward to start of next line.
2735 */
skipToNextLine()2736 final void skipToNextLine()
2737 {
2738 while (1)
2739 {
2740 switch (*p)
2741 {
2742 case 0:
2743 case 0x1A:
2744 return; // do not advance p
2745
2746 case '\n':
2747 ++p;
2748 break;
2749
2750 case '\r':
2751 ++p;
2752 if (p[0] == '\n')
2753 ++p;
2754 break;
2755
2756 default:
2757 if (*p & 0x80)
2758 {
2759 const u = decodeUTF();
2760 if (u == PS || u == LS)
2761 {
2762 ++p;
2763 break;
2764 }
2765 }
2766 ++p;
2767 continue;
2768 }
2769 break;
2770 }
2771 endOfLine();
2772 tokenizeNewlines = false;
2773 }
2774
2775 /********************************************
2776 * Decode UTF character.
2777 * Issue error messages for invalid sequences.
2778 * Return decoded character, advance p to last character in UTF sequence.
2779 */
decodeUTF()2780 private uint decodeUTF()
2781 {
2782 const s = p;
2783 assert(*s & 0x80);
2784 // Check length of remaining string up to 4 UTF-8 characters
2785 size_t len;
2786 for (len = 1; len < 4 && s[len]; len++)
2787 {
2788 }
2789 size_t idx = 0;
2790 dchar u;
2791 const msg = utf_decodeChar(s[0 .. len], idx, u);
2792 p += idx - 1;
2793 if (msg)
2794 {
2795 error("%.*s", cast(int)msg.length, msg.ptr);
2796 }
2797 return u;
2798 }
2799
2800 /***************************************************
2801 * Parse doc comment embedded between t.ptr and p.
2802 * Remove trailing blanks and tabs from lines.
2803 * Replace all newlines with \n.
2804 * Remove leading comment character from each line.
2805 * Decide if it's a lineComment or a blockComment.
2806 * Append to previous one for this token.
2807 *
2808 * If newParagraph is true, an extra newline will be
2809 * added between adjoining doc comments.
2810 */
getDocComment(Token * t,uint lineComment,bool newParagraph)2811 private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
2812 {
2813 /* ct tells us which kind of comment it is: '/', '*', or '+'
2814 */
2815 const ct = t.ptr[2];
2816 /* Start of comment text skips over / * *, / + +, or / / /
2817 */
2818 const(char)* q = t.ptr + 3; // start of comment text
2819 const(char)* qend = p;
2820 if (ct == '*' || ct == '+')
2821 qend -= 2;
2822 /* Scan over initial row of ****'s or ++++'s or ////'s
2823 */
2824 for (; q < qend; q++)
2825 {
2826 if (*q != ct)
2827 break;
2828 }
2829 /* Remove leading spaces until start of the comment
2830 */
2831 int linestart = 0;
2832 if (ct == '/')
2833 {
2834 while (q < qend && (*q == ' ' || *q == '\t'))
2835 ++q;
2836 }
2837 else if (q < qend)
2838 {
2839 if (*q == '\r')
2840 {
2841 ++q;
2842 if (q < qend && *q == '\n')
2843 ++q;
2844 linestart = 1;
2845 }
2846 else if (*q == '\n')
2847 {
2848 ++q;
2849 linestart = 1;
2850 }
2851 }
2852 /* Remove trailing row of ****'s or ++++'s
2853 */
2854 if (ct != '/')
2855 {
2856 for (; q < qend; qend--)
2857 {
2858 if (qend[-1] != ct)
2859 break;
2860 }
2861 }
2862 /* Comment is now [q .. qend].
2863 * Canonicalize it into buf[].
2864 */
2865 OutBuffer buf;
2866
2867 void trimTrailingWhitespace()
2868 {
2869 const s = buf[];
2870 auto len = s.length;
2871 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
2872 --len;
2873 buf.setsize(len);
2874 }
2875
2876 for (; q < qend; q++)
2877 {
2878 char c = *q;
2879 switch (c)
2880 {
2881 case '*':
2882 case '+':
2883 if (linestart && c == ct)
2884 {
2885 linestart = 0;
2886 /* Trim preceding whitespace up to preceding \n
2887 */
2888 trimTrailingWhitespace();
2889 continue;
2890 }
2891 break;
2892 case ' ':
2893 case '\t':
2894 break;
2895 case '\r':
2896 if (q[1] == '\n')
2897 continue; // skip the \r
2898 goto Lnewline;
2899 default:
2900 if (c == 226)
2901 {
2902 // If LS or PS
2903 if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
2904 {
2905 q += 2;
2906 goto Lnewline;
2907 }
2908 }
2909 linestart = 0;
2910 break;
2911 Lnewline:
2912 c = '\n'; // replace all newlines with \n
2913 goto case;
2914 case '\n':
2915 linestart = 1;
2916 /* Trim trailing whitespace
2917 */
2918 trimTrailingWhitespace();
2919 break;
2920 }
2921 buf.writeByte(c);
2922 }
2923 /* Trim trailing whitespace (if the last line does not have newline)
2924 */
2925 trimTrailingWhitespace();
2926
2927 // Always end with a newline
2928 const s = buf[];
2929 if (s.length == 0 || s[$ - 1] != '\n')
2930 buf.writeByte('\n');
2931
2932 // It's a line comment if the start of the doc comment comes
2933 // after other non-whitespace on the same line.
2934 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
2935 // Combine with previous doc comment, if any
2936 if (*dc)
2937 *dc = combineComments(*dc, buf[], newParagraph).toDString();
2938 else
2939 *dc = buf.extractSlice(true);
2940 }
2941
2942 /********************************************
2943 * Combine two document comments into one,
2944 * separated by an extra newline if newParagraph is true.
2945 */
combineComments(const (char)[]c1,const (char)[]c2,bool newParagraph)2946 static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
2947 {
2948 //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
2949 const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
2950 if (!c1)
2951 return c2.ptr;
2952 if (!c2)
2953 return c1.ptr;
2954
2955 int insertNewLine = 0;
2956 if (c1.length && c1[$ - 1] != '\n')
2957 insertNewLine = 1;
2958 const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
2959 auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
2960 p[0 .. c1.length] = c1[];
2961 if (insertNewLine)
2962 p[c1.length] = '\n';
2963 if (newParagraph)
2964 p[c1.length + insertNewLine] = '\n';
2965 p[retSize - c2.length .. retSize] = c2[];
2966 p[retSize] = 0;
2967 return p;
2968 }
2969
2970 /**************************
2971 * `p` should be at start of next line
2972 */
endOfLine()2973 private void endOfLine() pure @nogc @safe
2974 {
2975 scanloc.linnum++;
2976 line = p;
2977 }
2978 }
2979
2980
2981 /******************************* Private *****************************************/
2982
2983 private:
2984
2985 /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__`
2986 private struct TimeStampInfo
2987 {
2988 private __gshared bool initdone = false;
2989
2990 // Note: Those properties need to be guarded by a call to `init`
2991 // The API isn't safe, and quite brittle, but it was left this way
2992 // over performance concerns.
2993 // This is currently only called once, from the lexer.
2994 __gshared char[11 + 1] date;
2995 __gshared char[8 + 1] time;
2996 __gshared char[24 + 1] timestamp;
2997
initializeTimeStampInfo2998 public static void initialize(const ref Loc loc) nothrow
2999 {
3000 if (initdone)
3001 return;
3002
3003 initdone = true;
3004 time_t ct;
3005 // https://issues.dlang.org/show_bug.cgi?id=20444
3006 if (auto p = getenv("SOURCE_DATE_EPOCH"))
3007 {
3008 if (!ct.parseDigits(p.toDString()))
3009 error(loc, "value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p);
3010 }
3011 else
3012 .time(&ct);
3013 const p = ctime(&ct);
3014 assert(p);
3015 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
3016 sprintf(&time[0], "%.8s", p + 11);
3017 sprintf(×tamp[0], "%.24s", p);
3018 }
3019 }
3020
3021 private enum LS = 0x2028; // UTF line separator
3022 private enum PS = 0x2029; // UTF paragraph separator
3023
3024 /********************************************
3025 * Do our own char maps
3026 */
3027 private static immutable cmtable = ()
3028 {
3029 ubyte[256] table;
3030 foreach (const c; 0 .. table.length)
3031 {
3032 if ('0' <= c && c <= '7')
3033 table[c] |= CMoctal;
3034 if (c_isxdigit(c))
3035 table[c] |= CMhex;
3036 if (c_isalnum(c) || c == '_')
3037 table[c] |= CMidchar;
3038
3039 switch (c)
3040 {
3041 case 'x': case 'X':
3042 case 'b': case 'B':
3043 table[c] |= CMzerosecond;
3044 break;
3045
3046 case '0': .. case '9':
3047 case 'e': case 'E':
3048 case 'f': case 'F':
3049 case 'l': case 'L':
3050 case 'p': case 'P':
3051 case 'u': case 'U':
3052 case 'i':
3053 case '.':
3054 case '_':
3055 table[c] |= CMzerosecond | CMdigitsecond;
3056 break;
3057
3058 default:
3059 break;
3060 }
3061
3062 switch (c)
3063 {
3064 case '\\':
3065 case '\n':
3066 case '\r':
3067 case 0:
3068 case 0x1A:
3069 case '\'':
3070 break;
3071 default:
3072 if (!(c & 0x80))
3073 table[c] |= CMsinglechar;
3074 break;
3075 }
3076 }
3077 return table;
3078 }();
3079
3080 private
3081 {
3082 enum CMoctal = 0x1;
3083 enum CMhex = 0x2;
3084 enum CMidchar = 0x4;
3085 enum CMzerosecond = 0x8;
3086 enum CMdigitsecond = 0x10;
3087 enum CMsinglechar = 0x20;
3088 }
3089
isoctal(const char c)3090 private bool isoctal(const char c) pure @nogc @safe
3091 {
3092 return (cmtable[c] & CMoctal) != 0;
3093 }
3094
ishex(const char c)3095 private bool ishex(const char c) pure @nogc @safe
3096 {
3097 return (cmtable[c] & CMhex) != 0;
3098 }
3099
isidchar(const char c)3100 private bool isidchar(const char c) pure @nogc @safe
3101 {
3102 return (cmtable[c] & CMidchar) != 0;
3103 }
3104
isZeroSecond(const char c)3105 private bool isZeroSecond(const char c) pure @nogc @safe
3106 {
3107 return (cmtable[c] & CMzerosecond) != 0;
3108 }
3109
isDigitSecond(const char c)3110 private bool isDigitSecond(const char c) pure @nogc @safe
3111 {
3112 return (cmtable[c] & CMdigitsecond) != 0;
3113 }
3114
issinglechar(const char c)3115 private bool issinglechar(const char c) pure @nogc @safe
3116 {
3117 return (cmtable[c] & CMsinglechar) != 0;
3118 }
3119
c_isxdigit(const int c)3120 private bool c_isxdigit(const int c) pure @nogc @safe
3121 {
3122 return (( c >= '0' && c <= '9') ||
3123 ( c >= 'a' && c <= 'f') ||
3124 ( c >= 'A' && c <= 'F'));
3125 }
3126
c_isalnum(const int c)3127 private bool c_isalnum(const int c) pure @nogc @safe
3128 {
3129 return (( c >= '0' && c <= '9') ||
3130 ( c >= 'a' && c <= 'z') ||
3131 ( c >= 'A' && c <= 'Z'));
3132 }
3133
3134 /******************************* Unittest *****************************************/
3135
3136 unittest
3137 {
3138 import dmd.console;
assertDiagnosticHandler(const ref Loc loc,Color headerColor,const (char)* header,const (char)* format,va_list ap,const (char)* p1,const (char)* p2)3139 nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
3140 const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
3141 {
3142 assert(0);
3143 }
3144 diagnosticHandler = &assertDiagnosticHandler;
3145
test(T)3146 static void test(T)(string sequence, T expected, bool Ccompile = false)
3147 {
3148 auto p = cast(const(char)*)sequence.ptr;
3149 assert(expected == Lexer.escapeSequence(Loc.initial, p, Ccompile));
3150 assert(p == sequence.ptr + sequence.length);
3151 }
3152
3153 test(`'`, '\'');
3154 test(`"`, '"');
3155 test(`?`, '?');
3156 test(`\`, '\\');
3157 test(`0`, '\0');
3158 test(`a`, '\a');
3159 test(`b`, '\b');
3160 test(`f`, '\f');
3161 test(`n`, '\n');
3162 test(`r`, '\r');
3163 test(`t`, '\t');
3164 test(`v`, '\v');
3165
3166 test(`x00`, 0x00);
3167 test(`xff`, 0xff);
3168 test(`xFF`, 0xff);
3169 test(`xa7`, 0xa7);
3170 test(`x3c`, 0x3c);
3171 test(`xe2`, 0xe2);
3172
3173 test(`1`, '\1');
3174 test(`42`, '\42');
3175 test(`357`, '\357');
3176
3177 test(`u1234`, '\u1234');
3178 test(`uf0e4`, '\uf0e4');
3179
3180 test(`U0001f603`, '\U0001f603');
3181
3182 test(`"`, '"');
3183 test(`<`, '<');
3184 test(`>`, '>');
3185
3186 diagnosticHandler = null;
3187 }
3188
3189 unittest
3190 {
3191 import dmd.console;
3192 string expected;
3193 bool gotError;
3194
3195 nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
3196 const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
3197 {
3198 assert(cast(Classification)headerColor == Classification.error);
3199
3200 gotError = true;
3201 char[100] buffer = void;
3202 auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)];
3203 assert(expected == actual);
3204 return true;
3205 }
3206
3207 diagnosticHandler = &expectDiagnosticHandler;
3208
3209 void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
3210 {
3211 uint errors = global.errors;
3212 gotError = false;
3213 expected = expectedError;
3214 auto p = cast(const(char)*)sequence.ptr;
3215 auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p, Ccompile);
3216 assert(gotError);
3217 assert(expectedReturnValue == actualReturnValue);
3218
3219 auto actualScanLength = p - sequence.ptr;
3220 assert(expectedScanLength == actualScanLength);
3221 global.errors = errors;
3222 }
3223
3224 test("c", `undefined escape sequence \c`, 'c', 1);
3225 test("!", `undefined escape sequence \!`, '!', 1);
3226 test(""", `undefined escape sequence \&`, '&', 1, true);
3227
3228 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3229
3230 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2);
3231 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3);
3232 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3233
3234 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2);
3235 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3);
3236 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4);
3237 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5);
3238 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6);
3239 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7);
3240 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3241
3242 test("ud800" , `invalid UTF character \U0000d800`, '?', 5);
3243 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5);
3244 test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3245
3246 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2);
3247 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2);
3248 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3249
3250 test("&BAD;", `unnamed character entity &BAD;` , '?', 5);
3251 test(""", `unterminated named entity "`, '?', 5);
3252 test(""", `unterminated named entity "`, '?', 5);
3253
3254 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3255
3256 diagnosticHandler = null;
3257 }
3258
3259 unittest
3260 {
3261 //printf("lexer.unittest\n");
3262 /* Not much here, just trying things out.
3263 */
3264 string text = "int"; // We rely on the implicit null-terminator
3265 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0);
3266 TOK tok;
3267 tok = lex1.nextToken();
3268 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3269 assert(tok == TOK.int32);
3270 tok = lex1.nextToken();
3271 assert(tok == TOK.endOfFile);
3272 tok = lex1.nextToken();
3273 assert(tok == TOK.endOfFile);
3274 tok = lex1.nextToken();
3275 assert(tok == TOK.endOfFile);
3276 }
3277
3278 unittest
3279 {
3280 // We don't want to see Lexer error output during these tests.
3281 uint errors = global.startGagging();
3282 scope(exit) global.endGagging(errors);
3283
3284 // Test malformed input: even malformed input should end in a TOK.endOfFile.
3285 static immutable char[][] testcases =
3286 [ // Testcase must end with 0 or 0x1A.
3287 [0], // not malformed, but pathological
3288 ['\'', 0],
3289 ['\'', 0x1A],
3290 ['{', '{', 'q', '{', 0],
3291 [0xFF, 0],
3292 [0xFF, 0x80, 0],
3293 [0xFF, 0xFF, 0],
3294 [0xFF, 0xFF, 0],
3295 ['x', '"', 0x1A],
3296 ];
3297
3298 foreach (testcase; testcases)
3299 {
3300 scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0);
3301 TOK tok = lex2.nextToken();
3302 size_t iterations = 1;
3303 while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
3304 {
3305 tok = lex2.nextToken();
3306 }
3307 assert(tok == TOK.endOfFile);
3308 tok = lex2.nextToken();
3309 assert(tok == TOK.endOfFile);
3310 }
3311 }
3312