xref: /netbsd-src/external/gpl3/gcc/dist/gcc/d/dmd/lexer.d (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /**
2  * Implements the lexical analyzer, which converts source code into lexical tokens.
3  *
4  * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
5  *
6  * Copyright:   Copyright (C) 1999-2022 by The D Language Foundation, All Rights Reserved
7  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
8  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10  * Documentation:  https://dlang.org/phobos/dmd_lexer.html
11  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
12  */
13 
14 module dmd.lexer;
15 
16 import core.stdc.ctype;
17 import core.stdc.errno;
18 import core.stdc.stdarg;
19 import core.stdc.stdio;
20 import core.stdc.stdlib : getenv;
21 import core.stdc.string;
22 import core.stdc.time;
23 
24 import dmd.entity;
25 import dmd.errors;
26 import dmd.globals;
27 import dmd.id;
28 import dmd.identifier;
29 import dmd.root.array;
30 import dmd.root.ctfloat;
31 import dmd.common.outbuffer;
32 import dmd.root.port;
33 import dmd.root.rmem;
34 import dmd.root.string;
35 import dmd.root.utf;
36 import dmd.tokens;
37 import dmd.utils;
38 
39 nothrow:
40 
version(DMDLIB)41 version (DMDLIB)
42 {
43     version = LocOffset;
44 }
45 
46 /***********************************************************
47  */
48 class Lexer
49 {
50     private __gshared OutBuffer stringbuffer;
51 
52     Loc scanloc;            // for error messages
53     Loc prevloc;            // location of token before current
54 
55     const(char)* p;         // current character
56 
57     Token token;
58 
59     // For ImportC
60     bool Ccompile;              /// true if compiling ImportC
61 
62     // The following are valid only if (Ccompile == true)
63     ubyte boolsize;             /// size of a C _Bool, default 1
64     ubyte shortsize;            /// size of a C short, default 2
65     ubyte intsize;              /// size of a C int, default 4
66     ubyte longsize;             /// size of C long, 4 or 8
67     ubyte long_longsize;        /// size of a C long long, default 8
68     ubyte long_doublesize;      /// size of C long double, 8 or D real.sizeof
69     ubyte wchar_tsize;          /// size of C wchar_t, 2 or 4
70 
71     private
72     {
73         const(char)* base;      // pointer to start of buffer
74         const(char)* end;       // pointer to last element of buffer
75         const(char)* line;      // start of current line
76 
77         bool doDocComment;      // collect doc comment information
78         bool anyToken;          // seen at least one token
79         bool commentToken;      // comments are TOK.comment's
80         bool tokenizeNewlines;  // newlines are turned into TOK.endOfLine's
81 
version(DMDLIB)82         version (DMDLIB)
83         {
84             bool whitespaceToken;   // tokenize whitespaces
85         }
86 
87         int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
88         int lastDocLine;        // last line of previous doc comment
89 
90         Token* tokenFreelist;
91     }
92 
93   nothrow:
94 
95     /*********************
96      * Creates a Lexer for the source code base[begoffset..endoffset+1].
97      * The last character, base[endoffset], must be null (0) or EOF (0x1A).
98      *
99      * Params:
100      *  filename = used for error messages
101      *  base = source code, must be terminated by a null (0) or EOF (0x1A) character
102      *  begoffset = starting offset into base[]
103      *  endoffset = the last offset to read into base[]
104      *  doDocComment = handle documentation comments
105      *  commentToken = comments become TOK.comment's
106      */
this(const (char)* filename,const (char)* base,size_t begoffset,size_t endoffset,bool doDocComment,bool commentToken)107     this(const(char)* filename, const(char)* base, size_t begoffset,
108         size_t endoffset, bool doDocComment, bool commentToken) pure
109     {
110         scanloc = Loc(filename, 1, 1);
111         // debug printf("Lexer::Lexer(%p)\n", base);
112         // debug printf("lexer.filename = %s\n", filename);
113         token = Token.init;
114         this.base = base;
115         this.end = base + endoffset;
116         p = base + begoffset;
117         line = p;
118         this.doDocComment = doDocComment;
119         this.commentToken = commentToken;
120         this.tokenizeNewlines = false;
121         this.inTokenStringConstant = 0;
122         this.lastDocLine = 0;
123         //initKeywords();
124         /* If first line starts with '#!', ignore the line
125          */
126         if (p && p[0] == '#' && p[1] == '!')
127         {
128             p += 2;
129             while (1)
130             {
131                 char c = *p++;
132                 switch (c)
133                 {
134                 case 0:
135                 case 0x1A:
136                     p--;
137                     goto case;
138                 case '\n':
139                     break;
140                 default:
141                     continue;
142                 }
143                 break;
144             }
145             endOfLine();
146         }
147     }
148 
version(DMDLIB)149     version (DMDLIB)
150     {
151         this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
152             bool doDocComment, bool commentToken, bool whitespaceToken)
153         {
154             this(filename, base, begoffset, endoffset, doDocComment, commentToken);
155             this.whitespaceToken = whitespaceToken;
156         }
157 
158         bool empty() const pure @property @nogc @safe
159         {
160             return front() == TOK.endOfFile;
161         }
162 
163         TOK front() const pure @property @nogc @safe
164         {
165             return token.value;
166         }
167 
168         void popFront()
169         {
170             nextToken();
171         }
172     }
173 
174     /// Returns: a newly allocated `Token`.
allocateToken()175     Token* allocateToken() pure nothrow @safe
176     {
177         if (tokenFreelist)
178         {
179             Token* t = tokenFreelist;
180             tokenFreelist = t.next;
181             t.next = null;
182             return t;
183         }
184         return new Token();
185     }
186 
187     /// Frees the given token by returning it to the freelist.
releaseToken(Token * token)188     private void releaseToken(Token* token) pure nothrow @nogc @safe
189     {
190         if (mem.isGCEnabled)
191             *token = Token.init;
192         token.next = tokenFreelist;
193         tokenFreelist = token;
194     }
195 
nextToken()196     final TOK nextToken()
197     {
198         prevloc = token.loc;
199         if (token.next)
200         {
201             Token* t = token.next;
202             memcpy(&token, t, Token.sizeof);
203             releaseToken(t);
204         }
205         else
206         {
207             scan(&token);
208         }
209         //printf(token.toChars());
210         return token.value;
211     }
212 
213     /***********************
214      * Look ahead at next token's value.
215      */
peekNext()216     final TOK peekNext()
217     {
218         return peek(&token).value;
219     }
220 
221     /***********************
222      * Look 2 tokens ahead at value.
223      */
peekNext2()224     final TOK peekNext2()
225     {
226         Token* t = peek(&token);
227         return peek(t).value;
228     }
229 
230     /****************************
231      * Turn next token in buffer into a token.
232      * Params:
233      *  t = the token to set the resulting Token to
234      */
scan(Token * t)235     final void scan(Token* t)
236     {
237         const lastLine = scanloc.linnum;
238         Loc startLoc;
239         t.blockComment = null;
240         t.lineComment = null;
241 
242         while (1)
243         {
244             t.ptr = p;
245             //printf("p = %p, *p = '%c'\n",p,*p);
246             t.loc = loc();
247             switch (*p)
248             {
249             case 0:
250             case 0x1A:
251                 t.value = TOK.endOfFile; // end of file
252                 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
253                 return;
254             case ' ':
255                 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
256                 while ((cast(size_t)p) % uint.sizeof)
257                 {
258                     if (*p != ' ')
259                         goto LendSkipFourSpaces;
260                     p++;
261                 }
262                 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
263                     p += 4;
264                 // Skip over any remaining space on the line.
265                 while (*p == ' ')
266                     p++;
267             LendSkipFourSpaces:
268                 version (DMDLIB)
269                 {
270                     if (whitespaceToken)
271                     {
272                         t.value = TOK.whitespace;
273                         return;
274                     }
275                 }
276                 continue; // skip white space
277             case '\t':
278             case '\v':
279             case '\f':
280                 p++;
281                 version (DMDLIB)
282                 {
283                     if (whitespaceToken)
284                     {
285                         t.value = TOK.whitespace;
286                         return;
287                     }
288                 }
289                 continue; // skip white space
290             case '\r':
291                 p++;
292                 if (*p != '\n') // if CR stands by itself
293                 {
294                     endOfLine();
295                     if (tokenizeNewlines)
296                     {
297                         t.value = TOK.endOfLine;
298                         tokenizeNewlines = false;
299                         return;
300                     }
301                 }
302                 version (DMDLIB)
303                 {
304                     if (whitespaceToken)
305                     {
306                         t.value = TOK.whitespace;
307                         return;
308                     }
309                 }
310                 continue; // skip white space
311             case '\n':
312                 p++;
313                 endOfLine();
314                 if (tokenizeNewlines)
315                 {
316                     t.value = TOK.endOfLine;
317                     tokenizeNewlines = false;
318                     return;
319                 }
320                 version (DMDLIB)
321                 {
322                     if (whitespaceToken)
323                     {
324                         t.value = TOK.whitespace;
325                         return;
326                     }
327                 }
328                 continue; // skip white space
329             case '0':
330                 if (!isZeroSecond(p[1]))        // if numeric literal does not continue
331                 {
332                     ++p;
333                     t.unsvalue = 0;
334                     t.value = TOK.int32Literal;
335                     return;
336                 }
337                 goto Lnumber;
338 
339             case '1': .. case '9':
340                 if (!isDigitSecond(p[1]))       // if numeric literal does not continue
341                 {
342                     t.unsvalue = *p - '0';
343                     ++p;
344                     t.value = TOK.int32Literal;
345                     return;
346                 }
347             Lnumber:
348                 t.value = number(t);
349                 return;
350 
351             case '\'':
352                 if (issinglechar(p[1]) && p[2] == '\'')
353                 {
354                     t.unsvalue = p[1];        // simple one character literal
355                     t.value = TOK.charLiteral;
356                     p += 3;
357                 }
358                 else if (Ccompile)
359                 {
360                     clexerCharConstant(*t, 0);
361                 }
362                 else
363                 {
364                     t.value = charConstant(t);
365                 }
366                 return;
367 
368             case 'u':
369             case 'U':
370             case 'L':
371                 if (!Ccompile)
372                     goto case_ident;
373                 if (p[1] == '\'')       // C wide character constant
374                 {
375                     char c = *p;
376                     if (c == 'L')       // convert L to u or U
377                         c = (wchar_tsize == 4) ? 'u' : 'U';
378                     ++p;
379                     clexerCharConstant(*t, c);
380                     return;
381                 }
382                 else if (p[1] == '\"')  // C wide string literal
383                 {
384                     const c = *p;
385                     ++p;
386                     escapeStringConstant(t);
387                     t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
388                                 c == 'u' ? 'w' :
389                                 'd';
390                     return;
391                 }
392                 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
393                 {
394                     p += 2;
395                     escapeStringConstant(t);
396                     return;
397                 }
398                 goto case_ident;
399 
400             case 'r':
401                 if (Ccompile || p[1] != '"')
402                     goto case_ident;
403                 p++;
404                 goto case '`';
405             case '`':
406                 if (Ccompile)
407                     goto default;
408                 wysiwygStringConstant(t);
409                 return;
410             case 'q':
411                 if (Ccompile)
412                     goto case_ident;
413                 if (p[1] == '"')
414                 {
415                     p++;
416                     delimitedStringConstant(t);
417                     return;
418                 }
419                 else if (p[1] == '{')
420                 {
421                     p++;
422                     tokenStringConstant(t);
423                     return;
424                 }
425                 else
426                     goto case_ident;
427             case '"':
428                 escapeStringConstant(t);
429                 return;
430             case 'a':
431             case 'b':
432             case 'c':
433             case 'd':
434             case 'e':
435             case 'f':
436             case 'g':
437             case 'h':
438             case 'i':
439             case 'j':
440             case 'k':
441             case 'l':
442             case 'm':
443             case 'n':
444             case 'o':
445             case 'p':
446                 /*case 'q': case 'r':*/
447             case 's':
448             case 't':
449             //case 'u':
450             case 'v':
451             case 'w':
452             case 'x':
453             case 'y':
454             case 'z':
455             case 'A':
456             case 'B':
457             case 'C':
458             case 'D':
459             case 'E':
460             case 'F':
461             case 'G':
462             case 'H':
463             case 'I':
464             case 'J':
465             case 'K':
466             //case 'L':
467             case 'M':
468             case 'N':
469             case 'O':
470             case 'P':
471             case 'Q':
472             case 'R':
473             case 'S':
474             case 'T':
475             //case 'U':
476             case 'V':
477             case 'W':
478             case 'X':
479             case 'Y':
480             case 'Z':
481             case '_':
482             case_ident:
483                 {
484                     while (1)
485                     {
486                         const c = *++p;
487                         if (isidchar(c))
488                             continue;
489                         else if (c & 0x80)
490                         {
491                             const s = p;
492                             const u = decodeUTF();
493                             if (isUniAlpha(u))
494                                 continue;
495                             error("char 0x%04x not allowed in identifier", u);
496                             p = s;
497                         }
498                         break;
499                     }
500                     Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr));
501                     t.ident = id;
502                     t.value = cast(TOK)id.getValue();
503 
504                     anyToken = 1;
505 
506                     /* Different keywords for C and D
507                      */
508                     if (Ccompile)
509                     {
510                         if (t.value != TOK.identifier)
511                         {
512                             t.value = Ckeywords[t.value];  // filter out D keywords
513                         }
514                     }
515                     else if (t.value >= FirstCKeyword)
516                         t.value = TOK.identifier;       // filter out C keywords
517 
518                     else if (*t.ptr == '_') // if special identifier token
519                     {
520                         // Lazy initialization
521                         TimeStampInfo.initialize(t.loc);
522 
523                         if (id == Id.DATE)
524                         {
525                             t.ustring = TimeStampInfo.date.ptr;
526                             goto Lstr;
527                         }
528                         else if (id == Id.TIME)
529                         {
530                             t.ustring = TimeStampInfo.time.ptr;
531                             goto Lstr;
532                         }
533                         else if (id == Id.VENDOR)
534                         {
535                             t.ustring = global.vendor.xarraydup.ptr;
536                             goto Lstr;
537                         }
538                         else if (id == Id.TIMESTAMP)
539                         {
540                             t.ustring = TimeStampInfo.timestamp.ptr;
541                         Lstr:
542                             t.value = TOK.string_;
543                             t.postfix = 0;
544                             t.len = cast(uint)strlen(t.ustring);
545                         }
546                         else if (id == Id.VERSIONX)
547                         {
548                             t.value = TOK.int64Literal;
549                             t.unsvalue = global.versionNumber();
550                         }
551                         else if (id == Id.EOFX)
552                         {
553                             t.value = TOK.endOfFile;
554                             // Advance scanner to end of file
555                             while (!(*p == 0 || *p == 0x1A))
556                                 p++;
557                         }
558                     }
559                     //printf("t.value = %d\n",t.value);
560                     return;
561                 }
562             case '/':
563                 p++;
564                 switch (*p)
565                 {
566                 case '=':
567                     p++;
568                     t.value = TOK.divAssign;
569                     return;
570                 case '*':
571                     p++;
572                     startLoc = loc();
573                     while (1)
574                     {
575                         while (1)
576                         {
577                             const c = *p;
578                             switch (c)
579                             {
580                             case '/':
581                                 break;
582                             case '\n':
583                                 endOfLine();
584                                 p++;
585                                 continue;
586                             case '\r':
587                                 p++;
588                                 if (*p != '\n')
589                                     endOfLine();
590                                 continue;
591                             case 0:
592                             case 0x1A:
593                                 error("unterminated /* */ comment");
594                                 p = end;
595                                 t.loc = loc();
596                                 t.value = TOK.endOfFile;
597                                 return;
598                             default:
599                                 if (c & 0x80)
600                                 {
601                                     const u = decodeUTF();
602                                     if (u == PS || u == LS)
603                                         endOfLine();
604                                 }
605                                 p++;
606                                 continue;
607                             }
608                             break;
609                         }
610                         p++;
611                         if (p[-2] == '*' && p - 3 != t.ptr)
612                             break;
613                     }
614                     if (commentToken)
615                     {
616                         t.loc = startLoc;
617                         t.value = TOK.comment;
618                         return;
619                     }
620                     else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
621                     {
622                         // if /** but not /**/
623                         getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
624                         lastDocLine = scanloc.linnum;
625                     }
626                     continue;
627                 case '/': // do // style comments
628                     startLoc = loc();
629                     while (1)
630                     {
631                         const c = *++p;
632                         switch (c)
633                         {
634                         case '\n':
635                             break;
636                         case '\r':
637                             if (p[1] == '\n')
638                                 p++;
639                             break;
640                         case 0:
641                         case 0x1A:
642                             if (commentToken)
643                             {
644                                 p = end;
645                                 t.loc = startLoc;
646                                 t.value = TOK.comment;
647                                 return;
648                             }
649                             if (doDocComment && t.ptr[2] == '/')
650                             {
651                                 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
652                                 lastDocLine = scanloc.linnum;
653                             }
654                             p = end;
655                             t.loc = loc();
656                             t.value = TOK.endOfFile;
657                             return;
658                         default:
659                             if (c & 0x80)
660                             {
661                                 const u = decodeUTF();
662                                 if (u == PS || u == LS)
663                                     break;
664                             }
665                             continue;
666                         }
667                         break;
668                     }
669                     if (commentToken)
670                     {
671                         version (DMDLIB) {}
672                         else
673                         {
674                             p++;
675                             endOfLine();
676                         }
677                         t.loc = startLoc;
678                         t.value = TOK.comment;
679                         return;
680                     }
681                     if (doDocComment && t.ptr[2] == '/')
682                     {
683                         getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
684                         lastDocLine = scanloc.linnum;
685                     }
686                     p++;
687                     endOfLine();
688                     continue;
689                 case '+':
690                     if (!Ccompile)
691                     {
692                         int nest;
693                         startLoc = loc();
694                         p++;
695                         nest = 1;
696                         while (1)
697                         {
698                             char c = *p;
699                             switch (c)
700                             {
701                             case '/':
702                                 p++;
703                                 if (*p == '+')
704                                 {
705                                     p++;
706                                     nest++;
707                                 }
708                                 continue;
709                             case '+':
710                                 p++;
711                                 if (*p == '/')
712                                 {
713                                     p++;
714                                     if (--nest == 0)
715                                         break;
716                                 }
717                                 continue;
718                             case '\r':
719                                 p++;
720                                 if (*p != '\n')
721                                     endOfLine();
722                                 continue;
723                             case '\n':
724                                 endOfLine();
725                                 p++;
726                                 continue;
727                             case 0:
728                             case 0x1A:
729                                 error("unterminated /+ +/ comment");
730                                 p = end;
731                                 t.loc = loc();
732                                 t.value = TOK.endOfFile;
733                                 return;
734                             default:
735                                 if (c & 0x80)
736                                 {
737                                     uint u = decodeUTF();
738                                     if (u == PS || u == LS)
739                                         endOfLine();
740                                 }
741                                 p++;
742                                 continue;
743                             }
744                             break;
745                         }
746                         if (commentToken)
747                         {
748                             t.loc = startLoc;
749                             t.value = TOK.comment;
750                             return;
751                         }
752                         if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
753                         {
754                             // if /++ but not /++/
755                             getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
756                             lastDocLine = scanloc.linnum;
757                         }
758                         continue;
759                     }
760                     break;
761                 default:
762                     break;
763                 }
764                 t.value = TOK.div;
765                 return;
766             case '.':
767                 p++;
768                 if (isdigit(*p))
769                 {
770                     /* Note that we don't allow ._1 and ._ as being
771                      * valid floating point numbers.
772                      */
773                     p--;
774                     t.value = inreal(t);
775                 }
776                 else if (p[0] == '.')
777                 {
778                     if (p[1] == '.')
779                     {
780                         p += 2;
781                         t.value = TOK.dotDotDot;
782                     }
783                     else
784                     {
785                         p++;
786                         t.value = TOK.slice;
787                     }
788                 }
789                 else
790                     t.value = TOK.dot;
791                 return;
792             case '&':
793                 p++;
794                 if (*p == '=')
795                 {
796                     p++;
797                     t.value = TOK.andAssign;
798                 }
799                 else if (*p == '&')
800                 {
801                     p++;
802                     t.value = TOK.andAnd;
803                 }
804                 else
805                     t.value = TOK.and;
806                 return;
807             case '|':
808                 p++;
809                 if (*p == '=')
810                 {
811                     p++;
812                     t.value = TOK.orAssign;
813                 }
814                 else if (*p == '|')
815                 {
816                     p++;
817                     t.value = TOK.orOr;
818                 }
819                 else
820                     t.value = TOK.or;
821                 return;
822             case '-':
823                 p++;
824                 if (*p == '=')
825                 {
826                     p++;
827                     t.value = TOK.minAssign;
828                 }
829                 else if (*p == '-')
830                 {
831                     p++;
832                     t.value = TOK.minusMinus;
833                 }
834                 else if (*p == '>')
835                 {
836                     ++p;
837                     t.value = TOK.arrow;
838                 }
839                 else
840                     t.value = TOK.min;
841                 return;
842             case '+':
843                 p++;
844                 if (*p == '=')
845                 {
846                     p++;
847                     t.value = TOK.addAssign;
848                 }
849                 else if (*p == '+')
850                 {
851                     p++;
852                     t.value = TOK.plusPlus;
853                 }
854                 else
855                     t.value = TOK.add;
856                 return;
857             case '<':
858                 p++;
859                 if (*p == '=')
860                 {
861                     p++;
862                     t.value = TOK.lessOrEqual; // <=
863                 }
864                 else if (*p == '<')
865                 {
866                     p++;
867                     if (*p == '=')
868                     {
869                         p++;
870                         t.value = TOK.leftShiftAssign; // <<=
871                     }
872                     else
873                         t.value = TOK.leftShift; // <<
874                 }
875                 else if (*p == ':' && Ccompile)
876                 {
877                     ++p;
878                     t.value = TOK.leftBracket;  // <:
879                 }
880                 else if (*p == '%' && Ccompile)
881                 {
882                     ++p;
883                     t.value = TOK.leftCurly;    // <%
884                 }
885                 else
886                     t.value = TOK.lessThan; // <
887                 return;
888             case '>':
889                 p++;
890                 if (*p == '=')
891                 {
892                     p++;
893                     t.value = TOK.greaterOrEqual; // >=
894                 }
895                 else if (*p == '>')
896                 {
897                     p++;
898                     if (*p == '=')
899                     {
900                         p++;
901                         t.value = TOK.rightShiftAssign; // >>=
902                     }
903                     else if (*p == '>')
904                     {
905                         p++;
906                         if (*p == '=')
907                         {
908                             p++;
909                             t.value = TOK.unsignedRightShiftAssign; // >>>=
910                         }
911                         else
912                             t.value = TOK.unsignedRightShift; // >>>
913                     }
914                     else
915                         t.value = TOK.rightShift; // >>
916                 }
917                 else
918                     t.value = TOK.greaterThan; // >
919                 return;
920             case '!':
921                 p++;
922                 if (*p == '=')
923                 {
924                     p++;
925                     t.value = TOK.notEqual; // !=
926                 }
927                 else
928                     t.value = TOK.not; // !
929                 return;
930             case '=':
931                 p++;
932                 if (*p == '=')
933                 {
934                     p++;
935                     t.value = TOK.equal; // ==
936                 }
937                 else if (*p == '>')
938                 {
939                     p++;
940                     t.value = TOK.goesTo; // =>
941                 }
942                 else
943                     t.value = TOK.assign; // =
944                 return;
945             case '~':
946                 p++;
947                 if (*p == '=')
948                 {
949                     p++;
950                     t.value = TOK.concatenateAssign; // ~=
951                 }
952                 else
953                     t.value = TOK.tilde; // ~
954                 return;
955             case '^':
956                 p++;
957                 if (*p == '^')
958                 {
959                     p++;
960                     if (*p == '=')
961                     {
962                         p++;
963                         t.value = TOK.powAssign; // ^^=
964                     }
965                     else
966                         t.value = TOK.pow; // ^^
967                 }
968                 else if (*p == '=')
969                 {
970                     p++;
971                     t.value = TOK.xorAssign; // ^=
972                 }
973                 else
974                     t.value = TOK.xor; // ^
975                 return;
976             case '(':
977                 p++;
978                 t.value = TOK.leftParenthesis;
979                 return;
980             case ')':
981                 p++;
982                 t.value = TOK.rightParenthesis;
983                 return;
984             case '[':
985                 p++;
986                 t.value = TOK.leftBracket;
987                 return;
988             case ']':
989                 p++;
990                 t.value = TOK.rightBracket;
991                 return;
992             case '{':
993                 p++;
994                 t.value = TOK.leftCurly;
995                 return;
996             case '}':
997                 p++;
998                 t.value = TOK.rightCurly;
999                 return;
1000             case '?':
1001                 p++;
1002                 t.value = TOK.question;
1003                 return;
1004             case ',':
1005                 p++;
1006                 t.value = TOK.comma;
1007                 return;
1008             case ';':
1009                 p++;
1010                 t.value = TOK.semicolon;
1011                 return;
1012             case ':':
1013                 p++;
1014                 if (*p == ':')
1015                 {
1016                     ++p;
1017                     t.value = TOK.colonColon;
1018                 }
1019                 else if (*p == '>' && Ccompile)
1020                 {
1021                     ++p;
1022                     t.value = TOK.rightBracket;
1023                 }
1024                 else
1025                     t.value = TOK.colon;
1026                 return;
1027             case '$':
1028                 p++;
1029                 t.value = TOK.dollar;
1030                 return;
1031             case '@':
1032                 p++;
1033                 t.value = TOK.at;
1034                 return;
1035             case '*':
1036                 p++;
1037                 if (*p == '=')
1038                 {
1039                     p++;
1040                     t.value = TOK.mulAssign;
1041                 }
1042                 else
1043                     t.value = TOK.mul;
1044                 return;
1045             case '%':
1046                 p++;
1047                 if (*p == '=')
1048                 {
1049                     p++;
1050                     t.value = TOK.modAssign;
1051                 }
1052                 else if (*p == '>' && Ccompile)
1053                 {
1054                     ++p;
1055                     t.value = TOK.rightCurly;
1056                 }
1057                 else if (*p == ':' && Ccompile)
1058                 {
1059                     goto case '#';      // %: means #
1060                 }
1061                 else
1062                     t.value = TOK.mod;
1063                 return;
1064             case '#':
1065                 {
1066                     // https://issues.dlang.org/show_bug.cgi?id=22825
1067                     // Special token sequences are terminated by newlines,
1068                     // and should not be skipped over.
1069                     this.tokenizeNewlines = true;
1070                     p++;
1071                     if (parseSpecialTokenSequence())
1072                         continue;
1073                     t.value = TOK.pound;
1074                     return;
1075                 }
1076             default:
1077                 {
1078                     dchar c = *p;
1079                     if (c & 0x80)
1080                     {
1081                         c = decodeUTF();
1082                         // Check for start of unicode identifier
1083                         if (isUniAlpha(c))
1084                             goto case_ident;
1085                         if (c == PS || c == LS)
1086                         {
1087                             endOfLine();
1088                             p++;
1089                             if (tokenizeNewlines)
1090                             {
1091                                 t.value = TOK.endOfLine;
1092                                 tokenizeNewlines = false;
1093                                 return;
1094                             }
1095                             continue;
1096                         }
1097                     }
1098                     if (c < 0x80 && isprint(c))
1099                         error("character '%c' is not a valid token", c);
1100                     else
1101                         error("character 0x%02x is not a valid token", c);
1102                     p++;
1103                     continue;
1104                 }
1105             }
1106         }
1107     }
1108 
peek(Token * ct)1109     final Token* peek(Token* ct)
1110     {
1111         Token* t;
1112         if (ct.next)
1113             t = ct.next;
1114         else
1115         {
1116             t = allocateToken();
1117             scan(t);
1118             ct.next = t;
1119         }
1120         return t;
1121     }
1122 
1123     /*********************************
1124      * tk is on the opening (.
1125      * Look ahead and return token that is past the closing ).
1126      */
peekPastParen(Token * tk)1127     final Token* peekPastParen(Token* tk)
1128     {
1129         //printf("peekPastParen()\n");
1130         int parens = 1;
1131         int curlynest = 0;
1132         while (1)
1133         {
1134             tk = peek(tk);
1135             //tk.print();
1136             switch (tk.value)
1137             {
1138             case TOK.leftParenthesis:
1139                 parens++;
1140                 continue;
1141             case TOK.rightParenthesis:
1142                 --parens;
1143                 if (parens)
1144                     continue;
1145                 tk = peek(tk);
1146                 break;
1147             case TOK.leftCurly:
1148                 curlynest++;
1149                 continue;
1150             case TOK.rightCurly:
1151                 if (--curlynest >= 0)
1152                     continue;
1153                 break;
1154             case TOK.semicolon:
1155                 if (curlynest)
1156                     continue;
1157                 break;
1158             case TOK.endOfFile:
1159                 break;
1160             default:
1161                 continue;
1162             }
1163             return tk;
1164         }
1165     }
1166 
1167     /*******************************************
1168      * Parse escape sequence.
1169      */
escapeSequence()1170     private uint escapeSequence()
1171     {
1172         return Lexer.escapeSequence(token.loc, p, Ccompile);
1173     }
1174 
1175     /********
1176      * Parse the given string literal escape sequence into a single character.
1177      * D https://dlang.org/spec/lex.html#escape_sequences
1178      * C11 6.4.4.4
1179      * Params:
1180      *  loc = location to use for error messages
1181      *  sequence = pointer to string with escape sequence to parse. Updated to
1182      *             point past the end of the escape sequence
1183      *  Ccompile = true for compile C11 escape sequences
1184      * Returns:
1185      *  the escape sequence as a single character
1186      */
escapeSequence(const ref Loc loc,ref const (char)* sequence,bool Ccompile)1187     private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile)
1188     {
1189         const(char)* p = sequence; // cache sequence reference on stack
1190         scope(exit) sequence = p;
1191 
1192         uint c = *p;
1193         int ndigits;
1194         switch (c)
1195         {
1196         case '\'':
1197         case '"':
1198         case '?':
1199         case '\\':
1200         Lconsume:
1201             p++;
1202             break;
1203         case 'a':
1204             c = 7;
1205             goto Lconsume;
1206         case 'b':
1207             c = 8;
1208             goto Lconsume;
1209         case 'f':
1210             c = 12;
1211             goto Lconsume;
1212         case 'n':
1213             c = 10;
1214             goto Lconsume;
1215         case 'r':
1216             c = 13;
1217             goto Lconsume;
1218         case 't':
1219             c = 9;
1220             goto Lconsume;
1221         case 'v':
1222             c = 11;
1223             goto Lconsume;
1224         case 'u':
1225             ndigits = 4;
1226             goto Lhex;
1227         case 'U':
1228             ndigits = 8;
1229             goto Lhex;
1230         case 'x':
1231             ndigits = 2;
1232         Lhex:
1233             p++;
1234             c = *p;
1235             if (ishex(cast(char)c))
1236             {
1237                 uint v = 0;
1238                 int n = 0;
1239                 if (Ccompile && ndigits == 2)
1240                 {
1241                     /* C11 6.4.4.4-7 one to infinity hex digits
1242                      */
1243                     do
1244                     {
1245                         if (isdigit(cast(char)c))
1246                             c -= '0';
1247                         else if (islower(c))
1248                             c -= 'a' - 10;
1249                         else
1250                             c -= 'A' - 10;
1251                         v = v * 16 + c;
1252                         c = *++p;
1253                     } while (ishex(cast(char)c));
1254                 }
1255                 else
1256                 {
1257                     while (1)
1258                     {
1259                         if (isdigit(cast(char)c))
1260                             c -= '0';
1261                         else if (islower(c))
1262                             c -= 'a' - 10;
1263                         else
1264                             c -= 'A' - 10;
1265                         v = v * 16 + c;
1266                         c = *++p;
1267                         if (++n == ndigits)
1268                             break;
1269                         if (!ishex(cast(char)c))
1270                         {
1271                             .error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
1272                             break;
1273                         }
1274                     }
1275                     if (ndigits != 2 && !utf_isValidDchar(v))
1276                     {
1277                         .error(loc, "invalid UTF character \\U%08x", v);
1278                         v = '?'; // recover with valid UTF character
1279                     }
1280                 }
1281                 c = v;
1282             }
1283             else
1284             {
1285                 .error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
1286                 p++;
1287             }
1288             break;
1289         case '&':
1290             if (Ccompile)
1291                 goto default;
1292 
1293             // named character entity
1294             for (const idstart = ++p; 1; p++)
1295             {
1296                 switch (*p)
1297                 {
1298                 case ';':
1299                     c = HtmlNamedEntity(idstart, p - idstart);
1300                     if (c == ~0)
1301                     {
1302                         .error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
1303                         c = '?';
1304                     }
1305                     p++;
1306                     break;
1307                 default:
1308                     if (isalpha(*p) || (p != idstart && isdigit(*p)))
1309                         continue;
1310                     .error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
1311                     c = '?';
1312                     break;
1313                 }
1314                 break;
1315             }
1316             break;
1317         case 0:
1318         case 0x1A:
1319             // end of file
1320             c = '\\';
1321             break;
1322         default:
1323             if (isoctal(cast(char)c))
1324             {
1325                 uint v = 0;
1326                 int n = 0;
1327                 do
1328                 {
1329                     v = v * 8 + (c - '0');
1330                     c = *++p;
1331                 }
1332                 while (++n < 3 && isoctal(cast(char)c));
1333                 c = v;
1334                 if (c > 0xFF)
1335                     .error(loc, "escape octal sequence \\%03o is larger than \\377", c);
1336             }
1337             else
1338             {
1339                 .error(loc, "undefined escape sequence \\%c", c);
1340                 p++;
1341             }
1342             break;
1343         }
1344         return c;
1345     }
1346 
1347     /**
1348     Lex a wysiwyg string. `p` must be pointing to the first character before the
1349     contents of the string literal. The character pointed to by `p` will be used as
1350     the terminating character (i.e. backtick or double-quote).
1351     Params:
1352         result = pointer to the token that accepts the result
1353     */
wysiwygStringConstant(Token * result)1354     private void wysiwygStringConstant(Token* result)
1355     {
1356         result.value = TOK.string_;
1357         Loc start = loc();
1358         auto terminator = p[0];
1359         p++;
1360         stringbuffer.setsize(0);
1361         while (1)
1362         {
1363             dchar c = p[0];
1364             p++;
1365             switch (c)
1366             {
1367             case '\n':
1368                 endOfLine();
1369                 break;
1370             case '\r':
1371                 if (p[0] == '\n')
1372                     continue; // ignore
1373                 c = '\n'; // treat EndOfLine as \n character
1374                 endOfLine();
1375                 break;
1376             case 0:
1377             case 0x1A:
1378                 error("unterminated string constant starting at %s", start.toChars());
1379                 result.setString();
1380                 // rewind `p` so it points to the EOF character
1381                 p--;
1382                 return;
1383             default:
1384                 if (c == terminator)
1385                 {
1386                     result.setString(stringbuffer);
1387                     stringPostfix(result);
1388                     return;
1389                 }
1390                 else if (c & 0x80)
1391                 {
1392                     p--;
1393                     const u = decodeUTF();
1394                     p++;
1395                     if (u == PS || u == LS)
1396                         endOfLine();
1397                     stringbuffer.writeUTF8(u);
1398                     continue;
1399                 }
1400                 break;
1401             }
1402             stringbuffer.writeByte(c);
1403         }
1404     }
1405 
1406     /**
1407     Lex a delimited string. Some examples of delimited strings are:
1408     ---
1409     q"(foo(xxx))"      // "foo(xxx)"
1410     q"[foo$(LPAREN)]"  // "foo$(LPAREN)"
1411     q"/foo]/"          // "foo]"
1412     q"HERE
1413     foo
1414     HERE"              // "foo\n"
1415     ---
1416     It is assumed that `p` points to the opening double-quote '"'.
1417     Params:
1418         result = pointer to the token that accepts the result
1419     */
delimitedStringConstant(Token * result)1420     private void delimitedStringConstant(Token* result)
1421     {
1422         result.value = TOK.string_;
1423         Loc start = loc();
1424         dchar delimleft = 0;
1425         dchar delimright = 0;
1426         uint nest = 1;
1427         uint nestcount = ~0; // dead assignment, needed to suppress warning
1428         Identifier hereid = null;
1429         uint blankrol = 0;
1430         uint startline = 0;
1431         p++;
1432         stringbuffer.setsize(0);
1433         while (1)
1434         {
1435             dchar c = *p++;
1436             //printf("c = '%c'\n", c);
1437             switch (c)
1438             {
1439             case '\n':
1440             Lnextline:
1441                 endOfLine();
1442                 startline = 1;
1443                 if (blankrol)
1444                 {
1445                     blankrol = 0;
1446                     continue;
1447                 }
1448                 if (hereid)
1449                 {
1450                     stringbuffer.writeUTF8(c);
1451                     continue;
1452                 }
1453                 break;
1454             case '\r':
1455                 if (*p == '\n')
1456                     continue; // ignore
1457                 c = '\n'; // treat EndOfLine as \n character
1458                 goto Lnextline;
1459             case 0:
1460             case 0x1A:
1461                 error("unterminated delimited string constant starting at %s", start.toChars());
1462                 result.setString();
1463                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1464                 p--;
1465                 return;
1466             default:
1467                 if (c & 0x80)
1468                 {
1469                     p--;
1470                     c = decodeUTF();
1471                     p++;
1472                     if (c == PS || c == LS)
1473                         goto Lnextline;
1474                 }
1475                 break;
1476             }
1477             if (delimleft == 0)
1478             {
1479                 delimleft = c;
1480                 nest = 1;
1481                 nestcount = 1;
1482                 if (c == '(')
1483                     delimright = ')';
1484                 else if (c == '{')
1485                     delimright = '}';
1486                 else if (c == '[')
1487                     delimright = ']';
1488                 else if (c == '<')
1489                     delimright = '>';
1490                 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1491                 {
1492                     // Start of identifier; must be a heredoc
1493                     Token tok;
1494                     p--;
1495                     scan(&tok); // read in heredoc identifier
1496                     if (tok.value != TOK.identifier)
1497                     {
1498                         error("identifier expected for heredoc, not %s", tok.toChars());
1499                         delimright = c;
1500                     }
1501                     else
1502                     {
1503                         hereid = tok.ident;
1504                         //printf("hereid = '%s'\n", hereid.toChars());
1505                         blankrol = 1;
1506                     }
1507                     nest = 0;
1508                 }
1509                 else
1510                 {
1511                     delimright = c;
1512                     nest = 0;
1513                     if (isspace(c))
1514                         error("delimiter cannot be whitespace");
1515                 }
1516             }
1517             else
1518             {
1519                 if (blankrol)
1520                 {
1521                     error("heredoc rest of line should be blank");
1522                     blankrol = 0;
1523                     continue;
1524                 }
1525                 if (nest == 1)
1526                 {
1527                     if (c == delimleft)
1528                         nestcount++;
1529                     else if (c == delimright)
1530                     {
1531                         nestcount--;
1532                         if (nestcount == 0)
1533                             goto Ldone;
1534                     }
1535                 }
1536                 else if (c == delimright)
1537                     goto Ldone;
1538                 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
1539                 {
1540                     Token tok;
1541                     auto psave = p;
1542                     p--;
1543                     scan(&tok); // read in possible heredoc identifier
1544                     //printf("endid = '%s'\n", tok.ident.toChars());
1545                     if (tok.value == TOK.identifier && tok.ident is hereid)
1546                     {
1547                         /* should check that rest of line is blank
1548                          */
1549                         goto Ldone;
1550                     }
1551                     p = psave;
1552                 }
1553                 stringbuffer.writeUTF8(c);
1554                 startline = 0;
1555             }
1556         }
1557     Ldone:
1558         if (*p == '"')
1559             p++;
1560         else if (hereid)
1561             error("delimited string must end in `%s\"`", hereid.toChars());
1562         else if (isspace(delimright))
1563             error("delimited string must end in `\"`");
1564         else
1565             error("delimited string must end in `%c\"`", delimright);
1566         result.setString(stringbuffer);
1567         stringPostfix(result);
1568     }
1569 
1570     /**
1571     Lex a token string. Some examples of token strings are:
1572     ---
1573     q{ foo(xxx) }    // " foo(xxx) "
1574     q{foo$(LPAREN)}  // "foo$(LPAREN)"
1575     q{{foo}"}"}      // "{foo}"}""
1576     ---
1577     It is assumed that `p` points to the opening curly-brace.
1578     Params:
1579         result = pointer to the token that accepts the result
1580     */
tokenStringConstant(Token * result)1581     private void tokenStringConstant(Token* result)
1582     {
1583         result.value = TOK.string_;
1584 
1585         uint nest = 1;
1586         const start = loc();
1587         const pstart = ++p;
1588         inTokenStringConstant++;
1589         scope(exit) inTokenStringConstant--;
1590         while (1)
1591         {
1592             Token tok;
1593             scan(&tok);
1594             switch (tok.value)
1595             {
1596             case TOK.leftCurly:
1597                 nest++;
1598                 continue;
1599             case TOK.rightCurly:
1600                 if (--nest == 0)
1601                 {
1602                     result.setString(pstart, p - 1 - pstart);
1603                     stringPostfix(result);
1604                     return;
1605                 }
1606                 continue;
1607             case TOK.endOfFile:
1608                 error("unterminated token string constant starting at %s", start.toChars());
1609                 result.setString();
1610                 return;
1611             default:
1612                 continue;
1613             }
1614         }
1615     }
1616 
1617     /**
1618     Scan a quoted string while building the processed string value by
1619     handling escape sequences. The result is returned in the given `t` token.
1620     This function assumes that `p` currently points to the opening quote
1621     of the string.
1622     Params:
1623         t = the token to set the resulting string to
1624     * References:
1625     *   D https://dlang.org/spec/lex.html#double_quoted_strings
1626     *   ImportC C11 6.4.5
1627     */
escapeStringConstant(Token * t)1628     private void escapeStringConstant(Token* t)
1629     {
1630         t.value = TOK.string_;
1631 
1632         const start = loc();
1633         const tc = *p++;        // opening quote
1634         stringbuffer.setsize(0);
1635         while (1)
1636         {
1637             dchar c = *p++;
1638             switch (c)
1639             {
1640             case '\\':
1641                 switch (*p)
1642                 {
1643                 case '&':
1644                     if (Ccompile)
1645                         goto default;
1646                     goto case;
1647 
1648                 case 'u':
1649                 case 'U':
1650                     c = escapeSequence();
1651                     stringbuffer.writeUTF8(c);
1652                     continue;
1653                 default:
1654                     c = escapeSequence();
1655                     break;
1656                 }
1657                 break;
1658             case '\n':
1659                 endOfLine();
1660                 if (Ccompile)
1661                     goto Lunterminated;
1662                 break;
1663             case '\r':
1664                 if (*p == '\n')
1665                     continue; // ignore
1666                 c = '\n'; // treat EndOfLine as \n character
1667                 endOfLine();
1668                 if (Ccompile)
1669                     goto Lunterminated;
1670                 break;
1671             case '\'':
1672             case '"':
1673                 if (c != tc)
1674                     goto default;
1675                 t.setString(stringbuffer);
1676                 if (!Ccompile)
1677                     stringPostfix(t);
1678                 return;
1679             case 0:
1680             case 0x1A:
1681                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1682                 p--;
1683             Lunterminated:
1684                 error("unterminated string constant starting at %s", start.toChars());
1685                 t.setString();
1686                 return;
1687             default:
1688                 if (c & 0x80)
1689                 {
1690                     p--;
1691                     c = decodeUTF();
1692                     if (c == LS || c == PS)
1693                     {
1694                         c = '\n';
1695                         endOfLine();
1696                         if (Ccompile)
1697                             goto Lunterminated;
1698                     }
1699                     p++;
1700                     stringbuffer.writeUTF8(c);
1701                     continue;
1702                 }
1703                 break;
1704             }
1705             stringbuffer.writeByte(c);
1706         }
1707     }
1708 
1709     /**************************************
1710      * Reference:
1711      *    https://dlang.org/spec/lex.html#characterliteral
1712      */
charConstant(Token * t)1713     private TOK charConstant(Token* t)
1714     {
1715         TOK tk = TOK.charLiteral;
1716         //printf("Lexer::charConstant\n");
1717         p++;
1718         dchar c = *p++;
1719         switch (c)
1720         {
1721         case '\\':
1722             switch (*p)
1723             {
1724             case 'u':
1725                 t.unsvalue = escapeSequence();
1726                 tk = TOK.wcharLiteral;
1727                 break;
1728             case 'U':
1729             case '&':
1730                 t.unsvalue = escapeSequence();
1731                 tk = TOK.dcharLiteral;
1732                 break;
1733             default:
1734                 t.unsvalue = escapeSequence();
1735                 break;
1736             }
1737             break;
1738         case '\n':
1739         L1:
1740             endOfLine();
1741             goto case;
1742         case '\r':
1743             goto case '\'';
1744         case 0:
1745         case 0x1A:
1746             // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1747             p--;
1748             goto case;
1749         case '\'':
1750             error("unterminated character constant");
1751             t.unsvalue = '?';
1752             return tk;
1753         default:
1754             if (c & 0x80)
1755             {
1756                 p--;
1757                 c = decodeUTF();
1758                 p++;
1759                 if (c == LS || c == PS)
1760                     goto L1;
1761                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1762                     tk = TOK.wcharLiteral;
1763                 else
1764                     tk = TOK.dcharLiteral;
1765             }
1766             t.unsvalue = c;
1767             break;
1768         }
1769         if (*p != '\'')
1770         {
1771             while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
1772                     *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
1773             {
1774                 if (*p & 0x80)
1775                 {
1776                     const s = p;
1777                     c = decodeUTF();
1778                     if (c == LS || c == PS)
1779                     {
1780                         p = s;
1781                         break;
1782                     }
1783                 }
1784                 p++;
1785             }
1786 
1787             if (*p == '\'')
1788             {
1789                 error("character constant has multiple characters");
1790                 p++;
1791             }
1792             else
1793                 error("unterminated character constant");
1794             t.unsvalue = '?';
1795             return tk;
1796         }
1797         p++;
1798         return tk;
1799     }
1800 
1801     /***************************************
1802      * Lex C character constant.
1803      * Parser is on the opening quote.
1804      * Params:
1805      *  t = token to fill in
1806      *  prefix = one of `u`, `U` or 0.
1807      * Reference:
1808      *  C11 6.4.4.4
1809      */
clexerCharConstant(ref Token t,char prefix)1810     private void clexerCharConstant(ref Token t, char prefix)
1811     {
1812         escapeStringConstant(&t);
1813         const(char)[] str = t.ustring[0 .. t.len];
1814         const n = str.length;
1815         const loc = t.loc;
1816         if (n == 0)
1817         {
1818             error(loc, "empty character constant");
1819             t.value = TOK.semicolon;
1820             return;
1821         }
1822 
1823         uint u;
1824         switch (prefix)
1825         {
1826             case 0:
1827                 if (n == 1) // fast case
1828                 {
1829                     u = str[0];
1830                 }
1831                 else if (n > 4)
1832                     error(loc, "max number of chars in character literal is 4, had %d",
1833                         cast(int)n);
1834                 else
1835                 {
1836                     foreach (i, c; str)
1837                         (cast(char*)&u)[n - 1 - i] = c;
1838                 }
1839                 break;
1840 
1841             case 'u':
1842                 dchar d1;
1843                 size_t idx;
1844                 auto msg = utf_decodeChar(str, idx, d1);
1845                 dchar d2 = 0;
1846                 if (idx < n && !msg)
1847                     msg = utf_decodeChar(str, idx, d2);
1848                 if (msg)
1849                     error(loc, "%s", msg);
1850                 else if (idx < n)
1851                     error(loc, "max number of chars in 16 bit character literal is 2, had %d",
1852                         (n + 1) >> 1);
1853                 else if (d1 > 0x1_0000)
1854                     error(loc, "%d does not fit in 16 bits", d1);
1855                 else if (d2 > 0x1_0000)
1856                     error(loc, "%d does not fit in 16 bits", d2);
1857                 u = d1;
1858                 if (d2)
1859                     u = (d1 << 16) | d2;
1860                 break;
1861 
1862             case 'U':
1863                 dchar d;
1864                 size_t idx;
1865                 auto msg = utf_decodeChar(str, idx, d);
1866                 if (msg)
1867                     error(loc, "%s", msg);
1868                 else if (idx < n)
1869                     error(loc, "max number of chars in 32 bit character literal is 1, had %d",
1870                         (n + 3) >> 2);
1871                 u = d;
1872                 break;
1873 
1874             default:
1875                 assert(0);
1876         }
1877         t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
1878         t.unsvalue = u;
1879     }
1880 
1881     /***************************************
1882      * Get postfix of string literal.
1883      */
stringPostfix(Token * t)1884     private void stringPostfix(Token* t) pure @nogc
1885     {
1886         switch (*p)
1887         {
1888         case 'c':
1889         case 'w':
1890         case 'd':
1891             t.postfix = *p;
1892             p++;
1893             break;
1894         default:
1895             t.postfix = 0;
1896             break;
1897         }
1898     }
1899 
1900     /**************************************
1901      * Read in a number.
1902      * If it's an integer, store it in tok.TKutok.Vlong.
1903      *      integers can be decimal, octal or hex
1904      *      Handle the suffixes U, UL, LU, L, etc.
1905      * If it's double, store it in tok.TKutok.Vdouble.
1906      * Returns:
1907      *      TKnum
1908      *      TKdouble,...
1909      */
number(Token * t)1910     private TOK number(Token* t)
1911     {
1912         int base = 10;
1913         const start = p;
1914         uinteger_t n = 0; // unsigned >=64 bit integer type
1915         int d;
1916         bool err = false;
1917         bool overflow = false;
1918         bool anyBinaryDigitsNoSingleUS = false;
1919         bool anyHexDigitsNoSingleUS = false;
1920         char errorDigit = 0;
1921         dchar c = *p;
1922         if (c == '0')
1923         {
1924             ++p;
1925             c = *p;
1926             switch (c)
1927             {
1928             case '0':
1929             case '1':
1930             case '2':
1931             case '3':
1932             case '4':
1933             case '5':
1934             case '6':
1935             case '7':
1936                 base = 8;
1937                 break;
1938 
1939             case '8':
1940             case '9':
1941                 errorDigit = cast(char) c;
1942                 base = 8;
1943                 break;
1944             case 'x':
1945             case 'X':
1946                 ++p;
1947                 base = 16;
1948                 break;
1949             case 'b':
1950             case 'B':
1951                 if (Ccompile)
1952                     error("binary constants not allowed");
1953                 ++p;
1954                 base = 2;
1955                 break;
1956             case '.':
1957                 if (p[1] == '.')
1958                     goto Ldone; // if ".."
1959                 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
1960                 {
1961                     if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
1962                         goto Lreal;  // if `0.f` or `0.L`
1963                     goto Ldone; // if ".identifier" or ".unicode"
1964                 }
1965                 goto Lreal; // '.' is part of current token
1966             case 'i':
1967             case 'f':
1968             case 'F':
1969                 goto Lreal;
1970             case '_':
1971                 if (Ccompile)
1972                     error("embedded `_` not allowed");
1973                 ++p;
1974                 base = 8;
1975                 break;
1976             case 'L':
1977                 if (p[1] == 'i')
1978                     goto Lreal;
1979                 break;
1980             default:
1981                 break;
1982             }
1983         }
1984         while (1)
1985         {
1986             c = *p;
1987             switch (c)
1988             {
1989             case '0':
1990             case '1':
1991             case '2':
1992             case '3':
1993             case '4':
1994             case '5':
1995             case '6':
1996             case '7':
1997             case '8':
1998             case '9':
1999                 ++p;
2000                 d = c - '0';
2001                 break;
2002             case 'a':
2003             case 'b':
2004             case 'c':
2005             case 'd':
2006             case 'e':
2007             case 'f':
2008             case 'A':
2009             case 'B':
2010             case 'C':
2011             case 'D':
2012             case 'E':
2013             case 'F':
2014                 ++p;
2015                 if (base != 16)
2016                 {
2017                     if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
2018                         goto Lreal;
2019                 }
2020                 if (c >= 'a')
2021                     d = c + 10 - 'a';
2022                 else
2023                     d = c + 10 - 'A';
2024                 break;
2025             case 'L':
2026                 if (p[1] == 'i')
2027                     goto Lreal;
2028                 goto Ldone;
2029             case '.':
2030                 if (p[1] == '.')
2031                     goto Ldone; // if ".."
2032                 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
2033                 {
2034                     if (Ccompile && base == 10 &&
2035                         (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2036                         goto Lreal;  // if `1.e6` or `1.f` or `1.L`
2037                     goto Ldone; // if ".identifier" or ".unicode"
2038                 }
2039                 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
2040                     goto Ldone; // if ".identifier" or ".unicode"
2041                 if (base == 2)
2042                     goto Ldone; // if ".identifier" or ".unicode"
2043                 goto Lreal; // otherwise as part of a floating point literal
2044             case 'p':
2045             case 'P':
2046             case 'i':
2047             Lreal:
2048                 p = start;
2049                 return inreal(t);
2050             case '_':
2051                 if (Ccompile)
2052                     goto default;
2053                 ++p;
2054                 continue;
2055             default:
2056                 goto Ldone;
2057             }
2058             // got a digit here, set any necessary flags, check for errors
2059             anyHexDigitsNoSingleUS = true;
2060             anyBinaryDigitsNoSingleUS = true;
2061             if (!errorDigit && d >= base)
2062             {
2063                 errorDigit = cast(char) c;
2064             }
2065             // Avoid expensive overflow check if we aren't at risk of overflow
2066             if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
2067                 n = n * base + d;
2068             else
2069             {
2070                 import core.checkedint : mulu, addu;
2071 
2072                 n = mulu(n, base, overflow);
2073                 n = addu(n, d, overflow);
2074             }
2075         }
2076     Ldone:
2077         if (errorDigit)
2078         {
2079             error("%s digit expected, not `%c`", base == 2 ? "binary".ptr :
2080                                                  base == 8 ? "octal".ptr :
2081                                                  "decimal".ptr, errorDigit);
2082             err = true;
2083         }
2084         if (overflow && !err)
2085         {
2086             error("integer overflow");
2087             err = true;
2088         }
2089         if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
2090             (base == 16 && !anyHexDigitsNoSingleUS))
2091             error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
2092 
2093         t.unsvalue = n;
2094 
2095         if (Ccompile)
2096             return cnumber(base, n);
2097 
2098         enum FLAGS : int
2099         {
2100             none = 0,
2101             decimal = 1, // decimal
2102             unsigned = 2, // u or U suffix
2103             long_ = 4, // L suffix
2104         }
2105 
2106         FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
2107         // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2108         const psuffix = p;
2109         while (1)
2110         {
2111             FLAGS f;
2112             switch (*p)
2113             {
2114             case 'U':
2115             case 'u':
2116                 f = FLAGS.unsigned;
2117                 goto L1;
2118             case 'l':
2119                 f = FLAGS.long_;
2120                 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2121                 goto L1;
2122             case 'L':
2123                 f = FLAGS.long_;
2124             L1:
2125                 p++;
2126                 if ((flags & f) && !err)
2127                 {
2128                     error("unrecognized token");
2129                     err = true;
2130                 }
2131                 flags = cast(FLAGS)(flags | f);
2132                 continue;
2133             default:
2134                 break;
2135             }
2136             break;
2137         }
2138         if (base == 8 && n >= 8)
2139         {
2140             if (err)
2141                 // can't translate invalid octal value, just show a generic message
2142                 error("octal literals larger than 7 are no longer supported");
2143             else
2144                 error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
2145                     n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
2146         }
2147         TOK result;
2148         switch (flags)
2149         {
2150         case FLAGS.none:
2151             /* Octal or Hexadecimal constant.
2152              * First that fits: int, uint, long, ulong
2153              */
2154             if (n & 0x8000000000000000L)
2155                 result = TOK.uns64Literal;
2156             else if (n & 0xFFFFFFFF00000000L)
2157                 result = TOK.int64Literal;
2158             else if (n & 0x80000000)
2159                 result = TOK.uns32Literal;
2160             else
2161                 result = TOK.int32Literal;
2162             break;
2163         case FLAGS.decimal:
2164             /* First that fits: int, long, long long
2165              */
2166             if (n & 0x8000000000000000L)
2167             {
2168                 result = TOK.uns64Literal;
2169             }
2170             else if (n & 0xFFFFFFFF80000000L)
2171                 result = TOK.int64Literal;
2172             else
2173                 result = TOK.int32Literal;
2174             break;
2175         case FLAGS.unsigned:
2176         case FLAGS.decimal | FLAGS.unsigned:
2177             /* First that fits: uint, ulong
2178              */
2179             if (n & 0xFFFFFFFF00000000L)
2180                 result = TOK.uns64Literal;
2181             else
2182                 result = TOK.uns32Literal;
2183             break;
2184         case FLAGS.decimal | FLAGS.long_:
2185             if (n & 0x8000000000000000L)
2186             {
2187                 if (!err)
2188                 {
2189                     error("signed integer overflow");
2190                     err = true;
2191                 }
2192                 result = TOK.uns64Literal;
2193             }
2194             else
2195                 result = TOK.int64Literal;
2196             break;
2197         case FLAGS.long_:
2198             if (n & 0x8000000000000000L)
2199                 result = TOK.uns64Literal;
2200             else
2201                 result = TOK.int64Literal;
2202             break;
2203         case FLAGS.unsigned | FLAGS.long_:
2204         case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2205             result = TOK.uns64Literal;
2206             break;
2207         default:
2208             debug
2209             {
2210                 printf("%x\n", flags);
2211             }
2212             assert(0);
2213         }
2214         return result;
2215     }
2216 
2217     /**************************************
2218      * Lex C integer-suffix
2219      * Params:
2220      *  base = number base
2221      *  n = raw integer value
2222      * Returns:
2223      *  token value
2224      */
cnumber(int base,uinteger_t n)2225     private TOK cnumber(int base, uinteger_t n)
2226     {
2227         /* C11 6.4.4.1
2228          * Parse trailing suffixes:
2229          *   u or U
2230          *   l or L
2231          *   ll or LL
2232          */
2233         enum FLAGS : uint
2234         {
2235             octalhex = 1, // octal or hexadecimal
2236             decimal  = 2, // decimal
2237             unsigned = 4, // u or U suffix
2238             long_    = 8, // l or L suffix
2239             llong    = 0x10 // ll or LL
2240         }
2241         FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
2242         bool err;
2243     Lsuffixes:
2244         while (1)
2245         {
2246             FLAGS f;
2247             const cs = *p;
2248             switch (cs)
2249             {
2250                 case 'U':
2251                 case 'u':
2252                     f = FLAGS.unsigned;
2253                     break;
2254 
2255                 case 'l':
2256                 case 'L':
2257                     f = FLAGS.long_;
2258                     if (cs == p[1])
2259                     {
2260                         f = FLAGS.long_ | FLAGS.llong;
2261                         ++p;
2262                     }
2263                     break;
2264 
2265                 default:
2266                     break Lsuffixes;
2267             }
2268             ++p;
2269             if ((flags & f) && !err)
2270             {
2271                 error("duplicate integer suffixes");
2272                 err = true;
2273             }
2274             flags = cast(FLAGS)(flags | f);
2275         }
2276 
2277         TOK result = TOK.int32Literal;     // default
2278         switch (flags)
2279         {
2280             /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2281              * this code deviates from C by picking D int, uint, long, or ulong instead
2282              */
2283 
2284             case FLAGS.octalhex:
2285                 /* Octal or Hexadecimal constant.
2286                  * First that fits: int, unsigned, long, unsigned long,
2287                  * long long, unsigned long long
2288                  */
2289                 if (n & 0x8000000000000000L)
2290                     result = TOK.uns64Literal;      // unsigned long
2291                 else if (n & 0xFFFFFFFF00000000L)
2292                     result = TOK.int64Literal;      // long
2293                 else if (n & 0x80000000)
2294                     result = TOK.uns32Literal;
2295                 else
2296                     result = TOK.int32Literal;
2297                 break;
2298 
2299             case FLAGS.decimal:
2300                 /* First that fits: int, long, long long
2301                  */
2302                 if (n & 0x8000000000000000L)
2303                     result = TOK.uns64Literal;      // unsigned long
2304                 else if (n & 0xFFFFFFFF80000000L)
2305                     result = TOK.int64Literal;      // long
2306                 else
2307                     result = TOK.int32Literal;
2308                 break;
2309 
2310             case FLAGS.octalhex | FLAGS.unsigned:
2311             case FLAGS.decimal | FLAGS.unsigned:
2312                 /* First that fits: unsigned, unsigned long, unsigned long long
2313                  */
2314                 if (n & 0xFFFFFFFF00000000L)
2315                     result = TOK.uns64Literal;      // unsigned long
2316                 else
2317                     result = TOK.uns32Literal;
2318                 break;
2319 
2320             case FLAGS.decimal | FLAGS.long_:
2321                 /* First that fits: long, long long
2322                  */
2323                 if (longsize == 4 || long_longsize == 4)
2324                 {
2325                     if (n & 0xFFFFFFFF_80000000L)
2326                         result = TOK.int64Literal;
2327                     else
2328                         result = TOK.int32Literal;  // long
2329                 }
2330                 else
2331                 {
2332                     result = TOK.int64Literal;      // long
2333                 }
2334                 break;
2335 
2336             case FLAGS.octalhex | FLAGS.long_:
2337                 /* First that fits: long, unsigned long, long long,
2338                  * unsigned long long
2339                  */
2340                 if (longsize == 4 || long_longsize == 4)
2341                 {
2342                     if (n & 0x8000000000000000L)
2343                         result = TOK.uns64Literal;
2344                     else if (n & 0xFFFFFFFF00000000L)
2345                         result = TOK.int64Literal;
2346                     else if (n & 0x80000000)
2347                         result = TOK.uns32Literal;      // unsigned long
2348                     else
2349                         result = TOK.int32Literal;      // long
2350                 }
2351                 else
2352                 {
2353                     if (n & 0x80000000_00000000L)
2354                         result = TOK.uns64Literal;      // unsigned long
2355                     else
2356                         result = TOK.int64Literal;      // long
2357                 }
2358                 break;
2359 
2360             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
2361             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.long_:
2362                 /* First that fits: unsigned long, unsigned long long
2363                  */
2364                 if (longsize == 4 || long_longsize == 4)
2365                 {
2366                     if (n & 0xFFFFFFFF00000000L)
2367                         result = TOK.uns64Literal;
2368                     else
2369                         result = TOK.uns32Literal;      // unsigned long
2370                 }
2371                 else
2372                 {
2373                     result = TOK.uns64Literal;  // unsigned long
2374                 }
2375                 break;
2376 
2377             case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
2378                 /* First that fits: long long, unsigned long long
2379                  */
2380                 if (n & 0x8000000000000000L)
2381                     result = TOK.uns64Literal;
2382                 else
2383                     result = TOK.int64Literal;
2384                 break;
2385 
2386             case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
2387                 /* long long
2388                  */
2389                 result = TOK.int64Literal;
2390                 break;
2391 
2392             case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2393             case FLAGS.decimal  | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2394                 result = TOK.uns64Literal;
2395                 break;
2396 
2397             default:
2398                 debug printf("%x\n",flags);
2399                 assert(0);
2400         }
2401         return result;
2402     }
2403 
2404     /**************************************
2405      * Read in characters, converting them to real.
2406      * Bugs:
2407      *      Exponent overflow not detected.
2408      *      Too much requested precision is not detected.
2409      */
inreal(Token * t)2410     private TOK inreal(Token* t)
2411     {
2412         //printf("Lexer::inreal()\n");
2413         debug
2414         {
2415             assert(*p == '.' || isdigit(*p));
2416         }
2417         bool isWellformedString = true;
2418         stringbuffer.setsize(0);
2419         auto pstart = p;
2420         bool hex = false;
2421         dchar c = *p++;
2422         // Leading '0x'
2423         if (c == '0')
2424         {
2425             c = *p++;
2426             if (c == 'x' || c == 'X')
2427             {
2428                 hex = true;
2429                 c = *p++;
2430             }
2431         }
2432         // Digits to left of '.'
2433         while (1)
2434         {
2435             if (c == '.')
2436             {
2437                 c = *p++;
2438                 break;
2439             }
2440             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2441             {
2442                 c = *p++;
2443                 continue;
2444             }
2445             break;
2446         }
2447         // Digits to right of '.'
2448         while (1)
2449         {
2450             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2451             {
2452                 c = *p++;
2453                 continue;
2454             }
2455             break;
2456         }
2457         if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2458         {
2459             c = *p++;
2460             if (c == '-' || c == '+')
2461             {
2462                 c = *p++;
2463             }
2464             bool anyexp = false;
2465             while (1)
2466             {
2467                 if (isdigit(c))
2468                 {
2469                     anyexp = true;
2470                     c = *p++;
2471                     continue;
2472                 }
2473                 if (c == '_')
2474                 {
2475                     if (Ccompile)
2476                         error("embedded `_` in numeric literals not allowed");
2477                     c = *p++;
2478                     continue;
2479                 }
2480                 if (!anyexp)
2481                 {
2482                     error("missing exponent");
2483                     isWellformedString = false;
2484                 }
2485                 break;
2486             }
2487         }
2488         else if (hex)
2489         {
2490             error("exponent required for hex float");
2491             isWellformedString = false;
2492         }
2493         --p;
2494         while (pstart < p)
2495         {
2496             if (*pstart != '_')
2497                 stringbuffer.writeByte(*pstart);
2498             ++pstart;
2499         }
2500         stringbuffer.writeByte(0);
2501         auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
2502         TOK result;
2503         bool isOutOfRange = false;
2504         t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero);
2505         switch (*p)
2506         {
2507         case 'F':
2508         case 'f':
2509             if (isWellformedString && !isOutOfRange)
2510                 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
2511             result = TOK.float32Literal;
2512             p++;
2513             break;
2514         default:
2515             if (isWellformedString && !isOutOfRange)
2516                 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
2517             result = TOK.float64Literal;
2518             break;
2519         case 'l':
2520             if (!Ccompile)
2521                 error("use 'L' suffix instead of 'l'");
2522             goto case 'L';
2523         case 'L':
2524             ++p;
2525             if (Ccompile && long_doublesize == 8)
2526                 goto default;
2527             result = TOK.float80Literal;
2528             break;
2529         }
2530         if ((*p == 'i' || *p == 'I') && !Ccompile)
2531         {
2532             if (*p == 'I')
2533                 error("use 'i' suffix instead of 'I'");
2534             p++;
2535             switch (result)
2536             {
2537             case TOK.float32Literal:
2538                 result = TOK.imaginary32Literal;
2539                 break;
2540             case TOK.float64Literal:
2541                 result = TOK.imaginary64Literal;
2542                 break;
2543             case TOK.float80Literal:
2544                 result = TOK.imaginary80Literal;
2545                 break;
2546             default:
2547                 break;
2548             }
2549         }
2550         const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
2551         if (isOutOfRange && !isLong && (!Ccompile || hex))
2552         {
2553             /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2554              */
2555             const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : "";
2556             error(scanloc, "number `%s%s` is not representable", sbufptr, suffix);
2557         }
2558         debug
2559         {
2560             switch (result)
2561             {
2562             case TOK.float32Literal:
2563             case TOK.float64Literal:
2564             case TOK.float80Literal:
2565             case TOK.imaginary32Literal:
2566             case TOK.imaginary64Literal:
2567             case TOK.imaginary80Literal:
2568                 break;
2569             default:
2570                 assert(0);
2571             }
2572         }
2573         return result;
2574     }
2575 
loc()2576     final Loc loc() pure @nogc
2577     {
2578         scanloc.charnum = cast(uint)(1 + p - line);
2579         version (LocOffset)
2580             scanloc.fileOffset = cast(uint)(p - base);
2581         return scanloc;
2582     }
2583 
error(const (char)* format,...)2584     final void error(const(char)* format, ...)
2585     {
2586         va_list args;
2587         va_start(args, format);
2588         .verror(token.loc, format, args);
2589         va_end(args);
2590     }
2591 
error(const ref Loc loc,const (char)* format,...)2592     final void error(const ref Loc loc, const(char)* format, ...)
2593     {
2594         va_list args;
2595         va_start(args, format);
2596         .verror(loc, format, args);
2597         va_end(args);
2598     }
2599 
deprecation(const (char)* format,...)2600     final void deprecation(const(char)* format, ...)
2601     {
2602         va_list args;
2603         va_start(args, format);
2604         .vdeprecation(token.loc, format, args);
2605         va_end(args);
2606     }
2607 
2608     /***************************************
2609      * Parse special token sequence:
2610      * Returns:
2611      *  true if the special token sequence was handled
2612      * References:
2613      *  https://dlang.org/spec/lex.html#special-token-sequence
2614      */
parseSpecialTokenSequence()2615     bool parseSpecialTokenSequence()
2616     {
2617         Token n;
2618         scan(&n);
2619         if (n.value == TOK.identifier)
2620         {
2621             if (n.ident == Id.line)
2622             {
2623                 poundLine(n, false);
2624                 return true;
2625             }
2626             else
2627             {
2628                 const locx = loc();
2629                 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
2630             }
2631         }
2632         else if (n.value == TOK.if_)
2633         {
2634             error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
2635         }
2636         return false;
2637     }
2638 
2639     /*********************************************
2640      * Parse line/file preprocessor directive:
2641      *    #line linnum [filespec]
2642      * Allow __LINE__ for linnum, and __FILE__ for filespec.
2643      * Accept linemarker format:
2644      *    # linnum [filespec] {flags}
2645      * There can be zero or more flags, which are one of the digits 1..4, and
2646      * must be in ascending order. The flags are ignored.
2647      * Params:
2648      *  tok = token we're on, which is linnum of linemarker
2649      *  linemarker = true if line marker format and lexer is on linnum
2650      * References:
2651      *  linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2652      */
poundLine(ref Token tok,bool linemarker)2653     final void poundLine(ref Token tok, bool linemarker)
2654     {
2655         auto linnum = this.scanloc.linnum;
2656         const(char)* filespec = null;
2657         bool flags;
2658 
2659         if (!linemarker)
2660             scan(&tok);
2661         if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
2662         {
2663             const lin = cast(int)(tok.unsvalue);
2664             if (lin != tok.unsvalue)
2665             {
2666                 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
2667                 skipToNextLine();
2668                 return;
2669             }
2670             else
2671                 linnum = lin;
2672         }
2673         else if (tok.value == TOK.line)  // #line __LINE__
2674         {
2675         }
2676         else
2677         {
2678             error(tok.loc, "positive integer argument expected following `#line`");
2679             if (tok.value != TOK.endOfLine)
2680                 skipToNextLine();
2681             return;
2682         }
2683         while (1)
2684         {
2685             scan(&tok);
2686             switch (tok.value)
2687             {
2688             case TOK.endOfFile:
2689             case TOK.endOfLine:
2690                 if (!inTokenStringConstant)
2691                 {
2692                     this.scanloc.linnum = linnum;
2693                     if (filespec)
2694                         this.scanloc.filename = filespec;
2695                 }
2696                 return;
2697             case TOK.file:
2698                 if (filespec || flags)
2699                     goto Lerr;
2700                 filespec = mem.xstrdup(scanloc.filename);
2701                 continue;
2702             case TOK.string_:
2703                 if (filespec || flags)
2704                     goto Lerr;
2705                 if (tok.ptr[0] != '"' || tok.postfix != 0)
2706                     goto Lerr;
2707                 filespec = tok.ustring;
2708                 continue;
2709             case TOK.int32Literal:
2710                 if (!filespec)
2711                     goto Lerr;
2712                 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
2713                 {
2714                     flags = true;   // linemarker flags seen
2715                     continue;
2716                 }
2717                 goto Lerr;
2718             default:
2719                 goto Lerr;
2720             }
2721         }
2722     Lerr:
2723         if (filespec is null)
2724             error(tok.loc, "invalid filename for `#line` directive");
2725         else if (linemarker)
2726             error(tok.loc, "invalid flag for line marker directive");
2727         else if (!Ccompile)
2728             error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
2729         if (tok.value != TOK.endOfLine)
2730             skipToNextLine();
2731     }
2732 
2733     /***************************************
2734      * Scan forward to start of next line.
2735      */
skipToNextLine()2736     final void skipToNextLine()
2737     {
2738         while (1)
2739         {
2740             switch (*p)
2741             {
2742             case 0:
2743             case 0x1A:
2744                 return; // do not advance p
2745 
2746             case '\n':
2747                 ++p;
2748                 break;
2749 
2750             case '\r':
2751                 ++p;
2752                 if (p[0] == '\n')
2753                    ++p;
2754                 break;
2755 
2756             default:
2757                 if (*p & 0x80)
2758                 {
2759                     const u = decodeUTF();
2760                     if (u == PS || u == LS)
2761                     {
2762                         ++p;
2763                         break;
2764                     }
2765                 }
2766                 ++p;
2767                 continue;
2768             }
2769             break;
2770         }
2771         endOfLine();
2772         tokenizeNewlines = false;
2773     }
2774 
2775     /********************************************
2776      * Decode UTF character.
2777      * Issue error messages for invalid sequences.
2778      * Return decoded character, advance p to last character in UTF sequence.
2779      */
decodeUTF()2780     private uint decodeUTF()
2781     {
2782         const s = p;
2783         assert(*s & 0x80);
2784         // Check length of remaining string up to 4 UTF-8 characters
2785         size_t len;
2786         for (len = 1; len < 4 && s[len]; len++)
2787         {
2788         }
2789         size_t idx = 0;
2790         dchar u;
2791         const msg = utf_decodeChar(s[0 .. len], idx, u);
2792         p += idx - 1;
2793         if (msg)
2794         {
2795             error("%.*s", cast(int)msg.length, msg.ptr);
2796         }
2797         return u;
2798     }
2799 
2800     /***************************************************
2801      * Parse doc comment embedded between t.ptr and p.
2802      * Remove trailing blanks and tabs from lines.
2803      * Replace all newlines with \n.
2804      * Remove leading comment character from each line.
2805      * Decide if it's a lineComment or a blockComment.
2806      * Append to previous one for this token.
2807      *
2808      * If newParagraph is true, an extra newline will be
2809      * added between adjoining doc comments.
2810      */
getDocComment(Token * t,uint lineComment,bool newParagraph)2811     private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
2812     {
2813         /* ct tells us which kind of comment it is: '/', '*', or '+'
2814          */
2815         const ct = t.ptr[2];
2816         /* Start of comment text skips over / * *, / + +, or / / /
2817          */
2818         const(char)* q = t.ptr + 3; // start of comment text
2819         const(char)* qend = p;
2820         if (ct == '*' || ct == '+')
2821             qend -= 2;
2822         /* Scan over initial row of ****'s or ++++'s or ////'s
2823          */
2824         for (; q < qend; q++)
2825         {
2826             if (*q != ct)
2827                 break;
2828         }
2829         /* Remove leading spaces until start of the comment
2830          */
2831         int linestart = 0;
2832         if (ct == '/')
2833         {
2834             while (q < qend && (*q == ' ' || *q == '\t'))
2835                 ++q;
2836         }
2837         else if (q < qend)
2838         {
2839             if (*q == '\r')
2840             {
2841                 ++q;
2842                 if (q < qend && *q == '\n')
2843                     ++q;
2844                 linestart = 1;
2845             }
2846             else if (*q == '\n')
2847             {
2848                 ++q;
2849                 linestart = 1;
2850             }
2851         }
2852         /* Remove trailing row of ****'s or ++++'s
2853          */
2854         if (ct != '/')
2855         {
2856             for (; q < qend; qend--)
2857             {
2858                 if (qend[-1] != ct)
2859                     break;
2860             }
2861         }
2862         /* Comment is now [q .. qend].
2863          * Canonicalize it into buf[].
2864          */
2865         OutBuffer buf;
2866 
2867         void trimTrailingWhitespace()
2868         {
2869             const s = buf[];
2870             auto len = s.length;
2871             while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
2872                 --len;
2873             buf.setsize(len);
2874         }
2875 
2876         for (; q < qend; q++)
2877         {
2878             char c = *q;
2879             switch (c)
2880             {
2881             case '*':
2882             case '+':
2883                 if (linestart && c == ct)
2884                 {
2885                     linestart = 0;
2886                     /* Trim preceding whitespace up to preceding \n
2887                      */
2888                     trimTrailingWhitespace();
2889                     continue;
2890                 }
2891                 break;
2892             case ' ':
2893             case '\t':
2894                 break;
2895             case '\r':
2896                 if (q[1] == '\n')
2897                     continue; // skip the \r
2898                 goto Lnewline;
2899             default:
2900                 if (c == 226)
2901                 {
2902                     // If LS or PS
2903                     if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
2904                     {
2905                         q += 2;
2906                         goto Lnewline;
2907                     }
2908                 }
2909                 linestart = 0;
2910                 break;
2911             Lnewline:
2912                 c = '\n'; // replace all newlines with \n
2913                 goto case;
2914             case '\n':
2915                 linestart = 1;
2916                 /* Trim trailing whitespace
2917                  */
2918                 trimTrailingWhitespace();
2919                 break;
2920             }
2921             buf.writeByte(c);
2922         }
2923         /* Trim trailing whitespace (if the last line does not have newline)
2924          */
2925         trimTrailingWhitespace();
2926 
2927         // Always end with a newline
2928         const s = buf[];
2929         if (s.length == 0 || s[$ - 1] != '\n')
2930             buf.writeByte('\n');
2931 
2932         // It's a line comment if the start of the doc comment comes
2933         // after other non-whitespace on the same line.
2934         auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
2935         // Combine with previous doc comment, if any
2936         if (*dc)
2937             *dc = combineComments(*dc, buf[], newParagraph).toDString();
2938         else
2939             *dc = buf.extractSlice(true);
2940     }
2941 
2942     /********************************************
2943      * Combine two document comments into one,
2944      * separated by an extra newline if newParagraph is true.
2945      */
combineComments(const (char)[]c1,const (char)[]c2,bool newParagraph)2946     static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
2947     {
2948         //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
2949         const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
2950         if (!c1)
2951             return c2.ptr;
2952         if (!c2)
2953             return c1.ptr;
2954 
2955         int insertNewLine = 0;
2956         if (c1.length && c1[$ - 1] != '\n')
2957             insertNewLine = 1;
2958         const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
2959         auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
2960         p[0 .. c1.length] = c1[];
2961         if (insertNewLine)
2962             p[c1.length] = '\n';
2963         if (newParagraph)
2964             p[c1.length + insertNewLine] = '\n';
2965         p[retSize - c2.length .. retSize] = c2[];
2966         p[retSize] = 0;
2967         return p;
2968     }
2969 
2970     /**************************
2971      * `p` should be at start of next line
2972      */
endOfLine()2973     private void endOfLine() pure @nogc @safe
2974     {
2975         scanloc.linnum++;
2976         line = p;
2977     }
2978 }
2979 
2980 
2981 /******************************* Private *****************************************/
2982 
2983 private:
2984 
2985 /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__`
2986 private struct TimeStampInfo
2987 {
2988     private __gshared bool initdone = false;
2989 
2990     // Note: Those properties need to be guarded by a call to `init`
2991     // The API isn't safe, and quite brittle, but it was left this way
2992     // over performance concerns.
2993     // This is currently only called once, from the lexer.
2994     __gshared char[11 + 1] date;
2995     __gshared char[8 + 1] time;
2996     __gshared char[24 + 1] timestamp;
2997 
initializeTimeStampInfo2998     public static void initialize(const ref Loc loc) nothrow
2999     {
3000         if (initdone)
3001             return;
3002 
3003         initdone = true;
3004         time_t ct;
3005         // https://issues.dlang.org/show_bug.cgi?id=20444
3006         if (auto p = getenv("SOURCE_DATE_EPOCH"))
3007         {
3008             if (!ct.parseDigits(p.toDString()))
3009                 error(loc, "value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p);
3010         }
3011         else
3012             .time(&ct);
3013         const p = ctime(&ct);
3014         assert(p);
3015         sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
3016         sprintf(&time[0], "%.8s", p + 11);
3017         sprintf(&timestamp[0], "%.24s", p);
3018     }
3019 }
3020 
3021 private enum LS = 0x2028;       // UTF line separator
3022 private enum PS = 0x2029;       // UTF paragraph separator
3023 
3024 /********************************************
3025  * Do our own char maps
3026  */
3027 private static immutable cmtable = ()
3028 {
3029     ubyte[256] table;
3030     foreach (const c; 0 .. table.length)
3031     {
3032         if ('0' <= c && c <= '7')
3033             table[c] |= CMoctal;
3034         if (c_isxdigit(c))
3035             table[c] |= CMhex;
3036         if (c_isalnum(c) || c == '_')
3037             table[c] |= CMidchar;
3038 
3039         switch (c)
3040         {
3041             case 'x': case 'X':
3042             case 'b': case 'B':
3043                 table[c] |= CMzerosecond;
3044                 break;
3045 
3046             case '0': .. case '9':
3047             case 'e': case 'E':
3048             case 'f': case 'F':
3049             case 'l': case 'L':
3050             case 'p': case 'P':
3051             case 'u': case 'U':
3052             case 'i':
3053             case '.':
3054             case '_':
3055                 table[c] |= CMzerosecond | CMdigitsecond;
3056                 break;
3057 
3058             default:
3059                 break;
3060         }
3061 
3062         switch (c)
3063         {
3064             case '\\':
3065             case '\n':
3066             case '\r':
3067             case 0:
3068             case 0x1A:
3069             case '\'':
3070                 break;
3071             default:
3072                 if (!(c & 0x80))
3073                     table[c] |= CMsinglechar;
3074                 break;
3075         }
3076     }
3077     return table;
3078 }();
3079 
3080 private
3081 {
3082     enum CMoctal  = 0x1;
3083     enum CMhex    = 0x2;
3084     enum CMidchar = 0x4;
3085     enum CMzerosecond = 0x8;
3086     enum CMdigitsecond = 0x10;
3087     enum CMsinglechar = 0x20;
3088 }
3089 
isoctal(const char c)3090 private bool isoctal(const char c) pure @nogc @safe
3091 {
3092     return (cmtable[c] & CMoctal) != 0;
3093 }
3094 
ishex(const char c)3095 private bool ishex(const char c) pure @nogc @safe
3096 {
3097     return (cmtable[c] & CMhex) != 0;
3098 }
3099 
isidchar(const char c)3100 private bool isidchar(const char c) pure @nogc @safe
3101 {
3102     return (cmtable[c] & CMidchar) != 0;
3103 }
3104 
isZeroSecond(const char c)3105 private bool isZeroSecond(const char c) pure @nogc @safe
3106 {
3107     return (cmtable[c] & CMzerosecond) != 0;
3108 }
3109 
isDigitSecond(const char c)3110 private bool isDigitSecond(const char c) pure @nogc @safe
3111 {
3112     return (cmtable[c] & CMdigitsecond) != 0;
3113 }
3114 
issinglechar(const char c)3115 private bool issinglechar(const char c) pure @nogc @safe
3116 {
3117     return (cmtable[c] & CMsinglechar) != 0;
3118 }
3119 
c_isxdigit(const int c)3120 private bool c_isxdigit(const int c) pure @nogc @safe
3121 {
3122     return (( c >= '0' && c <= '9') ||
3123             ( c >= 'a' && c <= 'f') ||
3124             ( c >= 'A' && c <= 'F'));
3125 }
3126 
c_isalnum(const int c)3127 private bool c_isalnum(const int c) pure @nogc @safe
3128 {
3129     return (( c >= '0' && c <= '9') ||
3130             ( c >= 'a' && c <= 'z') ||
3131             ( c >= 'A' && c <= 'Z'));
3132 }
3133 
3134 /******************************* Unittest *****************************************/
3135 
3136 unittest
3137 {
3138     import dmd.console;
assertDiagnosticHandler(const ref Loc loc,Color headerColor,const (char)* header,const (char)* format,va_list ap,const (char)* p1,const (char)* p2)3139     nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
3140                                    const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
3141     {
3142         assert(0);
3143     }
3144     diagnosticHandler = &assertDiagnosticHandler;
3145 
test(T)3146     static void test(T)(string sequence, T expected, bool Ccompile = false)
3147     {
3148         auto p = cast(const(char)*)sequence.ptr;
3149         assert(expected == Lexer.escapeSequence(Loc.initial, p, Ccompile));
3150         assert(p == sequence.ptr + sequence.length);
3151     }
3152 
3153     test(`'`, '\'');
3154     test(`"`, '"');
3155     test(`?`, '?');
3156     test(`\`, '\\');
3157     test(`0`, '\0');
3158     test(`a`, '\a');
3159     test(`b`, '\b');
3160     test(`f`, '\f');
3161     test(`n`, '\n');
3162     test(`r`, '\r');
3163     test(`t`, '\t');
3164     test(`v`, '\v');
3165 
3166     test(`x00`, 0x00);
3167     test(`xff`, 0xff);
3168     test(`xFF`, 0xff);
3169     test(`xa7`, 0xa7);
3170     test(`x3c`, 0x3c);
3171     test(`xe2`, 0xe2);
3172 
3173     test(`1`, '\1');
3174     test(`42`, '\42');
3175     test(`357`, '\357');
3176 
3177     test(`u1234`, '\u1234');
3178     test(`uf0e4`, '\uf0e4');
3179 
3180     test(`U0001f603`, '\U0001f603');
3181 
3182     test(`&quot;`, '"');
3183     test(`&lt;`, '<');
3184     test(`&gt;`, '>');
3185 
3186     diagnosticHandler = null;
3187 }
3188 
3189 unittest
3190 {
3191     import dmd.console;
3192     string expected;
3193     bool gotError;
3194 
3195     nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
3196                                          const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
3197     {
3198         assert(cast(Classification)headerColor == Classification.error);
3199 
3200         gotError = true;
3201         char[100] buffer = void;
3202         auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)];
3203         assert(expected == actual);
3204         return true;
3205     }
3206 
3207     diagnosticHandler = &expectDiagnosticHandler;
3208 
3209     void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
3210     {
3211         uint errors = global.errors;
3212         gotError = false;
3213         expected = expectedError;
3214         auto p = cast(const(char)*)sequence.ptr;
3215         auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p, Ccompile);
3216         assert(gotError);
3217         assert(expectedReturnValue == actualReturnValue);
3218 
3219         auto actualScanLength = p - sequence.ptr;
3220         assert(expectedScanLength == actualScanLength);
3221         global.errors = errors;
3222     }
3223 
3224     test("c", `undefined escape sequence \c`, 'c', 1);
3225     test("!", `undefined escape sequence \!`, '!', 1);
3226     test("&quot;", `undefined escape sequence \&`, '&', 1, true);
3227 
3228     test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3229 
3230     test("u1"  , `escape hex sequence has 1 hex digits instead of 4`,   0x1, 2);
3231     test("u12" , `escape hex sequence has 2 hex digits instead of 4`,  0x12, 3);
3232     test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3233 
3234     test("U0"      , `escape hex sequence has 1 hex digits instead of 8`,       0x0, 2);
3235     test("U00"     , `escape hex sequence has 2 hex digits instead of 8`,      0x00, 3);
3236     test("U000"    , `escape hex sequence has 3 hex digits instead of 8`,     0x000, 4);
3237     test("U0000"   , `escape hex sequence has 4 hex digits instead of 8`,    0x0000, 5);
3238     test("U0001f"  , `escape hex sequence has 5 hex digits instead of 8`,   0x0001f, 6);
3239     test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`,  0x0001f6, 7);
3240     test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3241 
3242     test("ud800"    , `invalid UTF character \U0000d800`, '?', 5);
3243     test("udfff"    , `invalid UTF character \U0000dfff`, '?', 5);
3244     test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3245 
3246     test("xg0"      , `undefined escape hex sequence \xg`, 'g', 2);
3247     test("ug000"    , `undefined escape hex sequence \ug`, 'g', 2);
3248     test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3249 
3250     test("&BAD;", `unnamed character entity &BAD;`  , '?', 5);
3251     test("&quot", `unterminated named entity &quot;`, '?', 5);
3252     test("&quot", `unterminated named entity &quot;`, '?', 5);
3253 
3254     test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3255 
3256     diagnosticHandler = null;
3257 }
3258 
3259 unittest
3260 {
3261     //printf("lexer.unittest\n");
3262     /* Not much here, just trying things out.
3263      */
3264     string text = "int"; // We rely on the implicit null-terminator
3265     scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0);
3266     TOK tok;
3267     tok = lex1.nextToken();
3268     //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3269     assert(tok == TOK.int32);
3270     tok = lex1.nextToken();
3271     assert(tok == TOK.endOfFile);
3272     tok = lex1.nextToken();
3273     assert(tok == TOK.endOfFile);
3274     tok = lex1.nextToken();
3275     assert(tok == TOK.endOfFile);
3276 }
3277 
3278 unittest
3279 {
3280     // We don't want to see Lexer error output during these tests.
3281     uint errors = global.startGagging();
3282     scope(exit) global.endGagging(errors);
3283 
3284     // Test malformed input: even malformed input should end in a TOK.endOfFile.
3285     static immutable char[][] testcases =
3286     [   // Testcase must end with 0 or 0x1A.
3287         [0], // not malformed, but pathological
3288         ['\'', 0],
3289         ['\'', 0x1A],
3290         ['{', '{', 'q', '{', 0],
3291         [0xFF, 0],
3292         [0xFF, 0x80, 0],
3293         [0xFF, 0xFF, 0],
3294         [0xFF, 0xFF, 0],
3295         ['x', '"', 0x1A],
3296     ];
3297 
3298     foreach (testcase; testcases)
3299     {
3300         scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0);
3301         TOK tok = lex2.nextToken();
3302         size_t iterations = 1;
3303         while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
3304         {
3305             tok = lex2.nextToken();
3306         }
3307         assert(tok == TOK.endOfFile);
3308         tok = lex2.nextToken();
3309         assert(tok == TOK.endOfFile);
3310     }
3311 }
3312