1 2 /* Compiler implementation of the D programming language 3 * Copyright (C) 1999-2019 by The D Language Foundation, All Rights Reserved 4 * written by Walter Bright 5 * http://www.digitalmars.com 6 * Distributed under the Boost Software License, Version 1.0. 7 * http://www.boost.org/LICENSE_1_0.txt 8 * https://github.com/D-Programming-Language/dmd/blob/master/src/lexer.c 9 */ 10 11 /* Lexical Analyzer */ 12 13 #include "root/dsystem.h" // for time() and ctime() 14 #include "root/rmem.h" 15 16 #include "mars.h" 17 #include "lexer.h" 18 #include "utf.h" 19 #include "identifier.h" 20 #include "id.h" 21 22 extern int HtmlNamedEntity(const utf8_t *p, size_t length); 23 24 #define LS 0x2028 // UTF line separator 25 #define PS 0x2029 // UTF paragraph separator 26 27 /******************************************** 28 * Do our own char maps 29 */ 30 31 static unsigned char cmtable[256]; 32 33 const int CMoctal = 0x1; 34 const int CMhex = 0x2; 35 const int CMidchar = 0x4; 36 37 inline bool isoctal (utf8_t c) { return (cmtable[c] & CMoctal) != 0; } 38 inline bool ishex (utf8_t c) { return (cmtable[c] & CMhex) != 0; } 39 inline bool isidchar(utf8_t c) { return (cmtable[c] & CMidchar) != 0; } 40 41 struct CMTableInitializer 42 { 43 CMTableInitializer(); 44 }; 45 46 static CMTableInitializer cmtableinitializer; 47 48 CMTableInitializer::CMTableInitializer() 49 { 50 for (unsigned c = 0; c < 256; c++) 51 { 52 if ('0' <= c && c <= '7') 53 cmtable[c] |= CMoctal; 54 if (isxdigit(c)) 55 cmtable[c] |= CMhex; 56 if (isalnum(c) || c == '_') 57 cmtable[c] |= CMidchar; 58 } 59 } 60 61 /*************************** Lexer ********************************************/ 62 63 OutBuffer Lexer::stringbuffer; 64 65 Lexer::Lexer(const char *filename, 66 const utf8_t *base, size_t begoffset, size_t endoffset, 67 bool doDocComment, bool commentToken) 68 { 69 scanloc = Loc(filename, 1, 1); 70 //printf("Lexer::Lexer(%p,%d)\n",base,length); 71 //printf("lexer.filename = %s\n", filename); 72 this->token = Token(); 73 this->token.ptr = NULL; 74 this->token.value = TOKreserved; 75 this->token.blockComment = NULL; 76 this->token.lineComment = NULL; 77 this->base = base; 78 this->end = base + endoffset; 79 p = base + begoffset; 80 line = p; 81 this->doDocComment = doDocComment; 82 this->anyToken = 0; 83 this->commentToken = commentToken; 84 this->errors = false; 85 //initKeywords(); 86 87 /* If first line starts with '#!', ignore the line 88 */ 89 90 if (p[0] == '#' && p[1] =='!') 91 { 92 p += 2; 93 while (1) 94 { 95 utf8_t c = *p++; 96 switch (c) 97 { 98 case 0: 99 case 0x1A: 100 p--; 101 /* fall through */ 102 103 case '\n': 104 break; 105 106 default: 107 continue; 108 } 109 break; 110 } 111 endOfLine(); 112 } 113 } 114 115 116 void Lexer::endOfLine() 117 { 118 scanloc.linnum++; 119 line = p; 120 } 121 122 123 void Lexer::error(const char *format, ...) 124 { 125 va_list ap; 126 va_start(ap, format); 127 ::verror(token.loc, format, ap); 128 va_end(ap); 129 errors = true; 130 } 131 132 void Lexer::error(Loc loc, const char *format, ...) 133 { 134 va_list ap; 135 va_start(ap, format); 136 ::verror(loc, format, ap); 137 va_end(ap); 138 errors = true; 139 } 140 141 void Lexer::deprecation(const char *format, ...) 142 { 143 va_list ap; 144 va_start(ap, format); 145 ::vdeprecation(token.loc, format, ap); 146 va_end(ap); 147 if (global.params.useDeprecated == DIAGNOSTICerror) 148 errors = true; 149 } 150 151 TOK Lexer::nextToken() 152 { 153 if (token.next) 154 { 155 Token *t = token.next; 156 memcpy(&token,t,sizeof(Token)); 157 t->free(); 158 } 159 else 160 { 161 scan(&token); 162 } 163 //token.print(); 164 return token.value; 165 } 166 167 Token *Lexer::peek(Token *ct) 168 { 169 Token *t; 170 if (ct->next) 171 t = ct->next; 172 else 173 { 174 t = Token::alloc(); 175 scan(t); 176 ct->next = t; 177 } 178 return t; 179 } 180 181 /*********************** 182 * Look ahead at next token's value. 183 */ 184 185 TOK Lexer::peekNext() 186 { 187 return peek(&token)->value; 188 } 189 190 /*********************** 191 * Look 2 tokens ahead at value. 192 */ 193 194 TOK Lexer::peekNext2() 195 { 196 Token *t = peek(&token); 197 return peek(t)->value; 198 } 199 200 /********************************* 201 * tk is on the opening (. 202 * Look ahead and return token that is past the closing ). 203 */ 204 205 Token *Lexer::peekPastParen(Token *tk) 206 { 207 //printf("peekPastParen()\n"); 208 int parens = 1; 209 int curlynest = 0; 210 while (1) 211 { 212 tk = peek(tk); 213 //tk->print(); 214 switch (tk->value) 215 { 216 case TOKlparen: 217 parens++; 218 continue; 219 220 case TOKrparen: 221 --parens; 222 if (parens) 223 continue; 224 tk = peek(tk); 225 break; 226 227 case TOKlcurly: 228 curlynest++; 229 continue; 230 231 case TOKrcurly: 232 if (--curlynest >= 0) 233 continue; 234 break; 235 236 case TOKsemicolon: 237 if (curlynest) 238 continue; 239 break; 240 241 case TOKeof: 242 break; 243 244 default: 245 continue; 246 } 247 return tk; 248 } 249 } 250 251 /**************************** 252 * Turn next token in buffer into a token. 253 */ 254 255 void Lexer::scan(Token *t) 256 { 257 unsigned lastLine = scanloc.linnum; 258 Loc startLoc; 259 260 t->blockComment = NULL; 261 t->lineComment = NULL; 262 while (1) 263 { 264 t->ptr = p; 265 //printf("p = %p, *p = '%c'\n",p,*p); 266 t->loc = loc(); 267 switch (*p) 268 { 269 case 0: 270 case 0x1A: 271 t->value = TOKeof; // end of file 272 return; 273 274 case ' ': 275 case '\t': 276 case '\v': 277 case '\f': 278 p++; 279 continue; // skip white space 280 281 case '\r': 282 p++; 283 if (*p != '\n') // if CR stands by itself 284 endOfLine(); 285 continue; // skip white space 286 287 case '\n': 288 p++; 289 endOfLine(); 290 continue; // skip white space 291 292 case '0': case '1': case '2': case '3': case '4': 293 case '5': case '6': case '7': case '8': case '9': 294 t->value = number(t); 295 return; 296 297 case '\'': 298 t->value = charConstant(t); 299 return; 300 301 case 'r': 302 if (p[1] != '"') 303 goto case_ident; 304 p++; 305 /* fall through */ 306 case '`': 307 t->value = wysiwygStringConstant(t, *p); 308 return; 309 310 case 'x': 311 if (p[1] != '"') 312 goto case_ident; 313 p++; 314 t->value = hexStringConstant(t); 315 return; 316 317 case 'q': 318 if (p[1] == '"') 319 { 320 p++; 321 t->value = delimitedStringConstant(t); 322 return; 323 } 324 else if (p[1] == '{') 325 { 326 p++; 327 t->value = tokenStringConstant(t); 328 return; 329 } 330 else 331 goto case_ident; 332 333 case '"': 334 t->value = escapeStringConstant(t); 335 return; 336 337 case 'a': case 'b': case 'c': case 'd': case 'e': 338 case 'f': case 'g': case 'h': case 'i': case 'j': 339 case 'k': case 'l': case 'm': case 'n': case 'o': 340 case 'p': /*case 'q': case 'r':*/ case 's': case 't': 341 case 'u': case 'v': case 'w': /*case 'x':*/ case 'y': 342 case 'z': 343 case 'A': case 'B': case 'C': case 'D': case 'E': 344 case 'F': case 'G': case 'H': case 'I': case 'J': 345 case 'K': case 'L': case 'M': case 'N': case 'O': 346 case 'P': case 'Q': case 'R': case 'S': case 'T': 347 case 'U': case 'V': case 'W': case 'X': case 'Y': 348 case 'Z': 349 case '_': 350 case_ident: 351 { utf8_t c; 352 353 while (1) 354 { 355 c = *++p; 356 if (isidchar(c)) 357 continue; 358 else if (c & 0x80) 359 { const utf8_t *s = p; 360 unsigned u = decodeUTF(); 361 if (isUniAlpha(u)) 362 continue; 363 error("char 0x%04x not allowed in identifier", u); 364 p = s; 365 } 366 break; 367 } 368 369 Identifier *id = Identifier::idPool((const char *)t->ptr, p - t->ptr); 370 t->ident = id; 371 t->value = (TOK) id->getValue(); 372 anyToken = 1; 373 if (*t->ptr == '_') // if special identifier token 374 { 375 static bool initdone = false; 376 static char date[11+1]; 377 static char time[8+1]; 378 static char timestamp[24+1]; 379 380 if (!initdone) // lazy evaluation 381 { 382 initdone = true; 383 time_t ct; 384 ::time(&ct); 385 char *p = ctime(&ct); 386 assert(p); 387 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); 388 sprintf(&time[0], "%.8s", p + 11); 389 sprintf(×tamp[0], "%.24s", p); 390 } 391 392 if (id == Id::DATE) 393 { 394 t->ustring = (utf8_t *)date; 395 goto Lstr; 396 } 397 else if (id == Id::TIME) 398 { 399 t->ustring = (utf8_t *)time; 400 goto Lstr; 401 } 402 else if (id == Id::VENDOR) 403 { 404 t->ustring = (utf8_t *)const_cast<char *>(global.vendor); 405 goto Lstr; 406 } 407 else if (id == Id::TIMESTAMP) 408 { 409 t->ustring = (utf8_t *)timestamp; 410 Lstr: 411 t->value = TOKstring; 412 t->postfix = 0; 413 t->len = (unsigned)strlen((char *)t->ustring); 414 } 415 else if (id == Id::VERSIONX) 416 { unsigned major = 0; 417 unsigned minor = 0; 418 bool point = false; 419 420 for (const char *p = global.version + 1; 1; p++) 421 { 422 c = *p; 423 if (isdigit((utf8_t)c)) 424 minor = minor * 10 + c - '0'; 425 else if (c == '.') 426 { 427 if (point) 428 break; // ignore everything after second '.' 429 point = true; 430 major = minor; 431 minor = 0; 432 } 433 else 434 break; 435 } 436 t->value = TOKint64v; 437 t->uns64value = major * 1000 + minor; 438 } 439 else if (id == Id::EOFX) 440 { 441 t->value = TOKeof; 442 // Advance scanner to end of file 443 while (!(*p == 0 || *p == 0x1A)) 444 p++; 445 } 446 } 447 //printf("t->value = %d\n",t->value); 448 return; 449 } 450 451 case '/': 452 p++; 453 switch (*p) 454 { 455 case '=': 456 p++; 457 t->value = TOKdivass; 458 return; 459 460 case '*': 461 p++; 462 startLoc = loc(); 463 while (1) 464 { 465 while (1) 466 { utf8_t c = *p; 467 switch (c) 468 { 469 case '/': 470 break; 471 472 case '\n': 473 endOfLine(); 474 p++; 475 continue; 476 477 case '\r': 478 p++; 479 if (*p != '\n') 480 endOfLine(); 481 continue; 482 483 case 0: 484 case 0x1A: 485 error("unterminated /* */ comment"); 486 p = end; 487 t->loc = loc(); 488 t->value = TOKeof; 489 return; 490 491 default: 492 if (c & 0x80) 493 { unsigned u = decodeUTF(); 494 if (u == PS || u == LS) 495 endOfLine(); 496 } 497 p++; 498 continue; 499 } 500 break; 501 } 502 p++; 503 if (p[-2] == '*' && p - 3 != t->ptr) 504 break; 505 } 506 if (commentToken) 507 { 508 t->loc = startLoc; 509 t->value = TOKcomment; 510 return; 511 } 512 else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr) 513 { // if /** but not /**/ 514 getDocComment(t, lastLine == startLoc.linnum); 515 } 516 continue; 517 518 case '/': // do // style comments 519 startLoc = loc(); 520 while (1) 521 { utf8_t c = *++p; 522 switch (c) 523 { 524 case '\n': 525 break; 526 527 case '\r': 528 if (p[1] == '\n') 529 p++; 530 break; 531 532 case 0: 533 case 0x1A: 534 if (commentToken) 535 { 536 p = end; 537 t->loc = startLoc; 538 t->value = TOKcomment; 539 return; 540 } 541 if (doDocComment && t->ptr[2] == '/') 542 getDocComment(t, lastLine == startLoc.linnum); 543 p = end; 544 t->loc = loc(); 545 t->value = TOKeof; 546 return; 547 548 default: 549 if (c & 0x80) 550 { unsigned u = decodeUTF(); 551 if (u == PS || u == LS) 552 break; 553 } 554 continue; 555 } 556 break; 557 } 558 559 if (commentToken) 560 { 561 p++; 562 endOfLine(); 563 t->loc = startLoc; 564 t->value = TOKcomment; 565 return; 566 } 567 if (doDocComment && t->ptr[2] == '/') 568 getDocComment(t, lastLine == startLoc.linnum); 569 570 p++; 571 endOfLine(); 572 continue; 573 574 case '+': 575 { int nest; 576 577 startLoc = loc(); 578 p++; 579 nest = 1; 580 while (1) 581 { utf8_t c = *p; 582 switch (c) 583 { 584 case '/': 585 p++; 586 if (*p == '+') 587 { 588 p++; 589 nest++; 590 } 591 continue; 592 593 case '+': 594 p++; 595 if (*p == '/') 596 { 597 p++; 598 if (--nest == 0) 599 break; 600 } 601 continue; 602 603 case '\r': 604 p++; 605 if (*p != '\n') 606 endOfLine(); 607 continue; 608 609 case '\n': 610 endOfLine(); 611 p++; 612 continue; 613 614 case 0: 615 case 0x1A: 616 error("unterminated /+ +/ comment"); 617 p = end; 618 t->loc = loc(); 619 t->value = TOKeof; 620 return; 621 622 default: 623 if (c & 0x80) 624 { unsigned u = decodeUTF(); 625 if (u == PS || u == LS) 626 endOfLine(); 627 } 628 p++; 629 continue; 630 } 631 break; 632 } 633 if (commentToken) 634 { 635 t->loc = startLoc; 636 t->value = TOKcomment; 637 return; 638 } 639 if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr) 640 { // if /++ but not /++/ 641 getDocComment(t, lastLine == startLoc.linnum); 642 } 643 continue; 644 } 645 default: 646 break; 647 } 648 t->value = TOKdiv; 649 return; 650 651 case '.': 652 p++; 653 if (isdigit(*p)) 654 { /* Note that we don't allow ._1 and ._ as being 655 * valid floating point numbers. 656 */ 657 p--; 658 t->value = inreal(t); 659 } 660 else if (p[0] == '.') 661 { 662 if (p[1] == '.') 663 { p += 2; 664 t->value = TOKdotdotdot; 665 } 666 else 667 { p++; 668 t->value = TOKslice; 669 } 670 } 671 else 672 t->value = TOKdot; 673 return; 674 675 case '&': 676 p++; 677 if (*p == '=') 678 { p++; 679 t->value = TOKandass; 680 } 681 else if (*p == '&') 682 { p++; 683 t->value = TOKandand; 684 } 685 else 686 t->value = TOKand; 687 return; 688 689 case '|': 690 p++; 691 if (*p == '=') 692 { p++; 693 t->value = TOKorass; 694 } 695 else if (*p == '|') 696 { p++; 697 t->value = TOKoror; 698 } 699 else 700 t->value = TOKor; 701 return; 702 703 case '-': 704 p++; 705 if (*p == '=') 706 { p++; 707 t->value = TOKminass; 708 } 709 else if (*p == '-') 710 { p++; 711 t->value = TOKminusminus; 712 } 713 else 714 t->value = TOKmin; 715 return; 716 717 case '+': 718 p++; 719 if (*p == '=') 720 { p++; 721 t->value = TOKaddass; 722 } 723 else if (*p == '+') 724 { p++; 725 t->value = TOKplusplus; 726 } 727 else 728 t->value = TOKadd; 729 return; 730 731 case '<': 732 p++; 733 if (*p == '=') 734 { p++; 735 t->value = TOKle; // <= 736 } 737 else if (*p == '<') 738 { p++; 739 if (*p == '=') 740 { p++; 741 t->value = TOKshlass; // <<= 742 } 743 else 744 t->value = TOKshl; // << 745 } 746 else if (*p == '>') 747 { p++; 748 if (*p == '=') 749 { p++; 750 t->value = TOKleg; // <>= 751 } 752 else 753 t->value = TOKlg; // <> 754 } 755 else 756 t->value = TOKlt; // < 757 return; 758 759 case '>': 760 p++; 761 if (*p == '=') 762 { p++; 763 t->value = TOKge; // >= 764 } 765 else if (*p == '>') 766 { p++; 767 if (*p == '=') 768 { p++; 769 t->value = TOKshrass; // >>= 770 } 771 else if (*p == '>') 772 { p++; 773 if (*p == '=') 774 { p++; 775 t->value = TOKushrass; // >>>= 776 } 777 else 778 t->value = TOKushr; // >>> 779 } 780 else 781 t->value = TOKshr; // >> 782 } 783 else 784 t->value = TOKgt; // > 785 return; 786 787 case '!': 788 p++; 789 if (*p == '=') 790 { p++; 791 t->value = TOKnotequal; // != 792 } 793 else if (*p == '<') 794 { p++; 795 if (*p == '>') 796 { p++; 797 if (*p == '=') 798 { p++; 799 t->value = TOKunord; // !<>= 800 } 801 else 802 t->value = TOKue; // !<> 803 } 804 else if (*p == '=') 805 { p++; 806 t->value = TOKug; // !<= 807 } 808 else 809 t->value = TOKuge; // !< 810 } 811 else if (*p == '>') 812 { p++; 813 if (*p == '=') 814 { p++; 815 t->value = TOKul; // !>= 816 } 817 else 818 t->value = TOKule; // !> 819 } 820 else 821 t->value = TOKnot; // ! 822 return; 823 824 case '=': 825 p++; 826 if (*p == '=') 827 { p++; 828 t->value = TOKequal; // == 829 } 830 else if (*p == '>') 831 { p++; 832 t->value = TOKgoesto; // => 833 } 834 else 835 t->value = TOKassign; // = 836 return; 837 838 case '~': 839 p++; 840 if (*p == '=') 841 { p++; 842 t->value = TOKcatass; // ~= 843 } 844 else 845 t->value = TOKtilde; // ~ 846 return; 847 848 case '^': 849 p++; 850 if (*p == '^') 851 { p++; 852 if (*p == '=') 853 { p++; 854 t->value = TOKpowass; // ^^= 855 } 856 else 857 t->value = TOKpow; // ^^ 858 } 859 else if (*p == '=') 860 { p++; 861 t->value = TOKxorass; // ^= 862 } 863 else 864 t->value = TOKxor; // ^ 865 return; 866 867 case '(': p++; t->value = TOKlparen; return; 868 case ')': p++; t->value = TOKrparen; return; 869 case '[': p++; t->value = TOKlbracket; return; 870 case ']': p++; t->value = TOKrbracket; return; 871 case '{': p++; t->value = TOKlcurly; return; 872 case '}': p++; t->value = TOKrcurly; return; 873 case '?': p++; t->value = TOKquestion; return; 874 case ',': p++; t->value = TOKcomma; return; 875 case ';': p++; t->value = TOKsemicolon; return; 876 case ':': p++; t->value = TOKcolon; return; 877 case '$': p++; t->value = TOKdollar; return; 878 case '@': p++; t->value = TOKat; return; 879 880 case '*': 881 p++; 882 if (*p == '=') 883 { p++; 884 t->value = TOKmulass; 885 } 886 else 887 t->value = TOKmul; 888 return; 889 case '%': 890 p++; 891 if (*p == '=') 892 { p++; 893 t->value = TOKmodass; 894 } 895 else 896 t->value = TOKmod; 897 return; 898 899 case '#': 900 { 901 p++; 902 Token n; 903 scan(&n); 904 if (n.value == TOKidentifier) 905 { 906 if (n.ident == Id::line) 907 { 908 poundLine(); 909 continue; 910 } 911 else 912 { 913 const Loc locx = loc(); 914 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident->toChars()); 915 } 916 } 917 else if (n.value == TOKif) 918 { 919 error("C preprocessor directive `#if` is not supported, use `version` or `static if`"); 920 } 921 t->value = TOKpound; 922 return; 923 } 924 925 default: 926 { unsigned c = *p; 927 928 if (c & 0x80) 929 { c = decodeUTF(); 930 931 // Check for start of unicode identifier 932 if (isUniAlpha(c)) 933 goto case_ident; 934 935 if (c == PS || c == LS) 936 { 937 endOfLine(); 938 p++; 939 continue; 940 } 941 } 942 if (c < 0x80 && isprint(c)) 943 error("character '%c' is not a valid token", c); 944 else 945 error("character 0x%02x is not a valid token", c); 946 p++; 947 continue; 948 } 949 } 950 } 951 } 952 953 /******************************************* 954 * Parse escape sequence. 955 */ 956 957 unsigned Lexer::escapeSequence() 958 { unsigned c = *p; 959 960 int n; 961 int ndigits; 962 963 switch (c) 964 { 965 case '\'': 966 case '"': 967 case '?': 968 case '\\': 969 Lconsume: 970 p++; 971 break; 972 973 case 'a': c = 7; goto Lconsume; 974 case 'b': c = 8; goto Lconsume; 975 case 'f': c = 12; goto Lconsume; 976 case 'n': c = 10; goto Lconsume; 977 case 'r': c = 13; goto Lconsume; 978 case 't': c = 9; goto Lconsume; 979 case 'v': c = 11; goto Lconsume; 980 981 case 'u': 982 ndigits = 4; 983 goto Lhex; 984 case 'U': 985 ndigits = 8; 986 goto Lhex; 987 case 'x': 988 ndigits = 2; 989 Lhex: 990 p++; 991 c = *p; 992 if (ishex((utf8_t)c)) 993 { unsigned v; 994 995 n = 0; 996 v = 0; 997 while (1) 998 { 999 if (isdigit((utf8_t)c)) 1000 c -= '0'; 1001 else if (islower(c)) 1002 c -= 'a' - 10; 1003 else 1004 c -= 'A' - 10; 1005 v = v * 16 + c; 1006 c = *++p; 1007 if (++n == ndigits) 1008 break; 1009 if (!ishex((utf8_t)c)) 1010 { error("escape hex sequence has %d hex digits instead of %d", n, ndigits); 1011 break; 1012 } 1013 } 1014 if (ndigits != 2 && !utf_isValidDchar(v)) 1015 { error("invalid UTF character \\U%08x", v); 1016 v = '?'; // recover with valid UTF character 1017 } 1018 c = v; 1019 } 1020 else 1021 error("undefined escape hex sequence \\%c",c); 1022 break; 1023 1024 case '&': // named character entity 1025 for (const utf8_t *idstart = ++p; 1; p++) 1026 { 1027 switch (*p) 1028 { 1029 case ';': 1030 c = HtmlNamedEntity(idstart, p - idstart); 1031 if (c == ~0U) 1032 { error("unnamed character entity &%.*s;", (int)(p - idstart), idstart); 1033 c = ' '; 1034 } 1035 p++; 1036 break; 1037 1038 default: 1039 if (isalpha(*p) || 1040 (p != idstart && isdigit(*p))) 1041 continue; 1042 error("unterminated named entity &%.*s;", (int)(p - idstart + 1), idstart); 1043 break; 1044 } 1045 break; 1046 } 1047 break; 1048 1049 case 0: 1050 case 0x1A: // end of file 1051 c = '\\'; 1052 break; 1053 1054 default: 1055 if (isoctal((utf8_t)c)) 1056 { unsigned v; 1057 1058 n = 0; 1059 v = 0; 1060 do 1061 { 1062 v = v * 8 + (c - '0'); 1063 c = *++p; 1064 } while (++n < 3 && isoctal((utf8_t)c)); 1065 c = v; 1066 if (c > 0xFF) 1067 error("escape octal sequence \\%03o is larger than \\377", c); 1068 } 1069 else 1070 error("undefined escape sequence \\%c",c); 1071 break; 1072 } 1073 return c; 1074 } 1075 1076 /************************************** 1077 */ 1078 1079 TOK Lexer::wysiwygStringConstant(Token *t, int tc) 1080 { 1081 int c; 1082 Loc start = loc(); 1083 1084 p++; 1085 stringbuffer.reset(); 1086 while (1) 1087 { 1088 c = *p++; 1089 switch (c) 1090 { 1091 case '\n': 1092 endOfLine(); 1093 break; 1094 1095 case '\r': 1096 if (*p == '\n') 1097 continue; // ignore 1098 c = '\n'; // treat EndOfLine as \n character 1099 endOfLine(); 1100 break; 1101 1102 case 0: 1103 case 0x1A: 1104 error("unterminated string constant starting at %s", start.toChars()); 1105 t->ustring = (utf8_t *)const_cast<char *>(""); 1106 t->len = 0; 1107 t->postfix = 0; 1108 return TOKstring; 1109 1110 case '"': 1111 case '`': 1112 if (c == tc) 1113 { 1114 t->len = (unsigned)stringbuffer.offset; 1115 stringbuffer.writeByte(0); 1116 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); 1117 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); 1118 stringPostfix(t); 1119 return TOKstring; 1120 } 1121 break; 1122 1123 default: 1124 if (c & 0x80) 1125 { p--; 1126 unsigned u = decodeUTF(); 1127 p++; 1128 if (u == PS || u == LS) 1129 endOfLine(); 1130 stringbuffer.writeUTF8(u); 1131 continue; 1132 } 1133 break; 1134 } 1135 stringbuffer.writeByte(c); 1136 } 1137 } 1138 1139 /************************************** 1140 * Lex hex strings: 1141 * x"0A ae 34FE BD" 1142 */ 1143 1144 TOK Lexer::hexStringConstant(Token *t) 1145 { 1146 unsigned c; 1147 Loc start = loc(); 1148 unsigned n = 0; 1149 unsigned v = ~0; // dead assignment, needed to suppress warning 1150 1151 p++; 1152 stringbuffer.reset(); 1153 while (1) 1154 { 1155 c = *p++; 1156 switch (c) 1157 { 1158 case ' ': 1159 case '\t': 1160 case '\v': 1161 case '\f': 1162 continue; // skip white space 1163 1164 case '\r': 1165 if (*p == '\n') 1166 continue; // ignore 1167 // Treat isolated '\r' as if it were a '\n' 1168 /* fall through */ 1169 case '\n': 1170 endOfLine(); 1171 continue; 1172 1173 case 0: 1174 case 0x1A: 1175 error("unterminated string constant starting at %s", start.toChars()); 1176 t->ustring = (utf8_t *)const_cast<char *>(""); 1177 t->len = 0; 1178 t->postfix = 0; 1179 return TOKxstring; 1180 1181 case '"': 1182 if (n & 1) 1183 { error("odd number (%d) of hex characters in hex string", n); 1184 stringbuffer.writeByte(v); 1185 } 1186 t->len = (unsigned)stringbuffer.offset; 1187 stringbuffer.writeByte(0); 1188 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); 1189 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); 1190 stringPostfix(t); 1191 return TOKxstring; 1192 1193 default: 1194 if (c >= '0' && c <= '9') 1195 c -= '0'; 1196 else if (c >= 'a' && c <= 'f') 1197 c -= 'a' - 10; 1198 else if (c >= 'A' && c <= 'F') 1199 c -= 'A' - 10; 1200 else if (c & 0x80) 1201 { p--; 1202 unsigned u = decodeUTF(); 1203 p++; 1204 if (u == PS || u == LS) 1205 endOfLine(); 1206 else 1207 error("non-hex character \\u%04x in hex string", u); 1208 } 1209 else 1210 error("non-hex character '%c' in hex string", c); 1211 if (n & 1) 1212 { v = (v << 4) | c; 1213 stringbuffer.writeByte(v); 1214 } 1215 else 1216 v = c; 1217 n++; 1218 break; 1219 } 1220 } 1221 } 1222 1223 1224 /************************************** 1225 * Lex delimited strings: 1226 * q"(foo(xxx))" // "foo(xxx)" 1227 * q"[foo(]" // "foo(" 1228 * q"/foo]/" // "foo]" 1229 * q"HERE 1230 * foo 1231 * HERE" // "foo\n" 1232 * Input: 1233 * p is on the " 1234 */ 1235 1236 TOK Lexer::delimitedStringConstant(Token *t) 1237 { 1238 unsigned c; 1239 Loc start = loc(); 1240 unsigned delimleft = 0; 1241 unsigned delimright = 0; 1242 unsigned nest = 1; 1243 unsigned nestcount = ~0; // dead assignment, needed to suppress warning 1244 Identifier *hereid = NULL; 1245 unsigned blankrol = 0; 1246 unsigned startline = 0; 1247 1248 p++; 1249 stringbuffer.reset(); 1250 while (1) 1251 { 1252 c = *p++; 1253 //printf("c = '%c'\n", c); 1254 switch (c) 1255 { 1256 case '\n': 1257 Lnextline: 1258 endOfLine(); 1259 startline = 1; 1260 if (blankrol) 1261 { blankrol = 0; 1262 continue; 1263 } 1264 if (hereid) 1265 { 1266 stringbuffer.writeUTF8(c); 1267 continue; 1268 } 1269 break; 1270 1271 case '\r': 1272 if (*p == '\n') 1273 continue; // ignore 1274 c = '\n'; // treat EndOfLine as \n character 1275 goto Lnextline; 1276 1277 case 0: 1278 case 0x1A: 1279 error("unterminated delimited string constant starting at %s", start.toChars()); 1280 t->ustring = (utf8_t *)const_cast<char *>(""); 1281 t->len = 0; 1282 t->postfix = 0; 1283 return TOKstring; 1284 1285 default: 1286 if (c & 0x80) 1287 { p--; 1288 c = decodeUTF(); 1289 p++; 1290 if (c == PS || c == LS) 1291 goto Lnextline; 1292 } 1293 break; 1294 } 1295 if (delimleft == 0) 1296 { delimleft = c; 1297 nest = 1; 1298 nestcount = 1; 1299 if (c == '(') 1300 delimright = ')'; 1301 else if (c == '{') 1302 delimright = '}'; 1303 else if (c == '[') 1304 delimright = ']'; 1305 else if (c == '<') 1306 delimright = '>'; 1307 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) 1308 { // Start of identifier; must be a heredoc 1309 Token tok; 1310 p--; 1311 scan(&tok); // read in heredoc identifier 1312 if (tok.value != TOKidentifier) 1313 { error("identifier expected for heredoc, not %s", tok.toChars()); 1314 delimright = c; 1315 } 1316 else 1317 { hereid = tok.ident; 1318 //printf("hereid = '%s'\n", hereid->toChars()); 1319 blankrol = 1; 1320 } 1321 nest = 0; 1322 } 1323 else 1324 { delimright = c; 1325 nest = 0; 1326 if (isspace(c)) 1327 error("delimiter cannot be whitespace"); 1328 } 1329 } 1330 else 1331 { 1332 if (blankrol) 1333 { error("heredoc rest of line should be blank"); 1334 blankrol = 0; 1335 continue; 1336 } 1337 if (nest == 1) 1338 { 1339 if (c == delimleft) 1340 nestcount++; 1341 else if (c == delimright) 1342 { nestcount--; 1343 if (nestcount == 0) 1344 goto Ldone; 1345 } 1346 } 1347 else if (c == delimright) 1348 goto Ldone; 1349 if (startline && isalpha(c) && hereid) 1350 { Token tok; 1351 const utf8_t *psave = p; 1352 p--; 1353 scan(&tok); // read in possible heredoc identifier 1354 //printf("endid = '%s'\n", tok.ident->toChars()); 1355 if (tok.value == TOKidentifier && tok.ident->equals(hereid)) 1356 { /* should check that rest of line is blank 1357 */ 1358 goto Ldone; 1359 } 1360 p = psave; 1361 } 1362 stringbuffer.writeUTF8(c); 1363 startline = 0; 1364 } 1365 } 1366 1367 Ldone: 1368 if (*p == '"') 1369 p++; 1370 else if (hereid) 1371 error("delimited string must end in %s\"", hereid->toChars()); 1372 else 1373 error("delimited string must end in %c\"", delimright); 1374 t->len = (unsigned)stringbuffer.offset; 1375 stringbuffer.writeByte(0); 1376 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); 1377 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); 1378 stringPostfix(t); 1379 return TOKstring; 1380 } 1381 1382 /************************************** 1383 * Lex delimited strings: 1384 * q{ foo(xxx) } // " foo(xxx) " 1385 * q{foo(} // "foo(" 1386 * q{{foo}"}"} // "{foo}"}"" 1387 * Input: 1388 * p is on the q 1389 */ 1390 1391 TOK Lexer::tokenStringConstant(Token *t) 1392 { 1393 unsigned nest = 1; 1394 Loc start = loc(); 1395 const utf8_t *pstart = ++p; 1396 1397 while (1) 1398 { Token tok; 1399 1400 scan(&tok); 1401 switch (tok.value) 1402 { 1403 case TOKlcurly: 1404 nest++; 1405 continue; 1406 1407 case TOKrcurly: 1408 if (--nest == 0) 1409 { 1410 t->len = (unsigned)(p - 1 - pstart); 1411 t->ustring = (utf8_t *)mem.xmalloc(t->len + 1); 1412 memcpy(t->ustring, pstart, t->len); 1413 t->ustring[t->len] = 0; 1414 stringPostfix(t); 1415 return TOKstring; 1416 } 1417 continue; 1418 1419 case TOKeof: 1420 error("unterminated token string constant starting at %s", start.toChars()); 1421 t->ustring = (utf8_t *)const_cast<char *>(""); 1422 t->len = 0; 1423 t->postfix = 0; 1424 return TOKstring; 1425 1426 default: 1427 continue; 1428 } 1429 } 1430 } 1431 1432 1433 1434 /************************************** 1435 */ 1436 1437 TOK Lexer::escapeStringConstant(Token *t) 1438 { 1439 unsigned c; 1440 Loc start = loc(); 1441 1442 p++; 1443 stringbuffer.reset(); 1444 while (1) 1445 { 1446 c = *p++; 1447 switch (c) 1448 { 1449 case '\\': 1450 switch (*p) 1451 { 1452 case 'u': 1453 case 'U': 1454 case '&': 1455 c = escapeSequence(); 1456 stringbuffer.writeUTF8(c); 1457 continue; 1458 1459 default: 1460 c = escapeSequence(); 1461 break; 1462 } 1463 break; 1464 case '\n': 1465 endOfLine(); 1466 break; 1467 1468 case '\r': 1469 if (*p == '\n') 1470 continue; // ignore 1471 c = '\n'; // treat EndOfLine as \n character 1472 endOfLine(); 1473 break; 1474 1475 case '"': 1476 t->len = (unsigned)stringbuffer.offset; 1477 stringbuffer.writeByte(0); 1478 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); 1479 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); 1480 stringPostfix(t); 1481 return TOKstring; 1482 1483 case 0: 1484 case 0x1A: 1485 p--; 1486 error("unterminated string constant starting at %s", start.toChars()); 1487 t->ustring = (utf8_t *)const_cast<char *>(""); 1488 t->len = 0; 1489 t->postfix = 0; 1490 return TOKstring; 1491 1492 default: 1493 if (c & 0x80) 1494 { 1495 p--; 1496 c = decodeUTF(); 1497 if (c == LS || c == PS) 1498 { c = '\n'; 1499 endOfLine(); 1500 } 1501 p++; 1502 stringbuffer.writeUTF8(c); 1503 continue; 1504 } 1505 break; 1506 } 1507 stringbuffer.writeByte(c); 1508 } 1509 } 1510 1511 /************************************** 1512 */ 1513 1514 TOK Lexer::charConstant(Token *t) 1515 { 1516 unsigned c; 1517 TOK tk = TOKcharv; 1518 1519 //printf("Lexer::charConstant\n"); 1520 p++; 1521 c = *p++; 1522 switch (c) 1523 { 1524 case '\\': 1525 switch (*p) 1526 { 1527 case 'u': 1528 t->uns64value = escapeSequence(); 1529 tk = TOKwcharv; 1530 break; 1531 1532 case 'U': 1533 case '&': 1534 t->uns64value = escapeSequence(); 1535 tk = TOKdcharv; 1536 break; 1537 1538 default: 1539 t->uns64value = escapeSequence(); 1540 break; 1541 } 1542 break; 1543 case '\n': 1544 L1: 1545 endOfLine(); 1546 /* fall through */ 1547 case '\r': 1548 case 0: 1549 case 0x1A: 1550 case '\'': 1551 error("unterminated character constant"); 1552 t->uns64value = '?'; 1553 return tk; 1554 1555 default: 1556 if (c & 0x80) 1557 { 1558 p--; 1559 c = decodeUTF(); 1560 p++; 1561 if (c == LS || c == PS) 1562 goto L1; 1563 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) 1564 tk = TOKwcharv; 1565 else 1566 tk = TOKdcharv; 1567 } 1568 t->uns64value = c; 1569 break; 1570 } 1571 1572 if (*p != '\'') 1573 { 1574 error("unterminated character constant"); 1575 t->uns64value = '?'; 1576 return tk; 1577 } 1578 p++; 1579 return tk; 1580 } 1581 1582 /*************************************** 1583 * Get postfix of string literal. 1584 */ 1585 1586 void Lexer::stringPostfix(Token *t) 1587 { 1588 switch (*p) 1589 { 1590 case 'c': 1591 case 'w': 1592 case 'd': 1593 t->postfix = *p; 1594 p++; 1595 break; 1596 1597 default: 1598 t->postfix = 0; 1599 break; 1600 } 1601 } 1602 1603 /************************************** 1604 * Read in a number. 1605 * If it's an integer, store it in tok.TKutok.Vlong. 1606 * integers can be decimal, octal or hex 1607 * Handle the suffixes U, UL, LU, L, etc. 1608 * If it's double, store it in tok.TKutok.Vdouble. 1609 * Returns: 1610 * TKnum 1611 * TKdouble,... 1612 */ 1613 1614 TOK Lexer::number(Token *t) 1615 { 1616 int base = 10; 1617 const utf8_t *start = p; 1618 unsigned c; 1619 uinteger_t n = 0; // unsigned >=64 bit integer type 1620 int d; 1621 bool err = false; 1622 bool overflow = false; 1623 1624 c = *p; 1625 if (c == '0') 1626 { 1627 ++p; 1628 c = *p; 1629 switch (c) 1630 { 1631 case '0': case '1': case '2': case '3': 1632 case '4': case '5': case '6': case '7': 1633 n = c - '0'; 1634 ++p; 1635 base = 8; 1636 break; 1637 1638 case 'x': 1639 case 'X': 1640 ++p; 1641 base = 16; 1642 break; 1643 1644 case 'b': 1645 case 'B': 1646 ++p; 1647 base = 2; 1648 break; 1649 1650 case '.': 1651 if (p[1] == '.') 1652 goto Ldone; // if ".." 1653 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) 1654 goto Ldone; // if ".identifier" or ".unicode" 1655 goto Lreal; // '.' is part of current token 1656 1657 case 'i': 1658 case 'f': 1659 case 'F': 1660 goto Lreal; 1661 1662 case '_': 1663 ++p; 1664 base = 8; 1665 break; 1666 1667 case 'L': 1668 if (p[1] == 'i') 1669 goto Lreal; 1670 break; 1671 1672 default: 1673 break; 1674 } 1675 } 1676 1677 while (1) 1678 { 1679 c = *p; 1680 switch (c) 1681 { 1682 case '0': case '1': 1683 ++p; 1684 d = c - '0'; 1685 break; 1686 1687 case '2': case '3': 1688 case '4': case '5': case '6': case '7': 1689 if (base == 2 && !err) 1690 { 1691 error("binary digit expected"); 1692 err = true; 1693 } 1694 ++p; 1695 d = c - '0'; 1696 break; 1697 1698 case '8': case '9': 1699 ++p; 1700 if (base < 10 && !err) 1701 { 1702 error("radix %d digit expected, not '%c'", base, c); 1703 err = true; 1704 } 1705 d = c - '0'; 1706 break; 1707 1708 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1709 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1710 ++p; 1711 if (base != 16) 1712 { 1713 if (c == 'e' || c == 'E' || c == 'f' || c == 'F') 1714 goto Lreal; 1715 if (!err) 1716 { 1717 error("radix %d digit expected, not '%c'", base, c); 1718 err = true; 1719 } 1720 } 1721 if (c >= 'a') 1722 d = c + 10 - 'a'; 1723 else 1724 d = c + 10 - 'A'; 1725 break; 1726 1727 case 'L': 1728 if (p[1] == 'i') 1729 goto Lreal; 1730 goto Ldone; 1731 1732 case '.': 1733 if (p[1] == '.') 1734 goto Ldone; // if ".." 1735 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) 1736 goto Ldone; // if ".identifier" or ".unicode" 1737 goto Lreal; // otherwise as part of a floating point literal 1738 1739 case 'p': 1740 case 'P': 1741 case 'i': 1742 Lreal: 1743 p = start; 1744 return inreal(t); 1745 1746 case '_': 1747 ++p; 1748 continue; 1749 1750 default: 1751 goto Ldone; 1752 } 1753 1754 uinteger_t n2 = n * base; 1755 if ((n2 / base != n || n2 + d < n)) 1756 { 1757 overflow = true; 1758 } 1759 n = n2 + d; 1760 1761 // if n needs more than 64 bits 1762 if (sizeof(n) > 8 && 1763 n > 0xFFFFFFFFFFFFFFFFULL) 1764 { 1765 overflow = true; 1766 } 1767 } 1768 1769 Ldone: 1770 1771 if (overflow && !err) 1772 { 1773 error("integer overflow"); 1774 err = true; 1775 } 1776 1777 enum FLAGS 1778 { 1779 FLAGS_none = 0, 1780 FLAGS_decimal = 1, // decimal 1781 FLAGS_unsigned = 2, // u or U suffix 1782 FLAGS_long = 4, // L suffix 1783 }; 1784 1785 unsigned flags = (base == 10) ? FLAGS_decimal : FLAGS_none; 1786 1787 // Parse trailing 'u', 'U', 'l' or 'L' in any combination 1788 const utf8_t *psuffix = p; 1789 while (1) 1790 { 1791 utf8_t f; 1792 switch (*p) 1793 { 1794 case 'U': 1795 case 'u': 1796 f = FLAGS_unsigned; 1797 goto L1; 1798 1799 case 'l': 1800 f = FLAGS_long; 1801 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); 1802 goto L1; 1803 1804 case 'L': 1805 f = FLAGS_long; 1806 L1: 1807 p++; 1808 if ((flags & f) && !err) 1809 { 1810 error("unrecognized token"); 1811 err = true; 1812 } 1813 flags = (FLAGS) (flags | f); 1814 continue; 1815 default: 1816 break; 1817 } 1818 break; 1819 } 1820 1821 if (base == 8 && n >= 8) 1822 error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead", 1823 n, p - psuffix, psuffix, n, p - psuffix, psuffix); 1824 1825 TOK result; 1826 switch (flags) 1827 { 1828 case FLAGS_none: 1829 /* Octal or Hexadecimal constant. 1830 * First that fits: int, uint, long, ulong 1831 */ 1832 if (n & 0x8000000000000000LL) 1833 result = TOKuns64v; 1834 else if (n & 0xFFFFFFFF00000000LL) 1835 result = TOKint64v; 1836 else if (n & 0x80000000) 1837 result = TOKuns32v; 1838 else 1839 result = TOKint32v; 1840 break; 1841 1842 case FLAGS_decimal: 1843 /* First that fits: int, long, long long 1844 */ 1845 if (n & 0x8000000000000000LL) 1846 { 1847 if (!err) 1848 { 1849 error("signed integer overflow"); 1850 err = true; 1851 } 1852 result = TOKuns64v; 1853 } 1854 else if (n & 0xFFFFFFFF80000000LL) 1855 result = TOKint64v; 1856 else 1857 result = TOKint32v; 1858 break; 1859 1860 case FLAGS_unsigned: 1861 case FLAGS_decimal | FLAGS_unsigned: 1862 /* First that fits: uint, ulong 1863 */ 1864 if (n & 0xFFFFFFFF00000000LL) 1865 result = TOKuns64v; 1866 else 1867 result = TOKuns32v; 1868 break; 1869 1870 case FLAGS_decimal | FLAGS_long: 1871 if (n & 0x8000000000000000LL) 1872 { 1873 if (!err) 1874 { 1875 error("signed integer overflow"); 1876 err = true; 1877 } 1878 result = TOKuns64v; 1879 } 1880 else 1881 result = TOKint64v; 1882 break; 1883 1884 case FLAGS_long: 1885 if (n & 0x8000000000000000LL) 1886 result = TOKuns64v; 1887 else 1888 result = TOKint64v; 1889 break; 1890 1891 case FLAGS_unsigned | FLAGS_long: 1892 case FLAGS_decimal | FLAGS_unsigned | FLAGS_long: 1893 result = TOKuns64v; 1894 break; 1895 1896 default: 1897 assert(0); 1898 } 1899 t->uns64value = n; 1900 return result; 1901 } 1902 1903 /************************************** 1904 * Read in characters, converting them to real. 1905 * Bugs: 1906 * Exponent overflow not detected. 1907 * Too much requested precision is not detected. 1908 */ 1909 1910 TOK Lexer::inreal(Token *t) 1911 { 1912 //printf("Lexer::inreal()\n"); 1913 bool isWellformedString = true; 1914 stringbuffer.reset(); 1915 const utf8_t *pstart = p; 1916 char hex = 0; 1917 unsigned c = *p++; 1918 1919 // Leading '0x' 1920 if (c == '0') 1921 { 1922 c = *p++; 1923 if (c == 'x' || c == 'X') 1924 { 1925 hex = true; 1926 c = *p++; 1927 } 1928 } 1929 1930 // Digits to left of '.' 1931 while (1) 1932 { 1933 if (c == '.') 1934 { 1935 c = *p++; 1936 break; 1937 } 1938 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 1939 { 1940 c = *p++; 1941 continue; 1942 } 1943 break; 1944 } 1945 1946 // Digits to right of '.' 1947 while (1) 1948 { 1949 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 1950 { 1951 c = *p++; 1952 continue; 1953 } 1954 break; 1955 } 1956 1957 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) 1958 { 1959 c = *p++; 1960 if (c == '-' || c == '+') 1961 { 1962 c = *p++; 1963 } 1964 bool anyexp = false; 1965 while (1) 1966 { 1967 if (isdigit(c)) 1968 { 1969 anyexp = true; 1970 c = *p++; 1971 continue; 1972 } 1973 if (c == '_') 1974 { 1975 c = *p++; 1976 continue; 1977 } 1978 if (!anyexp) 1979 { 1980 error("missing exponent"); 1981 isWellformedString = false; 1982 } 1983 break; 1984 } 1985 } 1986 else if (hex) 1987 { 1988 error("exponent required for hex float"); 1989 isWellformedString = false; 1990 } 1991 --p; 1992 while (pstart < p) 1993 { 1994 if (*pstart != '_') 1995 stringbuffer.writeByte(*pstart); 1996 ++pstart; 1997 } 1998 1999 stringbuffer.writeByte(0); 2000 const char *sbufptr = (char *)stringbuffer.data; 2001 TOK result; 2002 bool isOutOfRange = false; 2003 t->floatvalue = (isWellformedString ? CTFloat::parse(sbufptr, &isOutOfRange) : CTFloat::zero); 2004 errno = 0; 2005 switch (*p) 2006 { 2007 case 'F': 2008 case 'f': 2009 if (isWellformedString && !isOutOfRange) 2010 isOutOfRange = Port::isFloat32LiteralOutOfRange(sbufptr); 2011 result = TOKfloat32v; 2012 p++; 2013 break; 2014 2015 default: 2016 if (isWellformedString && !isOutOfRange) 2017 isOutOfRange = Port::isFloat64LiteralOutOfRange(sbufptr); 2018 result = TOKfloat64v; 2019 break; 2020 2021 case 'l': 2022 error("use 'L' suffix instead of 'l'"); 2023 /* fall through */ 2024 case 'L': 2025 result = TOKfloat80v; 2026 p++; 2027 break; 2028 } 2029 if (*p == 'i' || *p == 'I') 2030 { 2031 if (*p == 'I') 2032 error("use 'i' suffix instead of 'I'"); 2033 p++; 2034 switch (result) 2035 { 2036 case TOKfloat32v: 2037 result = TOKimaginary32v; 2038 break; 2039 case TOKfloat64v: 2040 result = TOKimaginary64v; 2041 break; 2042 case TOKfloat80v: 2043 result = TOKimaginary80v; 2044 break; 2045 default: break; 2046 } 2047 } 2048 const bool isLong = (result == TOKfloat80v || result == TOKimaginary80v); 2049 if (isOutOfRange && !isLong) 2050 { 2051 const char *suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : ""; 2052 error(scanloc, "number '%s%s' is not representable", (char *)stringbuffer.data, suffix); 2053 } 2054 return result; 2055 } 2056 2057 /********************************************* 2058 * parse: 2059 * #line linnum [filespec] 2060 * also allow __LINE__ for linnum, and __FILE__ for filespec 2061 */ 2062 2063 void Lexer::poundLine() 2064 { 2065 Token tok; 2066 int linnum = this->scanloc.linnum; 2067 char *filespec = NULL; 2068 Loc loc = this->loc(); 2069 2070 scan(&tok); 2071 if (tok.value == TOKint32v || tok.value == TOKint64v) 2072 { 2073 int lin = (int)(tok.uns64value - 1); 2074 if ((unsigned)lin != tok.uns64value - 1) 2075 error("line number %lld out of range", (unsigned long long)tok.uns64value); 2076 else 2077 linnum = lin; 2078 } 2079 else if (tok.value == TOKline) 2080 { 2081 } 2082 else 2083 goto Lerr; 2084 2085 while (1) 2086 { 2087 switch (*p) 2088 { 2089 case 0: 2090 case 0x1A: 2091 case '\n': 2092 Lnewline: 2093 this->scanloc.linnum = linnum; 2094 if (filespec) 2095 this->scanloc.filename = filespec; 2096 return; 2097 2098 case '\r': 2099 p++; 2100 if (*p != '\n') 2101 { p--; 2102 goto Lnewline; 2103 } 2104 continue; 2105 2106 case ' ': 2107 case '\t': 2108 case '\v': 2109 case '\f': 2110 p++; 2111 continue; // skip white space 2112 2113 case '_': 2114 if (memcmp(p, "__FILE__", 8) == 0) 2115 { 2116 p += 8; 2117 filespec = mem.xstrdup(scanloc.filename); 2118 continue; 2119 } 2120 goto Lerr; 2121 2122 case '"': 2123 if (filespec) 2124 goto Lerr; 2125 stringbuffer.reset(); 2126 p++; 2127 while (1) 2128 { unsigned c; 2129 2130 c = *p; 2131 switch (c) 2132 { 2133 case '\n': 2134 case '\r': 2135 case 0: 2136 case 0x1A: 2137 goto Lerr; 2138 2139 case '"': 2140 stringbuffer.writeByte(0); 2141 filespec = mem.xstrdup((char *)stringbuffer.data); 2142 p++; 2143 break; 2144 2145 default: 2146 if (c & 0x80) 2147 { unsigned u = decodeUTF(); 2148 if (u == PS || u == LS) 2149 goto Lerr; 2150 } 2151 stringbuffer.writeByte(c); 2152 p++; 2153 continue; 2154 } 2155 break; 2156 } 2157 continue; 2158 2159 default: 2160 if (*p & 0x80) 2161 { unsigned u = decodeUTF(); 2162 if (u == PS || u == LS) 2163 goto Lnewline; 2164 } 2165 goto Lerr; 2166 } 2167 } 2168 2169 Lerr: 2170 error(loc, "#line integer [\"filespec\"]\\n expected"); 2171 } 2172 2173 2174 /******************************************** 2175 * Decode UTF character. 2176 * Issue error messages for invalid sequences. 2177 * Return decoded character, advance p to last character in UTF sequence. 2178 */ 2179 2180 unsigned Lexer::decodeUTF() 2181 { 2182 dchar_t u; 2183 utf8_t c; 2184 const utf8_t *s = p; 2185 size_t len; 2186 size_t idx; 2187 const char *msg; 2188 2189 c = *s; 2190 assert(c & 0x80); 2191 2192 // Check length of remaining string up to 6 UTF-8 characters 2193 for (len = 1; len < 6 && s[len]; len++) 2194 ; 2195 2196 idx = 0; 2197 msg = utf_decodeChar(s, len, &idx, &u); 2198 p += idx - 1; 2199 if (msg) 2200 { 2201 error("%s", msg); 2202 } 2203 return u; 2204 } 2205 2206 2207 /*************************************************** 2208 * Parse doc comment embedded between t->ptr and p. 2209 * Remove trailing blanks and tabs from lines. 2210 * Replace all newlines with \n. 2211 * Remove leading comment character from each line. 2212 * Decide if it's a lineComment or a blockComment. 2213 * Append to previous one for this token. 2214 */ 2215 2216 void Lexer::getDocComment(Token *t, unsigned lineComment) 2217 { 2218 /* ct tells us which kind of comment it is: '/', '*', or '+' 2219 */ 2220 utf8_t ct = t->ptr[2]; 2221 2222 /* Start of comment text skips over / * *, / + +, or / / / 2223 */ 2224 const utf8_t *q = t->ptr + 3; // start of comment text 2225 2226 const utf8_t *qend = p; 2227 if (ct == '*' || ct == '+') 2228 qend -= 2; 2229 2230 /* Scan over initial row of ****'s or ++++'s or ////'s 2231 */ 2232 for (; q < qend; q++) 2233 { 2234 if (*q != ct) 2235 break; 2236 } 2237 2238 /* Remove leading spaces until start of the comment 2239 */ 2240 int linestart = 0; 2241 if (ct == '/') 2242 { 2243 while (q < qend && (*q == ' ' || *q == '\t')) 2244 ++q; 2245 } 2246 else if (q < qend) 2247 { 2248 if (*q == '\r') 2249 { 2250 ++q; 2251 if (q < qend && *q == '\n') 2252 ++q; 2253 linestart = 1; 2254 } 2255 else if (*q == '\n') 2256 { 2257 ++q; 2258 linestart = 1; 2259 } 2260 } 2261 2262 /* Remove trailing row of ****'s or ++++'s 2263 */ 2264 if (ct != '/') 2265 { 2266 for (; q < qend; qend--) 2267 { 2268 if (qend[-1] != ct) 2269 break; 2270 } 2271 } 2272 2273 /* Comment is now [q .. qend]. 2274 * Canonicalize it into buf[]. 2275 */ 2276 OutBuffer buf; 2277 2278 for (; q < qend; q++) 2279 { 2280 utf8_t c = *q; 2281 2282 switch (c) 2283 { 2284 case '*': 2285 case '+': 2286 if (linestart && c == ct) 2287 { linestart = 0; 2288 /* Trim preceding whitespace up to preceding \n 2289 */ 2290 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2291 buf.offset--; 2292 continue; 2293 } 2294 break; 2295 2296 case ' ': 2297 case '\t': 2298 break; 2299 2300 case '\r': 2301 if (q[1] == '\n') 2302 continue; // skip the \r 2303 goto Lnewline; 2304 2305 default: 2306 if (c == 226) 2307 { 2308 // If LS or PS 2309 if (q[1] == 128 && 2310 (q[2] == 168 || q[2] == 169)) 2311 { 2312 q += 2; 2313 goto Lnewline; 2314 } 2315 } 2316 linestart = 0; 2317 break; 2318 2319 Lnewline: 2320 c = '\n'; // replace all newlines with \n 2321 /* fall through */ 2322 case '\n': 2323 linestart = 1; 2324 2325 /* Trim trailing whitespace 2326 */ 2327 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2328 buf.offset--; 2329 2330 break; 2331 } 2332 buf.writeByte(c); 2333 } 2334 2335 /* Trim trailing whitespace (if the last line does not have newline) 2336 */ 2337 if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2338 { 2339 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2340 buf.offset--; 2341 } 2342 2343 // Always end with a newline 2344 if (!buf.offset || buf.data[buf.offset - 1] != '\n') 2345 buf.writeByte('\n'); 2346 2347 buf.writeByte(0); 2348 2349 // It's a line comment if the start of the doc comment comes 2350 // after other non-whitespace on the same line. 2351 const utf8_t** dc = (lineComment && anyToken) 2352 ? &t->lineComment 2353 : &t->blockComment; 2354 2355 // Combine with previous doc comment, if any 2356 if (*dc) 2357 *dc = combineComments(*dc, (utf8_t *)buf.data); 2358 else 2359 *dc = (utf8_t *)buf.extractData(); 2360 } 2361 2362 /******************************************** 2363 * Combine two document comments into one, 2364 * separated by a newline. 2365 */ 2366 2367 const utf8_t *Lexer::combineComments(const utf8_t *c1, const utf8_t *c2) 2368 { 2369 //printf("Lexer::combineComments('%s', '%s')\n", c1, c2); 2370 2371 const utf8_t *c = c2; 2372 2373 if (c1) 2374 { 2375 c = c1; 2376 if (c2) 2377 { 2378 size_t len1 = strlen((const char *)c1); 2379 size_t len2 = strlen((const char *)c2); 2380 2381 int insertNewLine = 0; 2382 if (len1 && c1[len1 - 1] != '\n') 2383 { 2384 ++len1; 2385 insertNewLine = 1; 2386 } 2387 2388 utf8_t *p = (utf8_t *)mem.xmalloc(len1 + 1 + len2 + 1); 2389 memcpy(p, c1, len1 - insertNewLine); 2390 if (insertNewLine) 2391 p[len1 - 1] = '\n'; 2392 2393 p[len1] = '\n'; 2394 2395 memcpy(p + len1 + 1, c2, len2); 2396 p[len1 + 1 + len2] = 0; 2397 c = p; 2398 } 2399 } 2400 return c; 2401 } 2402