1 /* $NetBSD: lex.c,v 1.154 2023/02/19 12:00:15 rillig Exp $ */ 2 3 /* 4 * Copyright (c) 1996 Christopher G. Demetriou. All Rights Reserved. 5 * Copyright (c) 1994, 1995 Jochen Pohl 6 * All Rights Reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by Jochen Pohl for 19 * The NetBSD Project. 20 * 4. The name of the author may not be used to endorse or promote products 21 * derived from this software without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 24 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 25 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 26 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 28 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 32 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #if HAVE_NBTOOL_CONFIG_H 36 #include "nbtool_config.h" 37 #endif 38 39 #include <sys/cdefs.h> 40 #if defined(__RCSID) 41 __RCSID("$NetBSD: lex.c,v 1.154 2023/02/19 12:00:15 rillig Exp $"); 42 #endif 43 44 #include <ctype.h> 45 #include <errno.h> 46 #include <float.h> 47 #include <limits.h> 48 #include <math.h> 49 #include <stdlib.h> 50 #include <string.h> 51 52 #include "lint1.h" 53 #include "cgram.h" 54 55 #define CHAR_MASK ((1U << CHAR_SIZE) - 1) 56 57 58 /* Current position (it's also updated when an included file is parsed) */ 59 pos_t curr_pos = { "", 1, 0 }; 60 61 /* 62 * Current position in C source (not updated when an included file is 63 * parsed). 64 */ 65 pos_t csrc_pos = { "", 1, 0 }; 66 67 bool in_gcc_attribute; 68 bool in_system_header; 69 70 /* 71 * Valid values for 'since' are 78, 90, 99, 11. 72 * 73 * The C11 keywords are added in C99 mode as well, to provide good error 74 * messages instead of a simple parse error. If the keyword '_Generic' were 75 * not defined, it would be interpreted as an implicit function call, leading 76 * to a parse error. 77 */ 78 #define kwdef(name, token, scl, tspec, tqual, since, gcc, deco) \ 79 { \ 80 name, token, scl, tspec, tqual, \ 81 (since) == 90, \ 82 /* CONSTCOND */ (since) == 99 || (since) == 11, \ 83 (gcc) > 0, \ 84 ((deco) & 1) != 0, ((deco) & 2) != 0, ((deco) & 4) != 0, \ 85 } 86 #define kwdef_token(name, token, since, gcc, deco) \ 87 kwdef(name, token, 0, 0, 0, since, gcc, deco) 88 #define kwdef_sclass(name, sclass, since, gcc, deco) \ 89 kwdef(name, T_SCLASS, sclass, 0, 0, since, gcc, deco) 90 #define kwdef_type(name, tspec, since) \ 91 kwdef(name, T_TYPE, 0, tspec, 0, since, 0, 1) 92 #define kwdef_tqual(name, tqual, since, gcc, deco) \ 93 kwdef(name, T_QUAL, 0, 0, tqual, since, gcc, deco) 94 #define kwdef_keyword(name, token) \ 95 kwdef(name, token, 0, 0, 0, 78, 0, 1) 96 97 /* During initialization, these keywords are written to the symbol table. */ 98 static const struct keyword { 99 const char *kw_name; 100 int kw_token; /* token returned by yylex() */ 101 scl_t kw_scl; /* storage class if kw_token is T_SCLASS */ 102 tspec_t kw_tspec; /* type specifier if kw_token is T_TYPE or 103 * T_STRUCT_OR_UNION */ 104 tqual_t kw_tqual; /* type qualifier if kw_token is T_QUAL */ 105 bool kw_c90:1; /* available in C90 mode */ 106 bool kw_c99_or_c11:1; /* available in C99 or C11 mode */ 107 bool kw_gcc:1; /* available in GCC mode */ 108 bool kw_plain:1; /* 'name' */ 109 bool kw_leading:1; /* '__name' */ 110 bool kw_both:1; /* '__name__' */ 111 } keywords[] = { 112 kwdef_keyword( "_Alignas", T_ALIGNAS), 113 kwdef_keyword( "_Alignof", T_ALIGNOF), 114 kwdef_token( "alignof", T_ALIGNOF, 78,0,6), 115 kwdef_token( "asm", T_ASM, 78,1,7), 116 kwdef_token( "_Atomic", T_ATOMIC, 11,0,1), 117 kwdef_token( "attribute", T_ATTRIBUTE, 78,1,6), 118 kwdef_sclass( "auto", AUTO, 78,0,1), 119 kwdef_type( "_Bool", BOOL, 99), 120 kwdef_keyword( "break", T_BREAK), 121 kwdef_token( "__builtin_offsetof", T_BUILTIN_OFFSETOF, 78,1,1), 122 kwdef_keyword( "case", T_CASE), 123 kwdef_type( "char", CHAR, 78), 124 kwdef_type( "_Complex", COMPLEX, 99), 125 kwdef_tqual( "const", CONST, 90,0,7), 126 kwdef_keyword( "continue", T_CONTINUE), 127 kwdef_keyword( "default", T_DEFAULT), 128 kwdef_keyword( "do", T_DO), 129 kwdef_type( "double", DOUBLE, 78), 130 kwdef_keyword( "else", T_ELSE), 131 kwdef_keyword( "enum", T_ENUM), 132 kwdef_token( "__extension__",T_EXTENSION, 78,1,1), 133 kwdef_sclass( "extern", EXTERN, 78,0,1), 134 kwdef_type( "float", FLOAT, 78), 135 kwdef_keyword( "for", T_FOR), 136 kwdef_token( "_Generic", T_GENERIC, 11,0,1), 137 kwdef_keyword( "goto", T_GOTO), 138 kwdef_keyword( "if", T_IF), 139 kwdef_token( "__imag__", T_IMAG, 78,1,1), 140 kwdef_sclass( "inline", INLINE, 99,0,7), 141 kwdef_type( "int", INT, 78), 142 #ifdef INT128_SIZE 143 kwdef_type( "__int128_t", INT128, 99), 144 #endif 145 kwdef_type( "long", LONG, 78), 146 kwdef_token( "_Noreturn", T_NORETURN, 11,0,1), 147 kwdef_token( "__packed", T_PACKED, 78,0,1), 148 kwdef_token( "__real__", T_REAL, 78,1,1), 149 kwdef_sclass( "register", REG, 78,0,1), 150 kwdef_tqual( "restrict", RESTRICT, 99,0,7), 151 kwdef_keyword( "return", T_RETURN), 152 kwdef_type( "short", SHORT, 78), 153 kwdef( "signed", T_TYPE, 0, SIGNED, 0, 90,0,3), 154 kwdef_keyword( "sizeof", T_SIZEOF), 155 kwdef_sclass( "static", STATIC, 78,0,1), 156 kwdef_keyword( "_Static_assert", T_STATIC_ASSERT), 157 kwdef("struct", T_STRUCT_OR_UNION, 0, STRUCT, 0, 78,0,1), 158 kwdef_keyword( "switch", T_SWITCH), 159 kwdef_token( "__symbolrename", T_SYMBOLRENAME, 78,0,1), 160 kwdef_tqual( "__thread", THREAD, 78,1,1), 161 /* XXX: _Thread_local is a storage-class-specifier, not tqual. */ 162 kwdef_tqual( "_Thread_local", THREAD, 11,0,1), 163 kwdef_sclass( "typedef", TYPEDEF, 78,0,1), 164 kwdef_token( "typeof", T_TYPEOF, 78,1,7), 165 #ifdef INT128_SIZE 166 kwdef_type( "__uint128_t", UINT128, 99), 167 #endif 168 kwdef("union", T_STRUCT_OR_UNION, 0, UNION, 0, 78,0,1), 169 kwdef_type( "unsigned", UNSIGN, 78), 170 kwdef_type( "void", VOID, 78), 171 kwdef_tqual( "volatile", VOLATILE, 90,0,7), 172 kwdef_keyword( "while", T_WHILE), 173 #undef kwdef 174 #undef kwdef_token 175 #undef kwdef_sclass 176 #undef kwdef_type 177 #undef kwdef_tqual 178 #undef kwdef_keyword 179 }; 180 181 /* 182 * The symbol table containing all keywords, identifiers and labels. The hash 183 * entries are linked via sym_t.s_symtab_next. 184 */ 185 static sym_t *symtab[HSHSIZ1]; 186 187 /* 188 * The kind of the next expected symbol, to distinguish the namespaces of 189 * members, labels, type tags and other identifiers. 190 */ 191 symt_t symtyp; 192 193 194 static unsigned int 195 hash(const char *s) 196 { 197 unsigned int v; 198 const char *p; 199 200 v = 0; 201 for (p = s; *p != '\0'; p++) { 202 v = (v << 4) + (unsigned char)*p; 203 v ^= v >> 28; 204 } 205 return v % HSHSIZ1; 206 } 207 208 static void 209 symtab_add(sym_t *sym) 210 { 211 unsigned int h; 212 213 h = hash(sym->s_name); 214 if ((sym->s_symtab_next = symtab[h]) != NULL) 215 symtab[h]->s_symtab_ref = &sym->s_symtab_next; 216 sym->s_symtab_ref = &symtab[h]; 217 symtab[h] = sym; 218 } 219 220 static sym_t * 221 symtab_search(const char *name) 222 { 223 224 unsigned int h = hash(name); 225 for (sym_t *sym = symtab[h]; sym != NULL; sym = sym->s_symtab_next) { 226 if (strcmp(sym->s_name, name) != 0) 227 continue; 228 if (sym->s_keyword != NULL || 229 sym->s_kind == symtyp || 230 in_gcc_attribute) 231 return sym; 232 } 233 234 return NULL; 235 } 236 237 static void 238 symtab_remove(sym_t *sym) 239 { 240 241 if ((*sym->s_symtab_ref = sym->s_symtab_next) != NULL) 242 sym->s_symtab_next->s_symtab_ref = sym->s_symtab_ref; 243 sym->s_symtab_next = NULL; 244 } 245 246 static void 247 symtab_remove_locals(void) 248 { 249 250 for (size_t i = 0; i < HSHSIZ1; i++) { 251 for (sym_t *sym = symtab[i]; sym != NULL; ) { 252 sym_t *next = sym->s_symtab_next; 253 if (sym->s_block_level >= 1) 254 symtab_remove(sym); 255 sym = next; 256 } 257 } 258 } 259 260 #ifdef DEBUG 261 static int 262 sym_by_name(const void *va, const void *vb) 263 { 264 const sym_t *a = *(const sym_t *const *)va; 265 const sym_t *b = *(const sym_t *const *)vb; 266 267 return strcmp(a->s_name, b->s_name); 268 } 269 270 struct syms { 271 const sym_t **items; 272 size_t len; 273 size_t cap; 274 }; 275 276 static void 277 syms_add(struct syms *syms, const sym_t *sym) 278 { 279 if (syms->len >= syms->cap) { 280 syms->cap *= 2; 281 syms->items = xrealloc(syms->items, 282 syms->cap * sizeof(syms->items[0])); 283 } 284 syms->items[syms->len++] = sym; 285 } 286 287 void 288 debug_symtab(void) 289 { 290 struct syms syms = { xcalloc(64, sizeof(syms.items[0])), 0, 64 }; 291 292 for (int level = -1;; level++) { 293 bool more = false; 294 size_t n = sizeof(symtab) / sizeof(symtab[0]); 295 296 syms.len = 0; 297 for (size_t i = 0; i < n; i++) { 298 for (sym_t *sym = symtab[i]; sym != NULL;) { 299 if (sym->s_block_level == level && 300 sym->s_keyword == NULL) 301 syms_add(&syms, sym); 302 if (sym->s_block_level > level) 303 more = true; 304 sym = sym->s_symtab_next; 305 } 306 } 307 308 if (syms.len > 0) { 309 debug_printf("symbol table level %d\n", level); 310 debug_indent_inc(); 311 qsort(syms.items, syms.len, sizeof(syms.items[0]), 312 sym_by_name); 313 for (size_t i = 0; i < syms.len; i++) 314 debug_sym("", syms.items[i], "\n"); 315 debug_indent_dec(); 316 317 lint_assert(level != -1); 318 } 319 320 if (!more) 321 break; 322 } 323 324 free(syms.items); 325 } 326 #endif 327 328 static void 329 add_keyword(const struct keyword *kw, bool leading, bool trailing) 330 { 331 332 const char *name; 333 if (!leading && !trailing) { 334 name = kw->kw_name; 335 } else { 336 char buf[256]; 337 (void)snprintf(buf, sizeof(buf), "%s%s%s", 338 leading ? "__" : "", kw->kw_name, trailing ? "__" : ""); 339 name = xstrdup(buf); 340 } 341 342 sym_t *sym = block_zero_alloc(sizeof(*sym)); 343 sym->s_name = name; 344 sym->s_keyword = kw; 345 int tok = kw->kw_token; 346 sym->u.s_keyword.sk_token = tok; 347 if (tok == T_TYPE || tok == T_STRUCT_OR_UNION) 348 sym->u.s_keyword.sk_tspec = kw->kw_tspec; 349 if (tok == T_SCLASS) 350 sym->s_scl = kw->kw_scl; 351 if (tok == T_QUAL) 352 sym->u.s_keyword.sk_qualifier = kw->kw_tqual; 353 354 symtab_add(sym); 355 } 356 357 static bool 358 is_keyword_known(const struct keyword *kw) 359 { 360 361 if ((kw->kw_c90 || kw->kw_c99_or_c11) && !allow_c90) 362 return false; 363 364 /* 365 * In the 1990s, GCC defined several keywords that were later 366 * incorporated into C99, therefore in GCC mode, all C99 keywords are 367 * made available. The C11 keywords are made available as well, but 368 * there are so few that they don't matter practically. 369 */ 370 if (allow_gcc) 371 return true; 372 if (kw->kw_gcc) 373 return false; 374 375 if (kw->kw_c99_or_c11 && !allow_c99) 376 return false; 377 return true; 378 } 379 380 /* Write all keywords to the symbol table. */ 381 void 382 initscan(void) 383 { 384 385 size_t n = sizeof(keywords) / sizeof(keywords[0]); 386 for (size_t i = 0; i < n; i++) { 387 const struct keyword *kw = keywords + i; 388 if (!is_keyword_known(kw)) 389 continue; 390 if (kw->kw_plain) 391 add_keyword(kw, false, false); 392 if (kw->kw_leading) 393 add_keyword(kw, true, false); 394 if (kw->kw_both) 395 add_keyword(kw, true, true); 396 } 397 } 398 399 /* 400 * When scanning the remainder of a long token (see lex_input), read a byte 401 * and return it as an unsigned char or as EOF. 402 * 403 * Increment the line counts if necessary. 404 */ 405 static int 406 read_byte(void) 407 { 408 int c; 409 410 if ((c = lex_input()) == EOF) 411 return c; 412 if (c == '\0') 413 return EOF; /* lex returns 0 on EOF. */ 414 if (c == '\n') 415 lex_next_line(); 416 return c; 417 } 418 419 static int 420 lex_keyword(sym_t *sym) 421 { 422 int tok = sym->u.s_keyword.sk_token; 423 424 if (tok == T_SCLASS) 425 yylval.y_scl = sym->s_scl; 426 if (tok == T_TYPE || tok == T_STRUCT_OR_UNION) 427 yylval.y_tspec = sym->u.s_keyword.sk_tspec; 428 if (tok == T_QUAL) 429 yylval.y_tqual = sym->u.s_keyword.sk_qualifier; 430 return tok; 431 } 432 433 /* 434 * Look up the definition of a name in the symbol table. This symbol must 435 * either be a keyword or a symbol of the type required by symtyp (label, 436 * member, tag, ...). 437 */ 438 extern int 439 lex_name(const char *yytext, size_t yyleng) 440 { 441 442 sym_t *sym = symtab_search(yytext); 443 if (sym != NULL && sym->s_keyword != NULL) 444 return lex_keyword(sym); 445 446 sbuf_t *sb = xmalloc(sizeof(*sb)); 447 sb->sb_len = yyleng; 448 sb->sb_sym = sym; 449 yylval.y_name = sb; 450 451 if (sym != NULL) { 452 lint_assert(block_level >= sym->s_block_level); 453 sb->sb_name = sym->s_name; 454 return sym->s_scl == TYPEDEF ? T_TYPENAME : T_NAME; 455 } 456 457 char *name = block_zero_alloc(yyleng + 1); 458 (void)memcpy(name, yytext, yyleng + 1); 459 sb->sb_name = name; 460 return T_NAME; 461 } 462 463 int 464 lex_integer_constant(const char *yytext, size_t yyleng, int base) 465 { 466 /* C11 6.4.4.1p5 */ 467 static const tspec_t suffix_type[2][3] = { 468 { INT, LONG, QUAD, }, 469 { UINT, ULONG, UQUAD, } 470 }; 471 472 const char *cp = yytext; 473 size_t len = yyleng; 474 475 /* skip 0[xX] or 0[bB] */ 476 if (base == 16 || base == 2) { 477 cp += 2; 478 len -= 2; 479 } 480 481 /* read suffixes */ 482 unsigned l_suffix = 0, u_suffix = 0; 483 for (;; len--) { 484 char c = cp[len - 1]; 485 if (c == 'l' || c == 'L') 486 l_suffix++; 487 else if (c == 'u' || c == 'U') 488 u_suffix++; 489 else 490 break; 491 } 492 if (l_suffix > 2 || u_suffix > 1) { 493 /* malformed integer constant */ 494 warning(251); 495 if (l_suffix > 2) 496 l_suffix = 2; 497 if (u_suffix > 1) 498 u_suffix = 1; 499 } 500 if (!allow_c90 && u_suffix > 0) { 501 /* suffix U is illegal in traditional C */ 502 warning(97); 503 } 504 tspec_t typ = suffix_type[u_suffix][l_suffix]; 505 506 bool warned = false; 507 errno = 0; 508 char *eptr; 509 uint64_t uq = (uint64_t)strtoull(cp, &eptr, base); 510 lint_assert(eptr == cp + len); 511 if (errno != 0) { 512 /* integer constant out of range */ 513 warning(252); 514 warned = true; 515 } 516 517 /* 518 * If the value is too big for the current type, we must choose 519 * another type. 520 */ 521 bool ansiu = false; 522 switch (typ) { 523 case INT: 524 if (uq <= TARG_INT_MAX) { 525 /* ok */ 526 } else if (uq <= TARG_UINT_MAX && base != 10) { 527 typ = UINT; 528 } else if (uq <= TARG_LONG_MAX) { 529 typ = LONG; 530 } else { 531 typ = ULONG; 532 if (uq > TARG_ULONG_MAX && !warned) { 533 /* integer constant out of range */ 534 warning(252); 535 } 536 } 537 if (typ == UINT || typ == ULONG) { 538 if (!allow_c90) { 539 typ = LONG; 540 } else if (allow_trad) { 541 /* 542 * Remember that the constant is unsigned 543 * only in ANSI C. 544 */ 545 ansiu = true; 546 } 547 } 548 break; 549 case UINT: 550 if (uq > TARG_UINT_MAX) { 551 typ = ULONG; 552 if (uq > TARG_ULONG_MAX && !warned) { 553 /* integer constant out of range */ 554 warning(252); 555 } 556 } 557 break; 558 case LONG: 559 if (uq > TARG_LONG_MAX && allow_c90) { 560 typ = ULONG; 561 if (allow_trad) 562 ansiu = true; 563 if (uq > TARG_ULONG_MAX && !warned) { 564 /* integer constant out of range */ 565 warning(252); 566 } 567 } 568 break; 569 case ULONG: 570 if (uq > TARG_ULONG_MAX && !warned) { 571 /* integer constant out of range */ 572 warning(252); 573 } 574 break; 575 case QUAD: 576 if (uq > TARG_QUAD_MAX && allow_c90) 577 typ = UQUAD; 578 break; 579 case UQUAD: 580 if (uq > TARG_UQUAD_MAX && !warned) { 581 /* integer constant out of range */ 582 warning(252); 583 } 584 break; 585 default: 586 break; 587 } 588 589 uq = (uint64_t)convert_integer((int64_t)uq, typ, 0); 590 591 yylval.y_val = xcalloc(1, sizeof(*yylval.y_val)); 592 yylval.y_val->v_tspec = typ; 593 yylval.y_val->v_unsigned_since_c90 = ansiu; 594 yylval.y_val->v_quad = (int64_t)uq; 595 596 return T_CON; 597 } 598 599 /* 600 * Extend or truncate q to match t. If t is signed, sign-extend. 601 * 602 * len is the number of significant bits. If len is 0, len is set 603 * to the width of type t. 604 */ 605 int64_t 606 convert_integer(int64_t q, tspec_t t, unsigned int len) 607 { 608 609 if (len == 0) 610 len = size_in_bits(t); 611 612 uint64_t vbits = value_bits(len); 613 return t == PTR || is_uinteger(t) || ((q & bit(len - 1)) == 0) 614 ? (int64_t)(q & vbits) 615 : (int64_t)(q | ~vbits); 616 } 617 618 int 619 lex_floating_constant(const char *yytext, size_t yyleng) 620 { 621 const char *cp = yytext; 622 size_t len = yyleng; 623 624 if (cp[len - 1] == 'i') 625 len--; /* imaginary, do nothing for now */ 626 627 char c = cp[len - 1]; 628 tspec_t typ; 629 if (c == 'f' || c == 'F') { 630 typ = FLOAT; 631 len--; 632 } else if (c == 'l' || c == 'L') { 633 typ = LDOUBLE; 634 len--; 635 } else 636 typ = DOUBLE; 637 638 if (!allow_c90 && typ != DOUBLE) { 639 /* suffixes F and L are illegal in traditional C */ 640 warning(98); 641 } 642 643 errno = 0; 644 char *eptr; 645 long double ld = strtold(cp, &eptr); 646 lint_assert(eptr == cp + len); 647 if (errno != 0) 648 /* floating-point constant out of range */ 649 warning(248); 650 651 if (typ == FLOAT) { 652 ld = (float)ld; 653 if (isfinite(ld) == 0) { 654 /* floating-point constant out of range */ 655 warning(248); 656 ld = ld > 0 ? FLT_MAX : -FLT_MAX; 657 } 658 } else if (typ == DOUBLE) { 659 ld = (double)ld; 660 if (isfinite(ld) == 0) { 661 /* floating-point constant out of range */ 662 warning(248); 663 ld = ld > 0 ? DBL_MAX : -DBL_MAX; 664 } 665 } 666 667 yylval.y_val = xcalloc(1, sizeof(*yylval.y_val)); 668 yylval.y_val->v_tspec = typ; 669 yylval.y_val->v_ldbl = ld; 670 671 return T_CON; 672 } 673 674 int 675 lex_operator(int t, op_t o) 676 { 677 678 yylval.y_op = o; 679 return t; 680 } 681 682 static int prev_byte = -1; 683 684 static int 685 read_escaped_oct(int c) 686 { 687 int n = 3; 688 int value = 0; 689 do { 690 value = (value << 3) + (c - '0'); 691 c = read_byte(); 692 } while (--n > 0 && '0' <= c && c <= '7'); 693 prev_byte = c; 694 if (value > TARG_UCHAR_MAX) { 695 /* character escape does not fit in character */ 696 warning(76); 697 value &= CHAR_MASK; 698 } 699 return value; 700 } 701 702 static unsigned int 703 read_escaped_hex(int c) 704 { 705 if (!allow_c90) 706 /* \x undefined in traditional C */ 707 warning(82); 708 unsigned int value = 0; 709 int state = 0; /* 0 = no digits, 1 = OK, 2 = overflow */ 710 while (c = read_byte(), isxdigit(c)) { 711 c = isdigit(c) ? c - '0' : toupper(c) - 'A' + 10; 712 value = (value << 4) + c; 713 if (state == 2) 714 continue; 715 if ((value & ~CHAR_MASK) != 0) { 716 /* overflow in hex escape */ 717 warning(75); 718 state = 2; 719 } else { 720 state = 1; 721 } 722 } 723 prev_byte = c; 724 if (state == 0) { 725 /* no hex digits follow \x */ 726 error(74); 727 } 728 if (state == 2) 729 value &= CHAR_MASK; 730 return value; 731 } 732 733 static int 734 read_escaped_backslash(int delim) 735 { 736 int c; 737 738 switch (c = read_byte()) { 739 case '"': 740 if (!allow_c90 && delim == '\'') 741 /* \" inside character constants undef... */ 742 warning(262); 743 return '"'; 744 case '\'': 745 return '\''; 746 case '?': 747 if (!allow_c90) 748 /* \? undefined in traditional C */ 749 warning(263); 750 return '?'; 751 case '\\': 752 return '\\'; 753 case 'a': 754 if (!allow_c90) 755 /* \a undefined in traditional C */ 756 warning(81); 757 return '\a'; 758 case 'b': 759 return '\b'; 760 case 'f': 761 return '\f'; 762 case 'n': 763 return '\n'; 764 case 'r': 765 return '\r'; 766 case 't': 767 return '\t'; 768 case 'v': 769 if (!allow_c90) 770 /* \v undefined in traditional C */ 771 warning(264); 772 return '\v'; 773 case '8': case '9': 774 /* bad octal digit %c */ 775 warning(77, c); 776 /* FALLTHROUGH */ 777 case '0': case '1': case '2': case '3': 778 case '4': case '5': case '6': case '7': 779 return read_escaped_oct(c); 780 case 'x': 781 return (int)read_escaped_hex(c); 782 case '\n': 783 return -3; 784 case EOF: 785 return -2; 786 default: 787 if (isprint(c)) { 788 /* dubious escape \%c */ 789 warning(79, c); 790 } else { 791 /* dubious escape \%o */ 792 warning(80, c); 793 } 794 return c; 795 } 796 } 797 798 /* 799 * Read a character which is part of a character constant or of a string 800 * and handle escapes. 801 * 802 * 'delim' is '\'' for character constants and '"' for string literals. 803 * 804 * Returns -1 if the end of the character constant or string is reached, 805 * -2 if the EOF is reached, and the character otherwise. 806 */ 807 static int 808 get_escaped_char(int delim) 809 { 810 811 int c = prev_byte; 812 if (c != -1) 813 prev_byte = -1; 814 else 815 c = read_byte(); 816 817 if (c == delim) 818 return -1; 819 switch (c) { 820 case '\n': 821 if (!allow_c90) { 822 /* newline in string or char constant */ 823 error(254); 824 return -2; 825 } 826 return c; 827 case '\0': 828 /* syntax error '%s' */ 829 error(249, "EOF or null byte in literal"); 830 return -2; 831 case EOF: 832 return -2; 833 case '\\': 834 c = read_escaped_backslash(delim); 835 if (c == -3) 836 return get_escaped_char(delim); 837 } 838 return c; 839 } 840 841 /* Called if lex found a leading "'". */ 842 int 843 lex_character_constant(void) 844 { 845 size_t n; 846 int val, c; 847 848 n = 0; 849 val = 0; 850 while ((c = get_escaped_char('\'')) >= 0) { 851 val = (int)((unsigned int)val << CHAR_SIZE) + c; 852 n++; 853 } 854 if (c == -2) { 855 /* unterminated character constant */ 856 error(253); 857 } else if (n > sizeof(int) || (n > 1 && (pflag || hflag))) { 858 /* 859 * XXX: ^^ should rather be sizeof(TARG_INT). Luckily, 860 * sizeof(int) is the same on all supported platforms. 861 */ 862 /* too many characters in character constant */ 863 error(71); 864 } else if (n > 1) { 865 /* multi-character character constant */ 866 warning(294); 867 } else if (n == 0) { 868 /* empty character constant */ 869 error(73); 870 } 871 if (n == 1) 872 val = (int)convert_integer(val, CHAR, CHAR_SIZE); 873 874 yylval.y_val = xcalloc(1, sizeof(*yylval.y_val)); 875 yylval.y_val->v_tspec = INT; 876 yylval.y_val->v_quad = val; 877 878 return T_CON; 879 } 880 881 /* 882 * Called if lex found a leading L\' 883 */ 884 int 885 lex_wide_character_constant(void) 886 { 887 static char buf[MB_LEN_MAX + 1]; 888 size_t n, nmax; 889 int c; 890 wchar_t wc; 891 892 nmax = MB_CUR_MAX; 893 894 n = 0; 895 while ((c = get_escaped_char('\'')) >= 0) { 896 if (n < nmax) 897 buf[n] = (char)c; 898 n++; 899 } 900 901 wc = 0; 902 903 if (c == -2) { 904 /* unterminated character constant */ 905 error(253); 906 } else if (n == 0) { 907 /* empty character constant */ 908 error(73); 909 } else if (n > nmax) { 910 n = nmax; 911 /* too many characters in character constant */ 912 error(71); 913 } else { 914 buf[n] = '\0'; 915 (void)mbtowc(NULL, NULL, 0); 916 if (mbtowc(&wc, buf, nmax) < 0) 917 /* invalid multibyte character */ 918 error(291); 919 } 920 921 yylval.y_val = xcalloc(1, sizeof(*yylval.y_val)); 922 yylval.y_val->v_tspec = WCHAR; 923 yylval.y_val->v_quad = wc; 924 925 return T_CON; 926 } 927 928 /* See https://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html */ 929 static void 930 parse_line_directive_flags(const char *p, 931 bool *is_begin, bool *is_end, bool *is_system) 932 { 933 934 *is_begin = false; 935 *is_end = false; 936 *is_system = false; 937 938 while (*p != '\0') { 939 const char *word_start, *word_end; 940 941 while (ch_isspace(*p)) 942 p++; 943 944 word_start = p; 945 while (*p != '\0' && !ch_isspace(*p)) 946 p++; 947 word_end = p; 948 949 if (word_end - word_start == 1 && word_start[0] == '1') 950 *is_begin = true; 951 if (word_end - word_start == 1 && word_start[0] == '2') 952 *is_end = true; 953 if (word_end - word_start == 1 && word_start[0] == '3') 954 *is_system = true; 955 /* Flag '4' is only interesting for C++. */ 956 } 957 } 958 959 /* 960 * Called for preprocessor directives. Currently implemented are: 961 * # pragma [argument...] 962 * # lineno 963 * # lineno "filename" 964 * # lineno "filename" GCC-flag... 965 */ 966 void 967 lex_directive(const char *yytext) 968 { 969 const char *cp, *fn; 970 char c, *eptr; 971 size_t fnl; 972 long ln; 973 bool is_begin, is_end, is_system; 974 975 static bool first = true; 976 977 /* Go to first non-whitespace after # */ 978 for (cp = yytext + 1; (c = *cp) == ' ' || c == '\t'; cp++) 979 continue; 980 981 if (!ch_isdigit(c)) { 982 if (strncmp(cp, "pragma", 6) == 0 && ch_isspace(cp[6])) 983 return; 984 error: 985 /* undefined or invalid # directive */ 986 warning(255); 987 return; 988 } 989 ln = strtol(--cp, &eptr, 10); 990 if (eptr == cp) 991 goto error; 992 if ((c = *(cp = eptr)) != ' ' && c != '\t' && c != '\0') 993 goto error; 994 while ((c = *cp++) == ' ' || c == '\t') 995 continue; 996 if (c != '\0') { 997 if (c != '"') 998 goto error; 999 fn = cp; 1000 while ((c = *cp) != '"' && c != '\0') 1001 cp++; 1002 if (c != '"') 1003 goto error; 1004 if ((fnl = cp++ - fn) > PATH_MAX) 1005 goto error; 1006 /* empty string means stdin */ 1007 if (fnl == 0) { 1008 fn = "{standard input}"; 1009 fnl = 16; /* strlen (fn) */ 1010 } 1011 curr_pos.p_file = record_filename(fn, fnl); 1012 /* 1013 * If this is the first directive, the name is the name 1014 * of the C source file as specified at the command line. 1015 * It is written to the output file. 1016 */ 1017 if (first) { 1018 csrc_pos.p_file = curr_pos.p_file; 1019 outsrc(transform_filename(curr_pos.p_file, 1020 strlen(curr_pos.p_file))); 1021 first = false; 1022 } 1023 1024 parse_line_directive_flags(cp, &is_begin, &is_end, &is_system); 1025 update_location(curr_pos.p_file, (int)ln, is_begin, is_end); 1026 in_system_header = is_system; 1027 } 1028 curr_pos.p_line = (int)ln - 1; 1029 curr_pos.p_uniq = 0; 1030 if (curr_pos.p_file == csrc_pos.p_file) { 1031 csrc_pos.p_line = (int)ln - 1; 1032 csrc_pos.p_uniq = 0; 1033 } 1034 } 1035 1036 /* 1037 * Handle lint comments such as ARGSUSED. 1038 * 1039 * If one of these comments is recognized, the argument, if any, is 1040 * parsed and a function which handles this comment is called. 1041 */ 1042 void 1043 lex_comment(void) 1044 { 1045 int c; 1046 static const struct { 1047 const char *keywd; 1048 bool arg; 1049 void (*func)(int); 1050 } keywtab[] = { 1051 { "ARGSUSED", true, argsused }, 1052 { "BITFIELDTYPE", false, bitfieldtype }, 1053 { "CONSTCOND", false, constcond }, 1054 { "CONSTANTCOND", false, constcond }, 1055 { "CONSTANTCONDITION", false, constcond }, 1056 { "FALLTHRU", false, fallthru }, 1057 { "FALLTHROUGH", false, fallthru }, 1058 { "FALL THROUGH", false, fallthru }, 1059 { "fallthrough", false, fallthru }, 1060 { "LINTLIBRARY", false, lintlib }, 1061 { "LINTED", true, linted }, 1062 { "LONGLONG", false, longlong }, 1063 { "NOSTRICT", true, linted }, 1064 { "NOTREACHED", false, not_reached }, 1065 { "PRINTFLIKE", true, printflike }, 1066 { "PROTOLIB", true, protolib }, 1067 { "SCANFLIKE", true, scanflike }, 1068 { "VARARGS", true, varargs }, 1069 }; 1070 char keywd[32]; 1071 char arg[32]; 1072 size_t l, i; 1073 int a; 1074 1075 bool seen_end_of_comment = false; 1076 1077 /* Skip whitespace after the start of the comment */ 1078 while (c = read_byte(), isspace(c)) 1079 continue; 1080 1081 /* Read the potential keyword to keywd */ 1082 l = 0; 1083 while (c != EOF && l < sizeof(keywd) - 1 && 1084 (isalpha(c) || isspace(c))) { 1085 if (islower(c) && l > 0 && ch_isupper(keywd[0])) 1086 break; 1087 keywd[l++] = (char)c; 1088 c = read_byte(); 1089 } 1090 while (l > 0 && ch_isspace(keywd[l - 1])) 1091 l--; 1092 keywd[l] = '\0'; 1093 1094 /* look for the keyword */ 1095 for (i = 0; i < sizeof(keywtab) / sizeof(keywtab[0]); i++) { 1096 if (strcmp(keywtab[i].keywd, keywd) == 0) 1097 break; 1098 } 1099 if (i == sizeof(keywtab) / sizeof(keywtab[0])) 1100 goto skip_rest; 1101 1102 /* skip whitespace after the keyword */ 1103 while (isspace(c)) 1104 c = read_byte(); 1105 1106 /* read the argument, if the keyword accepts one and there is one */ 1107 l = 0; 1108 if (keywtab[i].arg) { 1109 while (isdigit(c) && l < sizeof(arg) - 1) { 1110 arg[l++] = (char)c; 1111 c = read_byte(); 1112 } 1113 } 1114 arg[l] = '\0'; 1115 a = l != 0 ? atoi(arg) : -1; 1116 1117 /* skip whitespace after the argument */ 1118 while (isspace(c)) 1119 c = read_byte(); 1120 1121 seen_end_of_comment = c == '*' && (c = read_byte()) == '/'; 1122 if (!seen_end_of_comment && keywtab[i].func != linted) 1123 /* extra characters in lint comment */ 1124 warning(257); 1125 1126 if (keywtab[i].func != NULL) 1127 keywtab[i].func(a); 1128 1129 skip_rest: 1130 while (!seen_end_of_comment) { 1131 int lc = c; 1132 if ((c = read_byte()) == EOF) { 1133 /* unterminated comment */ 1134 error(256); 1135 break; 1136 } 1137 if (lc == '*' && c == '/') 1138 seen_end_of_comment = true; 1139 } 1140 } 1141 1142 void 1143 lex_slash_slash_comment(void) 1144 { 1145 int c; 1146 1147 if (!allow_c99 && !allow_gcc) 1148 /* %s does not support // comments */ 1149 gnuism(312, allow_c90 ? "C90" : "traditional C"); 1150 1151 while ((c = read_byte()) != EOF && c != '\n') 1152 continue; 1153 } 1154 1155 /* 1156 * Clear flags for lint comments LINTED, LONGLONG and CONSTCOND. 1157 * clear_warn_flags is called after function definitions and global and 1158 * local declarations and definitions. It is also called between 1159 * the controlling expression and the body of control statements 1160 * (if, switch, for, while). 1161 */ 1162 void 1163 clear_warn_flags(void) 1164 { 1165 1166 lwarn = LWARN_ALL; 1167 quadflg = false; 1168 constcond_flag = false; 1169 } 1170 1171 int 1172 lex_string(void) 1173 { 1174 unsigned char *s; 1175 int c; 1176 size_t len, max; 1177 1178 s = xmalloc(max = 64); 1179 1180 len = 0; 1181 while ((c = get_escaped_char('"')) >= 0) { 1182 /* +1 to reserve space for a trailing NUL character */ 1183 if (len + 1 == max) 1184 s = xrealloc(s, max *= 2); 1185 s[len++] = (char)c; 1186 } 1187 s[len] = '\0'; 1188 if (c == -2) 1189 /* unterminated string constant */ 1190 error(258); 1191 1192 strg_t *strg = xcalloc(1, sizeof(*strg)); 1193 strg->st_char = true; 1194 strg->st_len = len; 1195 strg->st_mem = s; 1196 1197 yylval.y_string = strg; 1198 return T_STRING; 1199 } 1200 1201 int 1202 lex_wide_string(void) 1203 { 1204 int c, n; 1205 1206 size_t len = 0, max = 64; 1207 char *s = xmalloc(max); 1208 while ((c = get_escaped_char('"')) >= 0) { 1209 /* +1 to save space for a trailing NUL character */ 1210 if (len + 1 >= max) 1211 s = xrealloc(s, max *= 2); 1212 s[len++] = (char)c; 1213 } 1214 s[len] = '\0'; 1215 if (c == -2) 1216 /* unterminated string constant */ 1217 error(258); 1218 1219 /* get length of wide-character string */ 1220 (void)mblen(NULL, 0); 1221 size_t wlen = 0; 1222 for (size_t i = 0; i < len; i += n, wlen++) { 1223 if ((n = mblen(&s[i], MB_CUR_MAX)) == -1) { 1224 /* invalid multibyte character */ 1225 error(291); 1226 break; 1227 } 1228 if (n == 0) 1229 n = 1; 1230 } 1231 1232 wchar_t *ws = xmalloc((wlen + 1) * sizeof(*ws)); 1233 size_t wi = 0; 1234 /* convert from multibyte to wide char */ 1235 (void)mbtowc(NULL, NULL, 0); 1236 for (size_t i = 0; i < len; i += n, wi++) { 1237 if ((n = mbtowc(&ws[wi], &s[i], MB_CUR_MAX)) == -1) 1238 break; 1239 if (n == 0) 1240 n = 1; 1241 } 1242 ws[wi] = 0; 1243 free(s); 1244 1245 strg_t *strg = xcalloc(1, sizeof(*strg)); 1246 strg->st_char = false; 1247 strg->st_len = wlen; 1248 strg->st_mem = ws; 1249 1250 yylval.y_string = strg; 1251 return T_STRING; 1252 } 1253 1254 void 1255 lex_next_line(void) 1256 { 1257 curr_pos.p_line++; 1258 curr_pos.p_uniq = 0; 1259 debug_step("parsing %s:%d", curr_pos.p_file, curr_pos.p_line); 1260 if (curr_pos.p_file == csrc_pos.p_file) { 1261 csrc_pos.p_line++; 1262 csrc_pos.p_uniq = 0; 1263 } 1264 } 1265 1266 void 1267 lex_unknown_character(int c) 1268 { 1269 1270 /* unknown character \%o */ 1271 error(250, c); 1272 } 1273 1274 /* 1275 * The scanner does not create new symbol table entries for symbols it cannot 1276 * find in the symbol table. This is to avoid putting undeclared symbols into 1277 * the symbol table if a syntax error occurs. 1278 * 1279 * getsym is called as soon as it is probably ok to put the symbol in the 1280 * symbol table. It is still possible that symbols are put in the symbol 1281 * table that are not completely declared due to syntax errors. To avoid too 1282 * many problems in this case, symbols get type 'int' in getsym. 1283 * 1284 * XXX calls to getsym should be delayed until declare_1_* is called. 1285 */ 1286 sym_t * 1287 getsym(sbuf_t *sb) 1288 { 1289 1290 sym_t *sym = sb->sb_sym; 1291 1292 /* 1293 * During member declaration it is possible that name() looked 1294 * for symbols of type FVFT, although it should have looked for 1295 * symbols of type FTAG. Same can happen for labels. Both cases 1296 * are compensated here. 1297 */ 1298 if (symtyp == FMEMBER || symtyp == FLABEL) { 1299 if (sym == NULL || sym->s_kind == FVFT) 1300 sym = symtab_search(sb->sb_name); 1301 } 1302 1303 if (sym != NULL) { 1304 lint_assert(sym->s_kind == symtyp); 1305 symtyp = FVFT; 1306 free(sb); 1307 return sym; 1308 } 1309 1310 /* create a new symbol table entry */ 1311 1312 /* labels must always be allocated at level 1 (outermost block) */ 1313 dinfo_t *di; 1314 if (symtyp == FLABEL) { 1315 sym = level_zero_alloc(1, sizeof(*sym)); 1316 char *s = level_zero_alloc(1, sb->sb_len + 1); 1317 (void)memcpy(s, sb->sb_name, sb->sb_len + 1); 1318 sym->s_name = s; 1319 sym->s_block_level = 1; 1320 di = dcs; 1321 while (di->d_enclosing != NULL && 1322 di->d_enclosing->d_enclosing != NULL) 1323 di = di->d_enclosing; 1324 lint_assert(di->d_kind == DK_AUTO); 1325 } else { 1326 sym = block_zero_alloc(sizeof(*sym)); 1327 sym->s_name = sb->sb_name; 1328 sym->s_block_level = block_level; 1329 di = dcs; 1330 } 1331 1332 UNIQUE_CURR_POS(sym->s_def_pos); 1333 if ((sym->s_kind = symtyp) != FLABEL) 1334 sym->s_type = gettyp(INT); 1335 1336 symtyp = FVFT; 1337 1338 if (!in_gcc_attribute) { 1339 symtab_add(sym); 1340 1341 *di->d_ldlsym = sym; 1342 di->d_ldlsym = &sym->s_level_next; 1343 } 1344 1345 free(sb); 1346 return sym; 1347 } 1348 1349 /* 1350 * Construct a temporary symbol. The symbol name starts with a digit to avoid 1351 * name clashes with other identifiers. 1352 */ 1353 sym_t * 1354 mktempsym(type_t *tp) 1355 { 1356 static unsigned n = 0; 1357 char *s = level_zero_alloc((size_t)block_level, 64); 1358 sym_t *sym = block_zero_alloc(sizeof(*sym)); 1359 scl_t scl; 1360 1361 (void)snprintf(s, 64, "%.8u_tmp", n++); 1362 1363 scl = dcs->d_scl; 1364 if (scl == NOSCL) 1365 scl = block_level > 0 ? AUTO : EXTERN; 1366 1367 sym->s_name = s; 1368 sym->s_type = tp; 1369 sym->s_block_level = block_level; 1370 sym->s_scl = scl; 1371 sym->s_kind = FVFT; 1372 sym->s_used = true; 1373 sym->s_set = true; 1374 1375 symtab_add(sym); 1376 1377 *dcs->d_ldlsym = sym; 1378 dcs->d_ldlsym = &sym->s_level_next; 1379 1380 return sym; 1381 } 1382 1383 /* Remove a symbol forever from the symbol table. */ 1384 void 1385 rmsym(sym_t *sym) 1386 { 1387 1388 debug_step("rmsym '%s' %s '%s'", 1389 sym->s_name, symt_name(sym->s_kind), type_name(sym->s_type)); 1390 symtab_remove(sym); 1391 1392 /* avoid that the symbol will later be put back to the symbol table */ 1393 sym->s_block_level = -1; 1394 } 1395 1396 /* 1397 * Remove all symbols from the symbol table that have the same level as the 1398 * given symbol. 1399 */ 1400 void 1401 rmsyms(sym_t *syms) 1402 { 1403 sym_t *sym; 1404 1405 /* Note the use of s_level_next instead of s_symtab_next. */ 1406 for (sym = syms; sym != NULL; sym = sym->s_level_next) { 1407 if (sym->s_block_level != -1) { 1408 debug_step("rmsyms '%s' %s '%s'", 1409 sym->s_name, symt_name(sym->s_kind), 1410 type_name(sym->s_type)); 1411 symtab_remove(sym); 1412 sym->s_symtab_ref = NULL; 1413 } 1414 } 1415 } 1416 1417 /* Put a symbol into the symbol table. */ 1418 void 1419 inssym(int level, sym_t *sym) 1420 { 1421 1422 debug_step("inssym '%s' %s '%s'", 1423 sym->s_name, symt_name(sym->s_kind), type_name(sym->s_type)); 1424 symtab_add(sym); 1425 sym->s_block_level = level; 1426 1427 /* 1428 * Placing the inner symbols to the beginning of the list ensures 1429 * that these symbols are preferred over symbols from the outer 1430 * blocks that happen to have the same name. 1431 */ 1432 const sym_t *next = sym->s_symtab_next; 1433 if (next != NULL) 1434 lint_assert(sym->s_block_level >= next->s_block_level); 1435 } 1436 1437 /* Called at level 0 after syntax errors. */ 1438 void 1439 clean_up_after_error(void) 1440 { 1441 1442 symtab_remove_locals(); 1443 1444 while (mem_block_level > 0) 1445 level_free_all(mem_block_level--); 1446 } 1447 1448 /* Create a new symbol with the same name as an existing symbol. */ 1449 sym_t * 1450 pushdown(const sym_t *sym) 1451 { 1452 sym_t *nsym; 1453 1454 debug_step("pushdown '%s' %s '%s'", 1455 sym->s_name, symt_name(sym->s_kind), type_name(sym->s_type)); 1456 nsym = block_zero_alloc(sizeof(*nsym)); 1457 lint_assert(sym->s_block_level <= block_level); 1458 nsym->s_name = sym->s_name; 1459 UNIQUE_CURR_POS(nsym->s_def_pos); 1460 nsym->s_kind = sym->s_kind; 1461 nsym->s_block_level = block_level; 1462 1463 symtab_add(nsym); 1464 1465 *dcs->d_ldlsym = nsym; 1466 dcs->d_ldlsym = &nsym->s_level_next; 1467 1468 return nsym; 1469 } 1470 1471 /* 1472 * Free any dynamically allocated memory referenced by 1473 * the value stack or yylval. 1474 * The type of information in yylval is described by tok. 1475 */ 1476 void 1477 freeyyv(void *sp, int tok) 1478 { 1479 if (tok == T_NAME || tok == T_TYPENAME) { 1480 sbuf_t *sb = *(sbuf_t **)sp; 1481 free(sb); 1482 } else if (tok == T_CON) { 1483 val_t *val = *(val_t **)sp; 1484 free(val); 1485 } else if (tok == T_STRING) { 1486 strg_t *strg = *(strg_t **)sp; 1487 free(strg->st_mem); 1488 free(strg); 1489 } 1490 } 1491