1 /* $NetBSD: lexi.c,v 1.242 2023/12/03 21:44:42 rillig Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-4-Clause 5 * 6 * Copyright (c) 1985 Sun Microsystems, Inc. 7 * Copyright (c) 1980, 1993 8 * The Regents of the University of California. All rights reserved. 9 * All rights reserved. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 */ 39 40 #include <sys/cdefs.h> 41 __RCSID("$NetBSD: lexi.c,v 1.242 2023/12/03 21:44:42 rillig Exp $"); 42 43 #include <stdlib.h> 44 #include <string.h> 45 46 #include "indent.h" 47 48 /* must be sorted alphabetically, is used in binary search */ 49 static const struct keyword { 50 const char name[12]; 51 lexer_symbol lsym; 52 } keywords[] = { 53 {"_Bool", lsym_type}, 54 {"_Complex", lsym_modifier}, 55 {"_Imaginary", lsym_modifier}, 56 {"auto", lsym_modifier}, 57 {"bool", lsym_type}, 58 {"break", lsym_word}, 59 {"case", lsym_case}, 60 {"char", lsym_type}, 61 {"complex", lsym_modifier}, 62 {"const", lsym_modifier}, 63 {"continue", lsym_word}, 64 {"default", lsym_default}, 65 {"do", lsym_do}, 66 {"double", lsym_type}, 67 {"else", lsym_else}, 68 {"enum", lsym_tag}, 69 {"extern", lsym_modifier}, 70 {"float", lsym_type}, 71 {"for", lsym_for}, 72 {"goto", lsym_word}, 73 {"if", lsym_if}, 74 {"imaginary", lsym_modifier}, 75 {"inline", lsym_modifier}, 76 {"int", lsym_type}, 77 {"long", lsym_type}, 78 {"offsetof", lsym_offsetof}, 79 {"register", lsym_modifier}, 80 {"restrict", lsym_word}, 81 {"return", lsym_return}, 82 {"short", lsym_type}, 83 {"signed", lsym_type}, 84 {"sizeof", lsym_sizeof}, 85 {"static", lsym_modifier}, 86 {"struct", lsym_tag}, 87 {"switch", lsym_switch}, 88 {"typedef", lsym_typedef}, 89 {"union", lsym_tag}, 90 {"unsigned", lsym_type}, 91 {"void", lsym_type}, 92 {"volatile", lsym_modifier}, 93 {"while", lsym_while} 94 }; 95 96 static struct { 97 const char **items; 98 unsigned int len; 99 unsigned int cap; 100 } typenames; 101 102 /*- 103 * The transition table below was rewritten by hand from lx's output, given 104 * the following definitions. lx is Katherine Flavel's lexer generator. 105 * 106 * O = /[0-7]/; D = /[0-9]/; NZ = /[1-9]/; 107 * H = /[a-f0-9]/i; B = /[0-1]/; HP = /0x/i; 108 * BP = /0b/i; E = /e[+\-]?/i D+; P = /p[+\-]?/i D+; 109 * FS = /[fl]/i; IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?; 110 * 111 * D+ E FS? -> $float; 112 * D* "." D+ E? FS? -> $float; 113 * D+ "." E? FS? -> $float; HP H+ IS? -> $int; 114 * HP H+ P FS? -> $float; NZ D* IS? -> $int; 115 * HP H* "." H+ P FS? -> $float; "0" O* IS? -> $int; 116 * HP H+ "." P FS -> $float; BP B+ IS? -> $int; 117 */ 118 /* INDENT OFF */ 119 static const unsigned char lex_number_state[][26] = { 120 /* examples: 121 00 122 s 0xx 123 t 00xaa 124 a 11 101100xxa.. 125 r 11ee0001101lbuuxx.a.pp 126 t.01.e+008bLuxll0Ll.aa.p+0 127 states: ABCDEFGHIJKLMNOPQRSTUVWXYZ */ 128 [0] = "uuiifuufiuuiiuiiiiiuiuuuuu", /* (other) */ 129 [1] = "CEIDEHHHIJQ U Q VUVVZZZ", /* 0 */ 130 [2] = "DEIDEHHHIJQ U Q VUVVZZZ", /* 1 */ 131 [3] = "DEIDEHHHIJ U VUVVZZZ", /* 2 3 4 5 6 7 */ 132 [4] = "DEJDEHHHJJ U VUVVZZZ", /* 8 9 */ 133 [5] = " U VUVV ", /* A a C c D d */ 134 [6] = " K U VUVV ", /* B b */ 135 [7] = " FFF FF U VUVV ", /* E e */ 136 [8] = " f f U VUVV f", /* F f */ 137 [9] = " LLf fL PR Li L f", /* L */ 138 [10] = " OOf fO S P O i O f", /* l */ 139 [11] = " FFX ", /* P p */ 140 [12] = " MM M i iiM M ", /* U u */ 141 [13] = " N ", /* X x */ 142 [14] = " G Y ", /* + - */ 143 [15] = "B EE EE T W ", /* . */ 144 /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */ 145 }; 146 /* INDENT ON */ 147 148 static const unsigned char lex_number_row[] = { 149 ['0'] = 1, 150 ['1'] = 2, 151 ['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3, 152 ['8'] = 4, ['9'] = 4, 153 ['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5, 154 ['B'] = 6, ['b'] = 6, 155 ['E'] = 7, ['e'] = 7, 156 ['F'] = 8, ['f'] = 8, 157 ['L'] = 9, 158 ['l'] = 10, 159 ['P'] = 11, ['p'] = 11, 160 ['U'] = 12, ['u'] = 12, 161 ['X'] = 13, ['x'] = 13, 162 ['+'] = 14, ['-'] = 14, 163 ['.'] = 15, 164 }; 165 166 167 static bool 168 is_identifier_start(char ch) 169 { 170 return ch_isalpha(ch) || ch == '_' || ch == '$'; 171 } 172 173 static bool 174 is_identifier_part(char ch) 175 { 176 return ch_isalnum(ch) || ch == '_' || ch == '$'; 177 } 178 179 static void 180 token_add_char(char ch) 181 { 182 buf_add_char(&token, ch); 183 } 184 185 static bool 186 skip_line_continuation(void) 187 { 188 if (in.p[0] == '\\' && in.p[1] == '\n') { 189 in.p++; 190 inp_skip(); 191 in.token_end_line++; 192 return true; 193 } 194 return false; 195 } 196 197 static void 198 lex_number(void) 199 { 200 for (unsigned char s = 'A'; s != 'f' && s != 'i' && s != 'u';) { 201 unsigned char ch = (unsigned char)*in.p; 202 if (skip_line_continuation()) 203 continue; 204 if (ch >= array_length(lex_number_row) 205 || lex_number_row[ch] == 0) 206 break; 207 208 unsigned char row = lex_number_row[ch]; 209 if (lex_number_state[row][s - 'A'] == ' ') { 210 // lex_number_state[0][s - 'A'] now indicates the type: 211 // f = floating, i = integer, u = unknown 212 return; 213 } 214 215 s = lex_number_state[row][s - 'A']; 216 token_add_char(inp_next()); 217 } 218 } 219 220 static void 221 lex_word(void) 222 { 223 for (;;) { 224 if (is_identifier_part(*in.p)) 225 token_add_char(*in.p++); 226 else if (skip_line_continuation()) 227 continue; 228 else 229 return; 230 } 231 } 232 233 static void 234 lex_char_or_string(void) 235 { 236 for (char delim = token.s[token.len - 1];;) { 237 if (*in.p == '\n') { 238 diag(1, "Unterminated literal"); 239 return; 240 } 241 242 token_add_char(*in.p++); 243 if (token.s[token.len - 1] == delim) 244 return; 245 246 if (token.s[token.len - 1] == '\\') { 247 if (*in.p == '\n') 248 in.token_end_line++; 249 token_add_char(inp_next()); 250 } 251 } 252 } 253 254 /* Guess whether the current token is a declared type. */ 255 static bool 256 probably_typename(void) 257 { 258 if (ps.prev_lsym == lsym_modifier) 259 return true; 260 if (ps.in_init) 261 return false; 262 if (ps.in_stmt_or_decl) /* XXX: this condition looks incorrect */ 263 return false; 264 if (ps.prev_lsym == lsym_semicolon 265 || ps.prev_lsym == lsym_lbrace 266 || ps.prev_lsym == lsym_rbrace) { 267 if (in.p[0] == '*' && in.p[1] != '=') 268 return true; 269 /* XXX: is_identifier_start */ 270 if (ch_isalpha(in.p[0])) 271 return true; 272 } 273 return false; 274 } 275 276 static int 277 bsearch_typenames(const char *key) 278 { 279 const char **arr = typenames.items; 280 unsigned lo = 0; 281 unsigned hi = typenames.len; 282 283 while (lo < hi) { 284 unsigned mid = (lo + hi) / 2; 285 int cmp = strcmp(arr[mid], key); 286 if (cmp < 0) 287 lo = mid + 1; 288 else if (cmp > 0) 289 hi = mid; 290 else 291 return (int)mid; 292 } 293 return -1 - (int)lo; 294 } 295 296 static bool 297 is_typename(void) 298 { 299 if (ps.prev_lsym == lsym_tag) 300 return true; 301 if (opt.auto_typedefs && 302 token.len >= 2 && memcmp(token.s + token.len - 2, "_t", 2) == 0) 303 return true; 304 305 return bsearch_typenames(token.s) >= 0; 306 } 307 308 void 309 register_typename(const char *name) 310 { 311 if (typenames.len >= typenames.cap) { 312 typenames.cap = 16 + 2 * typenames.cap; 313 typenames.items = nonnull(realloc(typenames.items, 314 sizeof(typenames.items[0]) * typenames.cap)); 315 } 316 317 int pos = bsearch_typenames(name); 318 if (pos >= 0) 319 return; /* already in the list */ 320 321 pos = -1 - pos; 322 memmove(typenames.items + pos + 1, typenames.items + pos, 323 sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos)); 324 typenames.items[pos] = nonnull(strdup(name)); 325 } 326 327 static int 328 cmp_keyword_by_name(const void *key, const void *elem) 329 { 330 return strcmp(key, ((const struct keyword *)elem)->name); 331 } 332 333 /* 334 * Looking at the '(', guess whether this starts a function definition or a 335 * function declaration. 336 */ 337 static bool 338 probably_function_definition(const char *p) 339 { 340 // TODO: Don't look at characters in comments, see lsym_funcname.c. 341 int paren_level = 0; 342 for (; *p != '\n'; p++) { 343 if (*p == '(') 344 paren_level++; 345 if (*p == ')' && --paren_level == 0) { 346 p++; 347 348 while (*p != '\n' 349 && (ch_isspace(*p) || is_identifier_part(*p))) 350 p++; /* '__dead' or '__unused' */ 351 352 if (*p == '\n') /* func(...) */ 353 break; 354 if (*p == ';') /* func(...); */ 355 return false; 356 if (*p == ',') /* double abs(), pi; */ 357 return false; 358 if (*p == '(') /* func(...) __attribute__((...)) */ 359 paren_level++; /* func(...) __printflike(...) 360 */ 361 else 362 break; /* func(...) { ... */ 363 } 364 365 if (paren_level == 1 && p[0] == '*' && p[1] == ',') 366 return false; 367 } 368 369 /* 370 * To further reduce the cases where indent wrongly treats an 371 * incomplete function declaration as a function definition, thus 372 * adding a newline before the function name, it may be worth looking 373 * for parameter names, as these are often omitted in function 374 * declarations and only included in function definitions. Or just 375 * increase the lookahead to more than just the current line of input, 376 * until the next '{'. 377 */ 378 return true; 379 } 380 381 static lexer_symbol 382 lexi_alnum(void) 383 { 384 if (ch_isdigit(in.p[0]) || 385 (in.p[0] == '.' && ch_isdigit(in.p[1]))) { 386 lex_number(); 387 } else if (is_identifier_start(in.p[0])) { 388 lex_word(); 389 390 if (token.len == 1 && token.s[0] == 'L' && 391 (in.p[0] == '"' || in.p[0] == '\'')) { 392 token_add_char(*in.p++); 393 lex_char_or_string(); 394 ps.next_unary = false; 395 return lsym_word; 396 } 397 } else 398 return lsym_eof; /* just as a placeholder */ 399 400 while (ch_isblank(*in.p)) 401 in.p++; 402 403 ps.next_unary = ps.prev_lsym == lsym_tag 404 || ps.prev_lsym == lsym_typedef 405 || (ps.prev_lsym == lsym_modifier && *in.p == '*'); 406 407 if (ps.prev_lsym == lsym_tag && ps.paren.len == 0) 408 return lsym_type; 409 if (ps.spaced_expr_psym == psym_for_exprs 410 && ps.prev_lsym == lsym_lparen && ps.paren.len == 1 411 && *in.p == '*') { 412 ps.next_unary = true; 413 return lsym_type; 414 } 415 416 token_add_char('\0'); // Terminate in non-debug mode as well. 417 token.len--; 418 const struct keyword *kw = bsearch(token.s, keywords, 419 array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name); 420 lexer_symbol lsym = lsym_word; 421 if (kw != NULL) { 422 lsym = kw->lsym; 423 ps.next_unary = true; 424 if (lsym == lsym_tag || lsym == lsym_type) 425 goto found_typename; 426 return lsym; 427 } 428 429 if (is_typename()) { 430 lsym = lsym_type; 431 ps.next_unary = true; 432 found_typename: 433 if (ps.prev_lsym != lsym_period 434 && ps.prev_lsym != lsym_unary_op) { 435 if (lsym == lsym_tag) 436 return lsym_tag; 437 if (ps.paren.len == 0) 438 return lsym_type; 439 } 440 } 441 442 const char *p = in.p; 443 if (*p == ')') 444 p++; 445 if (*p == '(' && ps.psyms.len < 3 && ps.ind_level == 0 && 446 !ps.in_func_def_params && !ps.in_init) { 447 448 bool maybe_function_definition = *in.p == ')' 449 ? ps.paren.len == 1 && ps.prev_lsym != lsym_unary_op 450 : ps.paren.len == 0; 451 if (maybe_function_definition 452 && probably_function_definition(p)) { 453 ps.line_has_func_def = true; 454 if (ps.in_decl) 455 ps.in_func_def_params = true; 456 return lsym_funcname; 457 } 458 459 } else if (ps.paren.len == 0 && probably_typename()) { 460 ps.next_unary = true; 461 return lsym_type; 462 } 463 464 return lsym; 465 } 466 467 static void 468 check_parenthesized_function_definition(void) 469 { 470 const char *p = in.p; 471 while (ch_isblank(*p)) 472 p++; 473 if (is_identifier_start(*p)) 474 while (is_identifier_part(*p)) 475 p++; 476 while (ch_isblank(*p)) 477 p++; 478 if (*p == ')') { 479 p++; 480 while (ch_isblank(*p)) 481 p++; 482 if (*p == '(' && probably_function_definition(p)) 483 ps.line_has_func_def = true; 484 } 485 } 486 487 static bool 488 is_asterisk_unary(void) 489 { 490 const char *p = in.p; 491 while (*p == '*' || ch_isblank(*p)) 492 p++; 493 if (*p == ')') 494 return true; 495 if (ps.next_unary || ps.in_func_def_params) 496 return true; 497 if (ps.prev_lsym == lsym_word || 498 ps.prev_lsym == lsym_rparen || 499 ps.prev_lsym == lsym_rbracket) 500 return false; 501 return ps.in_decl && ps.paren.len > 0; 502 } 503 504 static bool 505 probably_in_function_definition(void) 506 { 507 for (const char *p = in.p; *p != '\n';) { 508 if (ch_isspace(*p)) 509 p++; 510 else if (is_identifier_start(*p)) { 511 p++; 512 while (is_identifier_part(*p)) 513 p++; 514 } else 515 return *p == '('; 516 } 517 return false; 518 } 519 520 static void 521 lex_asterisk_unary(void) 522 { 523 while (*in.p == '*' || ch_isspace(*in.p)) { 524 if (*in.p == '*') 525 token_add_char('*'); 526 if (*in.p == '\n') 527 in.token_end_line++; 528 inp_skip(); 529 } 530 531 if (ps.in_decl && probably_in_function_definition()) 532 ps.line_has_func_def = true; 533 } 534 535 static bool 536 skip(const char **pp, const char *s) 537 { 538 size_t len = strlen(s); 539 while (ch_isblank(**pp)) 540 (*pp)++; 541 if (strncmp(*pp, s, len) == 0) { 542 *pp += len; 543 return true; 544 } 545 return false; 546 } 547 548 static void 549 lex_indent_comment(void) 550 { 551 const char *p = in.line.s; 552 if (skip(&p, "/*") && skip(&p, "INDENT")) { 553 enum indent_enabled enabled; 554 if (skip(&p, "ON") || *p == '*') 555 enabled = indent_last_off_line; 556 else if (skip(&p, "OFF")) 557 enabled = indent_off; 558 else 559 return; 560 if (skip(&p, "*/\n")) { 561 if (lab.len > 0 || code.len > 0 || com.len > 0) 562 output_line(); 563 indent_enabled = enabled; 564 } 565 } 566 } 567 568 /* Reads the next token, placing it in the global variable "token". */ 569 lexer_symbol 570 lexi(void) 571 { 572 buf_clear(&token); 573 574 for (;;) { 575 if (ch_isblank(*in.p)) 576 in.p++; 577 else if (skip_line_continuation()) 578 continue; 579 else 580 break; 581 } 582 in.token_start_line = in.token_end_line; 583 584 lexer_symbol alnum_lsym = lexi_alnum(); 585 if (alnum_lsym != lsym_eof) 586 return alnum_lsym; 587 588 /* Scan a non-alphanumeric token */ 589 590 token_add_char(inp_next()); 591 592 lexer_symbol lsym; 593 bool next_unary; 594 595 switch (token.s[token.len - 1]) { 596 597 case '#': 598 lsym = lsym_preprocessing; 599 next_unary = ps.next_unary; 600 break; 601 602 case '\n': 603 /* if data has been exhausted, the '\n' is a dummy. */ 604 lsym = had_eof ? lsym_eof : lsym_newline; 605 next_unary = ps.next_unary; 606 break; 607 608 /* INDENT OFF */ 609 case ')': lsym = lsym_rparen; next_unary = false; break; 610 case '[': lsym = lsym_lbracket; next_unary = true; break; 611 case ']': lsym = lsym_rbracket; next_unary = false; break; 612 case '{': lsym = lsym_lbrace; next_unary = true; break; 613 case '}': lsym = lsym_rbrace; next_unary = true; break; 614 case '.': lsym = lsym_period; next_unary = false; break; 615 case '?': lsym = lsym_question; next_unary = true; break; 616 case ',': lsym = lsym_comma; next_unary = true; break; 617 case ';': lsym = lsym_semicolon; next_unary = true; break; 618 /* INDENT ON */ 619 620 case '(': 621 if (in.p == in.line.s + 1) 622 check_parenthesized_function_definition(); 623 lsym = lsym_lparen; 624 next_unary = true; 625 break; 626 627 case '+': 628 case '-': 629 lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op; 630 next_unary = true; 631 632 /* '++' or '--' */ 633 if (*in.p == token.s[token.len - 1]) { 634 token_add_char(*in.p++); 635 if (ps.prev_lsym == lsym_word || 636 ps.prev_lsym == lsym_rparen || 637 ps.prev_lsym == lsym_rbracket) { 638 lsym = ps.next_unary 639 ? lsym_unary_op : lsym_postfix_op; 640 next_unary = false; 641 } 642 643 } else if (*in.p == '=') { /* '+=' or '-=' */ 644 token_add_char(*in.p++); 645 646 } else if (*in.p == '>') { /* '->' */ 647 token_add_char(*in.p++); 648 lsym = lsym_unary_op; 649 next_unary = false; 650 ps.want_blank = false; 651 } 652 break; 653 654 case ':': 655 lsym = ps.quest_level > 0 656 ? (ps.quest_level--, lsym_question_colon) 657 : ps.in_var_decl ? lsym_other_colon : lsym_label_colon; 658 next_unary = true; 659 break; 660 661 case '*': 662 if (*in.p == '=') { 663 token_add_char(*in.p++); 664 lsym = lsym_binary_op; 665 } else if (is_asterisk_unary()) { 666 lex_asterisk_unary(); 667 lsym = lsym_unary_op; 668 } else 669 lsym = lsym_binary_op; 670 next_unary = true; 671 break; 672 673 case '=': 674 if (ps.in_var_decl) 675 ps.in_init = true; 676 if (*in.p == '=') 677 token_add_char(*in.p++); 678 lsym = lsym_binary_op; 679 next_unary = true; 680 break; 681 682 case '>': 683 case '<': 684 case '!': /* ops like <, <<, <=, !=, etc. */ 685 if (*in.p == '>' || *in.p == '<' || *in.p == '=') 686 token_add_char(*in.p++); 687 if (*in.p == '=') 688 token_add_char(*in.p++); 689 lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op; 690 next_unary = true; 691 break; 692 693 case '\'': 694 case '"': 695 lex_char_or_string(); 696 lsym = lsym_word; 697 next_unary = false; 698 break; 699 700 default: 701 if (token.s[token.len - 1] == '/' 702 && (*in.p == '*' || *in.p == '/')) { 703 enum indent_enabled prev = indent_enabled; 704 lex_indent_comment(); 705 if (prev == indent_on && indent_enabled == indent_off) 706 buf_clear(&out.indent_off_text); 707 token_add_char(*in.p++); 708 lsym = lsym_comment; 709 next_unary = ps.next_unary; 710 break; 711 } 712 713 /* punctuation like '%', '&&', '/', '^', '||', '~' */ 714 lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op; 715 if (*in.p == token.s[token.len - 1]) 716 token_add_char(*in.p++), lsym = lsym_binary_op; 717 if (*in.p == '=') 718 token_add_char(*in.p++), lsym = lsym_binary_op; 719 720 next_unary = true; 721 } 722 723 ps.next_unary = next_unary; 724 725 return lsym; 726 } 727