1 /* $OpenBSD: lexi.c,v 1.16 2013/11/26 13:21:17 deraadt Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 1976 Board of Trustees of the University of Illinois. 7 * Copyright (c) 1985 Sun Microsystems, Inc. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Here we have the token scanner for indent. It scans off one token and puts 37 * it in the global variable "token". It returns a code, indicating the type 38 * of token scanned. 39 */ 40 41 #include <stdio.h> 42 #include <ctype.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <err.h> 46 #include "indent_globs.h" 47 #include "indent_codes.h" 48 49 #define alphanum 1 50 #define opchar 3 51 52 struct templ { 53 char *rwd; 54 int rwcode; 55 }; 56 57 struct templ specialsinit[] = { 58 { "switch", 1 }, 59 { "case", 2 }, 60 { "break", 0 }, 61 { "struct", 3 }, 62 { "union", 3 }, 63 { "enum", 3 }, 64 { "default", 2 }, 65 { "int", 4 }, 66 { "char", 4 }, 67 { "float", 4 }, 68 { "double", 4 }, 69 { "long", 4 }, 70 { "short", 4 }, 71 { "typdef", 4 }, 72 { "unsigned", 4 }, 73 { "register", 4 }, 74 { "static", 4 }, 75 { "global", 4 }, 76 { "extern", 4 }, 77 { "void", 4 }, 78 { "goto", 0 }, 79 { "return", 0 }, 80 { "if", 5 }, 81 { "while", 5 }, 82 { "for", 5 }, 83 { "else", 6 }, 84 { "do", 6 }, 85 { "sizeof", 7 }, 86 }; 87 88 struct templ *specials = specialsinit; 89 int nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]); 90 int maxspecials; 91 92 char chartype[128] = 93 { /* this is used to facilitate the decision of 94 * what type (alphanumeric, operator) each 95 * character is */ 96 0, 0, 0, 0, 0, 0, 0, 0, 97 0, 0, 0, 0, 0, 0, 0, 0, 98 0, 0, 0, 0, 0, 0, 0, 0, 99 0, 0, 0, 0, 0, 0, 0, 0, 100 0, 3, 0, 0, 1, 3, 3, 0, 101 0, 0, 3, 3, 0, 3, 0, 3, 102 1, 1, 1, 1, 1, 1, 1, 1, 103 1, 1, 0, 0, 3, 3, 3, 3, 104 0, 1, 1, 1, 1, 1, 1, 1, 105 1, 1, 1, 1, 1, 1, 1, 1, 106 1, 1, 1, 1, 1, 1, 1, 1, 107 1, 1, 1, 0, 0, 0, 3, 1, 108 0, 1, 1, 1, 1, 1, 1, 1, 109 1, 1, 1, 1, 1, 1, 1, 1, 110 1, 1, 1, 1, 1, 1, 1, 1, 111 1, 1, 1, 0, 3, 0, 3, 0 112 }; 113 114 115 116 117 int 118 lexi(void) 119 { 120 int unary_delim; /* this is set to 1 if the current token 121 * forces a following operator to be unary */ 122 static int last_code; /* the last token type returned */ 123 static int l_struct; /* set to 1 if the last token was 'struct' */ 124 int code; /* internal code to be returned */ 125 char qchar; /* the delimiter character for a string */ 126 int i; 127 128 e_token = s_token; /* point to start of place to save token */ 129 unary_delim = false; 130 ps.col_1 = ps.last_nl; /* tell world that this token started in 131 * column 1 iff the last thing scanned was nl */ 132 ps.last_nl = false; 133 134 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 135 ps.col_1 = false; /* leading blanks imply token is not in column 136 * 1 */ 137 if (++buf_ptr >= buf_end) 138 fill_buffer(); 139 } 140 141 /* Scan an alphanumeric token */ 142 if (chartype[(int)*buf_ptr] == alphanum || 143 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 144 /* 145 * we have a character or number 146 */ 147 char *j; /* used for searching thru list of 148 * reserved words */ 149 if (isdigit((unsigned char)*buf_ptr) || 150 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 151 int seendot = 0, 152 seenexp = 0, 153 seensfx = 0; 154 if (*buf_ptr == '0' && 155 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 156 *e_token++ = *buf_ptr++; 157 *e_token++ = *buf_ptr++; 158 while (isxdigit(*buf_ptr)) { 159 CHECK_SIZE_TOKEN; 160 *e_token++ = *buf_ptr++; 161 } 162 } 163 else 164 while (1) { 165 if (*buf_ptr == '.') { 166 if (seendot) 167 break; 168 else 169 seendot++; 170 } 171 CHECK_SIZE_TOKEN; 172 *e_token++ = *buf_ptr++; 173 if (!isdigit((unsigned char)*buf_ptr) && *buf_ptr != '.') { 174 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 175 break; 176 else { 177 seenexp++; 178 seendot++; 179 CHECK_SIZE_TOKEN; 180 *e_token++ = *buf_ptr++; 181 if (*buf_ptr == '+' || *buf_ptr == '-') 182 *e_token++ = *buf_ptr++; 183 } 184 } 185 } 186 while (1) { 187 if (!(seensfx & 1) && 188 (*buf_ptr == 'U' || *buf_ptr == 'u')) { 189 CHECK_SIZE_TOKEN; 190 *e_token++ = *buf_ptr++; 191 seensfx |= 1; 192 continue; 193 } 194 if (!(seensfx & 2) && 195 (*buf_ptr == 'L' || *buf_ptr == 'l')) { 196 CHECK_SIZE_TOKEN; 197 if (buf_ptr[1] == buf_ptr[0]) 198 *e_token++ = *buf_ptr++; 199 *e_token++ = *buf_ptr++; 200 seensfx |= 2; 201 continue; 202 } 203 break; 204 } 205 } 206 else 207 while (chartype[(int)*buf_ptr] == alphanum) { /* copy it over */ 208 CHECK_SIZE_TOKEN; 209 *e_token++ = *buf_ptr++; 210 if (buf_ptr >= buf_end) 211 fill_buffer(); 212 } 213 *e_token++ = '\0'; 214 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 215 if (++buf_ptr >= buf_end) 216 fill_buffer(); 217 } 218 ps.its_a_keyword = false; 219 ps.sizeof_keyword = false; 220 if (l_struct) { /* if last token was 'struct', then this token 221 * should be treated as a declaration */ 222 l_struct = false; 223 last_code = ident; 224 ps.last_u_d = true; 225 return (decl); 226 } 227 ps.last_u_d = false; /* Operator after identifier is binary */ 228 last_code = ident; /* Remember that this is the code we will 229 * return */ 230 231 /* 232 * This loop will check if the token is a keyword. 233 */ 234 for (i = 0; i < nspecials; i++) { 235 char *p = s_token; /* point at scanned token */ 236 j = specials[i].rwd; 237 if (*j++ != *p++ || *j++ != *p++) 238 continue; /* This test depends on the fact that 239 * identifiers are always at least 1 character 240 * long (ie. the first two bytes of the 241 * identifier are always meaningful) */ 242 if (p[-1] == 0) 243 break; /* If its a one-character identifier */ 244 while (*p++ == *j) 245 if (*j++ == 0) 246 goto found_keyword; /* I wish that C had a multi-level 247 * break... */ 248 } 249 if (i < nspecials) { /* we have a keyword */ 250 found_keyword: 251 ps.its_a_keyword = true; 252 ps.last_u_d = true; 253 switch (specials[i].rwcode) { 254 case 1: /* it is a switch */ 255 return (swstmt); 256 case 2: /* a case or default */ 257 return (casestmt); 258 259 case 3: /* a "struct" */ 260 if (ps.p_l_follow) 261 break; /* inside parens: cast */ 262 l_struct = true; 263 264 /* 265 * Next time around, we will want to know that we have had a 266 * 'struct' 267 */ 268 case 4: /* one of the declaration keywords */ 269 if (ps.p_l_follow) { 270 ps.cast_mask |= 1 << ps.p_l_follow; 271 break; /* inside parens: cast */ 272 } 273 last_code = decl; 274 return (decl); 275 276 case 5: /* if, while, for */ 277 return (sp_paren); 278 279 case 6: /* do, else */ 280 return (sp_nparen); 281 282 case 7: 283 ps.sizeof_keyword = true; 284 default: /* all others are treated like any other 285 * identifier */ 286 return (ident); 287 } /* end of switch */ 288 } /* end of if (found_it) */ 289 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 290 char *tp = buf_ptr; 291 while (tp < buf_end) 292 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 293 goto not_proc; 294 strlcpy(ps.procname, token, sizeof ps.procname); 295 ps.in_parameter_declaration = 1; 296 rparen_count = 1; 297 not_proc:; 298 } 299 /* 300 * The following hack attempts to guess whether or not the current 301 * token is in fact a declaration keyword -- one that has been 302 * typedefd 303 */ 304 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || 305 isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_') 306 && !ps.p_l_follow 307 && !ps.block_init 308 && (ps.last_token == rparen || ps.last_token == semicolon || 309 ps.last_token == decl || 310 ps.last_token == lbrace || ps.last_token == rbrace)) { 311 ps.its_a_keyword = true; 312 ps.last_u_d = true; 313 last_code = decl; 314 return decl; 315 } 316 if (last_code == decl) /* if this is a declared variable, then 317 * following sign is unary */ 318 ps.last_u_d = true; /* will make "int a -1" work */ 319 last_code = ident; 320 return (ident); /* the ident is not in the list */ 321 } /* end of procesing for alpanum character */ 322 323 /* Scan a non-alphanumeric token */ 324 325 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 326 * moved here */ 327 *e_token = '\0'; 328 if (++buf_ptr >= buf_end) 329 fill_buffer(); 330 331 switch (*token) { 332 case '\n': 333 unary_delim = ps.last_u_d; 334 ps.last_nl = true; /* remember that we just had a newline */ 335 code = (had_eof ? 0 : newline); 336 337 /* 338 * if data has been exausted, the newline is a dummy, and we should 339 * return code to stop 340 */ 341 break; 342 343 case '\'': /* start of quoted character */ 344 case '"': /* start of string */ 345 qchar = *token; 346 if (troff) { 347 e_token[-1] = '`'; 348 if (qchar == '"') 349 *e_token++ = '`'; 350 e_token = chfont(&bodyf, &stringf, e_token); 351 } 352 do { /* copy the string */ 353 while (1) { /* move one character or [/<char>]<char> */ 354 if (*buf_ptr == '\n') { 355 printf("%d: Unterminated literal\n", line_no); 356 goto stop_lit; 357 } 358 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 359 * since CHECK_SIZE guarantees that there 360 * are at least 5 entries left */ 361 *e_token = *buf_ptr++; 362 if (buf_ptr >= buf_end) 363 fill_buffer(); 364 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 365 if (*buf_ptr == '\n') /* check for escaped newline */ 366 ++line_no; 367 if (troff) { 368 *++e_token = BACKSLASH; 369 if (*buf_ptr == BACKSLASH) 370 *++e_token = BACKSLASH; 371 } 372 *++e_token = *buf_ptr++; 373 ++e_token; /* we must increment this again because we 374 * copied two chars */ 375 if (buf_ptr >= buf_end) 376 fill_buffer(); 377 } 378 else 379 break; /* we copied one character */ 380 } /* end of while (1) */ 381 } while (*e_token++ != qchar); 382 if (troff) { 383 e_token = chfont(&stringf, &bodyf, e_token - 1); 384 if (qchar == '"') 385 *e_token++ = '\''; 386 } 387 stop_lit: 388 code = ident; 389 break; 390 391 case ('('): 392 case ('['): 393 unary_delim = true; 394 code = lparen; 395 break; 396 397 case (')'): 398 case (']'): 399 code = rparen; 400 break; 401 402 case '#': 403 unary_delim = ps.last_u_d; 404 code = preesc; 405 break; 406 407 case '?': 408 unary_delim = true; 409 code = question; 410 break; 411 412 case (':'): 413 code = colon; 414 unary_delim = true; 415 break; 416 417 case (';'): 418 unary_delim = true; 419 code = semicolon; 420 break; 421 422 case ('{'): 423 unary_delim = true; 424 425 /* 426 * if (ps.in_or_st) ps.block_init = 1; 427 */ 428 /* ? code = ps.block_init ? lparen : lbrace; */ 429 code = lbrace; 430 break; 431 432 case ('}'): 433 unary_delim = true; 434 /* ? code = ps.block_init ? rparen : rbrace; */ 435 code = rbrace; 436 break; 437 438 case 014: /* a form feed */ 439 unary_delim = ps.last_u_d; 440 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 441 * right */ 442 code = form_feed; 443 break; 444 445 case (','): 446 unary_delim = true; 447 code = comma; 448 break; 449 450 case '.': 451 unary_delim = false; 452 code = period; 453 break; 454 455 case '-': 456 case '+': /* check for -, +, --, ++ */ 457 code = (ps.last_u_d ? unary_op : binary_op); 458 unary_delim = true; 459 460 if (*buf_ptr == token[0]) { 461 /* check for doubled character */ 462 *e_token++ = *buf_ptr++; 463 /* buffer overflow will be checked at end of loop */ 464 if (last_code == ident || last_code == rparen) { 465 code = (ps.last_u_d ? unary_op : postop); 466 /* check for following ++ or -- */ 467 unary_delim = false; 468 } 469 } 470 else if (*buf_ptr == '=') 471 /* check for operator += */ 472 *e_token++ = *buf_ptr++; 473 else if (*buf_ptr == '>') { 474 /* check for operator -> */ 475 *e_token++ = *buf_ptr++; 476 if (!pointer_as_binop) { 477 unary_delim = false; 478 code = unary_op; 479 ps.want_blank = false; 480 } 481 } 482 break; /* buffer overflow will be checked at end of 483 * switch */ 484 485 case '=': 486 if (ps.in_or_st) 487 ps.block_init = 1; 488 #ifdef undef 489 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 490 e_token[-1] = *buf_ptr++; 491 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 492 *e_token++ = *buf_ptr++; 493 *e_token++ = '='; /* Flip =+ to += */ 494 *e_token = 0; 495 } 496 #else 497 if (*buf_ptr == '=') {/* == */ 498 *e_token++ = '='; /* Flip =+ to += */ 499 buf_ptr++; 500 *e_token = 0; 501 } 502 #endif 503 code = binary_op; 504 unary_delim = true; 505 break; 506 /* can drop thru!!! */ 507 508 case '>': 509 case '<': 510 case '!': /* ops like <, <<, <=, !=, etc */ 511 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 512 *e_token++ = *buf_ptr; 513 if (++buf_ptr >= buf_end) 514 fill_buffer(); 515 } 516 if (*buf_ptr == '=') 517 *e_token++ = *buf_ptr++; 518 code = (ps.last_u_d ? unary_op : binary_op); 519 unary_delim = true; 520 break; 521 522 default: 523 if (token[0] == '/' && *buf_ptr == '*') { 524 /* it is start of comment */ 525 *e_token++ = '*'; 526 527 if (++buf_ptr >= buf_end) 528 fill_buffer(); 529 530 code = comment; 531 unary_delim = ps.last_u_d; 532 break; 533 } 534 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 535 /* 536 * handle ||, &&, etc, and also things as in int *****i 537 */ 538 *e_token++ = *buf_ptr; 539 if (++buf_ptr >= buf_end) 540 fill_buffer(); 541 } 542 code = (ps.last_u_d ? unary_op : binary_op); 543 unary_delim = true; 544 545 546 } /* end of switch */ 547 if (code != newline) { 548 l_struct = false; 549 last_code = code; 550 } 551 if (buf_ptr >= buf_end) /* check for input buffer empty */ 552 fill_buffer(); 553 ps.last_u_d = unary_delim; 554 *e_token = '\0'; /* null terminate the token */ 555 return (code); 556 } 557 558 /* 559 * Add the given keyword to the keyword table, using val as the keyword type 560 */ 561 void 562 addkey(char *key, int val) 563 { 564 struct templ *p; 565 int i; 566 567 for (i = 0; i < nspecials; i++) { 568 p = &specials[i]; 569 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 570 return; 571 } 572 573 if (specials == specialsinit) { 574 /* 575 * Whoa. Must reallocate special table. 576 */ 577 nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]); 578 maxspecials = nspecials + (nspecials >> 2); 579 specials = (struct templ *)calloc(maxspecials, sizeof specials[0]); 580 if (specials == NULL) 581 err(1, NULL); 582 memcpy(specials, specialsinit, sizeof specialsinit); 583 } else if (nspecials >= maxspecials) { 584 int newspecials = maxspecials + (maxspecials >> 2); 585 struct templ *specials2; 586 587 specials2 = realloc(specials, newspecials * sizeof specials[0]); 588 if (specials2 == NULL) 589 err(1, NULL); 590 specials = specials2; 591 maxspecials = newspecials; 592 } 593 594 p = &specials[nspecials]; 595 p->rwd = key; 596 p->rwcode = val; 597 nspecials++; 598 return; 599 } 600