1 /* $OpenBSD: lexi.c,v 1.14 2007/11/27 16:22:14 martynas Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 1976 Board of Trustees of the University of Illinois. 7 * Copyright (c) 1985 Sun Microsystems, Inc. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #ifndef lint 36 /*static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";*/ 37 static char rcsid[] = "$OpenBSD: lexi.c,v 1.14 2007/11/27 16:22:14 martynas Exp $"; 38 #endif /* not lint */ 39 40 /* 41 * Here we have the token scanner for indent. It scans off one token and puts 42 * it in the global variable "token". It returns a code, indicating the type 43 * of token scanned. 44 */ 45 46 #include <stdio.h> 47 #include <ctype.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include <err.h> 51 #include "indent_globs.h" 52 #include "indent_codes.h" 53 54 #define alphanum 1 55 #define opchar 3 56 57 struct templ { 58 char *rwd; 59 int rwcode; 60 }; 61 62 struct templ specialsinit[] = { 63 { "switch", 1 }, 64 { "case", 2 }, 65 { "break", 0 }, 66 { "struct", 3 }, 67 { "union", 3 }, 68 { "enum", 3 }, 69 { "default", 2 }, 70 { "int", 4 }, 71 { "char", 4 }, 72 { "float", 4 }, 73 { "double", 4 }, 74 { "long", 4 }, 75 { "short", 4 }, 76 { "typdef", 4 }, 77 { "unsigned", 4 }, 78 { "register", 4 }, 79 { "static", 4 }, 80 { "global", 4 }, 81 { "extern", 4 }, 82 { "void", 4 }, 83 { "goto", 0 }, 84 { "return", 0 }, 85 { "if", 5 }, 86 { "while", 5 }, 87 { "for", 5 }, 88 { "else", 6 }, 89 { "do", 6 }, 90 { "sizeof", 7 }, 91 }; 92 93 struct templ *specials = specialsinit; 94 int nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]); 95 int maxspecials; 96 97 char chartype[128] = 98 { /* this is used to facilitate the decision of 99 * what type (alphanumeric, operator) each 100 * character is */ 101 0, 0, 0, 0, 0, 0, 0, 0, 102 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 0, 0, 0, 0, 0, 0, 0, 105 0, 3, 0, 0, 1, 3, 3, 0, 106 0, 0, 3, 3, 0, 3, 0, 3, 107 1, 1, 1, 1, 1, 1, 1, 1, 108 1, 1, 0, 0, 3, 3, 3, 3, 109 0, 1, 1, 1, 1, 1, 1, 1, 110 1, 1, 1, 1, 1, 1, 1, 1, 111 1, 1, 1, 1, 1, 1, 1, 1, 112 1, 1, 1, 0, 0, 0, 3, 1, 113 0, 1, 1, 1, 1, 1, 1, 1, 114 1, 1, 1, 1, 1, 1, 1, 1, 115 1, 1, 1, 1, 1, 1, 1, 1, 116 1, 1, 1, 0, 3, 0, 3, 0 117 }; 118 119 120 121 122 int 123 lexi(void) 124 { 125 int unary_delim; /* this is set to 1 if the current token 126 * forces a following operator to be unary */ 127 static int last_code; /* the last token type returned */ 128 static int l_struct; /* set to 1 if the last token was 'struct' */ 129 int code; /* internal code to be returned */ 130 char qchar; /* the delimiter character for a string */ 131 int i; 132 133 e_token = s_token; /* point to start of place to save token */ 134 unary_delim = false; 135 ps.col_1 = ps.last_nl; /* tell world that this token started in 136 * column 1 iff the last thing scanned was nl */ 137 ps.last_nl = false; 138 139 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 140 ps.col_1 = false; /* leading blanks imply token is not in column 141 * 1 */ 142 if (++buf_ptr >= buf_end) 143 fill_buffer(); 144 } 145 146 /* Scan an alphanumeric token */ 147 if (chartype[(int)*buf_ptr] == alphanum || 148 (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 149 /* 150 * we have a character or number 151 */ 152 char *j; /* used for searching thru list of 153 * reserved words */ 154 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 155 int seendot = 0, 156 seenexp = 0, 157 seensfx = 0; 158 if (*buf_ptr == '0' && 159 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 160 *e_token++ = *buf_ptr++; 161 *e_token++ = *buf_ptr++; 162 while (isxdigit(*buf_ptr)) { 163 CHECK_SIZE_TOKEN; 164 *e_token++ = *buf_ptr++; 165 } 166 } 167 else 168 while (1) { 169 if (*buf_ptr == '.') { 170 if (seendot) 171 break; 172 else 173 seendot++; 174 } 175 CHECK_SIZE_TOKEN; 176 *e_token++ = *buf_ptr++; 177 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 178 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 179 break; 180 else { 181 seenexp++; 182 seendot++; 183 CHECK_SIZE_TOKEN; 184 *e_token++ = *buf_ptr++; 185 if (*buf_ptr == '+' || *buf_ptr == '-') 186 *e_token++ = *buf_ptr++; 187 } 188 } 189 } 190 while (1) { 191 if (!(seensfx & 1) && 192 (*buf_ptr == 'U' || *buf_ptr == 'u')) { 193 CHECK_SIZE_TOKEN; 194 *e_token++ = *buf_ptr++; 195 seensfx |= 1; 196 continue; 197 } 198 if (!(seensfx & 2) && 199 (*buf_ptr == 'L' || *buf_ptr == 'l')) { 200 CHECK_SIZE_TOKEN; 201 if (buf_ptr[1] == buf_ptr[0]) 202 *e_token++ = *buf_ptr++; 203 *e_token++ = *buf_ptr++; 204 seensfx |= 2; 205 continue; 206 } 207 break; 208 } 209 } 210 else 211 while (chartype[(int)*buf_ptr] == alphanum) { /* copy it over */ 212 CHECK_SIZE_TOKEN; 213 *e_token++ = *buf_ptr++; 214 if (buf_ptr >= buf_end) 215 fill_buffer(); 216 } 217 *e_token++ = '\0'; 218 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 219 if (++buf_ptr >= buf_end) 220 fill_buffer(); 221 } 222 ps.its_a_keyword = false; 223 ps.sizeof_keyword = false; 224 if (l_struct) { /* if last token was 'struct', then this token 225 * should be treated as a declaration */ 226 l_struct = false; 227 last_code = ident; 228 ps.last_u_d = true; 229 return (decl); 230 } 231 ps.last_u_d = false; /* Operator after identifier is binary */ 232 last_code = ident; /* Remember that this is the code we will 233 * return */ 234 235 /* 236 * This loop will check if the token is a keyword. 237 */ 238 for (i = 0; i < nspecials; i++) { 239 char *p = s_token; /* point at scanned token */ 240 j = specials[i].rwd; 241 if (*j++ != *p++ || *j++ != *p++) 242 continue; /* This test depends on the fact that 243 * identifiers are always at least 1 character 244 * long (ie. the first two bytes of the 245 * identifier are always meaningful) */ 246 if (p[-1] == 0) 247 break; /* If its a one-character identifier */ 248 while (*p++ == *j) 249 if (*j++ == 0) 250 goto found_keyword; /* I wish that C had a multi-level 251 * break... */ 252 } 253 if (i < nspecials) { /* we have a keyword */ 254 found_keyword: 255 ps.its_a_keyword = true; 256 ps.last_u_d = true; 257 switch (specials[i].rwcode) { 258 case 1: /* it is a switch */ 259 return (swstmt); 260 case 2: /* a case or default */ 261 return (casestmt); 262 263 case 3: /* a "struct" */ 264 if (ps.p_l_follow) 265 break; /* inside parens: cast */ 266 l_struct = true; 267 268 /* 269 * Next time around, we will want to know that we have had a 270 * 'struct' 271 */ 272 case 4: /* one of the declaration keywords */ 273 if (ps.p_l_follow) { 274 ps.cast_mask |= 1 << ps.p_l_follow; 275 break; /* inside parens: cast */ 276 } 277 last_code = decl; 278 return (decl); 279 280 case 5: /* if, while, for */ 281 return (sp_paren); 282 283 case 6: /* do, else */ 284 return (sp_nparen); 285 286 case 7: 287 ps.sizeof_keyword = true; 288 default: /* all others are treated like any other 289 * identifier */ 290 return (ident); 291 } /* end of switch */ 292 } /* end of if (found_it) */ 293 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 294 char *tp = buf_ptr; 295 while (tp < buf_end) 296 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 297 goto not_proc; 298 strlcpy(ps.procname, token, sizeof ps.procname); 299 ps.in_parameter_declaration = 1; 300 rparen_count = 1; 301 not_proc:; 302 } 303 /* 304 * The following hack attempts to guess whether or not the current 305 * token is in fact a declaration keyword -- one that has been 306 * typedefd 307 */ 308 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 309 && !ps.p_l_follow 310 && !ps.block_init 311 && (ps.last_token == rparen || ps.last_token == semicolon || 312 ps.last_token == decl || 313 ps.last_token == lbrace || ps.last_token == rbrace)) { 314 ps.its_a_keyword = true; 315 ps.last_u_d = true; 316 last_code = decl; 317 return decl; 318 } 319 if (last_code == decl) /* if this is a declared variable, then 320 * following sign is unary */ 321 ps.last_u_d = true; /* will make "int a -1" work */ 322 last_code = ident; 323 return (ident); /* the ident is not in the list */ 324 } /* end of procesing for alpanum character */ 325 326 /* Scan a non-alphanumeric token */ 327 328 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 329 * moved here */ 330 *e_token = '\0'; 331 if (++buf_ptr >= buf_end) 332 fill_buffer(); 333 334 switch (*token) { 335 case '\n': 336 unary_delim = ps.last_u_d; 337 ps.last_nl = true; /* remember that we just had a newline */ 338 code = (had_eof ? 0 : newline); 339 340 /* 341 * if data has been exausted, the newline is a dummy, and we should 342 * return code to stop 343 */ 344 break; 345 346 case '\'': /* start of quoted character */ 347 case '"': /* start of string */ 348 qchar = *token; 349 if (troff) { 350 e_token[-1] = '`'; 351 if (qchar == '"') 352 *e_token++ = '`'; 353 e_token = chfont(&bodyf, &stringf, e_token); 354 } 355 do { /* copy the string */ 356 while (1) { /* move one character or [/<char>]<char> */ 357 if (*buf_ptr == '\n') { 358 printf("%d: Unterminated literal\n", line_no); 359 goto stop_lit; 360 } 361 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 362 * since CHECK_SIZE guarantees that there 363 * are at least 5 entries left */ 364 *e_token = *buf_ptr++; 365 if (buf_ptr >= buf_end) 366 fill_buffer(); 367 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 368 if (*buf_ptr == '\n') /* check for escaped newline */ 369 ++line_no; 370 if (troff) { 371 *++e_token = BACKSLASH; 372 if (*buf_ptr == BACKSLASH) 373 *++e_token = BACKSLASH; 374 } 375 *++e_token = *buf_ptr++; 376 ++e_token; /* we must increment this again because we 377 * copied two chars */ 378 if (buf_ptr >= buf_end) 379 fill_buffer(); 380 } 381 else 382 break; /* we copied one character */ 383 } /* end of while (1) */ 384 } while (*e_token++ != qchar); 385 if (troff) { 386 e_token = chfont(&stringf, &bodyf, e_token - 1); 387 if (qchar == '"') 388 *e_token++ = '\''; 389 } 390 stop_lit: 391 code = ident; 392 break; 393 394 case ('('): 395 case ('['): 396 unary_delim = true; 397 code = lparen; 398 break; 399 400 case (')'): 401 case (']'): 402 code = rparen; 403 break; 404 405 case '#': 406 unary_delim = ps.last_u_d; 407 code = preesc; 408 break; 409 410 case '?': 411 unary_delim = true; 412 code = question; 413 break; 414 415 case (':'): 416 code = colon; 417 unary_delim = true; 418 break; 419 420 case (';'): 421 unary_delim = true; 422 code = semicolon; 423 break; 424 425 case ('{'): 426 unary_delim = true; 427 428 /* 429 * if (ps.in_or_st) ps.block_init = 1; 430 */ 431 /* ? code = ps.block_init ? lparen : lbrace; */ 432 code = lbrace; 433 break; 434 435 case ('}'): 436 unary_delim = true; 437 /* ? code = ps.block_init ? rparen : rbrace; */ 438 code = rbrace; 439 break; 440 441 case 014: /* a form feed */ 442 unary_delim = ps.last_u_d; 443 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 444 * right */ 445 code = form_feed; 446 break; 447 448 case (','): 449 unary_delim = true; 450 code = comma; 451 break; 452 453 case '.': 454 unary_delim = false; 455 code = period; 456 break; 457 458 case '-': 459 case '+': /* check for -, +, --, ++ */ 460 code = (ps.last_u_d ? unary_op : binary_op); 461 unary_delim = true; 462 463 if (*buf_ptr == token[0]) { 464 /* check for doubled character */ 465 *e_token++ = *buf_ptr++; 466 /* buffer overflow will be checked at end of loop */ 467 if (last_code == ident || last_code == rparen) { 468 code = (ps.last_u_d ? unary_op : postop); 469 /* check for following ++ or -- */ 470 unary_delim = false; 471 } 472 } 473 else if (*buf_ptr == '=') 474 /* check for operator += */ 475 *e_token++ = *buf_ptr++; 476 else if (*buf_ptr == '>') { 477 /* check for operator -> */ 478 *e_token++ = *buf_ptr++; 479 if (!pointer_as_binop) { 480 unary_delim = false; 481 code = unary_op; 482 ps.want_blank = false; 483 } 484 } 485 break; /* buffer overflow will be checked at end of 486 * switch */ 487 488 case '=': 489 if (ps.in_or_st) 490 ps.block_init = 1; 491 #ifdef undef 492 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 493 e_token[-1] = *buf_ptr++; 494 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 495 *e_token++ = *buf_ptr++; 496 *e_token++ = '='; /* Flip =+ to += */ 497 *e_token = 0; 498 } 499 #else 500 if (*buf_ptr == '=') {/* == */ 501 *e_token++ = '='; /* Flip =+ to += */ 502 buf_ptr++; 503 *e_token = 0; 504 } 505 #endif 506 code = binary_op; 507 unary_delim = true; 508 break; 509 /* can drop thru!!! */ 510 511 case '>': 512 case '<': 513 case '!': /* ops like <, <<, <=, !=, etc */ 514 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 515 *e_token++ = *buf_ptr; 516 if (++buf_ptr >= buf_end) 517 fill_buffer(); 518 } 519 if (*buf_ptr == '=') 520 *e_token++ = *buf_ptr++; 521 code = (ps.last_u_d ? unary_op : binary_op); 522 unary_delim = true; 523 break; 524 525 default: 526 if (token[0] == '/' && *buf_ptr == '*') { 527 /* it is start of comment */ 528 *e_token++ = '*'; 529 530 if (++buf_ptr >= buf_end) 531 fill_buffer(); 532 533 code = comment; 534 unary_delim = ps.last_u_d; 535 break; 536 } 537 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 538 /* 539 * handle ||, &&, etc, and also things as in int *****i 540 */ 541 *e_token++ = *buf_ptr; 542 if (++buf_ptr >= buf_end) 543 fill_buffer(); 544 } 545 code = (ps.last_u_d ? unary_op : binary_op); 546 unary_delim = true; 547 548 549 } /* end of switch */ 550 if (code != newline) { 551 l_struct = false; 552 last_code = code; 553 } 554 if (buf_ptr >= buf_end) /* check for input buffer empty */ 555 fill_buffer(); 556 ps.last_u_d = unary_delim; 557 *e_token = '\0'; /* null terminate the token */ 558 return (code); 559 } 560 561 /* 562 * Add the given keyword to the keyword table, using val as the keyword type 563 */ 564 void 565 addkey(char *key, int val) 566 { 567 struct templ *p; 568 int i; 569 570 for (i = 0; i < nspecials; i++) { 571 p = &specials[i]; 572 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 573 return; 574 } 575 576 if (specials == specialsinit) { 577 /* 578 * Whoa. Must reallocate special table. 579 */ 580 nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]); 581 maxspecials = nspecials + (nspecials >> 2); 582 specials = (struct templ *)calloc(maxspecials, sizeof specials[0]); 583 if (specials == NULL) 584 err(1, NULL); 585 memcpy(specials, specialsinit, sizeof specialsinit); 586 } else if (nspecials >= maxspecials) { 587 int newspecials = maxspecials + (maxspecials >> 2); 588 struct templ *specials2; 589 590 specials2 = realloc(specials, newspecials * sizeof specials[0]); 591 if (specials2 == NULL) 592 err(1, NULL); 593 specials = specials2; 594 maxspecials = newspecials; 595 } 596 597 p = &specials[nspecials]; 598 p->rwd = key; 599 p->rwcode = val; 600 nspecials++; 601 return; 602 } 603