1 /* $OpenBSD: lexi.c,v 1.8 2001/06/25 04:58:31 pjanzen Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 1976 Board of Trustees of the University of Illinois. 7 * Copyright (c) 1985 Sun Microsystems, Inc. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #ifndef lint 40 /*static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";*/ 41 static char rcsid[] = "$OpenBSD: lexi.c,v 1.8 2001/06/25 04:58:31 pjanzen Exp $"; 42 #endif /* not lint */ 43 44 /* 45 * Here we have the token scanner for indent. It scans off one token and puts 46 * it in the global variable "token". It returns a code, indicating the type 47 * of token scanned. 48 */ 49 50 #include <stdio.h> 51 #include <ctype.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <err.h> 55 #include "indent_globs.h" 56 #include "indent_codes.h" 57 58 #define alphanum 1 59 #define opchar 3 60 61 struct templ { 62 char *rwd; 63 int rwcode; 64 }; 65 66 struct templ specialsinit[] = { 67 { "switch", 1 }, 68 { "case", 2 }, 69 { "break", 0 }, 70 { "struct", 3 }, 71 { "union", 3 }, 72 { "enum", 3 }, 73 { "default", 2 }, 74 { "int", 4 }, 75 { "char", 4 }, 76 { "float", 4 }, 77 { "double", 4 }, 78 { "long", 4 }, 79 { "short", 4 }, 80 { "typdef", 4 }, 81 { "unsigned", 4 }, 82 { "register", 4 }, 83 { "static", 4 }, 84 { "global", 4 }, 85 { "extern", 4 }, 86 { "void", 4 }, 87 { "goto", 0 }, 88 { "return", 0 }, 89 { "if", 5 }, 90 { "while", 5 }, 91 { "for", 5 }, 92 { "else", 6 }, 93 { "do", 6 }, 94 { "sizeof", 7 }, 95 }; 96 97 struct templ *specials = specialsinit; 98 int nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]); 99 int maxspecials; 100 101 char chartype[128] = 102 { /* this is used to facilitate the decision of 103 * what type (alphanumeric, operator) each 104 * character is */ 105 0, 0, 0, 0, 0, 0, 0, 0, 106 0, 0, 0, 0, 0, 0, 0, 0, 107 0, 0, 0, 0, 0, 0, 0, 0, 108 0, 0, 0, 0, 0, 0, 0, 0, 109 0, 3, 0, 0, 1, 3, 3, 0, 110 0, 0, 3, 3, 0, 3, 0, 3, 111 1, 1, 1, 1, 1, 1, 1, 1, 112 1, 1, 0, 0, 3, 3, 3, 3, 113 0, 1, 1, 1, 1, 1, 1, 1, 114 1, 1, 1, 1, 1, 1, 1, 1, 115 1, 1, 1, 1, 1, 1, 1, 1, 116 1, 1, 1, 0, 0, 0, 3, 1, 117 0, 1, 1, 1, 1, 1, 1, 1, 118 1, 1, 1, 1, 1, 1, 1, 1, 119 1, 1, 1, 1, 1, 1, 1, 1, 120 1, 1, 1, 0, 3, 0, 3, 0 121 }; 122 123 124 125 126 int 127 lexi() 128 { 129 int unary_delim; /* this is set to 1 if the current token 130 * forces a following operator to be unary */ 131 static int last_code; /* the last token type returned */ 132 static int l_struct; /* set to 1 if the last token was 'struct' */ 133 int code; /* internal code to be returned */ 134 char qchar; /* the delimiter character for a string */ 135 int i; 136 137 e_token = s_token; /* point to start of place to save token */ 138 unary_delim = false; 139 ps.col_1 = ps.last_nl; /* tell world that this token started in 140 * column 1 iff the last thing scanned was nl */ 141 ps.last_nl = false; 142 143 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 144 ps.col_1 = false; /* leading blanks imply token is not in column 145 * 1 */ 146 if (++buf_ptr >= buf_end) 147 fill_buffer(); 148 } 149 150 /* Scan an alphanumeric token */ 151 if (chartype[(int)*buf_ptr] == alphanum || 152 (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 153 /* 154 * we have a character or number 155 */ 156 char *j; /* used for searching thru list of 157 * reserved words */ 158 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 159 int seendot = 0, 160 seenexp = 0, 161 seensfx = 0; 162 if (*buf_ptr == '0' && 163 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 164 *e_token++ = *buf_ptr++; 165 *e_token++ = *buf_ptr++; 166 while (isxdigit(*buf_ptr)) { 167 CHECK_SIZE_TOKEN; 168 *e_token++ = *buf_ptr++; 169 } 170 } 171 else 172 while (1) { 173 if (*buf_ptr == '.') { 174 if (seendot) 175 break; 176 else 177 seendot++; 178 } 179 CHECK_SIZE_TOKEN; 180 *e_token++ = *buf_ptr++; 181 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 182 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 183 break; 184 else { 185 seenexp++; 186 seendot++; 187 CHECK_SIZE_TOKEN; 188 *e_token++ = *buf_ptr++; 189 if (*buf_ptr == '+' || *buf_ptr == '-') 190 *e_token++ = *buf_ptr++; 191 } 192 } 193 } 194 while (1) { 195 if (!(seensfx & 1) && 196 (*buf_ptr == 'U' || *buf_ptr == 'u')) { 197 CHECK_SIZE_TOKEN; 198 *e_token++ = *buf_ptr++; 199 seensfx |= 1; 200 continue; 201 } 202 if (!(seensfx & 2) && 203 (*buf_ptr == 'L' || *buf_ptr == 'l')) { 204 CHECK_SIZE_TOKEN; 205 if (buf_ptr[1] == buf_ptr[0]) 206 *e_token++ = *buf_ptr++; 207 *e_token++ = *buf_ptr++; 208 seensfx |= 2; 209 continue; 210 } 211 break; 212 } 213 } 214 else 215 while (chartype[(int)*buf_ptr] == alphanum) { /* copy it over */ 216 CHECK_SIZE_TOKEN; 217 *e_token++ = *buf_ptr++; 218 if (buf_ptr >= buf_end) 219 fill_buffer(); 220 } 221 *e_token++ = '\0'; 222 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 223 if (++buf_ptr >= buf_end) 224 fill_buffer(); 225 } 226 ps.its_a_keyword = false; 227 ps.sizeof_keyword = false; 228 if (l_struct) { /* if last token was 'struct', then this token 229 * should be treated as a declaration */ 230 l_struct = false; 231 last_code = ident; 232 ps.last_u_d = true; 233 return (decl); 234 } 235 ps.last_u_d = false; /* Operator after indentifier is binary */ 236 last_code = ident; /* Remember that this is the code we will 237 * return */ 238 239 /* 240 * This loop will check if the token is a keyword. 241 */ 242 for (i = 0; i < nspecials; i++) { 243 char *p = s_token; /* point at scanned token */ 244 j = specials[i].rwd; 245 if (*j++ != *p++ || *j++ != *p++) 246 continue; /* This test depends on the fact that 247 * identifiers are always at least 1 character 248 * long (ie. the first two bytes of the 249 * identifier are always meaningful) */ 250 if (p[-1] == 0) 251 break; /* If its a one-character identifier */ 252 while (*p++ == *j) 253 if (*j++ == 0) 254 goto found_keyword; /* I wish that C had a multi-level 255 * break... */ 256 } 257 if (i < nspecials) { /* we have a keyword */ 258 found_keyword: 259 ps.its_a_keyword = true; 260 ps.last_u_d = true; 261 switch (specials[i].rwcode) { 262 case 1: /* it is a switch */ 263 return (swstmt); 264 case 2: /* a case or default */ 265 return (casestmt); 266 267 case 3: /* a "struct" */ 268 if (ps.p_l_follow) 269 break; /* inside parens: cast */ 270 l_struct = true; 271 272 /* 273 * Next time around, we will want to know that we have had a 274 * 'struct' 275 */ 276 case 4: /* one of the declaration keywords */ 277 if (ps.p_l_follow) { 278 ps.cast_mask |= 1 << ps.p_l_follow; 279 break; /* inside parens: cast */ 280 } 281 last_code = decl; 282 return (decl); 283 284 case 5: /* if, while, for */ 285 return (sp_paren); 286 287 case 6: /* do, else */ 288 return (sp_nparen); 289 290 case 7: 291 ps.sizeof_keyword = true; 292 default: /* all others are treated like any other 293 * identifier */ 294 return (ident); 295 } /* end of switch */ 296 } /* end of if (found_it) */ 297 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 298 char *tp = buf_ptr; 299 while (tp < buf_end) 300 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 301 goto not_proc; 302 strlcpy(ps.procname, token, sizeof ps.procname); 303 ps.in_parameter_declaration = 1; 304 rparen_count = 1; 305 not_proc:; 306 } 307 /* 308 * The following hack attempts to guess whether or not the current 309 * token is in fact a declaration keyword -- one that has been 310 * typedefd 311 */ 312 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 313 && !ps.p_l_follow 314 && !ps.block_init 315 && (ps.last_token == rparen || ps.last_token == semicolon || 316 ps.last_token == decl || 317 ps.last_token == lbrace || ps.last_token == rbrace)) { 318 ps.its_a_keyword = true; 319 ps.last_u_d = true; 320 last_code = decl; 321 return decl; 322 } 323 if (last_code == decl) /* if this is a declared variable, then 324 * following sign is unary */ 325 ps.last_u_d = true; /* will make "int a -1" work */ 326 last_code = ident; 327 return (ident); /* the ident is not in the list */ 328 } /* end of procesing for alpanum character */ 329 330 /* Scan a non-alphanumeric token */ 331 332 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 333 * moved here */ 334 *e_token = '\0'; 335 if (++buf_ptr >= buf_end) 336 fill_buffer(); 337 338 switch (*token) { 339 case '\n': 340 unary_delim = ps.last_u_d; 341 ps.last_nl = true; /* remember that we just had a newline */ 342 code = (had_eof ? 0 : newline); 343 344 /* 345 * if data has been exausted, the newline is a dummy, and we should 346 * return code to stop 347 */ 348 break; 349 350 case '\'': /* start of quoted character */ 351 case '"': /* start of string */ 352 qchar = *token; 353 if (troff) { 354 e_token[-1] = '`'; 355 if (qchar == '"') 356 *e_token++ = '`'; 357 e_token = chfont(&bodyf, &stringf, e_token); 358 } 359 do { /* copy the string */ 360 while (1) { /* move one character or [/<char>]<char> */ 361 if (*buf_ptr == '\n') { 362 printf("%d: Unterminated literal\n", line_no); 363 goto stop_lit; 364 } 365 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 366 * since CHECK_SIZE guarantees that there 367 * are at least 5 entries left */ 368 *e_token = *buf_ptr++; 369 if (buf_ptr >= buf_end) 370 fill_buffer(); 371 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 372 if (*buf_ptr == '\n') /* check for escaped newline */ 373 ++line_no; 374 if (troff) { 375 *++e_token = BACKSLASH; 376 if (*buf_ptr == BACKSLASH) 377 *++e_token = BACKSLASH; 378 } 379 *++e_token = *buf_ptr++; 380 ++e_token; /* we must increment this again because we 381 * copied two chars */ 382 if (buf_ptr >= buf_end) 383 fill_buffer(); 384 } 385 else 386 break; /* we copied one character */ 387 } /* end of while (1) */ 388 } while (*e_token++ != qchar); 389 if (troff) { 390 e_token = chfont(&stringf, &bodyf, e_token - 1); 391 if (qchar == '"') 392 *e_token++ = '\''; 393 } 394 stop_lit: 395 code = ident; 396 break; 397 398 case ('('): 399 case ('['): 400 unary_delim = true; 401 code = lparen; 402 break; 403 404 case (')'): 405 case (']'): 406 code = rparen; 407 break; 408 409 case '#': 410 unary_delim = ps.last_u_d; 411 code = preesc; 412 break; 413 414 case '?': 415 unary_delim = true; 416 code = question; 417 break; 418 419 case (':'): 420 code = colon; 421 unary_delim = true; 422 break; 423 424 case (';'): 425 unary_delim = true; 426 code = semicolon; 427 break; 428 429 case ('{'): 430 unary_delim = true; 431 432 /* 433 * if (ps.in_or_st) ps.block_init = 1; 434 */ 435 /* ? code = ps.block_init ? lparen : lbrace; */ 436 code = lbrace; 437 break; 438 439 case ('}'): 440 unary_delim = true; 441 /* ? code = ps.block_init ? rparen : rbrace; */ 442 code = rbrace; 443 break; 444 445 case 014: /* a form feed */ 446 unary_delim = ps.last_u_d; 447 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 448 * right */ 449 code = form_feed; 450 break; 451 452 case (','): 453 unary_delim = true; 454 code = comma; 455 break; 456 457 case '.': 458 unary_delim = false; 459 code = period; 460 break; 461 462 case '-': 463 case '+': /* check for -, +, --, ++ */ 464 code = (ps.last_u_d ? unary_op : binary_op); 465 unary_delim = true; 466 467 if (*buf_ptr == token[0]) { 468 /* check for doubled character */ 469 *e_token++ = *buf_ptr++; 470 /* buffer overflow will be checked at end of loop */ 471 if (last_code == ident || last_code == rparen) { 472 code = (ps.last_u_d ? unary_op : postop); 473 /* check for following ++ or -- */ 474 unary_delim = false; 475 } 476 } 477 else if (*buf_ptr == '=') 478 /* check for operator += */ 479 *e_token++ = *buf_ptr++; 480 else if (*buf_ptr == '>') { 481 /* check for operator -> */ 482 *e_token++ = *buf_ptr++; 483 if (!pointer_as_binop) { 484 unary_delim = false; 485 code = unary_op; 486 ps.want_blank = false; 487 } 488 } 489 break; /* buffer overflow will be checked at end of 490 * switch */ 491 492 case '=': 493 if (ps.in_or_st) 494 ps.block_init = 1; 495 #ifdef undef 496 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 497 e_token[-1] = *buf_ptr++; 498 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 499 *e_token++ = *buf_ptr++; 500 *e_token++ = '='; /* Flip =+ to += */ 501 *e_token = 0; 502 } 503 #else 504 if (*buf_ptr == '=') {/* == */ 505 *e_token++ = '='; /* Flip =+ to += */ 506 buf_ptr++; 507 *e_token = 0; 508 } 509 #endif 510 code = binary_op; 511 unary_delim = true; 512 break; 513 /* can drop thru!!! */ 514 515 case '>': 516 case '<': 517 case '!': /* ops like <, <<, <=, !=, etc */ 518 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 519 *e_token++ = *buf_ptr; 520 if (++buf_ptr >= buf_end) 521 fill_buffer(); 522 } 523 if (*buf_ptr == '=') 524 *e_token++ = *buf_ptr++; 525 code = (ps.last_u_d ? unary_op : binary_op); 526 unary_delim = true; 527 break; 528 529 default: 530 if (token[0] == '/' && *buf_ptr == '*') { 531 /* it is start of comment */ 532 *e_token++ = '*'; 533 534 if (++buf_ptr >= buf_end) 535 fill_buffer(); 536 537 code = comment; 538 unary_delim = ps.last_u_d; 539 break; 540 } 541 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 542 /* 543 * handle ||, &&, etc, and also things as in int *****i 544 */ 545 *e_token++ = *buf_ptr; 546 if (++buf_ptr >= buf_end) 547 fill_buffer(); 548 } 549 code = (ps.last_u_d ? unary_op : binary_op); 550 unary_delim = true; 551 552 553 } /* end of switch */ 554 if (code != newline) { 555 l_struct = false; 556 last_code = code; 557 } 558 if (buf_ptr >= buf_end) /* check for input buffer empty */ 559 fill_buffer(); 560 ps.last_u_d = unary_delim; 561 *e_token = '\0'; /* null terminate the token */ 562 return (code); 563 } 564 565 /* 566 * Add the given keyword to the keyword table, using val as the keyword type 567 */ 568 void 569 addkey(key, val) 570 char *key; 571 int val; 572 { 573 struct templ *p; 574 int i = 0; 575 576 while (i < nspecials) { 577 p = &specials[i]; 578 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 579 return; 580 else 581 i++; 582 } 583 584 if (specials == specialsinit) { 585 /* 586 * Whoa. Must reallocate special table. 587 */ 588 nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]); 589 maxspecials = nspecials; 590 maxspecials += maxspecials >> 2; 591 specials = (struct templ *)malloc(maxspecials * sizeof specials[0]); 592 if (specials == NULL) 593 err(1, NULL); 594 memmove(specials, specialsinit, sizeof specialsinit); 595 } else if (nspecials >= maxspecials) { 596 maxspecials += maxspecials >> 2; 597 specials = realloc(specials, maxspecials * sizeof specials[0]); 598 if (specials == NULL) 599 err(1, NULL); 600 } 601 602 p = &specials[i]; 603 p->rwd = key; 604 p->rwcode = val; 605 nspecials++; 606 return; 607 } 608