1 /* $NetBSD: lexi.c,v 1.12 2003/08/07 11:14:09 agc Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1976 Board of Trustees of the University of Illinois. 34 * Copyright (c) 1985 Sun Microsystems, Inc. 35 * All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. All advertising materials mentioning features or use of this software 46 * must display the following acknowledgement: 47 * This product includes software developed by the University of 48 * California, Berkeley and its contributors. 49 * 4. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 */ 65 66 #include <sys/cdefs.h> 67 #ifndef lint 68 #if 0 69 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 70 #else 71 __RCSID("$NetBSD: lexi.c,v 1.12 2003/08/07 11:14:09 agc Exp $"); 72 #endif 73 #endif /* not lint */ 74 75 /* 76 * Here we have the token scanner for indent. It scans off one token and puts 77 * it in the global variable "token". It returns a code, indicating the type 78 * of token scanned. 79 */ 80 81 #include <stdio.h> 82 #include <ctype.h> 83 #include <stdlib.h> 84 #include <string.h> 85 #include "indent_globs.h" 86 #include "indent_codes.h" 87 88 #define alphanum 1 89 #define opchar 3 90 91 struct templ { 92 char *rwd; 93 int rwcode; 94 }; 95 96 struct templ specials[1000] = 97 { 98 {"switch", 1}, 99 {"case", 2}, 100 {"break", 0}, 101 {"struct", 3}, 102 {"union", 3}, 103 {"enum", 3}, 104 {"default", 2}, 105 {"int", 4}, 106 {"char", 4}, 107 {"float", 4}, 108 {"double", 4}, 109 {"long", 4}, 110 {"short", 4}, 111 {"typdef", 4}, 112 {"unsigned", 4}, 113 {"register", 4}, 114 {"static", 4}, 115 {"global", 4}, 116 {"extern", 4}, 117 {"void", 4}, 118 {"goto", 0}, 119 {"return", 0}, 120 {"if", 5}, 121 {"while", 5}, 122 {"for", 5}, 123 {"else", 6}, 124 {"do", 6}, 125 {"sizeof", 7}, 126 {0, 0} 127 }; 128 129 char chartype[128] = 130 { /* this is used to facilitate the decision of 131 * what type (alphanumeric, operator) each 132 * character is */ 133 0, 0, 0, 0, 0, 0, 0, 0, 134 0, 0, 0, 0, 0, 0, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0, 136 0, 0, 0, 0, 0, 0, 0, 0, 137 0, 3, 0, 0, 1, 3, 3, 0, 138 0, 0, 3, 3, 0, 3, 0, 3, 139 1, 1, 1, 1, 1, 1, 1, 1, 140 1, 1, 0, 0, 3, 3, 3, 3, 141 0, 1, 1, 1, 1, 1, 1, 1, 142 1, 1, 1, 1, 1, 1, 1, 1, 143 1, 1, 1, 1, 1, 1, 1, 1, 144 1, 1, 1, 0, 0, 0, 3, 1, 145 0, 1, 1, 1, 1, 1, 1, 1, 146 1, 1, 1, 1, 1, 1, 1, 1, 147 1, 1, 1, 1, 1, 1, 1, 1, 148 1, 1, 1, 0, 3, 0, 3, 0 149 }; 150 151 152 153 154 int 155 lexi(void) 156 { 157 int unary_delim; /* this is set to 1 if the current token 158 * 159 * forces a following operator to be unary */ 160 static int last_code; /* the last token type returned */ 161 static int l_struct; /* set to 1 if the last token was 'struct' */ 162 int code; /* internal code to be returned */ 163 char qchar; /* the delimiter character for a string */ 164 165 e_token = s_token; /* point to start of place to save token */ 166 unary_delim = false; 167 ps.col_1 = ps.last_nl; /* tell world that this token started in 168 * column 1 iff the last thing scanned was nl */ 169 ps.last_nl = false; 170 171 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 172 ps.col_1 = false; /* leading blanks imply token is not 173 * in column 1 */ 174 if (++buf_ptr >= buf_end) 175 fill_buffer(); 176 } 177 178 /* Scan an alphanumeric token */ 179 if (chartype[(int) *buf_ptr] == alphanum || 180 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 181 /* 182 * we have a character or number 183 */ 184 char *j; /* used for searching thru list of 185 * 186 * reserved words */ 187 struct templ *p; 188 189 if (isdigit((unsigned char)*buf_ptr) || 190 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 191 int seendot = 0, seenexp = 0, seensfx = 0; 192 if (*buf_ptr == '0' && 193 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 194 *e_token++ = *buf_ptr++; 195 *e_token++ = *buf_ptr++; 196 while (isxdigit((unsigned char)*buf_ptr)) { 197 CHECK_SIZE_TOKEN; 198 *e_token++ = *buf_ptr++; 199 } 200 } else { 201 while (1) { 202 if (*buf_ptr == '.') { 203 if (seendot) 204 break; 205 else 206 seendot++; 207 } 208 CHECK_SIZE_TOKEN; 209 *e_token++ = *buf_ptr++; 210 if (!isdigit((unsigned char)*buf_ptr) 211 && *buf_ptr != '.') { 212 if ((*buf_ptr != 'E' 213 && *buf_ptr != 'e') || seenexp) 214 break; 215 else { 216 seenexp++; 217 seendot++; 218 CHECK_SIZE_TOKEN; 219 *e_token++ = *buf_ptr++; 220 if (*buf_ptr == '+' || *buf_ptr == '-') 221 *e_token++ = *buf_ptr++; 222 } 223 } 224 } 225 } 226 if (*buf_ptr == 'F' || *buf_ptr == 'f') { 227 /* float constant */ 228 *e_token++ = *buf_ptr++; 229 } else { 230 /* integer constant */ 231 while (1) { 232 if (!(seensfx & 1) && 233 (*buf_ptr == 'U' || 234 *buf_ptr == 'u')) { 235 CHECK_SIZE_TOKEN; 236 *e_token++ = *buf_ptr++; 237 seensfx |= 1; 238 continue; 239 } 240 if (!(seensfx & 2) && 241 (*buf_ptr == 'L' || 242 *buf_ptr == 'l')) { 243 CHECK_SIZE_TOKEN; 244 if (buf_ptr[1] == buf_ptr[0]) 245 *e_token++ = *buf_ptr++; 246 *e_token++ = *buf_ptr++; 247 seensfx |= 2; 248 continue; 249 } 250 break; 251 } 252 } 253 } else 254 while (chartype[(int) *buf_ptr] == alphanum) { /* copy it over */ 255 CHECK_SIZE_TOKEN; 256 *e_token++ = *buf_ptr++; 257 if (buf_ptr >= buf_end) 258 fill_buffer(); 259 } 260 *e_token++ = '\0'; 261 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 262 if (++buf_ptr >= buf_end) 263 fill_buffer(); 264 } 265 ps.its_a_keyword = false; 266 ps.sizeof_keyword = false; 267 if (l_struct) { /* if last token was 'struct', then this token 268 * should be treated as a declaration */ 269 l_struct = false; 270 last_code = ident; 271 ps.last_u_d = true; 272 return (decl); 273 } 274 ps.last_u_d = false; /* Operator after indentifier is 275 * binary */ 276 last_code = ident; /* Remember that this is the code we 277 * will return */ 278 279 /* 280 * This loop will check if the token is a keyword. 281 */ 282 for (p = specials; (j = p->rwd) != 0; p++) { 283 char *p = s_token; /* point at scanned token */ 284 if (*j++ != *p++ || *j++ != *p++) 285 continue; /* This test depends on the 286 * fact that identifiers are 287 * always at least 1 character 288 * long (ie. the first two 289 * bytes of the identifier are 290 * always meaningful) */ 291 if (p[-1] == 0) 292 break; /* If its a one-character identifier */ 293 while (*p++ == *j) 294 if (*j++ == 0) 295 goto found_keyword; /* I wish that C had a 296 * multi-level break... */ 297 } 298 if (p->rwd) { /* we have a keyword */ 299 found_keyword: 300 ps.its_a_keyword = true; 301 ps.last_u_d = true; 302 switch (p->rwcode) { 303 case 1:/* it is a switch */ 304 return (swstmt); 305 case 2:/* a case or default */ 306 return (casestmt); 307 308 case 3:/* a "struct" */ 309 if (ps.p_l_follow) 310 break; /* inside parens: cast */ 311 l_struct = true; 312 313 /* 314 * Next time around, we will want to know that we have had a 315 * 'struct' 316 */ 317 case 4:/* one of the declaration keywords */ 318 if (ps.p_l_follow) { 319 ps.cast_mask |= 1 << ps.p_l_follow; 320 break; /* inside parens: cast */ 321 } 322 last_code = decl; 323 return (decl); 324 325 case 5:/* if, while, for */ 326 return (sp_paren); 327 328 case 6:/* do, else */ 329 return (sp_nparen); 330 331 case 7: 332 ps.sizeof_keyword = true; 333 default: /* all others are treated like any 334 * other identifier */ 335 return (ident); 336 } /* end of switch */ 337 } /* end of if (found_it) */ 338 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 339 char *tp = buf_ptr; 340 while (tp < buf_end) 341 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 342 goto not_proc; 343 strncpy(ps.procname, token, sizeof ps.procname - 1); 344 ps.in_parameter_declaration = 1; 345 rparen_count = 1; 346 not_proc: ; 347 } 348 /* 349 * The following hack attempts to guess whether or not the current 350 * token is in fact a declaration keyword -- one that has been 351 * typedefd 352 */ 353 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || 354 isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_') 355 && !ps.p_l_follow 356 && !ps.block_init 357 && (ps.last_token == rparen || ps.last_token == semicolon || 358 ps.last_token == decl || 359 ps.last_token == lbrace || ps.last_token == rbrace)) { 360 ps.its_a_keyword = true; 361 ps.last_u_d = true; 362 last_code = decl; 363 return decl; 364 } 365 if (last_code == decl) /* if this is a declared variable, 366 * then following sign is unary */ 367 ps.last_u_d = true; /* will make "int a -1" work */ 368 last_code = ident; 369 return (ident); /* the ident is not in the list */ 370 } /* end of procesing for alpanum character */ 371 /* Scan a non-alphanumeric token */ 372 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 373 * moved here */ 374 *e_token = '\0'; 375 if (++buf_ptr >= buf_end) 376 fill_buffer(); 377 378 switch (*token) { 379 case '\n': 380 unary_delim = ps.last_u_d; 381 ps.last_nl = true; /* remember that we just had a newline */ 382 code = (had_eof ? 0 : newline); 383 384 /* 385 * if data has been exausted, the newline is a dummy, and we should 386 * return code to stop 387 */ 388 break; 389 390 case '\'': /* start of quoted character */ 391 case '"': /* start of string */ 392 qchar = *token; 393 if (troff) { 394 e_token[-1] = '`'; 395 if (qchar == '"') 396 *e_token++ = '`'; 397 e_token = chfont(&bodyf, &stringf, e_token); 398 } 399 do { /* copy the string */ 400 while (1) { /* move one character or 401 * [/<char>]<char> */ 402 if (*buf_ptr == '\n') { 403 printf("%d: Unterminated literal\n", line_no); 404 goto stop_lit; 405 } 406 CHECK_SIZE_TOKEN; /* Only have to do this 407 * once in this loop, 408 * since CHECK_SIZE 409 * guarantees that there 410 * are at least 5 411 * entries left */ 412 *e_token = *buf_ptr++; 413 if (buf_ptr >= buf_end) 414 fill_buffer(); 415 if (*e_token == BACKSLASH) { /* if escape, copy extra 416 * char */ 417 if (*buf_ptr == '\n') /* check for escaped 418 * newline */ 419 ++line_no; 420 if (troff) { 421 *++e_token = BACKSLASH; 422 if (*buf_ptr == BACKSLASH) 423 *++e_token = BACKSLASH; 424 } 425 *++e_token = *buf_ptr++; 426 ++e_token; /* we must increment 427 * this again because we 428 * copied two chars */ 429 if (buf_ptr >= buf_end) 430 fill_buffer(); 431 } else 432 break; /* we copied one character */ 433 } /* end of while (1) */ 434 } while (*e_token++ != qchar); 435 if (troff) { 436 e_token = chfont(&stringf, &bodyf, e_token - 1); 437 if (qchar == '"') 438 *e_token++ = '\''; 439 } 440 stop_lit: 441 code = ident; 442 break; 443 444 case ('('): 445 case ('['): 446 unary_delim = true; 447 code = lparen; 448 break; 449 450 case (')'): 451 case (']'): 452 code = rparen; 453 break; 454 455 case '#': 456 unary_delim = ps.last_u_d; 457 code = preesc; 458 break; 459 460 case '?': 461 unary_delim = true; 462 code = question; 463 break; 464 465 case (':'): 466 code = colon; 467 unary_delim = true; 468 break; 469 470 case (';'): 471 unary_delim = true; 472 code = semicolon; 473 break; 474 475 case ('{'): 476 unary_delim = true; 477 478 /* 479 * if (ps.in_or_st) ps.block_init = 1; 480 */ 481 /* ? code = ps.block_init ? lparen : lbrace; */ 482 code = lbrace; 483 break; 484 485 case ('}'): 486 unary_delim = true; 487 /* ? code = ps.block_init ? rparen : rbrace; */ 488 code = rbrace; 489 break; 490 491 case 014: /* a form feed */ 492 unary_delim = ps.last_u_d; 493 ps.last_nl = true; /* remember this so we can set 494 * 'ps.col_1' right */ 495 code = form_feed; 496 break; 497 498 case (','): 499 unary_delim = true; 500 code = comma; 501 break; 502 503 case '.': 504 unary_delim = false; 505 code = period; 506 break; 507 508 case '-': 509 case '+': /* check for -, +, --, ++ */ 510 code = (ps.last_u_d ? unary_op : binary_op); 511 unary_delim = true; 512 513 if (*buf_ptr == token[0]) { 514 /* check for doubled character */ 515 *e_token++ = *buf_ptr++; 516 /* buffer overflow will be checked at end of loop */ 517 if (last_code == ident || last_code == rparen) { 518 code = (ps.last_u_d ? unary_op : postop); 519 /* check for following ++ or -- */ 520 unary_delim = false; 521 } 522 } else 523 if (*buf_ptr == '=') 524 /* check for operator += */ 525 *e_token++ = *buf_ptr++; 526 else 527 if (*buf_ptr == '>') { 528 /* check for operator -> */ 529 *e_token++ = *buf_ptr++; 530 if (!pointer_as_binop) { 531 unary_delim = false; 532 code = unary_op; 533 ps.want_blank = false; 534 } 535 } 536 break; /* buffer overflow will be checked at end of 537 * switch */ 538 539 case '=': 540 if (ps.in_or_st) 541 ps.block_init = 1; 542 #ifdef undef 543 if (chartype[*buf_ptr] == opchar) { /* we have two char 544 * assignment */ 545 e_token[-1] = *buf_ptr++; 546 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 547 *e_token++ = *buf_ptr++; 548 *e_token++ = '='; /* Flip =+ to += */ 549 *e_token = 0; 550 } 551 #else 552 if (*buf_ptr == '=') { /* == */ 553 *e_token++ = '='; /* Flip =+ to += */ 554 buf_ptr++; 555 *e_token = 0; 556 } 557 #endif 558 code = binary_op; 559 unary_delim = true; 560 break; 561 /* can drop thru!!! */ 562 563 case '>': 564 case '<': 565 case '!': /* ops like <, <<, <=, !=, etc */ 566 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 567 *e_token++ = *buf_ptr; 568 if (++buf_ptr >= buf_end) 569 fill_buffer(); 570 } 571 if (*buf_ptr == '=') 572 *e_token++ = *buf_ptr++; 573 code = (ps.last_u_d ? unary_op : binary_op); 574 unary_delim = true; 575 break; 576 577 default: 578 if (token[0] == '/' && *buf_ptr == '*') { 579 /* it is start of comment */ 580 *e_token++ = '*'; 581 582 if (++buf_ptr >= buf_end) 583 fill_buffer(); 584 585 code = comment; 586 unary_delim = ps.last_u_d; 587 break; 588 } 589 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 590 /* 591 * handle ||, &&, etc, and also things as in int *****i 592 */ 593 *e_token++ = *buf_ptr; 594 if (++buf_ptr >= buf_end) 595 fill_buffer(); 596 } 597 code = (ps.last_u_d ? unary_op : binary_op); 598 unary_delim = true; 599 600 601 } /* end of switch */ 602 if (code != newline) { 603 l_struct = false; 604 last_code = code; 605 } 606 if (buf_ptr >= buf_end) /* check for input buffer empty */ 607 fill_buffer(); 608 ps.last_u_d = unary_delim; 609 *e_token = '\0'; /* null terminate the token */ 610 return (code); 611 } 612 /* 613 * Add the given keyword to the keyword table, using val as the keyword type 614 */ 615 void 616 addkey(char *key, int val) 617 { 618 struct templ *p = specials; 619 while (p->rwd) 620 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 621 return; 622 else 623 p++; 624 if (p >= specials + sizeof specials / sizeof specials[0]) 625 return; /* For now, table overflows are silently 626 * ignored */ 627 p->rwd = key; 628 p->rwcode = val; 629 p[1].rwd = 0; 630 p[1].rwcode = 0; 631 } 632