1 /* $NetBSD: lexi.c,v 1.9 1999/03/15 20:28:45 kristerw Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 1976 Board of Trustees of the University of Illinois. 7 * Copyright (c) 1985 Sun Microsystems, Inc. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/cdefs.h> 40 #ifndef lint 41 #if 0 42 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 43 #else 44 __RCSID("$NetBSD: lexi.c,v 1.9 1999/03/15 20:28:45 kristerw Exp $"); 45 #endif 46 #endif /* not lint */ 47 48 /* 49 * Here we have the token scanner for indent. It scans off one token and puts 50 * it in the global variable "token". It returns a code, indicating the type 51 * of token scanned. 52 */ 53 54 #include <stdio.h> 55 #include <ctype.h> 56 #include <stdlib.h> 57 #include <string.h> 58 #include "indent_globs.h" 59 #include "indent_codes.h" 60 61 #define alphanum 1 62 #define opchar 3 63 64 struct templ { 65 char *rwd; 66 int rwcode; 67 }; 68 69 struct templ specials[1000] = 70 { 71 {"switch", 1}, 72 {"case", 2}, 73 {"break", 0}, 74 {"struct", 3}, 75 {"union", 3}, 76 {"enum", 3}, 77 {"default", 2}, 78 {"int", 4}, 79 {"char", 4}, 80 {"float", 4}, 81 {"double", 4}, 82 {"long", 4}, 83 {"short", 4}, 84 {"typdef", 4}, 85 {"unsigned", 4}, 86 {"register", 4}, 87 {"static", 4}, 88 {"global", 4}, 89 {"extern", 4}, 90 {"void", 4}, 91 {"goto", 0}, 92 {"return", 0}, 93 {"if", 5}, 94 {"while", 5}, 95 {"for", 5}, 96 {"else", 6}, 97 {"do", 6}, 98 {"sizeof", 7}, 99 {0, 0} 100 }; 101 102 char chartype[128] = 103 { /* this is used to facilitate the decision of 104 * what type (alphanumeric, operator) each 105 * character is */ 106 0, 0, 0, 0, 0, 0, 0, 0, 107 0, 0, 0, 0, 0, 0, 0, 0, 108 0, 0, 0, 0, 0, 0, 0, 0, 109 0, 0, 0, 0, 0, 0, 0, 0, 110 0, 3, 0, 0, 1, 3, 3, 0, 111 0, 0, 3, 3, 0, 3, 0, 3, 112 1, 1, 1, 1, 1, 1, 1, 1, 113 1, 1, 0, 0, 3, 3, 3, 3, 114 0, 1, 1, 1, 1, 1, 1, 1, 115 1, 1, 1, 1, 1, 1, 1, 1, 116 1, 1, 1, 1, 1, 1, 1, 1, 117 1, 1, 1, 0, 0, 0, 3, 1, 118 0, 1, 1, 1, 1, 1, 1, 1, 119 1, 1, 1, 1, 1, 1, 1, 1, 120 1, 1, 1, 1, 1, 1, 1, 1, 121 1, 1, 1, 0, 3, 0, 3, 0 122 }; 123 124 125 126 127 int 128 lexi() 129 { 130 int unary_delim; /* this is set to 1 if the current token 131 * 132 * forces a following operator to be unary */ 133 static int last_code; /* the last token type returned */ 134 static int l_struct; /* set to 1 if the last token was 'struct' */ 135 int code; /* internal code to be returned */ 136 char qchar; /* the delimiter character for a string */ 137 138 e_token = s_token; /* point to start of place to save token */ 139 unary_delim = false; 140 ps.col_1 = ps.last_nl; /* tell world that this token started in 141 * column 1 iff the last thing scanned was nl */ 142 ps.last_nl = false; 143 144 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 145 ps.col_1 = false; /* leading blanks imply token is not 146 * in column 1 */ 147 if (++buf_ptr >= buf_end) 148 fill_buffer(); 149 } 150 151 /* Scan an alphanumeric token */ 152 if (chartype[(int) *buf_ptr] == alphanum || 153 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 154 /* 155 * we have a character or number 156 */ 157 char *j; /* used for searching thru list of 158 * 159 * reserved words */ 160 struct templ *p; 161 162 if (isdigit((unsigned char)*buf_ptr) || 163 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 164 int seendot = 0, seenexp = 0; 165 if (*buf_ptr == '0' && 166 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 167 *e_token++ = *buf_ptr++; 168 *e_token++ = *buf_ptr++; 169 while (isxdigit((unsigned char)*buf_ptr)) { 170 CHECK_SIZE_TOKEN; 171 *e_token++ = *buf_ptr++; 172 } 173 } else { 174 while (1) { 175 if (*buf_ptr == '.') { 176 if (seendot) 177 break; 178 else 179 seendot++; 180 } 181 CHECK_SIZE_TOKEN; 182 *e_token++ = *buf_ptr++; 183 if (!isdigit((unsigned char)*buf_ptr) 184 && *buf_ptr != '.') { 185 if ((*buf_ptr != 'E' 186 && *buf_ptr != 'e') || seenexp) 187 break; 188 else { 189 seenexp++; 190 seendot++; 191 CHECK_SIZE_TOKEN; 192 *e_token++ = *buf_ptr++; 193 if (*buf_ptr == '+' || *buf_ptr == '-') 194 *e_token++ = *buf_ptr++; 195 } 196 } 197 } 198 } 199 if (*buf_ptr == 'F' || *buf_ptr == 'f') { 200 /* float constant */ 201 *e_token++ = *buf_ptr++; 202 } else { 203 /* integer constant (U, L, UL, LL, ULL) */ 204 if (*buf_ptr == 'U' || *buf_ptr == 'u') 205 *e_token++ = *buf_ptr++; 206 if (*buf_ptr == 'L' || *buf_ptr == 'l') 207 *e_token++ = *buf_ptr++; 208 if (*buf_ptr == 'L' || *buf_ptr == 'l') 209 *e_token++ = *buf_ptr++; 210 } 211 } else 212 while (chartype[(int) *buf_ptr] == alphanum) { /* copy it over */ 213 CHECK_SIZE_TOKEN; 214 *e_token++ = *buf_ptr++; 215 if (buf_ptr >= buf_end) 216 fill_buffer(); 217 } 218 *e_token++ = '\0'; 219 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 220 if (++buf_ptr >= buf_end) 221 fill_buffer(); 222 } 223 ps.its_a_keyword = false; 224 ps.sizeof_keyword = false; 225 if (l_struct) { /* if last token was 'struct', then this token 226 * should be treated as a declaration */ 227 l_struct = false; 228 last_code = ident; 229 ps.last_u_d = true; 230 return (decl); 231 } 232 ps.last_u_d = false; /* Operator after indentifier is 233 * binary */ 234 last_code = ident; /* Remember that this is the code we 235 * will return */ 236 237 /* 238 * This loop will check if the token is a keyword. 239 */ 240 for (p = specials; (j = p->rwd) != 0; p++) { 241 char *p = s_token; /* point at scanned token */ 242 if (*j++ != *p++ || *j++ != *p++) 243 continue; /* This test depends on the 244 * fact that identifiers are 245 * always at least 1 character 246 * long (ie. the first two 247 * bytes of the identifier are 248 * always meaningful) */ 249 if (p[-1] == 0) 250 break; /* If its a one-character identifier */ 251 while (*p++ == *j) 252 if (*j++ == 0) 253 goto found_keyword; /* I wish that C had a 254 * multi-level break... */ 255 } 256 if (p->rwd) { /* we have a keyword */ 257 found_keyword: 258 ps.its_a_keyword = true; 259 ps.last_u_d = true; 260 switch (p->rwcode) { 261 case 1:/* it is a switch */ 262 return (swstmt); 263 case 2:/* a case or default */ 264 return (casestmt); 265 266 case 3:/* a "struct" */ 267 if (ps.p_l_follow) 268 break; /* inside parens: cast */ 269 l_struct = true; 270 271 /* 272 * Next time around, we will want to know that we have had a 273 * 'struct' 274 */ 275 case 4:/* one of the declaration keywords */ 276 if (ps.p_l_follow) { 277 ps.cast_mask |= 1 << ps.p_l_follow; 278 break; /* inside parens: cast */ 279 } 280 last_code = decl; 281 return (decl); 282 283 case 5:/* if, while, for */ 284 return (sp_paren); 285 286 case 6:/* do, else */ 287 return (sp_nparen); 288 289 case 7: 290 ps.sizeof_keyword = true; 291 default: /* all others are treated like any 292 * other identifier */ 293 return (ident); 294 } /* end of switch */ 295 } /* end of if (found_it) */ 296 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 297 char *tp = buf_ptr; 298 while (tp < buf_end) 299 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 300 goto not_proc; 301 strncpy(ps.procname, token, sizeof ps.procname - 1); 302 ps.in_parameter_declaration = 1; 303 rparen_count = 1; 304 not_proc: ; 305 } 306 /* 307 * The following hack attempts to guess whether or not the current 308 * token is in fact a declaration keyword -- one that has been 309 * typedefd 310 */ 311 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || 312 isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_') 313 && !ps.p_l_follow 314 && !ps.block_init 315 && (ps.last_token == rparen || ps.last_token == semicolon || 316 ps.last_token == decl || 317 ps.last_token == lbrace || ps.last_token == rbrace)) { 318 ps.its_a_keyword = true; 319 ps.last_u_d = true; 320 last_code = decl; 321 return decl; 322 } 323 if (last_code == decl) /* if this is a declared variable, 324 * then following sign is unary */ 325 ps.last_u_d = true; /* will make "int a -1" work */ 326 last_code = ident; 327 return (ident); /* the ident is not in the list */ 328 } /* end of procesing for alpanum character */ 329 /* Scan a non-alphanumeric token */ 330 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 331 * moved here */ 332 *e_token = '\0'; 333 if (++buf_ptr >= buf_end) 334 fill_buffer(); 335 336 switch (*token) { 337 case '\n': 338 unary_delim = ps.last_u_d; 339 ps.last_nl = true; /* remember that we just had a newline */ 340 code = (had_eof ? 0 : newline); 341 342 /* 343 * if data has been exausted, the newline is a dummy, and we should 344 * return code to stop 345 */ 346 break; 347 348 case '\'': /* start of quoted character */ 349 case '"': /* start of string */ 350 qchar = *token; 351 if (troff) { 352 e_token[-1] = '`'; 353 if (qchar == '"') 354 *e_token++ = '`'; 355 e_token = chfont(&bodyf, &stringf, e_token); 356 } 357 do { /* copy the string */ 358 while (1) { /* move one character or 359 * [/<char>]<char> */ 360 if (*buf_ptr == '\n') { 361 printf("%d: Unterminated literal\n", line_no); 362 goto stop_lit; 363 } 364 CHECK_SIZE_TOKEN; /* Only have to do this 365 * once in this loop, 366 * since CHECK_SIZE 367 * guarantees that there 368 * are at least 5 369 * entries left */ 370 *e_token = *buf_ptr++; 371 if (buf_ptr >= buf_end) 372 fill_buffer(); 373 if (*e_token == BACKSLASH) { /* if escape, copy extra 374 * char */ 375 if (*buf_ptr == '\n') /* check for escaped 376 * newline */ 377 ++line_no; 378 if (troff) { 379 *++e_token = BACKSLASH; 380 if (*buf_ptr == BACKSLASH) 381 *++e_token = BACKSLASH; 382 } 383 *++e_token = *buf_ptr++; 384 ++e_token; /* we must increment 385 * this again because we 386 * copied two chars */ 387 if (buf_ptr >= buf_end) 388 fill_buffer(); 389 } else 390 break; /* we copied one character */ 391 } /* end of while (1) */ 392 } while (*e_token++ != qchar); 393 if (troff) { 394 e_token = chfont(&stringf, &bodyf, e_token - 1); 395 if (qchar == '"') 396 *e_token++ = '\''; 397 } 398 stop_lit: 399 code = ident; 400 break; 401 402 case ('('): 403 case ('['): 404 unary_delim = true; 405 code = lparen; 406 break; 407 408 case (')'): 409 case (']'): 410 code = rparen; 411 break; 412 413 case '#': 414 unary_delim = ps.last_u_d; 415 code = preesc; 416 break; 417 418 case '?': 419 unary_delim = true; 420 code = question; 421 break; 422 423 case (':'): 424 code = colon; 425 unary_delim = true; 426 break; 427 428 case (';'): 429 unary_delim = true; 430 code = semicolon; 431 break; 432 433 case ('{'): 434 unary_delim = true; 435 436 /* 437 * if (ps.in_or_st) ps.block_init = 1; 438 */ 439 /* ? code = ps.block_init ? lparen : lbrace; */ 440 code = lbrace; 441 break; 442 443 case ('}'): 444 unary_delim = true; 445 /* ? code = ps.block_init ? rparen : rbrace; */ 446 code = rbrace; 447 break; 448 449 case 014: /* a form feed */ 450 unary_delim = ps.last_u_d; 451 ps.last_nl = true; /* remember this so we can set 452 * 'ps.col_1' right */ 453 code = form_feed; 454 break; 455 456 case (','): 457 unary_delim = true; 458 code = comma; 459 break; 460 461 case '.': 462 unary_delim = false; 463 code = period; 464 break; 465 466 case '-': 467 case '+': /* check for -, +, --, ++ */ 468 code = (ps.last_u_d ? unary_op : binary_op); 469 unary_delim = true; 470 471 if (*buf_ptr == token[0]) { 472 /* check for doubled character */ 473 *e_token++ = *buf_ptr++; 474 /* buffer overflow will be checked at end of loop */ 475 if (last_code == ident || last_code == rparen) { 476 code = (ps.last_u_d ? unary_op : postop); 477 /* check for following ++ or -- */ 478 unary_delim = false; 479 } 480 } else 481 if (*buf_ptr == '=') 482 /* check for operator += */ 483 *e_token++ = *buf_ptr++; 484 else 485 if (*buf_ptr == '>') { 486 /* check for operator -> */ 487 *e_token++ = *buf_ptr++; 488 if (!pointer_as_binop) { 489 unary_delim = false; 490 code = unary_op; 491 ps.want_blank = false; 492 } 493 } 494 break; /* buffer overflow will be checked at end of 495 * switch */ 496 497 case '=': 498 if (ps.in_or_st) 499 ps.block_init = 1; 500 #ifdef undef 501 if (chartype[*buf_ptr] == opchar) { /* we have two char 502 * assignment */ 503 e_token[-1] = *buf_ptr++; 504 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 505 *e_token++ = *buf_ptr++; 506 *e_token++ = '='; /* Flip =+ to += */ 507 *e_token = 0; 508 } 509 #else 510 if (*buf_ptr == '=') { /* == */ 511 *e_token++ = '='; /* Flip =+ to += */ 512 buf_ptr++; 513 *e_token = 0; 514 } 515 #endif 516 code = binary_op; 517 unary_delim = true; 518 break; 519 /* can drop thru!!! */ 520 521 case '>': 522 case '<': 523 case '!': /* ops like <, <<, <=, !=, etc */ 524 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 525 *e_token++ = *buf_ptr; 526 if (++buf_ptr >= buf_end) 527 fill_buffer(); 528 } 529 if (*buf_ptr == '=') 530 *e_token++ = *buf_ptr++; 531 code = (ps.last_u_d ? unary_op : binary_op); 532 unary_delim = true; 533 break; 534 535 default: 536 if (token[0] == '/' && *buf_ptr == '*') { 537 /* it is start of comment */ 538 *e_token++ = '*'; 539 540 if (++buf_ptr >= buf_end) 541 fill_buffer(); 542 543 code = comment; 544 unary_delim = ps.last_u_d; 545 break; 546 } 547 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 548 /* 549 * handle ||, &&, etc, and also things as in int *****i 550 */ 551 *e_token++ = *buf_ptr; 552 if (++buf_ptr >= buf_end) 553 fill_buffer(); 554 } 555 code = (ps.last_u_d ? unary_op : binary_op); 556 unary_delim = true; 557 558 559 } /* end of switch */ 560 if (code != newline) { 561 l_struct = false; 562 last_code = code; 563 } 564 if (buf_ptr >= buf_end) /* check for input buffer empty */ 565 fill_buffer(); 566 ps.last_u_d = unary_delim; 567 *e_token = '\0'; /* null terminate the token */ 568 return (code); 569 } 570 /* 571 * Add the given keyword to the keyword table, using val as the keyword type 572 */ 573 void 574 addkey(key, val) 575 char *key; 576 int val; 577 { 578 struct templ *p = specials; 579 while (p->rwd) 580 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 581 return; 582 else 583 p++; 584 if (p >= specials + sizeof specials / sizeof specials[0]) 585 return; /* For now, table overflows are silently 586 * ignored */ 587 p->rwd = key; 588 p->rwcode = val; 589 p[1].rwd = 0; 590 p[1].rwcode = 0; 591 } 592