1 /* 2 * Copyright (c) 1980 Regents of the University of California. 3 * Copyright (c) 1976 Board of Trustees of the University of Illinois. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms are permitted 7 * provided that the above copyright notice and this paragraph are 8 * duplicated in all such forms and that any documentation, 9 * advertising materials, and other materials related to such 10 * distribution and use acknowledge that the software was developed 11 * by the University of California, Berkeley and the University 12 * of Illinois, Urbana. The name of either 13 * University may not be used to endorse or promote products derived 14 * from this software without specific prior written permission. 15 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 17 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 18 */ 19 20 #ifndef lint 21 static char sccsid[] = "@(#)lexi.c 5.8 (Berkeley) 06/29/88"; 22 #endif /* not lint */ 23 24 /* 25 * NAME: 26 * lexi 27 * 28 * FUNCTION: 29 * This is the token scanner for indent 30 * 31 * ALGORITHM: 32 * 1) Strip off intervening blanks and/or tabs. 33 * 2) If it is an alphanumeric token, move it to the token buffer "token". 34 * Check if it is a special reserved word that indent will want to 35 * know about. 36 * 3) Non-alphanumeric tokens are handled with a big switch statement. A 37 * flag is kept to remember if the last token was a "unary delimiter", 38 * which forces a following operator to be unary as opposed to binary. 39 * 40 * PARAMETERS: 41 * None 42 * 43 * RETURNS: 44 * An integer code indicating the type of token scanned. 45 * 46 * GLOBALS: 47 * buf_ptr = 48 * had_eof 49 * ps.last_u_d = Set to true iff this token is a "unary delimiter" 50 * 51 * CALLS: 52 * fill_buffer 53 * printf (lib) 54 * 55 * CALLED BY: 56 * main 57 * 58 * NOTES: 59 * Start of comment is passed back so that the comment can be scanned by 60 * pr_comment. 61 * 62 * Strings and character literals are returned just like identifiers. 63 * 64 * HISTORY: 65 * initial coding November 1976 D A Willcox of CAC 66 * 1/7/77 D A Willcox of CAC Fix to provide proper handling 67 * of "int a -1;" 68 * 69 */ 70 71 /* 72 * Here we have the token scanner for indent. It scans off one token and 73 * puts it in the global variable "token". It returns a code, indicating 74 * the type of token scanned. 75 */ 76 77 #include "indent_globs.h" 78 #include "indent_codes.h" 79 #include "ctype.h" 80 81 #define alphanum 1 82 #define opchar 3 83 84 struct templ { 85 char *rwd; 86 int rwcode; 87 }; 88 89 struct templ specials[100] = 90 { 91 "switch", 1, 92 "case", 2, 93 "break", 0, 94 "struct", 3, 95 "union", 3, 96 "enum", 3, 97 "default", 2, 98 "int", 4, 99 "char", 4, 100 "float", 4, 101 "double", 4, 102 "long", 4, 103 "short", 4, 104 "typdef", 4, 105 "unsigned", 4, 106 "register", 4, 107 "static", 4, 108 "global", 4, 109 "extern", 4, 110 "void", 4, 111 "goto", 0, 112 "return", 0, 113 "if", 5, 114 "while", 5, 115 "for", 5, 116 "else", 6, 117 "do", 6, 118 "sizeof", 7, 119 0, 0 120 }; 121 122 char chartype[128] = 123 { /* this is used to facilitate the decision 124 * of what type (alphanumeric, operator) 125 * each character is */ 126 0, 0, 0, 0, 0, 0, 0, 0, 127 0, 0, 0, 0, 0, 0, 0, 0, 128 0, 0, 0, 0, 0, 0, 0, 0, 129 0, 0, 0, 0, 0, 0, 0, 0, 130 0, 3, 0, 0, 1, 3, 3, 0, 131 0, 0, 3, 3, 0, 3, 3, 3, 132 1, 1, 1, 1, 1, 1, 1, 1, 133 1, 1, 0, 0, 3, 3, 3, 3, 134 0, 1, 1, 1, 1, 1, 1, 1, 135 1, 1, 1, 1, 1, 1, 1, 1, 136 1, 1, 1, 1, 1, 1, 1, 1, 137 1, 1, 1, 0, 0, 0, 3, 1, 138 0, 1, 1, 1, 1, 1, 1, 1, 139 1, 1, 1, 1, 1, 1, 1, 1, 140 1, 1, 1, 1, 1, 1, 1, 1, 141 1, 1, 1, 0, 3, 0, 3, 0 142 }; 143 144 145 146 147 int 148 lexi() 149 { 150 register char *tok; /* local pointer to next char in token */ 151 int unary_delim; /* this is set to 1 if the current token 152 * 153 * forces a following operator to be unary */ 154 static int last_code; /* the last token type returned */ 155 static int l_struct; /* set to 1 if the last token was 'struct' */ 156 int code; /* internal code to be returned */ 157 char qchar; /* the delimiter character for a string */ 158 159 tok = token; /* point to start of place to save token */ 160 unary_delim = false; 161 ps.col_1 = ps.last_nl; /* tell world that this token started in 162 * column 1 iff the last thing scanned was 163 * nl */ 164 ps.last_nl = false; 165 166 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 167 ps.col_1 = false; /* leading blanks imply token is not in 168 * column 1 */ 169 if (++buf_ptr >= buf_end) 170 fill_buffer(); 171 } 172 173 /* Scan an alphanumeric token. Note that we must also handle 174 * stuff like "1.0e+03" and "7e-6". */ 175 if (chartype[*buf_ptr & 0177] == alphanum) { /* we have a character 176 * or number */ 177 register char *j; /* used for searching thru list of 178 * reserved words */ 179 register struct templ *p; 180 register int c; 181 182 do { /* copy it over */ 183 *tok++ = *buf_ptr++; 184 if (buf_ptr >= buf_end) 185 fill_buffer(); 186 } while (chartype[c = *buf_ptr & 0177] == alphanum || 187 isdigit(token[0]) && (c == '+' || c == '-') && 188 (tok[-1] == 'e' || tok[-1] == 'E')); 189 *tok++ = '\0'; 190 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 191 if (++buf_ptr >= buf_end) 192 fill_buffer(); 193 } 194 ps.its_a_keyword = false; 195 ps.sizeof_keyword = false; 196 if (l_struct) { /* if last token was 'struct', then this 197 * token should be treated as a 198 * declaration */ 199 l_struct = false; 200 last_code = ident; 201 ps.last_u_d = true; 202 return (decl); 203 } 204 ps.last_u_d = false; /* Operator after indentifier is binary */ 205 last_code = ident; /* Remember that this is the code we will 206 * return */ 207 208 /* 209 * This loop will check if the token is a keyword. 210 */ 211 for (p = specials; (j = p->rwd) != 0; p++) { 212 tok = token; /* point at scanned token */ 213 if (*j++ != *tok++ || *j++ != *tok++) 214 continue; /* This test depends on the fact that 215 * identifiers are always at least 1 216 * character long (ie. the first two bytes 217 * of the identifier are always 218 * meaningful) */ 219 if (tok[-1] == 0) 220 break; /* If its a one-character identifier */ 221 while (*tok++ == *j) 222 if (*j++ == 0) 223 goto found_keyword; /* I wish that C had a multi-level 224 * break... */ 225 } 226 if (p->rwd) { /* we have a keyword */ 227 found_keyword: 228 ps.its_a_keyword = true; 229 ps.last_u_d = true; 230 switch (p->rwcode) { 231 case 1: /* it is a switch */ 232 return (swstmt); 233 case 2: /* a case or default */ 234 return (casestmt); 235 236 case 3: /* a "struct" */ 237 if (ps.p_l_follow) 238 break; /* inside parens: cast */ 239 l_struct = true; 240 241 /* 242 * Next time around, we will want to know that we have 243 * had a 'struct' 244 */ 245 case 4: /* one of the declaration keywords */ 246 if (ps.p_l_follow) { 247 ps.cast_mask |= 1 << ps.p_l_follow; 248 break; /* inside parens: cast */ 249 } 250 last_code = decl; 251 return (decl); 252 253 case 5: /* if, while, for */ 254 return (sp_paren); 255 256 case 6: /* do, else */ 257 return (sp_nparen); 258 259 case 7: 260 ps.sizeof_keyword = true; 261 default: /* all others are treated like any other 262 * identifier */ 263 return (ident); 264 } /* end of switch */ 265 } /* end of if (found_it) */ 266 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 267 && (buf_ptr[1] != ')' || buf_ptr[2] != ';')) { 268 strncpy(ps.procname, token, sizeof ps.procname - 1); 269 ps.in_parameter_declaration = 1; 270 } 271 272 /* 273 * The following hack attempts to guess whether or not the current 274 * token is in fact a declaration keyword -- one that has been 275 * typedefd 276 */ 277 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr)) 278 && !ps.p_l_follow 279 && (ps.last_token == rparen || ps.last_token == semicolon || 280 ps.last_token == decl || 281 ps.last_token == lbrace || ps.last_token == rbrace)) { 282 ps.its_a_keyword = true; 283 ps.last_u_d = true; 284 last_code = decl; 285 return decl; 286 } 287 if (last_code == decl) /* if this is a declared variable, then 288 * following sign is unary */ 289 ps.last_u_d = true; /* will make "int a -1" work */ 290 last_code = ident; 291 return (ident); /* the ident is not in the list */ 292 } /* end of procesing for alpanum character */ 293 /* Scan a non-alphanumeric token */ 294 295 *tok++ = *buf_ptr; /* if it is only a one-character token, it 296 * is moved here */ 297 *tok = '\0'; 298 if (++buf_ptr >= buf_end) 299 fill_buffer(); 300 301 switch (*token) { 302 case '\n': 303 unary_delim = ps.last_u_d; 304 ps.last_nl = true; /* remember that we just had a newline */ 305 code = (had_eof ? 0 : newline); 306 307 /* 308 * if data has been exausted, the newline is a dummy, and we 309 * should return code to stop 310 */ 311 break; 312 313 case '\'': /* start of quoted character */ 314 case '"': /* start of string */ 315 qchar = *token; 316 if (troff) { 317 tok[-1] = '`'; 318 if (qchar == '"') 319 *tok++ = '`'; 320 *tok++ = BACKSLASH; 321 *tok++ = 'f'; 322 *tok++ = 'L'; 323 } 324 do { /* copy the string */ 325 while (1) { /* move one character or [/<char>]<char> */ 326 if (*buf_ptr == '\n') { 327 printf("%d: Unterminated literal\n", line_no); 328 goto stop_lit; 329 } 330 *tok = *buf_ptr++; 331 if (buf_ptr >= buf_end) 332 fill_buffer(); 333 if (had_eof || ((tok - token) > (bufsize - 2))) { 334 printf("Unterminated literal\n"); 335 ++tok; 336 goto stop_lit; 337 /* get outof literal copying loop */ 338 } 339 if (*tok == BACKSLASH) { /* if escape, copy extra 340 * char */ 341 if (*buf_ptr == '\n') /* check for escaped 342 * newline */ 343 ++line_no; 344 if (troff) { 345 *++tok = BACKSLASH; 346 if (*buf_ptr == BACKSLASH) 347 *++tok = BACKSLASH; 348 } 349 *++tok = *buf_ptr++; 350 ++tok; /* we must increment this again because we 351 * copied two chars */ 352 if (buf_ptr >= buf_end) 353 fill_buffer(); 354 } 355 else 356 break; /* we copied one character */ 357 } /* end of while (1) */ 358 } while (*tok++ != qchar); 359 if (troff) { 360 tok[-1] = BACKSLASH; 361 *tok++ = 'f'; 362 *tok++ = 'R'; 363 *tok++ = '\''; 364 if (qchar == '"') 365 *tok++ = '\''; 366 } 367 stop_lit: 368 code = ident; 369 break; 370 371 case ('('): 372 case ('['): 373 unary_delim = true; 374 code = lparen; 375 break; 376 377 case (')'): 378 case (']'): 379 code = rparen; 380 break; 381 382 case '#': 383 unary_delim = ps.last_u_d; 384 code = preesc; 385 break; 386 387 case '?': 388 unary_delim = true; 389 code = question; 390 break; 391 392 case (':'): 393 code = colon; 394 unary_delim = true; 395 break; 396 397 case (';'): 398 unary_delim = true; 399 code = semicolon; 400 break; 401 402 case ('{'): 403 unary_delim = true; 404 405 /* 406 * if (ps.in_or_st) ps.block_init = 1; 407 */ 408 code = ps.block_init ? lparen : lbrace; 409 break; 410 411 case ('}'): 412 unary_delim = true; 413 code = ps.block_init ? rparen : rbrace; 414 break; 415 416 case 014: /* a form feed */ 417 unary_delim = ps.last_u_d; 418 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 419 * right */ 420 code = form_feed; 421 break; 422 423 case (','): 424 unary_delim = true; 425 code = comma; 426 break; 427 428 case '.': 429 unary_delim = false; 430 code = period; 431 break; 432 433 case '-': 434 case '+': /* check for -, +, --, ++ */ 435 code = (ps.last_u_d ? unary_op : binary_op); 436 unary_delim = true; 437 438 if (*buf_ptr == token[0]) { 439 /* check for doubled character */ 440 *tok++ = *buf_ptr++; 441 /* buffer overflow will be checked at end of loop */ 442 if (last_code == ident || last_code == rparen) { 443 code = (ps.last_u_d ? unary_op : postop); 444 /* check for following ++ or -- */ 445 unary_delim = false; 446 } 447 } 448 else if (*buf_ptr == '=') 449 /* check for operator += */ 450 *tok++ = *buf_ptr++; 451 else if (token[0] == '-' && *buf_ptr == '>') { 452 /* check for operator -> */ 453 *tok++ = *buf_ptr++; 454 if (!pointer_as_binop) { 455 code = unary_op; 456 unary_delim = false; 457 ps.want_blank = false; 458 } 459 } 460 /* buffer overflow will be checked at end of switch */ 461 462 break; 463 464 case '=': 465 if (ps.in_or_st) 466 ps.block_init = 1; 467 if (chartype[*buf_ptr] == opchar) { /* we have two char 468 * assignment */ 469 tok[-1] = *buf_ptr++; 470 if ((tok[-1] == '<' || tok[-1] == '>') && tok[-1] == *buf_ptr) 471 *tok++ = *buf_ptr++; 472 *tok++ = '='; /* Flip =+ to += */ 473 *tok = 0; 474 } 475 code = binary_op; 476 unary_delim = true; 477 break; 478 /* can drop thru!!! */ 479 480 case '>': 481 case '<': 482 case '!': /* ops like <, <<, <=, !=, etc */ 483 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 484 *tok++ = *buf_ptr; 485 if (++buf_ptr >= buf_end) 486 fill_buffer(); 487 } 488 if (*buf_ptr == '=') 489 *tok++ = *buf_ptr++; 490 code = (ps.last_u_d ? unary_op : binary_op); 491 unary_delim = true; 492 break; 493 494 default: 495 if (token[0] == '/' && *buf_ptr == '*') { 496 /* it is start of comment */ 497 *tok++ = '*'; 498 499 if (++buf_ptr >= buf_end) 500 fill_buffer(); 501 502 code = comment; 503 unary_delim = ps.last_u_d; 504 break; 505 } 506 while (*(tok - 1) == *buf_ptr || *buf_ptr == '=') { 507 /* handle ||, &&, etc, and also things as in int *****i */ 508 *tok++ = *buf_ptr; 509 if (++buf_ptr >= buf_end) 510 fill_buffer(); 511 } 512 code = (ps.last_u_d ? unary_op : binary_op); 513 unary_delim = true; 514 515 516 } /* end of switch */ 517 if (code != newline) { 518 l_struct = false; 519 last_code = code; 520 } 521 if (buf_ptr >= buf_end) /* check for input buffer empty */ 522 fill_buffer(); 523 ps.last_u_d = unary_delim; 524 *tok = '\0'; /* null terminate the token */ 525 return (code); 526 }; 527 528 /* Add the given keyword to the keyword table, using val as the keyword type 529 */ 530 addkey (key, val) 531 char *key; 532 { 533 register struct templ *p = specials; 534 while (p->rwd) 535 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 536 return; 537 else 538 p++; 539 if (p >= specials + sizeof specials / sizeof specials[0]) 540 return; /* For now, table overflows are silently 541 ignored */ 542 p->rwd = key; 543 p->rwcode = val; 544 p[1].rwd = 0; 545 p[1].rwcode = 0; 546 return; 547 } 548