1 /* 2 * Copyright (c) 1980 Regents of the University of California. 3 * Copyright (c) 1976 Board of Trustees of the University of Illinois. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms are permitted 7 * provided that this notice is preserved and that due credit is given 8 * to the University of California at Berkeley and the University of 9 * Illinois at Urbana. The name of either University may not be used 10 * to endorse or promote products derived from this software without 11 * specific prior written permission. This software is provided 12 * ``as is'' without express or implied warranty. 13 */ 14 15 #ifndef lint 16 static char sccsid[] = "@(#)lexi.c 5.6 (Berkeley) 03/22/88"; 17 #endif /* not lint */ 18 19 /* 20 * NAME: 21 * lexi 22 * 23 * FUNCTION: 24 * This is the token scanner for indent 25 * 26 * ALGORITHM: 27 * 1) Strip off intervening blanks and/or tabs. 28 * 2) If it is an alphanumeric token, move it to the token buffer "token". 29 * Check if it is a special reserved word that indent will want to 30 * know about. 31 * 3) Non-alphanumeric tokens are handled with a big switch statement. A 32 * flag is kept to remember if the last token was a "unary delimiter", 33 * which forces a following operator to be unary as opposed to binary. 34 * 35 * PARAMETERS: 36 * None 37 * 38 * RETURNS: 39 * An integer code indicating the type of token scanned. 40 * 41 * GLOBALS: 42 * buf_ptr = 43 * had_eof 44 * ps.last_u_d = Set to true iff this token is a "unary delimiter" 45 * 46 * CALLS: 47 * fill_buffer 48 * printf (lib) 49 * 50 * CALLED BY: 51 * main 52 * 53 * NOTES: 54 * Start of comment is passed back so that the comment can be scanned by 55 * pr_comment. 56 * 57 * Strings and character literals are returned just like identifiers. 58 * 59 * HISTORY: 60 * initial coding November 1976 D A Willcox of CAC 61 * 1/7/77 D A Willcox of CAC Fix to provide proper handling 62 * of "int a -1;" 63 * 64 */ 65 66 /* 67 * Here we have the token scanner for indent. It scans off one token and 68 * puts it in the global variable "token". It returns a code, indicating 69 * the type of token scanned. 70 */ 71 72 #include "indent_globs.h" 73 #include "indent_codes.h" 74 #include "ctype.h" 75 76 #define alphanum 1 77 #define opchar 3 78 79 struct templ { 80 char *rwd; 81 int rwcode; 82 }; 83 84 struct templ specials[100] = 85 { 86 "switch", 1, 87 "case", 2, 88 "break", 0, 89 "struct", 3, 90 "union", 3, 91 "enum", 3, 92 "default", 2, 93 "int", 4, 94 "char", 4, 95 "float", 4, 96 "double", 4, 97 "long", 4, 98 "short", 4, 99 "typdef", 4, 100 "unsigned", 4, 101 "register", 4, 102 "static", 4, 103 "global", 4, 104 "extern", 4, 105 "void", 4, 106 "goto", 0, 107 "return", 0, 108 "if", 5, 109 "while", 5, 110 "for", 5, 111 "else", 6, 112 "do", 6, 113 "sizeof", 7, 114 0, 0 115 }; 116 117 char chartype[128] = 118 { /* this is used to facilitate the decision 119 * of what type (alphanumeric, operator) 120 * each character is */ 121 0, 0, 0, 0, 0, 0, 0, 0, 122 0, 0, 0, 0, 0, 0, 0, 0, 123 0, 0, 0, 0, 0, 0, 0, 0, 124 0, 0, 0, 0, 0, 0, 0, 0, 125 0, 3, 0, 0, 0, 3, 3, 0, 126 0, 0, 3, 3, 0, 3, 3, 3, 127 1, 1, 1, 1, 1, 1, 1, 1, 128 1, 1, 0, 0, 3, 3, 3, 3, 129 0, 1, 1, 1, 1, 1, 1, 1, 130 1, 1, 1, 1, 1, 1, 1, 1, 131 1, 1, 1, 1, 1, 1, 1, 1, 132 1, 1, 1, 0, 0, 0, 3, 1, 133 0, 1, 1, 1, 1, 1, 1, 1, 134 1, 1, 1, 1, 1, 1, 1, 1, 135 1, 1, 1, 1, 1, 1, 1, 1, 136 1, 1, 1, 0, 3, 0, 3, 0 137 }; 138 139 140 141 142 int 143 lexi() 144 { 145 register char *tok; /* local pointer to next char in token */ 146 int unary_delim; /* this is set to 1 if the current token 147 * 148 * forces a following operator to be unary */ 149 static int last_code; /* the last token type returned */ 150 static int l_struct; /* set to 1 if the last token was 'struct' */ 151 int code; /* internal code to be returned */ 152 char qchar; /* the delimiter character for a string */ 153 154 tok = token; /* point to start of place to save token */ 155 unary_delim = false; 156 ps.col_1 = ps.last_nl; /* tell world that this token started in 157 * column 1 iff the last thing scanned was 158 * nl */ 159 ps.last_nl = false; 160 161 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 162 ps.col_1 = false; /* leading blanks imply token is not in 163 * column 1 */ 164 if (++buf_ptr >= buf_end) 165 fill_buffer(); 166 } 167 168 /* Scan an alphanumeric token. Note that we must also handle 169 * stuff like "1.0e+03" and "7e-6". */ 170 if (chartype[*buf_ptr & 0177] == alphanum) { /* we have a character 171 * or number */ 172 register char *j; /* used for searching thru list of 173 * reserved words */ 174 register struct templ *p; 175 register int c; 176 177 do { /* copy it over */ 178 *tok++ = *buf_ptr++; 179 if (buf_ptr >= buf_end) 180 fill_buffer(); 181 } while (chartype[c = *buf_ptr & 0177] == alphanum || 182 isdigit(token[0]) && (c == '+' || c == '-') && 183 (tok[-1] == 'e' || tok[-1] == 'E')); 184 *tok++ = '\0'; 185 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 186 if (++buf_ptr >= buf_end) 187 fill_buffer(); 188 } 189 ps.its_a_keyword = false; 190 ps.sizeof_keyword = false; 191 if (l_struct) { /* if last token was 'struct', then this 192 * token should be treated as a 193 * declaration */ 194 l_struct = false; 195 last_code = ident; 196 ps.last_u_d = true; 197 return (decl); 198 } 199 ps.last_u_d = false; /* Operator after indentifier is binary */ 200 last_code = ident; /* Remember that this is the code we will 201 * return */ 202 203 /* 204 * This loop will check if the token is a keyword. 205 */ 206 for (p = specials; (j = p->rwd) != 0; p++) { 207 tok = token; /* point at scanned token */ 208 if (*j++ != *tok++ || *j++ != *tok++) 209 continue; /* This test depends on the fact that 210 * identifiers are always at least 1 211 * character long (ie. the first two bytes 212 * of the identifier are always 213 * meaningful) */ 214 if (tok[-1] == 0) 215 break; /* If its a one-character identifier */ 216 while (*tok++ == *j) 217 if (*j++ == 0) 218 goto found_keyword; /* I wish that C had a multi-level 219 * break... */ 220 } 221 if (p->rwd) { /* we have a keyword */ 222 found_keyword: 223 ps.its_a_keyword = true; 224 ps.last_u_d = true; 225 switch (p->rwcode) { 226 case 1: /* it is a switch */ 227 return (swstmt); 228 case 2: /* a case or default */ 229 return (casestmt); 230 231 case 3: /* a "struct" */ 232 if (ps.p_l_follow) 233 break; /* inside parens: cast */ 234 l_struct = true; 235 236 /* 237 * Next time around, we will want to know that we have 238 * had a 'struct' 239 */ 240 case 4: /* one of the declaration keywords */ 241 if (ps.p_l_follow) { 242 ps.cast_mask |= 1 << ps.p_l_follow; 243 break; /* inside parens: cast */ 244 } 245 last_code = decl; 246 return (decl); 247 248 case 5: /* if, while, for */ 249 return (sp_paren); 250 251 case 6: /* do, else */ 252 return (sp_nparen); 253 254 case 7: 255 ps.sizeof_keyword = true; 256 default: /* all others are treated like any other 257 * identifier */ 258 return (ident); 259 } /* end of switch */ 260 } /* end of if (found_it) */ 261 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 262 && (buf_ptr[1] != ')' || buf_ptr[2] != ';')) { 263 strncpy(ps.procname, token, sizeof ps.procname - 1); 264 ps.in_parameter_declaration = 1; 265 } 266 267 /* 268 * The following hack attempts to guess whether or not the current 269 * token is in fact a declaration keyword -- one that has been 270 * typedefd 271 */ 272 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr)) 273 && !ps.p_l_follow 274 && (ps.last_token == rparen || ps.last_token == semicolon || 275 ps.last_token == decl || 276 ps.last_token == lbrace || ps.last_token == rbrace)) { 277 ps.its_a_keyword = true; 278 ps.last_u_d = true; 279 last_code = decl; 280 return decl; 281 } 282 if (last_code == decl) /* if this is a declared variable, then 283 * following sign is unary */ 284 ps.last_u_d = true; /* will make "int a -1" work */ 285 last_code = ident; 286 return (ident); /* the ident is not in the list */ 287 } /* end of procesing for alpanum character */ 288 /* Scan a non-alphanumeric token */ 289 290 *tok++ = *buf_ptr; /* if it is only a one-character token, it 291 * is moved here */ 292 *tok = '\0'; 293 if (++buf_ptr >= buf_end) 294 fill_buffer(); 295 296 switch (*token) { 297 case '\n': 298 unary_delim = ps.last_u_d; 299 ps.last_nl = true; /* remember that we just had a newline */ 300 code = (had_eof ? 0 : newline); 301 302 /* 303 * if data has been exausted, the newline is a dummy, and we 304 * should return code to stop 305 */ 306 break; 307 308 case '\'': /* start of quoted character */ 309 case '"': /* start of string */ 310 qchar = *token; 311 if (troff) { 312 tok[-1] = '`'; 313 if (qchar == '"') 314 *tok++ = '`'; 315 *tok++ = BACKSLASH; 316 *tok++ = 'f'; 317 *tok++ = 'L'; 318 } 319 do { /* copy the string */ 320 while (1) { /* move one character or [/<char>]<char> */ 321 if (*buf_ptr == '\n') { 322 printf("%d: Unterminated literal\n", line_no); 323 goto stop_lit; 324 } 325 *tok = *buf_ptr++; 326 if (buf_ptr >= buf_end) 327 fill_buffer(); 328 if (had_eof || ((tok - token) > (bufsize - 2))) { 329 printf("Unterminated literal\n"); 330 ++tok; 331 goto stop_lit; 332 /* get outof literal copying loop */ 333 } 334 if (*tok == BACKSLASH) { /* if escape, copy extra 335 * char */ 336 if (*buf_ptr == '\n') /* check for escaped 337 * newline */ 338 ++line_no; 339 if (troff) { 340 *++tok = BACKSLASH; 341 if (*buf_ptr == BACKSLASH) 342 *++tok = BACKSLASH; 343 } 344 *++tok = *buf_ptr++; 345 ++tok; /* we must increment this again because we 346 * copied two chars */ 347 if (buf_ptr >= buf_end) 348 fill_buffer(); 349 } 350 else 351 break; /* we copied one character */ 352 } /* end of while (1) */ 353 } while (*tok++ != qchar); 354 if (troff) { 355 tok[-1] = BACKSLASH; 356 *tok++ = 'f'; 357 *tok++ = 'R'; 358 *tok++ = '\''; 359 if (qchar == '"') 360 *tok++ = '\''; 361 } 362 stop_lit: 363 code = ident; 364 break; 365 366 case ('('): 367 case ('['): 368 unary_delim = true; 369 code = lparen; 370 break; 371 372 case (')'): 373 case (']'): 374 code = rparen; 375 break; 376 377 case '#': 378 unary_delim = ps.last_u_d; 379 code = preesc; 380 break; 381 382 case '?': 383 unary_delim = true; 384 code = question; 385 break; 386 387 case (':'): 388 code = colon; 389 unary_delim = true; 390 break; 391 392 case (';'): 393 unary_delim = true; 394 code = semicolon; 395 break; 396 397 case ('{'): 398 unary_delim = true; 399 400 /* 401 * if (ps.in_or_st) ps.block_init = 1; 402 */ 403 code = ps.block_init ? lparen : lbrace; 404 break; 405 406 case ('}'): 407 unary_delim = true; 408 code = ps.block_init ? rparen : rbrace; 409 break; 410 411 case 014: /* a form feed */ 412 unary_delim = ps.last_u_d; 413 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 414 * right */ 415 code = form_feed; 416 break; 417 418 case (','): 419 unary_delim = true; 420 code = comma; 421 break; 422 423 case '.': 424 unary_delim = false; 425 code = period; 426 break; 427 428 case '-': 429 case '+': /* check for -, +, --, ++ */ 430 code = (ps.last_u_d ? unary_op : binary_op); 431 unary_delim = true; 432 433 if (*buf_ptr == token[0]) { 434 /* check for doubled character */ 435 *tok++ = *buf_ptr++; 436 /* buffer overflow will be checked at end of loop */ 437 if (last_code == ident || last_code == rparen) { 438 code = (ps.last_u_d ? unary_op : postop); 439 /* check for following ++ or -- */ 440 unary_delim = false; 441 } 442 } 443 else if (*buf_ptr == '=') 444 /* check for operator += */ 445 *tok++ = *buf_ptr++; 446 else if (token[0] == '-' && *buf_ptr == '>') { 447 /* check for operator -> */ 448 *tok++ = *buf_ptr++; 449 if (!pointer_as_binop) { 450 code = unary_op; 451 unary_delim = false; 452 ps.want_blank = false; 453 } 454 } 455 /* buffer overflow will be checked at end of switch */ 456 457 break; 458 459 case '=': 460 if (ps.in_or_st) 461 ps.block_init = 1; 462 if (chartype[*buf_ptr] == opchar) { /* we have two char 463 * assignment */ 464 tok[-1] = *buf_ptr++; 465 if ((tok[-1] == '<' || tok[-1] == '>') && tok[-1] == *buf_ptr) 466 *tok++ = *buf_ptr++; 467 *tok++ = '='; /* Flip =+ to += */ 468 *tok = 0; 469 } 470 code = binary_op; 471 unary_delim = true; 472 break; 473 /* can drop thru!!! */ 474 475 case '>': 476 case '<': 477 case '!': /* ops like <, <<, <=, !=, etc */ 478 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 479 *tok++ = *buf_ptr; 480 if (++buf_ptr >= buf_end) 481 fill_buffer(); 482 } 483 if (*buf_ptr == '=') 484 *tok++ = *buf_ptr++; 485 code = (ps.last_u_d ? unary_op : binary_op); 486 unary_delim = true; 487 break; 488 489 default: 490 if (token[0] == '/' && *buf_ptr == '*') { 491 /* it is start of comment */ 492 *tok++ = '*'; 493 494 if (++buf_ptr >= buf_end) 495 fill_buffer(); 496 497 code = comment; 498 unary_delim = ps.last_u_d; 499 break; 500 } 501 while (*(tok - 1) == *buf_ptr || *buf_ptr == '=') { 502 /* handle ||, &&, etc, and also things as in int *****i */ 503 *tok++ = *buf_ptr; 504 if (++buf_ptr >= buf_end) 505 fill_buffer(); 506 } 507 code = (ps.last_u_d ? unary_op : binary_op); 508 unary_delim = true; 509 510 511 } /* end of switch */ 512 if (code != newline) { 513 l_struct = false; 514 last_code = code; 515 } 516 if (buf_ptr >= buf_end) /* check for input buffer empty */ 517 fill_buffer(); 518 ps.last_u_d = unary_delim; 519 *tok = '\0'; /* null terminate the token */ 520 return (code); 521 }; 522 523 /* Add the given keyword to the keyword table, using val as the keyword type 524 */ 525 addkey (key, val) 526 char *key; 527 { 528 register struct templ *p = specials; 529 while (p->rwd) 530 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 531 return; 532 else 533 p++; 534 if (p >= specials + sizeof specials / sizeof specials[0]) 535 return; /* For now, table overflows are silently 536 ignored */ 537 p->rwd = key; 538 p->rwcode = val; 539 p[1].rwd = 0; 540 p[1].rwcode = 0; 541 return; 542 } 543