1*8804Smckusick static char sccsid[] = "@(#)lexi.c 4.1 (Berkeley) 10/21/82"; 2*8804Smckusick 3*8804Smckusick /* 4*8804Smckusick 5*8804Smckusick Copyright (C) 1976 6*8804Smckusick by the 7*8804Smckusick Board of Trustees 8*8804Smckusick of the 9*8804Smckusick University of Illinois 10*8804Smckusick 11*8804Smckusick All rights reserved 12*8804Smckusick 13*8804Smckusick 14*8804Smckusick NAME: 15*8804Smckusick lexi 16*8804Smckusick 17*8804Smckusick FUNCTION: 18*8804Smckusick This is the token scanner for indent 19*8804Smckusick 20*8804Smckusick ALGORITHM: 21*8804Smckusick 1) Strip off intervening blanks and/or tabs. 22*8804Smckusick 2) If it is an alphanumeric token, move it to the token buffer "token". 23*8804Smckusick Check if it is a special reserved word that indent will want to 24*8804Smckusick know about. 25*8804Smckusick 3) Non-alphanumeric tokens are handled with a big switch statement. A 26*8804Smckusick flag is kept to remember if the last token was a "unary delimiter", 27*8804Smckusick which forces a following operator to be unary as opposed to binary. 28*8804Smckusick 29*8804Smckusick PARAMETERS: 30*8804Smckusick None 31*8804Smckusick 32*8804Smckusick RETURNS: 33*8804Smckusick An integer code indicating the type of token scanned. 34*8804Smckusick 35*8804Smckusick GLOBALS: 36*8804Smckusick buf_ptr = 37*8804Smckusick had_eof 38*8804Smckusick last_u_d = Set to true iff this token is a "unary delimiter" 39*8804Smckusick 40*8804Smckusick CALLS: 41*8804Smckusick fill_buffer 42*8804Smckusick printf (lib) 43*8804Smckusick 44*8804Smckusick CALLED BY: 45*8804Smckusick main 46*8804Smckusick 47*8804Smckusick NOTES: 48*8804Smckusick Start of comment is passed back so that the comment can be scanned by 49*8804Smckusick pr_comment. 50*8804Smckusick 51*8804Smckusick Strings and character literals are returned just like identifiers. 52*8804Smckusick 53*8804Smckusick HISTORY: 54*8804Smckusick initial coding November 1976 D A Willcox of CAC 55*8804Smckusick 1/7/77 D A Willcox of CAC Fix to provide proper handling 56*8804Smckusick of "int a -1;" 57*8804Smckusick 58*8804Smckusick */ 59*8804Smckusick 60*8804Smckusick /* Here we have the token scanner for indent. It scans off one token and 61*8804Smckusick puts it in the global variable "token". It returns a code, indicating the 62*8804Smckusick type of token scanned. */ 63*8804Smckusick 64*8804Smckusick #include "indent_globs.h"; 65*8804Smckusick #include "indent_codes.h"; 66*8804Smckusick 67*8804Smckusick 68*8804Smckusick 69*8804Smckusick #define alphanum 1 70*8804Smckusick #define opchar 3 71*8804Smckusick 72*8804Smckusick struct templ { 73*8804Smckusick char *rwd; 74*8804Smckusick int rwcode; 75*8804Smckusick }; 76*8804Smckusick 77*8804Smckusick struct templ specials[] = 78*8804Smckusick { 79*8804Smckusick "switch", 1, 80*8804Smckusick "case", 2, 81*8804Smckusick "struct", 3, 82*8804Smckusick "default", 2, 83*8804Smckusick "int", 4, 84*8804Smckusick "char", 4, 85*8804Smckusick "float", 4, 86*8804Smckusick "double", 4, 87*8804Smckusick "long", 4, 88*8804Smckusick "short", 4, 89*8804Smckusick "typdef", 4, 90*8804Smckusick "unsigned", 4, 91*8804Smckusick "register", 4, 92*8804Smckusick "static", 4, 93*8804Smckusick "global", 4, 94*8804Smckusick "extern", 4, 95*8804Smckusick "if", 5, 96*8804Smckusick "while", 5, 97*8804Smckusick "for", 5, 98*8804Smckusick "else", 6, 99*8804Smckusick "do", 6, 100*8804Smckusick "sizeof", 0, 101*8804Smckusick 0, 0 102*8804Smckusick }; 103*8804Smckusick 104*8804Smckusick char chartype[128] = 105*8804Smckusick { /* this is used to facilitate the decision of what type 106*8804Smckusick (alphanumeric, operator) each character is */ 107*8804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 108*8804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 109*8804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 110*8804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 111*8804Smckusick 0, 3, 0, 0, 0, 3, 3, 0, 112*8804Smckusick 0, 0, 3, 3, 0, 3, 3, 3, 113*8804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 114*8804Smckusick 1, 1, 0, 0, 3, 3, 3, 3, 115*8804Smckusick 0, 1, 1, 1, 1, 1, 1, 1, 116*8804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 117*8804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 118*8804Smckusick 1, 1, 1, 0, 0, 0, 3, 1, 119*8804Smckusick 0, 1, 1, 1, 1, 1, 1, 1, 120*8804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 121*8804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 122*8804Smckusick 1, 1, 1, 0, 3, 0, 3, 0 123*8804Smckusick }; 124*8804Smckusick 125*8804Smckusick int last_nl = true; 126*8804Smckusick /* this is true if the last thing scanned was a newline */ 127*8804Smckusick 128*8804Smckusick 129*8804Smckusick 130*8804Smckusick int lexi () { 131*8804Smckusick register char *tok; 132*8804Smckusick /* local pointer to next char in token */ 133*8804Smckusick register int i; 134*8804Smckusick /* local loop counter */ 135*8804Smckusick register char *j; 136*8804Smckusick /* used for searching thru list of reserved words */ 137*8804Smckusick int unary_delim; 138*8804Smckusick /* this is set to 1 if the current token forces a following operator to be 139*8804Smckusick unary */ 140*8804Smckusick static int last_code; 141*8804Smckusick /* the last token type returned */ 142*8804Smckusick static int l_struct; 143*8804Smckusick /* set to 1 if the last token was 'struct' */ 144*8804Smckusick int found_it; 145*8804Smckusick int code; /* internal code to be returned */ 146*8804Smckusick char qchar; /* the delimiter character for a string */ 147*8804Smckusick 148*8804Smckusick tok = token; /* point to start of place to save token */ 149*8804Smckusick unary_delim = false; 150*8804Smckusick col_1 = last_nl; /* tell world that this token started in column 151*8804Smckusick 1 iff the last thing scanned was nl */ 152*8804Smckusick last_nl = false; 153*8804Smckusick 154*8804Smckusick while (*buf_ptr == ' ' || *buf_ptr == '\t') { 155*8804Smckusick /* get rid of blanks */ 156*8804Smckusick col_1 = false; /* leading blanks imply token is not in column 1 157*8804Smckusick */ 158*8804Smckusick if (++buf_ptr >= buf_end) 159*8804Smckusick fill_buffer (); 160*8804Smckusick } 161*8804Smckusick 162*8804Smckusick /*----------------------------------------------------------*\ 163*8804Smckusick | Scan an alphanumeric token 164*8804Smckusick \*----------------------------------------------------------*/ 165*8804Smckusick 166*8804Smckusick if (chartype[*buf_ptr & 0177] == alphanum) { 167*8804Smckusick /* we have a character or number */ 168*8804Smckusick while (chartype[*buf_ptr & 0177] == alphanum) { 169*8804Smckusick /* copy it over */ 170*8804Smckusick *tok++ = *buf_ptr++; 171*8804Smckusick if (buf_ptr >= buf_end) 172*8804Smckusick fill_buffer (); 173*8804Smckusick } 174*8804Smckusick 175*8804Smckusick *tok++ = '\0'; 176*8804Smckusick 177*8804Smckusick if (l_struct) { /* if last token was 'struct', then this token 178*8804Smckusick should be treated as a declaration */ 179*8804Smckusick l_struct = false; 180*8804Smckusick last_code = ident; 181*8804Smckusick last_u_d = true; 182*8804Smckusick return (decl); 183*8804Smckusick } 184*8804Smckusick 185*8804Smckusick last_u_d = false; /* operator after indentifier is binary */ 186*8804Smckusick 187*8804Smckusick for (i = 0; specials[i].rwd != 0; ++i) { 188*8804Smckusick /* this loop will check if the token is a keyword. if so, a following 189*8804Smckusick operator is unary */ 190*8804Smckusick last_code = ident; /* remember that this is the code we will return 191*8804Smckusick */ 192*8804Smckusick j = specials[i].rwd; 193*8804Smckusick /* point at ith reserved word */ 194*8804Smckusick tok = token; /* point at scanned toekn */ 195*8804Smckusick found_it = true; /* set to false if not found */ 196*8804Smckusick do { 197*8804Smckusick if (*tok++ != *j) { 198*8804Smckusick found_it = false; 199*8804Smckusick break; 200*8804Smckusick } 201*8804Smckusick } while (*j++); 202*8804Smckusick 203*8804Smckusick if (found_it) { /* we have a keyword */ 204*8804Smckusick last_u_d = true; 205*8804Smckusick switch (specials[i].rwcode) { 206*8804Smckusick case 1: /* it is a switch */ 207*8804Smckusick return (swstmt); 208*8804Smckusick case 2: /* a case or default */ 209*8804Smckusick return (casestmt); 210*8804Smckusick 211*8804Smckusick case 3: /* a "struct" */ 212*8804Smckusick l_struct = true; 213*8804Smckusick /* Next time around, we will want to know that we have had 214*8804Smckusick a 'struct' */ 215*8804Smckusick case 4: /* one of the declaration keywords */ 216*8804Smckusick if(p_l_follow) break; /* inside parens: cast */ 217*8804Smckusick last_code = decl; 218*8804Smckusick return (decl); 219*8804Smckusick 220*8804Smckusick case 5: /* if, while, for */ 221*8804Smckusick return (sp_paren); 222*8804Smckusick 223*8804Smckusick case 6: /* do, else */ 224*8804Smckusick return (sp_nparen); 225*8804Smckusick 226*8804Smckusick default: /* all others are treated like any other 227*8804Smckusick identifier */ 228*8804Smckusick return (ident); 229*8804Smckusick } /* end of switch */ 230*8804Smckusick } /* end of if (found_it) */ 231*8804Smckusick 232*8804Smckusick } 233*8804Smckusick 234*8804Smckusick if (last_code == decl) /* if this is a declared variable, then 235*8804Smckusick following sign is unary */ 236*8804Smckusick last_u_d = true; /* will make "int a -1" work */ 237*8804Smckusick last_code = ident; 238*8804Smckusick return (ident); /* the ident is not in the list */ 239*8804Smckusick } /* end of procesing for alpanum character */ 240*8804Smckusick 241*8804Smckusick 242*8804Smckusick 243*8804Smckusick /*----------------------------------------------------------*\ 244*8804Smckusick | Scan a non-alphanumeric token 245*8804Smckusick \*----------------------------------------------------------*/ 246*8804Smckusick 247*8804Smckusick *tok++ = *buf_ptr; /* if it is only a one-character token, it is 248*8804Smckusick moved here */ 249*8804Smckusick *tok = '\0'; 250*8804Smckusick if (++buf_ptr >= buf_end) 251*8804Smckusick fill_buffer (); 252*8804Smckusick 253*8804Smckusick switch (*token) { 254*8804Smckusick case '\n': 255*8804Smckusick unary_delim = last_u_d; 256*8804Smckusick last_nl = true; /* remember that we just had a newline */ 257*8804Smckusick code = (had_eof ? 0 : newline); 258*8804Smckusick /* if data has been exausted, the newline is a dummy, and we should 259*8804Smckusick return code to stop */ 260*8804Smckusick break; 261*8804Smckusick 262*8804Smckusick case '\'': /* start of quoted character */ 263*8804Smckusick qchar = '\''; /* remember final delimiter */ 264*8804Smckusick goto copy_lit; /* and go to common literal code */ 265*8804Smckusick 266*8804Smckusick case '"': /* start of string */ 267*8804Smckusick qchar = '"'; 268*8804Smckusick 269*8804Smckusick copy_lit: 270*8804Smckusick do { /* copy the string */ 271*8804Smckusick while (1) { /* move one character or [/<char>]<char> */ 272*8804Smckusick if (*buf_ptr == '\n') { 273*8804Smckusick /* check for unterminated literal */ 274*8804Smckusick printf ("%d: Unterminated literal\n", line_no); 275*8804Smckusick goto stop_lit; 276*8804Smckusick /* Don't copy any more */ 277*8804Smckusick } 278*8804Smckusick 279*8804Smckusick *tok = *buf_ptr++; 280*8804Smckusick if (buf_ptr >= buf_end) 281*8804Smckusick fill_buffer (); 282*8804Smckusick if (had_eof || ((tok - token) > (bufsize - 2))) { 283*8804Smckusick printf ("Unterminated literal\n"); 284*8804Smckusick ++tok; 285*8804Smckusick goto stop_lit; 286*8804Smckusick /* get outof literal copying loop */ 287*8804Smckusick } 288*8804Smckusick 289*8804Smckusick if (*tok == '\\') { 290*8804Smckusick /* if escape, copy extra char */ 291*8804Smckusick if (*buf_ptr == '\n') 292*8804Smckusick /* check for escaped newline */ 293*8804Smckusick ++line_no; 294*8804Smckusick *(++tok) = *buf_ptr++; 295*8804Smckusick ++tok; /* we must increment this again because we 296*8804Smckusick copied two chars */ 297*8804Smckusick if (buf_ptr >= buf_end) 298*8804Smckusick fill_buffer (); 299*8804Smckusick } 300*8804Smckusick else 301*8804Smckusick break; /* we copied one character */ 302*8804Smckusick } /* end of while (1) */ 303*8804Smckusick } while (*tok++ != qchar); 304*8804Smckusick 305*8804Smckusick stop_lit: 306*8804Smckusick code = ident; 307*8804Smckusick break; 308*8804Smckusick 309*8804Smckusick case ('('): 310*8804Smckusick case ('['): 311*8804Smckusick unary_delim = true; 312*8804Smckusick code = lparen; 313*8804Smckusick break; 314*8804Smckusick 315*8804Smckusick case (')'): 316*8804Smckusick case (']'): 317*8804Smckusick code = rparen; 318*8804Smckusick break; 319*8804Smckusick 320*8804Smckusick case '#': 321*8804Smckusick unary_delim = last_u_d; 322*8804Smckusick code = preesc; 323*8804Smckusick break; 324*8804Smckusick 325*8804Smckusick case '?': 326*8804Smckusick unary_delim = true; 327*8804Smckusick code = question; 328*8804Smckusick break; 329*8804Smckusick 330*8804Smckusick case (':'): 331*8804Smckusick code = colon; 332*8804Smckusick unary_delim = true; 333*8804Smckusick break; 334*8804Smckusick 335*8804Smckusick case (';'): 336*8804Smckusick unary_delim = true; 337*8804Smckusick code = semicolon; 338*8804Smckusick break; 339*8804Smckusick 340*8804Smckusick case ('{'): 341*8804Smckusick unary_delim = true; 342*8804Smckusick code = lbrace; 343*8804Smckusick break; 344*8804Smckusick 345*8804Smckusick case ('}'): 346*8804Smckusick unary_delim = true; 347*8804Smckusick code = rbrace; 348*8804Smckusick break; 349*8804Smckusick 350*8804Smckusick case 014: /* a form feed */ 351*8804Smckusick unary_delim = last_u_d; 352*8804Smckusick last_nl = true; /* remember this so we can set 'col_1' right */ 353*8804Smckusick code = form_feed; 354*8804Smckusick break; 355*8804Smckusick 356*8804Smckusick case (','): 357*8804Smckusick unary_delim = true; 358*8804Smckusick code = comma; 359*8804Smckusick break; 360*8804Smckusick 361*8804Smckusick case '.': 362*8804Smckusick unary_delim = false; 363*8804Smckusick code = period; 364*8804Smckusick break; 365*8804Smckusick 366*8804Smckusick case '-': 367*8804Smckusick case '+': /* check for -, +, --, ++ */ 368*8804Smckusick code = (last_u_d ? unary_op : binary_op); 369*8804Smckusick unary_delim = true; 370*8804Smckusick 371*8804Smckusick if (*buf_ptr == token[0]) { 372*8804Smckusick /* check for doubled character */ 373*8804Smckusick *tok++ = *buf_ptr++; 374*8804Smckusick /* buffer overflow will be checked at end of loop */ 375*8804Smckusick if (last_code == ident || last_code == rparen) { 376*8804Smckusick code = (last_u_d ? unary_op : postop); 377*8804Smckusick /* check for following ++ or -- */ 378*8804Smckusick unary_delim = false; 379*8804Smckusick } 380*8804Smckusick } 381*8804Smckusick else 382*8804Smckusick if (*buf_ptr == '>' || *buf_ptr == '=') 383*8804Smckusick /* check for operator -> or += */ 384*8804Smckusick *tok++ = *buf_ptr++; 385*8804Smckusick /* buffer overflow will be checked at end of switch */ 386*8804Smckusick 387*8804Smckusick break; 388*8804Smckusick 389*8804Smckusick case '=': 390*8804Smckusick if (chartype[*buf_ptr] == opchar) { 391*8804Smckusick /* we have two char assignment */ 392*8804Smckusick *tok++ = *buf_ptr; 393*8804Smckusick /* move second character */ 394*8804Smckusick if (++buf_ptr >= buf_end) 395*8804Smckusick fill_buffer (); 396*8804Smckusick } 397*8804Smckusick 398*8804Smckusick code = binary_op; 399*8804Smckusick unary_delim = true; 400*8804Smckusick if (token[1] != '<' && token[1] != '>') 401*8804Smckusick /* check for possible 3 char operator */ 402*8804Smckusick break; 403*8804Smckusick /* can drop thru!!! */ 404*8804Smckusick 405*8804Smckusick case '>': 406*8804Smckusick case '<': 407*8804Smckusick case '!': /* ops like <, <<, <=, !=, etc */ 408*8804Smckusick if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 409*8804Smckusick *tok++ = *buf_ptr; 410*8804Smckusick if (++buf_ptr >= buf_end) 411*8804Smckusick fill_buffer (); 412*8804Smckusick } 413*8804Smckusick 414*8804Smckusick if (*buf_ptr == '=') 415*8804Smckusick *tok++ = *buf_ptr++; 416*8804Smckusick code = (last_u_d ? unary_op : binary_op); 417*8804Smckusick unary_delim = true; 418*8804Smckusick break; 419*8804Smckusick 420*8804Smckusick default: 421*8804Smckusick if (token[0] == '/' && *buf_ptr == '*') { 422*8804Smckusick /* it is start of comment */ 423*8804Smckusick *tok++ = '*'; 424*8804Smckusick 425*8804Smckusick if (++buf_ptr >= buf_end) 426*8804Smckusick fill_buffer (); 427*8804Smckusick 428*8804Smckusick code = comment; 429*8804Smckusick unary_delim = last_u_d; 430*8804Smckusick break; 431*8804Smckusick } 432*8804Smckusick 433*8804Smckusick while (*(tok - 1) == *buf_ptr || *buf_ptr=='=') { 434*8804Smckusick /* handle ||, &&, etc, and also things as in int *****i */ 435*8804Smckusick *tok++ = *buf_ptr; 436*8804Smckusick if (++buf_ptr >= buf_end) 437*8804Smckusick fill_buffer (); 438*8804Smckusick } 439*8804Smckusick 440*8804Smckusick 441*8804Smckusick code = (last_u_d ? unary_op : binary_op); 442*8804Smckusick unary_delim = true; 443*8804Smckusick 444*8804Smckusick 445*8804Smckusick } /* end of switch */ 446*8804Smckusick 447*8804Smckusick if (code != newline) { 448*8804Smckusick l_struct = false; 449*8804Smckusick last_code = code; 450*8804Smckusick } 451*8804Smckusick 452*8804Smckusick if (buf_ptr >= buf_end) /* check for input buffer empty */ 453*8804Smckusick fill_buffer (); 454*8804Smckusick last_u_d = unary_delim; 455*8804Smckusick *tok = '\0'; /* null terminate the token */ 456*8804Smckusick return (code); 457*8804Smckusick }; 458