121970Sdist /* 221970Sdist * Copyright (c) 1980 Regents of the University of California. 321970Sdist * All rights reserved. The Berkeley software License Agreement 421970Sdist * specifies the terms and conditions for redistribution. 521970Sdist */ 68804Smckusick 721970Sdist #ifndef lint 8*24649Smckusick static char sccsid[] = "@(#)lexi.c 5.3 (Berkeley) 09/08/85"; 921970Sdist #endif not lint 1021970Sdist 1124455Smckusick /*- 1224455Smckusick * 1324455Smckusick * Copyright (C) 1976 1424455Smckusick * by the 1524455Smckusick * Board of Trustees 1624455Smckusick * of the 1724455Smckusick * University of Illinois 1824455Smckusick * 1924455Smckusick * All rights reserved 2024455Smckusick * 2124455Smckusick * 2224455Smckusick * NAME: 2324455Smckusick * lexi 2424455Smckusick * 2524455Smckusick * FUNCTION: 2624455Smckusick * This is the token scanner for indent 2724455Smckusick * 2824455Smckusick * ALGORITHM: 2924455Smckusick * 1) Strip off intervening blanks and/or tabs. 3024455Smckusick * 2) If it is an alphanumeric token, move it to the token buffer "token". 3124455Smckusick * Check if it is a special reserved word that indent will want to 3224455Smckusick * know about. 3324455Smckusick * 3) Non-alphanumeric tokens are handled with a big switch statement. A 3424455Smckusick * flag is kept to remember if the last token was a "unary delimiter", 3524455Smckusick * which forces a following operator to be unary as opposed to binary. 3624455Smckusick * 3724455Smckusick * PARAMETERS: 3824455Smckusick * None 3924455Smckusick * 4024455Smckusick * RETURNS: 4124455Smckusick * An integer code indicating the type of token scanned. 4224455Smckusick * 4324455Smckusick * GLOBALS: 4424455Smckusick * buf_ptr = 4524455Smckusick * had_eof 4624455Smckusick * ps.last_u_d = Set to true iff this token is a "unary delimiter" 4724455Smckusick * 4824455Smckusick * CALLS: 4924455Smckusick * fill_buffer 5024455Smckusick * printf (lib) 5124455Smckusick * 5224455Smckusick * CALLED BY: 5324455Smckusick * main 5424455Smckusick * 5524455Smckusick * NOTES: 5624455Smckusick * Start of comment is passed back so that the comment can be scanned by 5724455Smckusick * pr_comment. 5824455Smckusick * 5924455Smckusick * Strings and character literals are returned just like identifiers. 6024455Smckusick * 6124455Smckusick * HISTORY: 6224455Smckusick * initial coding November 1976 D A Willcox of CAC 6324455Smckusick * 1/7/77 D A Willcox of CAC Fix to provide proper handling 6424455Smckusick * of "int a -1;" 6524455Smckusick * 6624455Smckusick */ 6724455Smckusick 688804Smckusick /* 6924455Smckusick * Here we have the token scanner for indent. It scans off one token and 7024455Smckusick * puts it in the global variable "token". It returns a code, indicating 7124455Smckusick * the type of token scanned. 7224455Smckusick */ 738804Smckusick 748804Smckusick #include "indent_globs.h"; 758804Smckusick #include "indent_codes.h"; 7624455Smckusick #include "ctype.h" 778804Smckusick 788804Smckusick #define alphanum 1 798804Smckusick #define opchar 3 808804Smckusick 818804Smckusick struct templ { 8224455Smckusick char *rwd; 8324455Smckusick int rwcode; 848804Smckusick }; 858804Smckusick 8624455Smckusick struct templ specials[100] = 878804Smckusick { 888804Smckusick "switch", 1, 898804Smckusick "case", 2, 9024455Smckusick "break", 0, 918804Smckusick "struct", 3, 9224455Smckusick "union", 3, 9324455Smckusick "enum", 3, 948804Smckusick "default", 2, 958804Smckusick "int", 4, 968804Smckusick "char", 4, 978804Smckusick "float", 4, 988804Smckusick "double", 4, 998804Smckusick "long", 4, 1008804Smckusick "short", 4, 1018804Smckusick "typdef", 4, 1028804Smckusick "unsigned", 4, 1038804Smckusick "register", 4, 1048804Smckusick "static", 4, 1058804Smckusick "global", 4, 1068804Smckusick "extern", 4, 10724455Smckusick "void", 4, 10824455Smckusick "goto", 0, 10924455Smckusick "return", 0, 1108804Smckusick "if", 5, 1118804Smckusick "while", 5, 1128804Smckusick "for", 5, 1138804Smckusick "else", 6, 1148804Smckusick "do", 6, 11524455Smckusick "sizeof", 7, 1168804Smckusick 0, 0 1178804Smckusick }; 1188804Smckusick 11924455Smckusick char chartype[128] = 12024455Smckusick { /* this is used to facilitate the decision 12124455Smckusick * of what type (alphanumeric, operator) 12224455Smckusick * each character is */ 1238804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 1248804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 1258804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 1268804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 1278804Smckusick 0, 3, 0, 0, 0, 3, 3, 0, 1288804Smckusick 0, 0, 3, 3, 0, 3, 3, 3, 1298804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 1308804Smckusick 1, 1, 0, 0, 3, 3, 3, 3, 1318804Smckusick 0, 1, 1, 1, 1, 1, 1, 1, 1328804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 1338804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 1348804Smckusick 1, 1, 1, 0, 0, 0, 3, 1, 1358804Smckusick 0, 1, 1, 1, 1, 1, 1, 1, 1368804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 1378804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 1388804Smckusick 1, 1, 1, 0, 3, 0, 3, 0 1398804Smckusick }; 1408804Smckusick 1418804Smckusick 1428804Smckusick 1438804Smckusick 14424455Smckusick int 14524455Smckusick lexi() 14624455Smckusick { 14724455Smckusick register char *tok; /* local pointer to next char in token */ 14824455Smckusick int unary_delim; /* this is set to 1 if the current token 14924455Smckusick * 15024455Smckusick * forces a following operator to be unary */ 15124455Smckusick static int last_code; /* the last token type returned */ 15224455Smckusick static int l_struct; /* set to 1 if the last token was 'struct' */ 15324455Smckusick int code; /* internal code to be returned */ 15424455Smckusick char qchar; /* the delimiter character for a string */ 1558804Smckusick 15624455Smckusick tok = token; /* point to start of place to save token */ 1578804Smckusick unary_delim = false; 15824455Smckusick ps.col_1 = ps.last_nl; /* tell world that this token started in 15924455Smckusick * column 1 iff the last thing scanned was 16024455Smckusick * nl */ 16124455Smckusick ps.last_nl = false; 1628804Smckusick 16324455Smckusick while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 16424455Smckusick ps.col_1 = false; /* leading blanks imply token is not in 16524455Smckusick * column 1 */ 1668804Smckusick if (++buf_ptr >= buf_end) 16724455Smckusick fill_buffer(); 1688804Smckusick } 1698804Smckusick 170*24649Smckusick /* Scan an alphanumeric token. Note that we must also handle 171*24649Smckusick * stuff like "1.0e+03" and "7e-6". */ 17224455Smckusick if (chartype[*buf_ptr & 0177] == alphanum) { /* we have a character 17324455Smckusick * or number */ 17424455Smckusick register char *j; /* used for searching thru list of 17524455Smckusick * reserved words */ 17624455Smckusick register struct templ *p; 177*24649Smckusick register int c; 1788804Smckusick 179*24649Smckusick do { /* copy it over */ 1808804Smckusick *tok++ = *buf_ptr++; 1818804Smckusick if (buf_ptr >= buf_end) 18224455Smckusick fill_buffer(); 183*24649Smckusick } while (chartype[c = *buf_ptr & 0177] == alphanum || 184*24649Smckusick isdigit(token[0]) && (c == '+' || c == '-') && 185*24649Smckusick (tok[-1] == 'e' || tok[-1] == 'E')); 1868804Smckusick *tok++ = '\0'; 18724455Smckusick while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 18824455Smckusick if (++buf_ptr >= buf_end) 18924455Smckusick fill_buffer(); 19024455Smckusick } 19124455Smckusick ps.its_a_keyword = false; 19224455Smckusick ps.sizeof_keyword = false; 19324455Smckusick if (l_struct) { /* if last token was 'struct', then this 19424455Smckusick * token should be treated as a 19524455Smckusick * declaration */ 1968804Smckusick l_struct = false; 1978804Smckusick last_code = ident; 19824455Smckusick ps.last_u_d = true; 1998804Smckusick return (decl); 2008804Smckusick } 20124455Smckusick ps.last_u_d = false; /* Operator after indentifier is binary */ 20224455Smckusick last_code = ident; /* Remember that this is the code we will 20324455Smckusick * return */ 2048804Smckusick 20524455Smckusick /* 20624455Smckusick * This loop will check if the token is a keyword. 20724455Smckusick */ 20824455Smckusick for (p = specials; (j = p->rwd) != 0; p++) { 20924455Smckusick tok = token; /* point at scanned token */ 21024455Smckusick if (*j++ != *tok++ || *j++ != *tok++) 21124455Smckusick continue; /* This test depends on the fact that 21224455Smckusick * identifiers are always at least 1 21324455Smckusick * character long (ie. the first two bytes 21424455Smckusick * of the identifier are always 21524455Smckusick * meaningful) */ 21624455Smckusick if (tok[-1] == 0) 21724455Smckusick break; /* If its a one-character identifier */ 21824455Smckusick while (*tok++ == *j) 21924455Smckusick if (*j++ == 0) 22024455Smckusick goto found_keyword; /* I wish that C had a multi-level 22124455Smckusick * break... */ 22224455Smckusick } 22324455Smckusick if (p->rwd) { /* we have a keyword */ 22424455Smckusick found_keyword: 22524455Smckusick ps.its_a_keyword = true; 22624455Smckusick ps.last_u_d = true; 22724455Smckusick switch (p->rwcode) { 22824455Smckusick case 1: /* it is a switch */ 22924455Smckusick return (swstmt); 23024455Smckusick case 2: /* a case or default */ 23124455Smckusick return (casestmt); 2328804Smckusick 23324455Smckusick case 3: /* a "struct" */ 23424455Smckusick if (ps.p_l_follow) 23524455Smckusick break; /* inside parens: cast */ 23624455Smckusick l_struct = true; 2378804Smckusick 23824455Smckusick /* 23924455Smckusick * Next time around, we will want to know that we have 24024455Smckusick * had a 'struct' 24124455Smckusick */ 24224455Smckusick case 4: /* one of the declaration keywords */ 24324455Smckusick if (ps.p_l_follow) { 24424455Smckusick ps.cast_mask |= 1 << ps.p_l_follow; 24524455Smckusick break; /* inside parens: cast */ 24624455Smckusick } 24724455Smckusick last_code = decl; 24824455Smckusick return (decl); 2498804Smckusick 25024455Smckusick case 5: /* if, while, for */ 25124455Smckusick return (sp_paren); 2528804Smckusick 25324455Smckusick case 6: /* do, else */ 25424455Smckusick return (sp_nparen); 2558804Smckusick 25624455Smckusick case 7: 25724455Smckusick ps.sizeof_keyword = true; 25824455Smckusick default: /* all others are treated like any other 25924455Smckusick * identifier */ 26024455Smckusick return (ident); 26124455Smckusick } /* end of switch */ 26224455Smckusick } /* end of if (found_it) */ 26324455Smckusick if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 26424455Smckusick && (buf_ptr[1] != ')' || buf_ptr[2] != ';')) { 26524455Smckusick strncpy(ps.procname, token, sizeof ps.procname - 1); 26624455Smckusick ps.in_parameter_declaration = 1; 26724455Smckusick } 2688804Smckusick 26924455Smckusick /* 27024455Smckusick * The following hack attempts to guess whether or not the current 27124455Smckusick * token is in fact a declaration keyword -- one that has been 27224455Smckusick * typedefd 27324455Smckusick */ 27424455Smckusick if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr)) 27524455Smckusick && !ps.p_l_follow 27624455Smckusick && (ps.last_token == rparen || ps.last_token == semicolon || 27724455Smckusick ps.last_token == decl || 27824455Smckusick ps.last_token == lbrace || ps.last_token == rbrace)) { 27924455Smckusick ps.its_a_keyword = true; 28024455Smckusick ps.last_u_d = true; 28124455Smckusick last_code = decl; 28224455Smckusick return decl; 2838804Smckusick } 28424455Smckusick if (last_code == decl) /* if this is a declared variable, then 28524455Smckusick * following sign is unary */ 28624455Smckusick ps.last_u_d = true; /* will make "int a -1" work */ 2878804Smckusick last_code = ident; 28824455Smckusick return (ident); /* the ident is not in the list */ 28924455Smckusick } /* end of procesing for alpanum character */ 290*24649Smckusick /* Scan a non-alphanumeric token */ 2918804Smckusick 29224455Smckusick *tok++ = *buf_ptr; /* if it is only a one-character token, it 29324455Smckusick * is moved here */ 2948804Smckusick *tok = '\0'; 2958804Smckusick if (++buf_ptr >= buf_end) 29624455Smckusick fill_buffer(); 2978804Smckusick 2988804Smckusick switch (*token) { 29924455Smckusick case '\n': 30024455Smckusick unary_delim = ps.last_u_d; 30124455Smckusick ps.last_nl = true; /* remember that we just had a newline */ 3028804Smckusick code = (had_eof ? 0 : newline); 30324455Smckusick 30424455Smckusick /* 30524455Smckusick * if data has been exausted, the newline is a dummy, and we 30624455Smckusick * should return code to stop 30724455Smckusick */ 3088804Smckusick break; 3098804Smckusick 31024455Smckusick case '\'': /* start of quoted character */ 31124455Smckusick case '"': /* start of string */ 31224455Smckusick qchar = *token; 31324455Smckusick if (troff) { 31424455Smckusick tok[-1] = '`'; 31524455Smckusick if (qchar == '"') 31624455Smckusick *tok++ = '`'; 31724455Smckusick *tok++ = BACKSLASH; 31824455Smckusick *tok++ = 'f'; 31924455Smckusick *tok++ = 'L'; 32024455Smckusick } 32124455Smckusick do { /* copy the string */ 32224455Smckusick while (1) { /* move one character or [/<char>]<char> */ 3238804Smckusick if (*buf_ptr == '\n') { 32424455Smckusick printf("%d: Unterminated literal\n", line_no); 3258804Smckusick goto stop_lit; 3268804Smckusick } 3278804Smckusick *tok = *buf_ptr++; 3288804Smckusick if (buf_ptr >= buf_end) 32924455Smckusick fill_buffer(); 3308804Smckusick if (had_eof || ((tok - token) > (bufsize - 2))) { 33124455Smckusick printf("Unterminated literal\n"); 3328804Smckusick ++tok; 3338804Smckusick goto stop_lit; 33424455Smckusick /* get outof literal copying loop */ 3358804Smckusick } 33624455Smckusick if (*tok == BACKSLASH) { /* if escape, copy extra 33724455Smckusick * char */ 33824455Smckusick if (*buf_ptr == '\n') /* check for escaped 33924455Smckusick * newline */ 3408804Smckusick ++line_no; 34124455Smckusick if (troff) { 34224455Smckusick *++tok = BACKSLASH; 34324455Smckusick if (*buf_ptr == BACKSLASH) 34424455Smckusick *++tok = BACKSLASH; 34524455Smckusick } 34624455Smckusick *++tok = *buf_ptr++; 34724455Smckusick ++tok; /* we must increment this again because we 34824455Smckusick * copied two chars */ 3498804Smckusick if (buf_ptr >= buf_end) 35024455Smckusick fill_buffer(); 3518804Smckusick } 3528804Smckusick else 35324455Smckusick break; /* we copied one character */ 35424455Smckusick } /* end of while (1) */ 3558804Smckusick } while (*tok++ != qchar); 35624455Smckusick if (troff) { 35724455Smckusick tok[-1] = BACKSLASH; 35824455Smckusick *tok++ = 'f'; 35924455Smckusick *tok++ = 'R'; 36024455Smckusick *tok++ = '\''; 36124455Smckusick if (qchar == '"') 36224455Smckusick *tok++ = '\''; 36324455Smckusick } 36424455Smckusick stop_lit: 3658804Smckusick code = ident; 3668804Smckusick break; 3678804Smckusick 36824455Smckusick case ('('): 36924455Smckusick case ('['): 3708804Smckusick unary_delim = true; 3718804Smckusick code = lparen; 3728804Smckusick break; 3738804Smckusick 37424455Smckusick case (')'): 37524455Smckusick case (']'): 3768804Smckusick code = rparen; 3778804Smckusick break; 3788804Smckusick 37924455Smckusick case '#': 38024455Smckusick unary_delim = ps.last_u_d; 3818804Smckusick code = preesc; 3828804Smckusick break; 3838804Smckusick 38424455Smckusick case '?': 3858804Smckusick unary_delim = true; 3868804Smckusick code = question; 3878804Smckusick break; 3888804Smckusick 38924455Smckusick case (':'): 3908804Smckusick code = colon; 3918804Smckusick unary_delim = true; 3928804Smckusick break; 3938804Smckusick 39424455Smckusick case (';'): 3958804Smckusick unary_delim = true; 3968804Smckusick code = semicolon; 3978804Smckusick break; 3988804Smckusick 39924455Smckusick case ('{'): 4008804Smckusick unary_delim = true; 40124455Smckusick 40224455Smckusick /* 40324455Smckusick * if (ps.in_or_st) ps.block_init = 1; 40424455Smckusick */ 40524455Smckusick code = ps.block_init ? lparen : lbrace; 4068804Smckusick break; 4078804Smckusick 40824455Smckusick case ('}'): 4098804Smckusick unary_delim = true; 41024455Smckusick code = ps.block_init ? rparen : rbrace; 4118804Smckusick break; 4128804Smckusick 41324455Smckusick case 014: /* a form feed */ 41424455Smckusick unary_delim = ps.last_u_d; 41524455Smckusick ps.last_nl = true; /* remember this so we can set 'ps.col_1' 41624455Smckusick * right */ 4178804Smckusick code = form_feed; 4188804Smckusick break; 4198804Smckusick 42024455Smckusick case (','): 4218804Smckusick unary_delim = true; 4228804Smckusick code = comma; 4238804Smckusick break; 4248804Smckusick 42524455Smckusick case '.': 4268804Smckusick unary_delim = false; 4278804Smckusick code = period; 4288804Smckusick break; 4298804Smckusick 43024455Smckusick case '-': 43124455Smckusick case '+': /* check for -, +, --, ++ */ 43224455Smckusick code = (ps.last_u_d ? unary_op : binary_op); 4338804Smckusick unary_delim = true; 4348804Smckusick 4358804Smckusick if (*buf_ptr == token[0]) { 43624455Smckusick /* check for doubled character */ 4378804Smckusick *tok++ = *buf_ptr++; 43824455Smckusick /* buffer overflow will be checked at end of loop */ 4398804Smckusick if (last_code == ident || last_code == rparen) { 44024455Smckusick code = (ps.last_u_d ? unary_op : postop); 44124455Smckusick /* check for following ++ or -- */ 4428804Smckusick unary_delim = false; 4438804Smckusick } 4448804Smckusick } 44524455Smckusick else if (*buf_ptr == '=') 44624455Smckusick /* check for operator += */ 44724455Smckusick *tok++ = *buf_ptr++; 44824455Smckusick else if (*buf_ptr == '>') { 44924455Smckusick /* check for operator -> */ 45024455Smckusick *tok++ = *buf_ptr++; 45124455Smckusick code = unary_op; 45224455Smckusick unary_delim = false; 45324455Smckusick ps.want_blank = false; 45424455Smckusick } 45524455Smckusick /* buffer overflow will be checked at end of switch */ 4568804Smckusick 4578804Smckusick break; 4588804Smckusick 45924455Smckusick case '=': 46024455Smckusick if (ps.in_or_st) 46124455Smckusick ps.block_init = 1; 46224455Smckusick if (chartype[*buf_ptr] == opchar) { /* we have two char 46324455Smckusick * assignment */ 46424455Smckusick tok[-1] = *buf_ptr++; 46524455Smckusick if ((tok[-1] == '<' || tok[-1] == '>') && tok[-1] == *buf_ptr) 46624455Smckusick *tok++ = *buf_ptr++; 46724455Smckusick *tok++ = '='; /* Flip =+ to += */ 46824455Smckusick *tok = 0; 4698804Smckusick } 4708804Smckusick code = binary_op; 4718804Smckusick unary_delim = true; 47224455Smckusick break; 47324455Smckusick /* can drop thru!!! */ 4748804Smckusick 47524455Smckusick case '>': 47624455Smckusick case '<': 47724455Smckusick case '!': /* ops like <, <<, <=, !=, etc */ 4788804Smckusick if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 4798804Smckusick *tok++ = *buf_ptr; 4808804Smckusick if (++buf_ptr >= buf_end) 48124455Smckusick fill_buffer(); 4828804Smckusick } 4838804Smckusick if (*buf_ptr == '=') 48424455Smckusick *tok++ = *buf_ptr++; 48524455Smckusick code = (ps.last_u_d ? unary_op : binary_op); 4868804Smckusick unary_delim = true; 4878804Smckusick break; 4888804Smckusick 48924455Smckusick default: 4908804Smckusick if (token[0] == '/' && *buf_ptr == '*') { 49124455Smckusick /* it is start of comment */ 4928804Smckusick *tok++ = '*'; 4938804Smckusick 4948804Smckusick if (++buf_ptr >= buf_end) 49524455Smckusick fill_buffer(); 4968804Smckusick 4978804Smckusick code = comment; 49824455Smckusick unary_delim = ps.last_u_d; 4998804Smckusick break; 5008804Smckusick } 50124455Smckusick while (*(tok - 1) == *buf_ptr || *buf_ptr == '=') { 50224455Smckusick /* handle ||, &&, etc, and also things as in int *****i */ 5038804Smckusick *tok++ = *buf_ptr; 5048804Smckusick if (++buf_ptr >= buf_end) 50524455Smckusick fill_buffer(); 5068804Smckusick } 50724455Smckusick code = (ps.last_u_d ? unary_op : binary_op); 5088804Smckusick unary_delim = true; 5098804Smckusick 5108804Smckusick 51124455Smckusick } /* end of switch */ 5128804Smckusick if (code != newline) { 5138804Smckusick l_struct = false; 5148804Smckusick last_code = code; 5158804Smckusick } 51624455Smckusick if (buf_ptr >= buf_end) /* check for input buffer empty */ 51724455Smckusick fill_buffer(); 51824455Smckusick ps.last_u_d = unary_delim; 51924455Smckusick *tok = '\0'; /* null terminate the token */ 5208804Smckusick return (code); 5218804Smckusick }; 52224455Smckusick 52324455Smckusick /* Add the given keyword to the keyword table, using val as the keyword type 52424455Smckusick */ 52524455Smckusick addkey (key, val) 52624455Smckusick char *key; 52724455Smckusick { 52824455Smckusick register struct templ *p = specials; 52924455Smckusick while (p->rwd) 53024455Smckusick if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 53124455Smckusick return; 53224455Smckusick else 53324455Smckusick p++; 53424455Smckusick if (p >= specials + sizeof specials / sizeof specials[0]) 53524455Smckusick return; /* For now, table overflows are silently 53624455Smckusick ignored */ 53724455Smckusick p->rwd = key; 53824455Smckusick p->rwcode = val; 53924455Smckusick p[1].rwd = 0; 54024455Smckusick p[1].rwcode = 0; 54124455Smckusick return; 54224455Smckusick } 543