1*21970Sdist /* 2*21970Sdist * Copyright (c) 1980 Regents of the University of California. 3*21970Sdist * All rights reserved. The Berkeley software License Agreement 4*21970Sdist * specifies the terms and conditions for redistribution. 5*21970Sdist */ 68804Smckusick 7*21970Sdist #ifndef lint 8*21970Sdist static char sccsid[] = "@(#)lexi.c 5.1 (Berkeley) 06/04/85"; 9*21970Sdist #endif not lint 10*21970Sdist 118804Smckusick /* 128804Smckusick 138804Smckusick Copyright (C) 1976 148804Smckusick by the 158804Smckusick Board of Trustees 168804Smckusick of the 178804Smckusick University of Illinois 188804Smckusick 198804Smckusick All rights reserved 208804Smckusick 218804Smckusick 228804Smckusick NAME: 238804Smckusick lexi 248804Smckusick 258804Smckusick FUNCTION: 268804Smckusick This is the token scanner for indent 278804Smckusick 288804Smckusick ALGORITHM: 298804Smckusick 1) Strip off intervening blanks and/or tabs. 308804Smckusick 2) If it is an alphanumeric token, move it to the token buffer "token". 318804Smckusick Check if it is a special reserved word that indent will want to 328804Smckusick know about. 338804Smckusick 3) Non-alphanumeric tokens are handled with a big switch statement. A 348804Smckusick flag is kept to remember if the last token was a "unary delimiter", 358804Smckusick which forces a following operator to be unary as opposed to binary. 368804Smckusick 378804Smckusick PARAMETERS: 388804Smckusick None 398804Smckusick 408804Smckusick RETURNS: 418804Smckusick An integer code indicating the type of token scanned. 428804Smckusick 438804Smckusick GLOBALS: 448804Smckusick buf_ptr = 458804Smckusick had_eof 468804Smckusick last_u_d = Set to true iff this token is a "unary delimiter" 478804Smckusick 488804Smckusick CALLS: 498804Smckusick fill_buffer 508804Smckusick printf (lib) 518804Smckusick 528804Smckusick CALLED BY: 538804Smckusick main 548804Smckusick 558804Smckusick NOTES: 568804Smckusick Start of comment is passed back so that the comment can be scanned by 578804Smckusick pr_comment. 588804Smckusick 598804Smckusick Strings and character literals are returned just like identifiers. 608804Smckusick 618804Smckusick HISTORY: 628804Smckusick initial coding November 1976 D A Willcox of CAC 638804Smckusick 1/7/77 D A Willcox of CAC Fix to provide proper handling 648804Smckusick of "int a -1;" 658804Smckusick 668804Smckusick */ 678804Smckusick 688804Smckusick /* Here we have the token scanner for indent. It scans off one token and 698804Smckusick puts it in the global variable "token". It returns a code, indicating the 708804Smckusick type of token scanned. */ 718804Smckusick 728804Smckusick #include "indent_globs.h"; 738804Smckusick #include "indent_codes.h"; 748804Smckusick 758804Smckusick 768804Smckusick 778804Smckusick #define alphanum 1 788804Smckusick #define opchar 3 798804Smckusick 808804Smckusick struct templ { 818804Smckusick char *rwd; 828804Smckusick int rwcode; 838804Smckusick }; 848804Smckusick 858804Smckusick struct templ specials[] = 868804Smckusick { 878804Smckusick "switch", 1, 888804Smckusick "case", 2, 898804Smckusick "struct", 3, 908804Smckusick "default", 2, 918804Smckusick "int", 4, 928804Smckusick "char", 4, 938804Smckusick "float", 4, 948804Smckusick "double", 4, 958804Smckusick "long", 4, 968804Smckusick "short", 4, 978804Smckusick "typdef", 4, 988804Smckusick "unsigned", 4, 998804Smckusick "register", 4, 1008804Smckusick "static", 4, 1018804Smckusick "global", 4, 1028804Smckusick "extern", 4, 1038804Smckusick "if", 5, 1048804Smckusick "while", 5, 1058804Smckusick "for", 5, 1068804Smckusick "else", 6, 1078804Smckusick "do", 6, 1088804Smckusick "sizeof", 0, 1098804Smckusick 0, 0 1108804Smckusick }; 1118804Smckusick 1128804Smckusick char chartype[128] = 1138804Smckusick { /* this is used to facilitate the decision of what type 1148804Smckusick (alphanumeric, operator) each character is */ 1158804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 1168804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 1178804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 1188804Smckusick 0, 0, 0, 0, 0, 0, 0, 0, 1198804Smckusick 0, 3, 0, 0, 0, 3, 3, 0, 1208804Smckusick 0, 0, 3, 3, 0, 3, 3, 3, 1218804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 1228804Smckusick 1, 1, 0, 0, 3, 3, 3, 3, 1238804Smckusick 0, 1, 1, 1, 1, 1, 1, 1, 1248804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 1258804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 1268804Smckusick 1, 1, 1, 0, 0, 0, 3, 1, 1278804Smckusick 0, 1, 1, 1, 1, 1, 1, 1, 1288804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 1298804Smckusick 1, 1, 1, 1, 1, 1, 1, 1, 1308804Smckusick 1, 1, 1, 0, 3, 0, 3, 0 1318804Smckusick }; 1328804Smckusick 1338804Smckusick int last_nl = true; 1348804Smckusick /* this is true if the last thing scanned was a newline */ 1358804Smckusick 1368804Smckusick 1378804Smckusick 1388804Smckusick int lexi () { 1398804Smckusick register char *tok; 1408804Smckusick /* local pointer to next char in token */ 1418804Smckusick register int i; 1428804Smckusick /* local loop counter */ 1438804Smckusick register char *j; 1448804Smckusick /* used for searching thru list of reserved words */ 1458804Smckusick int unary_delim; 1468804Smckusick /* this is set to 1 if the current token forces a following operator to be 1478804Smckusick unary */ 1488804Smckusick static int last_code; 1498804Smckusick /* the last token type returned */ 1508804Smckusick static int l_struct; 1518804Smckusick /* set to 1 if the last token was 'struct' */ 1528804Smckusick int found_it; 1538804Smckusick int code; /* internal code to be returned */ 1548804Smckusick char qchar; /* the delimiter character for a string */ 1558804Smckusick 1568804Smckusick tok = token; /* point to start of place to save token */ 1578804Smckusick unary_delim = false; 1588804Smckusick col_1 = last_nl; /* tell world that this token started in column 1598804Smckusick 1 iff the last thing scanned was nl */ 1608804Smckusick last_nl = false; 1618804Smckusick 1628804Smckusick while (*buf_ptr == ' ' || *buf_ptr == '\t') { 1638804Smckusick /* get rid of blanks */ 1648804Smckusick col_1 = false; /* leading blanks imply token is not in column 1 1658804Smckusick */ 1668804Smckusick if (++buf_ptr >= buf_end) 1678804Smckusick fill_buffer (); 1688804Smckusick } 1698804Smckusick 1708804Smckusick /*----------------------------------------------------------*\ 1718804Smckusick | Scan an alphanumeric token 1728804Smckusick \*----------------------------------------------------------*/ 1738804Smckusick 1748804Smckusick if (chartype[*buf_ptr & 0177] == alphanum) { 1758804Smckusick /* we have a character or number */ 1768804Smckusick while (chartype[*buf_ptr & 0177] == alphanum) { 1778804Smckusick /* copy it over */ 1788804Smckusick *tok++ = *buf_ptr++; 1798804Smckusick if (buf_ptr >= buf_end) 1808804Smckusick fill_buffer (); 1818804Smckusick } 1828804Smckusick 1838804Smckusick *tok++ = '\0'; 1848804Smckusick 1858804Smckusick if (l_struct) { /* if last token was 'struct', then this token 1868804Smckusick should be treated as a declaration */ 1878804Smckusick l_struct = false; 1888804Smckusick last_code = ident; 1898804Smckusick last_u_d = true; 1908804Smckusick return (decl); 1918804Smckusick } 1928804Smckusick 1938804Smckusick last_u_d = false; /* operator after indentifier is binary */ 1948804Smckusick 1958804Smckusick for (i = 0; specials[i].rwd != 0; ++i) { 1968804Smckusick /* this loop will check if the token is a keyword. if so, a following 1978804Smckusick operator is unary */ 1988804Smckusick last_code = ident; /* remember that this is the code we will return 1998804Smckusick */ 2008804Smckusick j = specials[i].rwd; 2018804Smckusick /* point at ith reserved word */ 2028804Smckusick tok = token; /* point at scanned toekn */ 2038804Smckusick found_it = true; /* set to false if not found */ 2048804Smckusick do { 2058804Smckusick if (*tok++ != *j) { 2068804Smckusick found_it = false; 2078804Smckusick break; 2088804Smckusick } 2098804Smckusick } while (*j++); 2108804Smckusick 2118804Smckusick if (found_it) { /* we have a keyword */ 2128804Smckusick last_u_d = true; 2138804Smckusick switch (specials[i].rwcode) { 2148804Smckusick case 1: /* it is a switch */ 2158804Smckusick return (swstmt); 2168804Smckusick case 2: /* a case or default */ 2178804Smckusick return (casestmt); 2188804Smckusick 2198804Smckusick case 3: /* a "struct" */ 2208804Smckusick l_struct = true; 2218804Smckusick /* Next time around, we will want to know that we have had 2228804Smckusick a 'struct' */ 2238804Smckusick case 4: /* one of the declaration keywords */ 2248804Smckusick if(p_l_follow) break; /* inside parens: cast */ 2258804Smckusick last_code = decl; 2268804Smckusick return (decl); 2278804Smckusick 2288804Smckusick case 5: /* if, while, for */ 2298804Smckusick return (sp_paren); 2308804Smckusick 2318804Smckusick case 6: /* do, else */ 2328804Smckusick return (sp_nparen); 2338804Smckusick 2348804Smckusick default: /* all others are treated like any other 2358804Smckusick identifier */ 2368804Smckusick return (ident); 2378804Smckusick } /* end of switch */ 2388804Smckusick } /* end of if (found_it) */ 2398804Smckusick 2408804Smckusick } 2418804Smckusick 2428804Smckusick if (last_code == decl) /* if this is a declared variable, then 2438804Smckusick following sign is unary */ 2448804Smckusick last_u_d = true; /* will make "int a -1" work */ 2458804Smckusick last_code = ident; 2468804Smckusick return (ident); /* the ident is not in the list */ 2478804Smckusick } /* end of procesing for alpanum character */ 2488804Smckusick 2498804Smckusick 2508804Smckusick 2518804Smckusick /*----------------------------------------------------------*\ 2528804Smckusick | Scan a non-alphanumeric token 2538804Smckusick \*----------------------------------------------------------*/ 2548804Smckusick 2558804Smckusick *tok++ = *buf_ptr; /* if it is only a one-character token, it is 2568804Smckusick moved here */ 2578804Smckusick *tok = '\0'; 2588804Smckusick if (++buf_ptr >= buf_end) 2598804Smckusick fill_buffer (); 2608804Smckusick 2618804Smckusick switch (*token) { 2628804Smckusick case '\n': 2638804Smckusick unary_delim = last_u_d; 2648804Smckusick last_nl = true; /* remember that we just had a newline */ 2658804Smckusick code = (had_eof ? 0 : newline); 2668804Smckusick /* if data has been exausted, the newline is a dummy, and we should 2678804Smckusick return code to stop */ 2688804Smckusick break; 2698804Smckusick 2708804Smckusick case '\'': /* start of quoted character */ 2718804Smckusick qchar = '\''; /* remember final delimiter */ 2728804Smckusick goto copy_lit; /* and go to common literal code */ 2738804Smckusick 2748804Smckusick case '"': /* start of string */ 2758804Smckusick qchar = '"'; 2768804Smckusick 2778804Smckusick copy_lit: 2788804Smckusick do { /* copy the string */ 2798804Smckusick while (1) { /* move one character or [/<char>]<char> */ 2808804Smckusick if (*buf_ptr == '\n') { 2818804Smckusick /* check for unterminated literal */ 2828804Smckusick printf ("%d: Unterminated literal\n", line_no); 2838804Smckusick goto stop_lit; 2848804Smckusick /* Don't copy any more */ 2858804Smckusick } 2868804Smckusick 2878804Smckusick *tok = *buf_ptr++; 2888804Smckusick if (buf_ptr >= buf_end) 2898804Smckusick fill_buffer (); 2908804Smckusick if (had_eof || ((tok - token) > (bufsize - 2))) { 2918804Smckusick printf ("Unterminated literal\n"); 2928804Smckusick ++tok; 2938804Smckusick goto stop_lit; 2948804Smckusick /* get outof literal copying loop */ 2958804Smckusick } 2968804Smckusick 2978804Smckusick if (*tok == '\\') { 2988804Smckusick /* if escape, copy extra char */ 2998804Smckusick if (*buf_ptr == '\n') 3008804Smckusick /* check for escaped newline */ 3018804Smckusick ++line_no; 3028804Smckusick *(++tok) = *buf_ptr++; 3038804Smckusick ++tok; /* we must increment this again because we 3048804Smckusick copied two chars */ 3058804Smckusick if (buf_ptr >= buf_end) 3068804Smckusick fill_buffer (); 3078804Smckusick } 3088804Smckusick else 3098804Smckusick break; /* we copied one character */ 3108804Smckusick } /* end of while (1) */ 3118804Smckusick } while (*tok++ != qchar); 3128804Smckusick 3138804Smckusick stop_lit: 3148804Smckusick code = ident; 3158804Smckusick break; 3168804Smckusick 3178804Smckusick case ('('): 3188804Smckusick case ('['): 3198804Smckusick unary_delim = true; 3208804Smckusick code = lparen; 3218804Smckusick break; 3228804Smckusick 3238804Smckusick case (')'): 3248804Smckusick case (']'): 3258804Smckusick code = rparen; 3268804Smckusick break; 3278804Smckusick 3288804Smckusick case '#': 3298804Smckusick unary_delim = last_u_d; 3308804Smckusick code = preesc; 3318804Smckusick break; 3328804Smckusick 3338804Smckusick case '?': 3348804Smckusick unary_delim = true; 3358804Smckusick code = question; 3368804Smckusick break; 3378804Smckusick 3388804Smckusick case (':'): 3398804Smckusick code = colon; 3408804Smckusick unary_delim = true; 3418804Smckusick break; 3428804Smckusick 3438804Smckusick case (';'): 3448804Smckusick unary_delim = true; 3458804Smckusick code = semicolon; 3468804Smckusick break; 3478804Smckusick 3488804Smckusick case ('{'): 3498804Smckusick unary_delim = true; 3508804Smckusick code = lbrace; 3518804Smckusick break; 3528804Smckusick 3538804Smckusick case ('}'): 3548804Smckusick unary_delim = true; 3558804Smckusick code = rbrace; 3568804Smckusick break; 3578804Smckusick 3588804Smckusick case 014: /* a form feed */ 3598804Smckusick unary_delim = last_u_d; 3608804Smckusick last_nl = true; /* remember this so we can set 'col_1' right */ 3618804Smckusick code = form_feed; 3628804Smckusick break; 3638804Smckusick 3648804Smckusick case (','): 3658804Smckusick unary_delim = true; 3668804Smckusick code = comma; 3678804Smckusick break; 3688804Smckusick 3698804Smckusick case '.': 3708804Smckusick unary_delim = false; 3718804Smckusick code = period; 3728804Smckusick break; 3738804Smckusick 3748804Smckusick case '-': 3758804Smckusick case '+': /* check for -, +, --, ++ */ 3768804Smckusick code = (last_u_d ? unary_op : binary_op); 3778804Smckusick unary_delim = true; 3788804Smckusick 3798804Smckusick if (*buf_ptr == token[0]) { 3808804Smckusick /* check for doubled character */ 3818804Smckusick *tok++ = *buf_ptr++; 3828804Smckusick /* buffer overflow will be checked at end of loop */ 3838804Smckusick if (last_code == ident || last_code == rparen) { 3848804Smckusick code = (last_u_d ? unary_op : postop); 3858804Smckusick /* check for following ++ or -- */ 3868804Smckusick unary_delim = false; 3878804Smckusick } 3888804Smckusick } 3898804Smckusick else 3908804Smckusick if (*buf_ptr == '>' || *buf_ptr == '=') 3918804Smckusick /* check for operator -> or += */ 3928804Smckusick *tok++ = *buf_ptr++; 3938804Smckusick /* buffer overflow will be checked at end of switch */ 3948804Smckusick 3958804Smckusick break; 3968804Smckusick 3978804Smckusick case '=': 3988804Smckusick if (chartype[*buf_ptr] == opchar) { 3998804Smckusick /* we have two char assignment */ 4008804Smckusick *tok++ = *buf_ptr; 4018804Smckusick /* move second character */ 4028804Smckusick if (++buf_ptr >= buf_end) 4038804Smckusick fill_buffer (); 4048804Smckusick } 4058804Smckusick 4068804Smckusick code = binary_op; 4078804Smckusick unary_delim = true; 4088804Smckusick if (token[1] != '<' && token[1] != '>') 4098804Smckusick /* check for possible 3 char operator */ 4108804Smckusick break; 4118804Smckusick /* can drop thru!!! */ 4128804Smckusick 4138804Smckusick case '>': 4148804Smckusick case '<': 4158804Smckusick case '!': /* ops like <, <<, <=, !=, etc */ 4168804Smckusick if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 4178804Smckusick *tok++ = *buf_ptr; 4188804Smckusick if (++buf_ptr >= buf_end) 4198804Smckusick fill_buffer (); 4208804Smckusick } 4218804Smckusick 4228804Smckusick if (*buf_ptr == '=') 4238804Smckusick *tok++ = *buf_ptr++; 4248804Smckusick code = (last_u_d ? unary_op : binary_op); 4258804Smckusick unary_delim = true; 4268804Smckusick break; 4278804Smckusick 4288804Smckusick default: 4298804Smckusick if (token[0] == '/' && *buf_ptr == '*') { 4308804Smckusick /* it is start of comment */ 4318804Smckusick *tok++ = '*'; 4328804Smckusick 4338804Smckusick if (++buf_ptr >= buf_end) 4348804Smckusick fill_buffer (); 4358804Smckusick 4368804Smckusick code = comment; 4378804Smckusick unary_delim = last_u_d; 4388804Smckusick break; 4398804Smckusick } 4408804Smckusick 4418804Smckusick while (*(tok - 1) == *buf_ptr || *buf_ptr=='=') { 4428804Smckusick /* handle ||, &&, etc, and also things as in int *****i */ 4438804Smckusick *tok++ = *buf_ptr; 4448804Smckusick if (++buf_ptr >= buf_end) 4458804Smckusick fill_buffer (); 4468804Smckusick } 4478804Smckusick 4488804Smckusick 4498804Smckusick code = (last_u_d ? unary_op : binary_op); 4508804Smckusick unary_delim = true; 4518804Smckusick 4528804Smckusick 4538804Smckusick } /* end of switch */ 4548804Smckusick 4558804Smckusick if (code != newline) { 4568804Smckusick l_struct = false; 4578804Smckusick last_code = code; 4588804Smckusick } 4598804Smckusick 4608804Smckusick if (buf_ptr >= buf_end) /* check for input buffer empty */ 4618804Smckusick fill_buffer (); 4628804Smckusick last_u_d = unary_delim; 4638804Smckusick *tok = '\0'; /* null terminate the token */ 4648804Smckusick return (code); 4658804Smckusick }; 466