xref: /csrg-svn/usr.bin/indent/lexi.c (revision 21970)
1*21970Sdist /*
2*21970Sdist  * Copyright (c) 1980 Regents of the University of California.
3*21970Sdist  * All rights reserved.  The Berkeley software License Agreement
4*21970Sdist  * specifies the terms and conditions for redistribution.
5*21970Sdist  */
68804Smckusick 
7*21970Sdist #ifndef lint
8*21970Sdist static char sccsid[] = "@(#)lexi.c	5.1 (Berkeley) 06/04/85";
9*21970Sdist #endif not lint
10*21970Sdist 
118804Smckusick /*
128804Smckusick 
138804Smckusick 			  Copyright (C) 1976
148804Smckusick 				by the
158804Smckusick 			  Board of Trustees
168804Smckusick 				of the
178804Smckusick 			University of Illinois
188804Smckusick 
198804Smckusick 			 All rights reserved
208804Smckusick 
218804Smckusick 
228804Smckusick NAME:
238804Smckusick 	lexi
248804Smckusick 
258804Smckusick FUNCTION:
268804Smckusick 	This is the token scanner for indent
278804Smckusick 
288804Smckusick ALGORITHM:
298804Smckusick 	1) Strip off intervening blanks and/or tabs.
308804Smckusick 	2) If it is an alphanumeric token, move it to the token buffer "token".
318804Smckusick 	   Check if it is a special reserved word that indent will want to
328804Smckusick 	   know about.
338804Smckusick 	3) Non-alphanumeric tokens are handled with a big switch statement.  A
348804Smckusick 	   flag is kept to remember if the last token was a "unary delimiter",
358804Smckusick 	   which forces a following operator to be unary as opposed to binary.
368804Smckusick 
378804Smckusick PARAMETERS:
388804Smckusick 	None
398804Smckusick 
408804Smckusick RETURNS:
418804Smckusick 	An integer code indicating the type of token scanned.
428804Smckusick 
438804Smckusick GLOBALS:
448804Smckusick 	buf_ptr =
458804Smckusick 	had_eof
468804Smckusick 	last_u_d =	Set to true iff this token is a "unary delimiter"
478804Smckusick 
488804Smckusick CALLS:
498804Smckusick 	fill_buffer
508804Smckusick 	printf (lib)
518804Smckusick 
528804Smckusick CALLED BY:
538804Smckusick 	main
548804Smckusick 
558804Smckusick NOTES:
568804Smckusick 	Start of comment is passed back so that the comment can be scanned by
578804Smckusick 	pr_comment.
588804Smckusick 
598804Smckusick 	Strings and character literals are returned just like identifiers.
608804Smckusick 
618804Smckusick HISTORY:
628804Smckusick 	initial coding 	November 1976	D A Willcox of CAC
638804Smckusick 	1/7/77		D A Willcox of CAC	Fix to provide proper handling
648804Smckusick 						of "int a -1;"
658804Smckusick 
668804Smckusick */
678804Smckusick 
688804Smckusick /* Here we have the token scanner for indent.  It scans off one token and
698804Smckusick    puts it in the global variable "token".  It returns a code, indicating the
708804Smckusick    type of token scanned. */
718804Smckusick 
728804Smckusick #include "indent_globs.h";
738804Smckusick #include "indent_codes.h";
748804Smckusick 
758804Smckusick 
768804Smckusick 
778804Smckusick #define alphanum 1
788804Smckusick #define opchar 3
798804Smckusick 
808804Smckusick struct templ {
818804Smckusick     char   *rwd;
828804Smckusick     int     rwcode;
838804Smckusick };
848804Smckusick 
858804Smckusick struct templ    specials[] =
868804Smckusick {
878804Smckusick     "switch", 1,
888804Smckusick     "case", 2,
898804Smckusick     "struct", 3,
908804Smckusick     "default", 2,
918804Smckusick     "int", 4,
928804Smckusick     "char", 4,
938804Smckusick     "float", 4,
948804Smckusick     "double", 4,
958804Smckusick     "long", 4,
968804Smckusick     "short", 4,
978804Smckusick     "typdef", 4,
988804Smckusick     "unsigned", 4,
998804Smckusick     "register", 4,
1008804Smckusick     "static", 4,
1018804Smckusick     "global", 4,
1028804Smckusick     "extern", 4,
1038804Smckusick     "if", 5,
1048804Smckusick     "while", 5,
1058804Smckusick     "for", 5,
1068804Smckusick     "else", 6,
1078804Smckusick     "do", 6,
1088804Smckusick     "sizeof", 0,
1098804Smckusick     0, 0
1108804Smckusick };
1118804Smckusick 
1128804Smckusick char    chartype[128] =
1138804Smckusick {		   /* this is used to facilitate the decision of what type
1148804Smckusick 		      (alphanumeric, operator) each character is */
1158804Smckusick     0, 0, 0, 0, 0, 0, 0, 0,
1168804Smckusick     0, 0, 0, 0, 0, 0, 0, 0,
1178804Smckusick     0, 0, 0, 0, 0, 0, 0, 0,
1188804Smckusick     0, 0, 0, 0, 0, 0, 0, 0,
1198804Smckusick     0, 3, 0, 0, 0, 3, 3, 0,
1208804Smckusick     0, 0, 3, 3, 0, 3, 3, 3,
1218804Smckusick     1, 1, 1, 1, 1, 1, 1, 1,
1228804Smckusick     1, 1, 0, 0, 3, 3, 3, 3,
1238804Smckusick     0, 1, 1, 1, 1, 1, 1, 1,
1248804Smckusick     1, 1, 1, 1, 1, 1, 1, 1,
1258804Smckusick     1, 1, 1, 1, 1, 1, 1, 1,
1268804Smckusick     1, 1, 1, 0, 0, 0, 3, 1,
1278804Smckusick     0, 1, 1, 1, 1, 1, 1, 1,
1288804Smckusick     1, 1, 1, 1, 1, 1, 1, 1,
1298804Smckusick     1, 1, 1, 1, 1, 1, 1, 1,
1308804Smckusick     1, 1, 1, 0, 3, 0, 3, 0
1318804Smckusick };
1328804Smckusick 
1338804Smckusick int     last_nl = true;
1348804Smckusick  /* this is true if the last thing scanned was a newline */
1358804Smckusick 
1368804Smckusick 
1378804Smckusick 
1388804Smckusick int     lexi () {
1398804Smckusick     register char  *tok;
1408804Smckusick  /* local pointer to next char in token */
1418804Smckusick     register int    i;
1428804Smckusick  /* local loop counter */
1438804Smckusick     register char  *j;
1448804Smckusick  /* used for searching thru list of reserved words */
1458804Smckusick     int     unary_delim;
1468804Smckusick  /* this is set to 1 if the current token forces a following operator to be
1478804Smckusick     unary */
1488804Smckusick     static int  last_code;
1498804Smckusick  /* the last token type returned */
1508804Smckusick     static int  l_struct;
1518804Smckusick  /* set to 1 if the last token was 'struct' */
1528804Smckusick     int     found_it;
1538804Smckusick     int     code;  /* internal code to be returned */
1548804Smckusick     char    qchar; /* the delimiter character for a string */
1558804Smckusick 
1568804Smckusick     tok = token;	       /* point to start of place to save token */
1578804Smckusick     unary_delim = false;
1588804Smckusick     col_1 = last_nl;	       /* tell world that this token started in column
1598804Smckusick 			          1 iff the last thing scanned was nl */
1608804Smckusick     last_nl = false;
1618804Smckusick 
1628804Smckusick     while (*buf_ptr == ' ' || *buf_ptr == '\t') {
1638804Smckusick     /* get rid of blanks */
1648804Smckusick 	col_1 = false;	       /* leading blanks imply token is not in column 1
1658804Smckusick 			          */
1668804Smckusick 	if (++buf_ptr >= buf_end)
1678804Smckusick 	    fill_buffer ();
1688804Smckusick     }
1698804Smckusick 
1708804Smckusick /*----------------------------------------------------------*\
1718804Smckusick |    Scan an alphanumeric token
1728804Smckusick \*----------------------------------------------------------*/
1738804Smckusick 
1748804Smckusick     if (chartype[*buf_ptr & 0177] == alphanum) {
1758804Smckusick     /* we have a character or number */
1768804Smckusick 	while (chartype[*buf_ptr & 0177] == alphanum) {
1778804Smckusick 	/* copy it over */
1788804Smckusick 	    *tok++ = *buf_ptr++;
1798804Smckusick 	    if (buf_ptr >= buf_end)
1808804Smckusick 		fill_buffer ();
1818804Smckusick 	}
1828804Smckusick 
1838804Smckusick 	*tok++ = '\0';
1848804Smckusick 
1858804Smckusick 	if (l_struct) {	       /* if last token was 'struct', then this token
1868804Smckusick 			          should be treated as a declaration */
1878804Smckusick 	    l_struct = false;
1888804Smckusick 	    last_code = ident;
1898804Smckusick 	    last_u_d = true;
1908804Smckusick 	    return (decl);
1918804Smckusick 	}
1928804Smckusick 
1938804Smckusick 	last_u_d = false;      /* operator after indentifier is binary */
1948804Smckusick 
1958804Smckusick 	for (i = 0; specials[i].rwd != 0; ++i) {
1968804Smckusick 	/* this loop will check if the token is a keyword.  if so, a following
1978804Smckusick 	   operator is unary */
1988804Smckusick 	    last_code = ident; /* remember that this is the code we will return
1998804Smckusick 			          */
2008804Smckusick 	    j = specials[i].rwd;
2018804Smckusick 	/* point at ith reserved word */
2028804Smckusick 	    tok = token;       /* point at scanned toekn */
2038804Smckusick 	    found_it = true;   /* set to false if not found */
2048804Smckusick 	    do {
2058804Smckusick 		if (*tok++ != *j) {
2068804Smckusick 		    found_it = false;
2078804Smckusick 		    break;
2088804Smckusick 		}
2098804Smckusick 	    } while (*j++);
2108804Smckusick 
2118804Smckusick 	    if (found_it) {    /* we have a keyword */
2128804Smckusick 		last_u_d = true;
2138804Smckusick 		switch (specials[i].rwcode) {
2148804Smckusick 		    case 1:    /* it is a switch */
2158804Smckusick 			return (swstmt);
2168804Smckusick 		    case 2:    /* a case or default */
2178804Smckusick 			return (casestmt);
2188804Smckusick 
2198804Smckusick 		    case 3:    /* a "struct" */
2208804Smckusick 			l_struct = true;
2218804Smckusick 		    /* Next time around, we will want to know that we have had
2228804Smckusick 		       a 'struct' */
2238804Smckusick 		    case 4:    /* one of the declaration keywords */
2248804Smckusick 			if(p_l_follow) break;	/* inside parens: cast */
2258804Smckusick 			last_code = decl;
2268804Smckusick 			return (decl);
2278804Smckusick 
2288804Smckusick 		    case 5:    /* if, while, for */
2298804Smckusick 			return (sp_paren);
2308804Smckusick 
2318804Smckusick 		    case 6:    /* do, else */
2328804Smckusick 			return (sp_nparen);
2338804Smckusick 
2348804Smckusick 		    default:   /* all others are treated like any other
2358804Smckusick 			          identifier */
2368804Smckusick 			return (ident);
2378804Smckusick 		}	       /* end of switch */
2388804Smckusick 	    }		       /* end of if (found_it) */
2398804Smckusick 
2408804Smckusick 	}
2418804Smckusick 
2428804Smckusick 	if (last_code == decl) /* if this is a declared variable, then
2438804Smckusick 			          following sign is unary */
2448804Smckusick 	    last_u_d = true;   /* will make "int a -1" work */
2458804Smckusick 	last_code = ident;
2468804Smckusick 	return (ident);	       /* the ident is not in the list */
2478804Smckusick     }			       /* end of procesing for alpanum character */
2488804Smckusick 
2498804Smckusick 
2508804Smckusick 
2518804Smckusick /*----------------------------------------------------------*\
2528804Smckusick |   Scan a non-alphanumeric token
2538804Smckusick \*----------------------------------------------------------*/
2548804Smckusick 
2558804Smckusick     *tok++ = *buf_ptr;	       /* if it is only a one-character token, it is
2568804Smckusick 			          moved here */
2578804Smckusick     *tok = '\0';
2588804Smckusick     if (++buf_ptr >= buf_end)
2598804Smckusick 	fill_buffer ();
2608804Smckusick 
2618804Smckusick     switch (*token) {
2628804Smckusick 	case '\n':
2638804Smckusick 	    unary_delim = last_u_d;
2648804Smckusick 	    last_nl = true;    /* remember that we just had a newline */
2658804Smckusick 	    code = (had_eof ? 0 : newline);
2668804Smckusick 	/* if data has been exausted, the newline is a dummy, and we should
2678804Smckusick 	   return code to stop */
2688804Smckusick 	    break;
2698804Smckusick 
2708804Smckusick 	case '\'': 	       /* start of quoted character */
2718804Smckusick 	    qchar = '\'';      /* remember final delimiter */
2728804Smckusick 	    goto copy_lit;     /* and go to common literal code */
2738804Smckusick 
2748804Smckusick 	case '"': 	       /* start of string */
2758804Smckusick 	    qchar = '"';
2768804Smckusick 
2778804Smckusick     copy_lit:
2788804Smckusick 	    do {	       /* copy the string */
2798804Smckusick 		while (1) {    /* move one character or [/<char>]<char> */
2808804Smckusick 		    if (*buf_ptr == '\n') {
2818804Smckusick 		    /* check for unterminated literal */
2828804Smckusick 			printf ("%d: Unterminated literal\n", line_no);
2838804Smckusick 			goto stop_lit;
2848804Smckusick 		    /* Don't copy any more */
2858804Smckusick 		    }
2868804Smckusick 
2878804Smckusick 		    *tok = *buf_ptr++;
2888804Smckusick 		    if (buf_ptr >= buf_end)
2898804Smckusick 			fill_buffer ();
2908804Smckusick 		    if (had_eof || ((tok - token) > (bufsize - 2))) {
2918804Smckusick 			printf ("Unterminated literal\n");
2928804Smckusick 			++tok;
2938804Smckusick 			goto stop_lit;
2948804Smckusick 		    /* get outof literal copying loop */
2958804Smckusick 		    }
2968804Smckusick 
2978804Smckusick 		    if (*tok == '\\') {
2988804Smckusick 		    /* if escape, copy extra char */
2998804Smckusick 			if (*buf_ptr == '\n')
3008804Smckusick 			       /* check for escaped newline */
3018804Smckusick 			    ++line_no;
3028804Smckusick 			*(++tok) = *buf_ptr++;
3038804Smckusick 			++tok; /* we must increment this again because we
3048804Smckusick 			          copied two chars */
3058804Smckusick 			if (buf_ptr >= buf_end)
3068804Smckusick 			    fill_buffer ();
3078804Smckusick 		    }
3088804Smckusick 		    else
3098804Smckusick 			break; /* we copied one character */
3108804Smckusick 		}	       /* end of while (1) */
3118804Smckusick 	    } while (*tok++ != qchar);
3128804Smckusick 
3138804Smckusick     stop_lit:
3148804Smckusick 	    code = ident;
3158804Smckusick 	    break;
3168804Smckusick 
3178804Smckusick 	case ('('):
3188804Smckusick 	case ('['):
3198804Smckusick 	    unary_delim = true;
3208804Smckusick 	    code = lparen;
3218804Smckusick 	    break;
3228804Smckusick 
3238804Smckusick 	case (')'):
3248804Smckusick 	case (']'):
3258804Smckusick 	    code = rparen;
3268804Smckusick 	    break;
3278804Smckusick 
3288804Smckusick 	case '#':
3298804Smckusick 	    unary_delim = last_u_d;
3308804Smckusick 	    code = preesc;
3318804Smckusick 	    break;
3328804Smckusick 
3338804Smckusick 	case '?':
3348804Smckusick 	    unary_delim = true;
3358804Smckusick 	    code = question;
3368804Smckusick 	    break;
3378804Smckusick 
3388804Smckusick 	case (':'):
3398804Smckusick 	    code = colon;
3408804Smckusick 	    unary_delim = true;
3418804Smckusick 	    break;
3428804Smckusick 
3438804Smckusick 	case (';'):
3448804Smckusick 	    unary_delim = true;
3458804Smckusick 	    code = semicolon;
3468804Smckusick 	    break;
3478804Smckusick 
3488804Smckusick 	case ('{'):
3498804Smckusick 	    unary_delim = true;
3508804Smckusick 	    code = lbrace;
3518804Smckusick 	    break;
3528804Smckusick 
3538804Smckusick 	case ('}'):
3548804Smckusick 	    unary_delim = true;
3558804Smckusick 	    code = rbrace;
3568804Smckusick 	    break;
3578804Smckusick 
3588804Smckusick 	case 014: 	       /* a form feed */
3598804Smckusick 	    unary_delim = last_u_d;
3608804Smckusick 	    last_nl = true;    /* remember this so we can set 'col_1' right */
3618804Smckusick 	    code = form_feed;
3628804Smckusick 	    break;
3638804Smckusick 
3648804Smckusick 	case (','):
3658804Smckusick 	    unary_delim = true;
3668804Smckusick 	    code = comma;
3678804Smckusick 	    break;
3688804Smckusick 
3698804Smckusick 	case '.':
3708804Smckusick 	    unary_delim = false;
3718804Smckusick 	    code = period;
3728804Smckusick 	    break;
3738804Smckusick 
3748804Smckusick 	case '-':
3758804Smckusick 	case '+': 	       /* check for -, +, --, ++ */
3768804Smckusick 	    code = (last_u_d ? unary_op : binary_op);
3778804Smckusick 	    unary_delim = true;
3788804Smckusick 
3798804Smckusick 	    if (*buf_ptr == token[0]) {
3808804Smckusick 	    /* check for doubled character */
3818804Smckusick 		*tok++ = *buf_ptr++;
3828804Smckusick 	    /* buffer overflow will be checked at end of loop */
3838804Smckusick 		if (last_code == ident || last_code == rparen) {
3848804Smckusick 		    code = (last_u_d ? unary_op : postop);
3858804Smckusick 		/* check for following ++ or -- */
3868804Smckusick 		    unary_delim = false;
3878804Smckusick 		}
3888804Smckusick 	    }
3898804Smckusick 	    else
3908804Smckusick 		if (*buf_ptr == '>' || *buf_ptr == '=')
3918804Smckusick 			       /* check for operator -> or += */
3928804Smckusick 		    *tok++ = *buf_ptr++;
3938804Smckusick 	/* buffer overflow will be checked at end of switch */
3948804Smckusick 
3958804Smckusick 	    break;
3968804Smckusick 
3978804Smckusick 	case '=':
3988804Smckusick 	    if (chartype[*buf_ptr] == opchar) {
3998804Smckusick 	    /* we have two char assignment */
4008804Smckusick 		*tok++ = *buf_ptr;
4018804Smckusick 	    /* move second character */
4028804Smckusick 		if (++buf_ptr >= buf_end)
4038804Smckusick 		    fill_buffer ();
4048804Smckusick 	    }
4058804Smckusick 
4068804Smckusick 	    code = binary_op;
4078804Smckusick 	    unary_delim = true;
4088804Smckusick 	    if (token[1] != '<' && token[1] != '>')
4098804Smckusick 			       /* check for possible 3 char operator */
4108804Smckusick 		break;
4118804Smckusick 	/* can drop thru!!! */
4128804Smckusick 
4138804Smckusick 	case '>':
4148804Smckusick 	case '<':
4158804Smckusick 	case '!': 	       /* ops like <, <<, <=, !=, etc */
4168804Smckusick 	    if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
4178804Smckusick 		*tok++ = *buf_ptr;
4188804Smckusick 		if (++buf_ptr >= buf_end)
4198804Smckusick 		    fill_buffer ();
4208804Smckusick 	    }
4218804Smckusick 
4228804Smckusick 	    if (*buf_ptr == '=')
4238804Smckusick 		 *tok++ = *buf_ptr++;
4248804Smckusick 	    code = (last_u_d ? unary_op : binary_op);
4258804Smckusick 	    unary_delim = true;
4268804Smckusick 	    break;
4278804Smckusick 
4288804Smckusick 	default:
4298804Smckusick 	    if (token[0] == '/' && *buf_ptr == '*') {
4308804Smckusick 	    /* it is start of comment */
4318804Smckusick 		*tok++ = '*';
4328804Smckusick 
4338804Smckusick 		if (++buf_ptr >= buf_end)
4348804Smckusick 		    fill_buffer ();
4358804Smckusick 
4368804Smckusick 		code = comment;
4378804Smckusick 		unary_delim = last_u_d;
4388804Smckusick 		break;
4398804Smckusick 	    }
4408804Smckusick 
4418804Smckusick 	    while (*(tok - 1) == *buf_ptr || *buf_ptr=='=') {
4428804Smckusick 	    /* handle ||, &&, etc, and also things as in int *****i */
4438804Smckusick 		*tok++ = *buf_ptr;
4448804Smckusick 		if (++buf_ptr >= buf_end)
4458804Smckusick 		    fill_buffer ();
4468804Smckusick 	    }
4478804Smckusick 
4488804Smckusick 
4498804Smckusick 	    code = (last_u_d ? unary_op : binary_op);
4508804Smckusick 	    unary_delim = true;
4518804Smckusick 
4528804Smckusick 
4538804Smckusick     }			       /* end of switch */
4548804Smckusick 
4558804Smckusick     if (code != newline) {
4568804Smckusick 	l_struct = false;
4578804Smckusick 	last_code = code;
4588804Smckusick     }
4598804Smckusick 
4608804Smckusick     if (buf_ptr >= buf_end)    /* check for input buffer empty */
4618804Smckusick 	fill_buffer ();
4628804Smckusick     last_u_d = unary_delim;
4638804Smckusick     *tok = '\0';	       /* null terminate the token */
4648804Smckusick     return (code);
4658804Smckusick };
466