xref: /csrg-svn/usr.bin/indent/lexi.c (revision 8804)
1*8804Smckusick static char sccsid[] = "@(#)lexi.c	4.1	(Berkeley)	10/21/82";
2*8804Smckusick 
3*8804Smckusick /*
4*8804Smckusick 
5*8804Smckusick 			  Copyright (C) 1976
6*8804Smckusick 				by the
7*8804Smckusick 			  Board of Trustees
8*8804Smckusick 				of the
9*8804Smckusick 			University of Illinois
10*8804Smckusick 
11*8804Smckusick 			 All rights reserved
12*8804Smckusick 
13*8804Smckusick 
14*8804Smckusick NAME:
15*8804Smckusick 	lexi
16*8804Smckusick 
17*8804Smckusick FUNCTION:
18*8804Smckusick 	This is the token scanner for indent
19*8804Smckusick 
20*8804Smckusick ALGORITHM:
21*8804Smckusick 	1) Strip off intervening blanks and/or tabs.
22*8804Smckusick 	2) If it is an alphanumeric token, move it to the token buffer "token".
23*8804Smckusick 	   Check if it is a special reserved word that indent will want to
24*8804Smckusick 	   know about.
25*8804Smckusick 	3) Non-alphanumeric tokens are handled with a big switch statement.  A
26*8804Smckusick 	   flag is kept to remember if the last token was a "unary delimiter",
27*8804Smckusick 	   which forces a following operator to be unary as opposed to binary.
28*8804Smckusick 
29*8804Smckusick PARAMETERS:
30*8804Smckusick 	None
31*8804Smckusick 
32*8804Smckusick RETURNS:
33*8804Smckusick 	An integer code indicating the type of token scanned.
34*8804Smckusick 
35*8804Smckusick GLOBALS:
36*8804Smckusick 	buf_ptr =
37*8804Smckusick 	had_eof
38*8804Smckusick 	last_u_d =	Set to true iff this token is a "unary delimiter"
39*8804Smckusick 
40*8804Smckusick CALLS:
41*8804Smckusick 	fill_buffer
42*8804Smckusick 	printf (lib)
43*8804Smckusick 
44*8804Smckusick CALLED BY:
45*8804Smckusick 	main
46*8804Smckusick 
47*8804Smckusick NOTES:
48*8804Smckusick 	Start of comment is passed back so that the comment can be scanned by
49*8804Smckusick 	pr_comment.
50*8804Smckusick 
51*8804Smckusick 	Strings and character literals are returned just like identifiers.
52*8804Smckusick 
53*8804Smckusick HISTORY:
54*8804Smckusick 	initial coding 	November 1976	D A Willcox of CAC
55*8804Smckusick 	1/7/77		D A Willcox of CAC	Fix to provide proper handling
56*8804Smckusick 						of "int a -1;"
57*8804Smckusick 
58*8804Smckusick */
59*8804Smckusick 
60*8804Smckusick /* Here we have the token scanner for indent.  It scans off one token and
61*8804Smckusick    puts it in the global variable "token".  It returns a code, indicating the
62*8804Smckusick    type of token scanned. */
63*8804Smckusick 
64*8804Smckusick #include "indent_globs.h";
65*8804Smckusick #include "indent_codes.h";
66*8804Smckusick 
67*8804Smckusick 
68*8804Smckusick 
69*8804Smckusick #define alphanum 1
70*8804Smckusick #define opchar 3
71*8804Smckusick 
72*8804Smckusick struct templ {
73*8804Smckusick     char   *rwd;
74*8804Smckusick     int     rwcode;
75*8804Smckusick };
76*8804Smckusick 
77*8804Smckusick struct templ    specials[] =
78*8804Smckusick {
79*8804Smckusick     "switch", 1,
80*8804Smckusick     "case", 2,
81*8804Smckusick     "struct", 3,
82*8804Smckusick     "default", 2,
83*8804Smckusick     "int", 4,
84*8804Smckusick     "char", 4,
85*8804Smckusick     "float", 4,
86*8804Smckusick     "double", 4,
87*8804Smckusick     "long", 4,
88*8804Smckusick     "short", 4,
89*8804Smckusick     "typdef", 4,
90*8804Smckusick     "unsigned", 4,
91*8804Smckusick     "register", 4,
92*8804Smckusick     "static", 4,
93*8804Smckusick     "global", 4,
94*8804Smckusick     "extern", 4,
95*8804Smckusick     "if", 5,
96*8804Smckusick     "while", 5,
97*8804Smckusick     "for", 5,
98*8804Smckusick     "else", 6,
99*8804Smckusick     "do", 6,
100*8804Smckusick     "sizeof", 0,
101*8804Smckusick     0, 0
102*8804Smckusick };
103*8804Smckusick 
104*8804Smckusick char    chartype[128] =
105*8804Smckusick {		   /* this is used to facilitate the decision of what type
106*8804Smckusick 		      (alphanumeric, operator) each character is */
107*8804Smckusick     0, 0, 0, 0, 0, 0, 0, 0,
108*8804Smckusick     0, 0, 0, 0, 0, 0, 0, 0,
109*8804Smckusick     0, 0, 0, 0, 0, 0, 0, 0,
110*8804Smckusick     0, 0, 0, 0, 0, 0, 0, 0,
111*8804Smckusick     0, 3, 0, 0, 0, 3, 3, 0,
112*8804Smckusick     0, 0, 3, 3, 0, 3, 3, 3,
113*8804Smckusick     1, 1, 1, 1, 1, 1, 1, 1,
114*8804Smckusick     1, 1, 0, 0, 3, 3, 3, 3,
115*8804Smckusick     0, 1, 1, 1, 1, 1, 1, 1,
116*8804Smckusick     1, 1, 1, 1, 1, 1, 1, 1,
117*8804Smckusick     1, 1, 1, 1, 1, 1, 1, 1,
118*8804Smckusick     1, 1, 1, 0, 0, 0, 3, 1,
119*8804Smckusick     0, 1, 1, 1, 1, 1, 1, 1,
120*8804Smckusick     1, 1, 1, 1, 1, 1, 1, 1,
121*8804Smckusick     1, 1, 1, 1, 1, 1, 1, 1,
122*8804Smckusick     1, 1, 1, 0, 3, 0, 3, 0
123*8804Smckusick };
124*8804Smckusick 
125*8804Smckusick int     last_nl = true;
126*8804Smckusick  /* this is true if the last thing scanned was a newline */
127*8804Smckusick 
128*8804Smckusick 
129*8804Smckusick 
130*8804Smckusick int     lexi () {
131*8804Smckusick     register char  *tok;
132*8804Smckusick  /* local pointer to next char in token */
133*8804Smckusick     register int    i;
134*8804Smckusick  /* local loop counter */
135*8804Smckusick     register char  *j;
136*8804Smckusick  /* used for searching thru list of reserved words */
137*8804Smckusick     int     unary_delim;
138*8804Smckusick  /* this is set to 1 if the current token forces a following operator to be
139*8804Smckusick     unary */
140*8804Smckusick     static int  last_code;
141*8804Smckusick  /* the last token type returned */
142*8804Smckusick     static int  l_struct;
143*8804Smckusick  /* set to 1 if the last token was 'struct' */
144*8804Smckusick     int     found_it;
145*8804Smckusick     int     code;  /* internal code to be returned */
146*8804Smckusick     char    qchar; /* the delimiter character for a string */
147*8804Smckusick 
148*8804Smckusick     tok = token;	       /* point to start of place to save token */
149*8804Smckusick     unary_delim = false;
150*8804Smckusick     col_1 = last_nl;	       /* tell world that this token started in column
151*8804Smckusick 			          1 iff the last thing scanned was nl */
152*8804Smckusick     last_nl = false;
153*8804Smckusick 
154*8804Smckusick     while (*buf_ptr == ' ' || *buf_ptr == '\t') {
155*8804Smckusick     /* get rid of blanks */
156*8804Smckusick 	col_1 = false;	       /* leading blanks imply token is not in column 1
157*8804Smckusick 			          */
158*8804Smckusick 	if (++buf_ptr >= buf_end)
159*8804Smckusick 	    fill_buffer ();
160*8804Smckusick     }
161*8804Smckusick 
162*8804Smckusick /*----------------------------------------------------------*\
163*8804Smckusick |    Scan an alphanumeric token
164*8804Smckusick \*----------------------------------------------------------*/
165*8804Smckusick 
166*8804Smckusick     if (chartype[*buf_ptr & 0177] == alphanum) {
167*8804Smckusick     /* we have a character or number */
168*8804Smckusick 	while (chartype[*buf_ptr & 0177] == alphanum) {
169*8804Smckusick 	/* copy it over */
170*8804Smckusick 	    *tok++ = *buf_ptr++;
171*8804Smckusick 	    if (buf_ptr >= buf_end)
172*8804Smckusick 		fill_buffer ();
173*8804Smckusick 	}
174*8804Smckusick 
175*8804Smckusick 	*tok++ = '\0';
176*8804Smckusick 
177*8804Smckusick 	if (l_struct) {	       /* if last token was 'struct', then this token
178*8804Smckusick 			          should be treated as a declaration */
179*8804Smckusick 	    l_struct = false;
180*8804Smckusick 	    last_code = ident;
181*8804Smckusick 	    last_u_d = true;
182*8804Smckusick 	    return (decl);
183*8804Smckusick 	}
184*8804Smckusick 
185*8804Smckusick 	last_u_d = false;      /* operator after indentifier is binary */
186*8804Smckusick 
187*8804Smckusick 	for (i = 0; specials[i].rwd != 0; ++i) {
188*8804Smckusick 	/* this loop will check if the token is a keyword.  if so, a following
189*8804Smckusick 	   operator is unary */
190*8804Smckusick 	    last_code = ident; /* remember that this is the code we will return
191*8804Smckusick 			          */
192*8804Smckusick 	    j = specials[i].rwd;
193*8804Smckusick 	/* point at ith reserved word */
194*8804Smckusick 	    tok = token;       /* point at scanned toekn */
195*8804Smckusick 	    found_it = true;   /* set to false if not found */
196*8804Smckusick 	    do {
197*8804Smckusick 		if (*tok++ != *j) {
198*8804Smckusick 		    found_it = false;
199*8804Smckusick 		    break;
200*8804Smckusick 		}
201*8804Smckusick 	    } while (*j++);
202*8804Smckusick 
203*8804Smckusick 	    if (found_it) {    /* we have a keyword */
204*8804Smckusick 		last_u_d = true;
205*8804Smckusick 		switch (specials[i].rwcode) {
206*8804Smckusick 		    case 1:    /* it is a switch */
207*8804Smckusick 			return (swstmt);
208*8804Smckusick 		    case 2:    /* a case or default */
209*8804Smckusick 			return (casestmt);
210*8804Smckusick 
211*8804Smckusick 		    case 3:    /* a "struct" */
212*8804Smckusick 			l_struct = true;
213*8804Smckusick 		    /* Next time around, we will want to know that we have had
214*8804Smckusick 		       a 'struct' */
215*8804Smckusick 		    case 4:    /* one of the declaration keywords */
216*8804Smckusick 			if(p_l_follow) break;	/* inside parens: cast */
217*8804Smckusick 			last_code = decl;
218*8804Smckusick 			return (decl);
219*8804Smckusick 
220*8804Smckusick 		    case 5:    /* if, while, for */
221*8804Smckusick 			return (sp_paren);
222*8804Smckusick 
223*8804Smckusick 		    case 6:    /* do, else */
224*8804Smckusick 			return (sp_nparen);
225*8804Smckusick 
226*8804Smckusick 		    default:   /* all others are treated like any other
227*8804Smckusick 			          identifier */
228*8804Smckusick 			return (ident);
229*8804Smckusick 		}	       /* end of switch */
230*8804Smckusick 	    }		       /* end of if (found_it) */
231*8804Smckusick 
232*8804Smckusick 	}
233*8804Smckusick 
234*8804Smckusick 	if (last_code == decl) /* if this is a declared variable, then
235*8804Smckusick 			          following sign is unary */
236*8804Smckusick 	    last_u_d = true;   /* will make "int a -1" work */
237*8804Smckusick 	last_code = ident;
238*8804Smckusick 	return (ident);	       /* the ident is not in the list */
239*8804Smckusick     }			       /* end of procesing for alpanum character */
240*8804Smckusick 
241*8804Smckusick 
242*8804Smckusick 
243*8804Smckusick /*----------------------------------------------------------*\
244*8804Smckusick |   Scan a non-alphanumeric token
245*8804Smckusick \*----------------------------------------------------------*/
246*8804Smckusick 
247*8804Smckusick     *tok++ = *buf_ptr;	       /* if it is only a one-character token, it is
248*8804Smckusick 			          moved here */
249*8804Smckusick     *tok = '\0';
250*8804Smckusick     if (++buf_ptr >= buf_end)
251*8804Smckusick 	fill_buffer ();
252*8804Smckusick 
253*8804Smckusick     switch (*token) {
254*8804Smckusick 	case '\n':
255*8804Smckusick 	    unary_delim = last_u_d;
256*8804Smckusick 	    last_nl = true;    /* remember that we just had a newline */
257*8804Smckusick 	    code = (had_eof ? 0 : newline);
258*8804Smckusick 	/* if data has been exausted, the newline is a dummy, and we should
259*8804Smckusick 	   return code to stop */
260*8804Smckusick 	    break;
261*8804Smckusick 
262*8804Smckusick 	case '\'': 	       /* start of quoted character */
263*8804Smckusick 	    qchar = '\'';      /* remember final delimiter */
264*8804Smckusick 	    goto copy_lit;     /* and go to common literal code */
265*8804Smckusick 
266*8804Smckusick 	case '"': 	       /* start of string */
267*8804Smckusick 	    qchar = '"';
268*8804Smckusick 
269*8804Smckusick     copy_lit:
270*8804Smckusick 	    do {	       /* copy the string */
271*8804Smckusick 		while (1) {    /* move one character or [/<char>]<char> */
272*8804Smckusick 		    if (*buf_ptr == '\n') {
273*8804Smckusick 		    /* check for unterminated literal */
274*8804Smckusick 			printf ("%d: Unterminated literal\n", line_no);
275*8804Smckusick 			goto stop_lit;
276*8804Smckusick 		    /* Don't copy any more */
277*8804Smckusick 		    }
278*8804Smckusick 
279*8804Smckusick 		    *tok = *buf_ptr++;
280*8804Smckusick 		    if (buf_ptr >= buf_end)
281*8804Smckusick 			fill_buffer ();
282*8804Smckusick 		    if (had_eof || ((tok - token) > (bufsize - 2))) {
283*8804Smckusick 			printf ("Unterminated literal\n");
284*8804Smckusick 			++tok;
285*8804Smckusick 			goto stop_lit;
286*8804Smckusick 		    /* get outof literal copying loop */
287*8804Smckusick 		    }
288*8804Smckusick 
289*8804Smckusick 		    if (*tok == '\\') {
290*8804Smckusick 		    /* if escape, copy extra char */
291*8804Smckusick 			if (*buf_ptr == '\n')
292*8804Smckusick 			       /* check for escaped newline */
293*8804Smckusick 			    ++line_no;
294*8804Smckusick 			*(++tok) = *buf_ptr++;
295*8804Smckusick 			++tok; /* we must increment this again because we
296*8804Smckusick 			          copied two chars */
297*8804Smckusick 			if (buf_ptr >= buf_end)
298*8804Smckusick 			    fill_buffer ();
299*8804Smckusick 		    }
300*8804Smckusick 		    else
301*8804Smckusick 			break; /* we copied one character */
302*8804Smckusick 		}	       /* end of while (1) */
303*8804Smckusick 	    } while (*tok++ != qchar);
304*8804Smckusick 
305*8804Smckusick     stop_lit:
306*8804Smckusick 	    code = ident;
307*8804Smckusick 	    break;
308*8804Smckusick 
309*8804Smckusick 	case ('('):
310*8804Smckusick 	case ('['):
311*8804Smckusick 	    unary_delim = true;
312*8804Smckusick 	    code = lparen;
313*8804Smckusick 	    break;
314*8804Smckusick 
315*8804Smckusick 	case (')'):
316*8804Smckusick 	case (']'):
317*8804Smckusick 	    code = rparen;
318*8804Smckusick 	    break;
319*8804Smckusick 
320*8804Smckusick 	case '#':
321*8804Smckusick 	    unary_delim = last_u_d;
322*8804Smckusick 	    code = preesc;
323*8804Smckusick 	    break;
324*8804Smckusick 
325*8804Smckusick 	case '?':
326*8804Smckusick 	    unary_delim = true;
327*8804Smckusick 	    code = question;
328*8804Smckusick 	    break;
329*8804Smckusick 
330*8804Smckusick 	case (':'):
331*8804Smckusick 	    code = colon;
332*8804Smckusick 	    unary_delim = true;
333*8804Smckusick 	    break;
334*8804Smckusick 
335*8804Smckusick 	case (';'):
336*8804Smckusick 	    unary_delim = true;
337*8804Smckusick 	    code = semicolon;
338*8804Smckusick 	    break;
339*8804Smckusick 
340*8804Smckusick 	case ('{'):
341*8804Smckusick 	    unary_delim = true;
342*8804Smckusick 	    code = lbrace;
343*8804Smckusick 	    break;
344*8804Smckusick 
345*8804Smckusick 	case ('}'):
346*8804Smckusick 	    unary_delim = true;
347*8804Smckusick 	    code = rbrace;
348*8804Smckusick 	    break;
349*8804Smckusick 
350*8804Smckusick 	case 014: 	       /* a form feed */
351*8804Smckusick 	    unary_delim = last_u_d;
352*8804Smckusick 	    last_nl = true;    /* remember this so we can set 'col_1' right */
353*8804Smckusick 	    code = form_feed;
354*8804Smckusick 	    break;
355*8804Smckusick 
356*8804Smckusick 	case (','):
357*8804Smckusick 	    unary_delim = true;
358*8804Smckusick 	    code = comma;
359*8804Smckusick 	    break;
360*8804Smckusick 
361*8804Smckusick 	case '.':
362*8804Smckusick 	    unary_delim = false;
363*8804Smckusick 	    code = period;
364*8804Smckusick 	    break;
365*8804Smckusick 
366*8804Smckusick 	case '-':
367*8804Smckusick 	case '+': 	       /* check for -, +, --, ++ */
368*8804Smckusick 	    code = (last_u_d ? unary_op : binary_op);
369*8804Smckusick 	    unary_delim = true;
370*8804Smckusick 
371*8804Smckusick 	    if (*buf_ptr == token[0]) {
372*8804Smckusick 	    /* check for doubled character */
373*8804Smckusick 		*tok++ = *buf_ptr++;
374*8804Smckusick 	    /* buffer overflow will be checked at end of loop */
375*8804Smckusick 		if (last_code == ident || last_code == rparen) {
376*8804Smckusick 		    code = (last_u_d ? unary_op : postop);
377*8804Smckusick 		/* check for following ++ or -- */
378*8804Smckusick 		    unary_delim = false;
379*8804Smckusick 		}
380*8804Smckusick 	    }
381*8804Smckusick 	    else
382*8804Smckusick 		if (*buf_ptr == '>' || *buf_ptr == '=')
383*8804Smckusick 			       /* check for operator -> or += */
384*8804Smckusick 		    *tok++ = *buf_ptr++;
385*8804Smckusick 	/* buffer overflow will be checked at end of switch */
386*8804Smckusick 
387*8804Smckusick 	    break;
388*8804Smckusick 
389*8804Smckusick 	case '=':
390*8804Smckusick 	    if (chartype[*buf_ptr] == opchar) {
391*8804Smckusick 	    /* we have two char assignment */
392*8804Smckusick 		*tok++ = *buf_ptr;
393*8804Smckusick 	    /* move second character */
394*8804Smckusick 		if (++buf_ptr >= buf_end)
395*8804Smckusick 		    fill_buffer ();
396*8804Smckusick 	    }
397*8804Smckusick 
398*8804Smckusick 	    code = binary_op;
399*8804Smckusick 	    unary_delim = true;
400*8804Smckusick 	    if (token[1] != '<' && token[1] != '>')
401*8804Smckusick 			       /* check for possible 3 char operator */
402*8804Smckusick 		break;
403*8804Smckusick 	/* can drop thru!!! */
404*8804Smckusick 
405*8804Smckusick 	case '>':
406*8804Smckusick 	case '<':
407*8804Smckusick 	case '!': 	       /* ops like <, <<, <=, !=, etc */
408*8804Smckusick 	    if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
409*8804Smckusick 		*tok++ = *buf_ptr;
410*8804Smckusick 		if (++buf_ptr >= buf_end)
411*8804Smckusick 		    fill_buffer ();
412*8804Smckusick 	    }
413*8804Smckusick 
414*8804Smckusick 	    if (*buf_ptr == '=')
415*8804Smckusick 		 *tok++ = *buf_ptr++;
416*8804Smckusick 	    code = (last_u_d ? unary_op : binary_op);
417*8804Smckusick 	    unary_delim = true;
418*8804Smckusick 	    break;
419*8804Smckusick 
420*8804Smckusick 	default:
421*8804Smckusick 	    if (token[0] == '/' && *buf_ptr == '*') {
422*8804Smckusick 	    /* it is start of comment */
423*8804Smckusick 		*tok++ = '*';
424*8804Smckusick 
425*8804Smckusick 		if (++buf_ptr >= buf_end)
426*8804Smckusick 		    fill_buffer ();
427*8804Smckusick 
428*8804Smckusick 		code = comment;
429*8804Smckusick 		unary_delim = last_u_d;
430*8804Smckusick 		break;
431*8804Smckusick 	    }
432*8804Smckusick 
433*8804Smckusick 	    while (*(tok - 1) == *buf_ptr || *buf_ptr=='=') {
434*8804Smckusick 	    /* handle ||, &&, etc, and also things as in int *****i */
435*8804Smckusick 		*tok++ = *buf_ptr;
436*8804Smckusick 		if (++buf_ptr >= buf_end)
437*8804Smckusick 		    fill_buffer ();
438*8804Smckusick 	    }
439*8804Smckusick 
440*8804Smckusick 
441*8804Smckusick 	    code = (last_u_d ? unary_op : binary_op);
442*8804Smckusick 	    unary_delim = true;
443*8804Smckusick 
444*8804Smckusick 
445*8804Smckusick     }			       /* end of switch */
446*8804Smckusick 
447*8804Smckusick     if (code != newline) {
448*8804Smckusick 	l_struct = false;
449*8804Smckusick 	last_code = code;
450*8804Smckusick     }
451*8804Smckusick 
452*8804Smckusick     if (buf_ptr >= buf_end)    /* check for input buffer empty */
453*8804Smckusick 	fill_buffer ();
454*8804Smckusick     last_u_d = unary_delim;
455*8804Smckusick     *tok = '\0';	       /* null terminate the token */
456*8804Smckusick     return (code);
457*8804Smckusick };
458