xref: /csrg-svn/usr.bin/indent/lexi.c (revision 33767)
1 /*
2  * Copyright (c) 1980 Regents of the University of California.
3  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms are permitted
7  * provided that this notice is preserved and that due credit is given
8  * to the University of California at Berkeley and the University of
9  * Illinois at Urbana.  The name of either University may not be used
10  * to endorse or promote products derived from this software without
11  * specific prior written permission. This software is provided
12  * ``as is'' without express or implied warranty.
13  */
14 
15 #ifndef lint
16 static char sccsid[] = "@(#)lexi.c	5.6 (Berkeley) 03/22/88";
17 #endif /* not lint */
18 
19 /*
20  * NAME:
21  *	lexi
22  *
23  * FUNCTION:
24  *	This is the token scanner for indent
25  *
26  * ALGORITHM:
27  *	1) Strip off intervening blanks and/or tabs.
28  *	2) If it is an alphanumeric token, move it to the token buffer "token".
29  *	   Check if it is a special reserved word that indent will want to
30  *	   know about.
31  *	3) Non-alphanumeric tokens are handled with a big switch statement.  A
32  *	   flag is kept to remember if the last token was a "unary delimiter",
33  *	   which forces a following operator to be unary as opposed to binary.
34  *
35  * PARAMETERS:
36  *	None
37  *
38  * RETURNS:
39  *	An integer code indicating the type of token scanned.
40  *
41  * GLOBALS:
42  *	buf_ptr =
43  *	had_eof
44  *	ps.last_u_d =	Set to true iff this token is a "unary delimiter"
45  *
46  * CALLS:
47  *	fill_buffer
48  *	printf (lib)
49  *
50  * CALLED BY:
51  *	main
52  *
53  * NOTES:
54  *	Start of comment is passed back so that the comment can be scanned by
55  *	pr_comment.
56  *
57  *	Strings and character literals are returned just like identifiers.
58  *
59  * HISTORY:
60  *	initial coding 	November 1976	D A Willcox of CAC
61  *	1/7/77		D A Willcox of CAC	Fix to provide proper handling
62  *						of "int a -1;"
63  *
64  */
65 
66 /*
67  * Here we have the token scanner for indent.  It scans off one token and
68  * puts it in the global variable "token".  It returns a code, indicating
69  * the type of token scanned.
70  */
71 
72 #include "indent_globs.h"
73 #include "indent_codes.h"
74 #include "ctype.h"
75 
76 #define alphanum 1
77 #define opchar 3
78 
79 struct templ {
80     char       *rwd;
81     int         rwcode;
82 };
83 
84 struct templ specials[100] =
85 {
86     "switch", 1,
87     "case", 2,
88     "break", 0,
89     "struct", 3,
90     "union", 3,
91     "enum", 3,
92     "default", 2,
93     "int", 4,
94     "char", 4,
95     "float", 4,
96     "double", 4,
97     "long", 4,
98     "short", 4,
99     "typdef", 4,
100     "unsigned", 4,
101     "register", 4,
102     "static", 4,
103     "global", 4,
104     "extern", 4,
105     "void", 4,
106     "goto", 0,
107     "return", 0,
108     "if", 5,
109     "while", 5,
110     "for", 5,
111     "else", 6,
112     "do", 6,
113     "sizeof", 7,
114     0, 0
115 };
116 
117 char        chartype[128] =
118 {				/* this is used to facilitate the decision
119 				 * of what type (alphanumeric, operator)
120 				 * each character is */
121     0, 0, 0, 0, 0, 0, 0, 0,
122     0, 0, 0, 0, 0, 0, 0, 0,
123     0, 0, 0, 0, 0, 0, 0, 0,
124     0, 0, 0, 0, 0, 0, 0, 0,
125     0, 3, 0, 0, 0, 3, 3, 0,
126     0, 0, 3, 3, 0, 3, 3, 3,
127     1, 1, 1, 1, 1, 1, 1, 1,
128     1, 1, 0, 0, 3, 3, 3, 3,
129     0, 1, 1, 1, 1, 1, 1, 1,
130     1, 1, 1, 1, 1, 1, 1, 1,
131     1, 1, 1, 1, 1, 1, 1, 1,
132     1, 1, 1, 0, 0, 0, 3, 1,
133     0, 1, 1, 1, 1, 1, 1, 1,
134     1, 1, 1, 1, 1, 1, 1, 1,
135     1, 1, 1, 1, 1, 1, 1, 1,
136     1, 1, 1, 0, 3, 0, 3, 0
137 };
138 
139 
140 
141 
142 int
143 lexi()
144 {
145     register char *tok;		/* local pointer to next char in token */
146     int         unary_delim;	/* this is set to 1 if the current token
147 				 *
148 				 * forces a following operator to be unary */
149     static int  last_code;	/* the last token type returned */
150     static int  l_struct;	/* set to 1 if the last token was 'struct' */
151     int         code;		/* internal code to be returned */
152     char        qchar;		/* the delimiter character for a string */
153 
154     tok = token;		/* point to start of place to save token */
155     unary_delim = false;
156     ps.col_1 = ps.last_nl;	/* tell world that this token started in
157 				 * column 1 iff the last thing scanned was
158 				 * nl */
159     ps.last_nl = false;
160 
161     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
162 	ps.col_1 = false;	/* leading blanks imply token is not in
163 				 * column 1 */
164 	if (++buf_ptr >= buf_end)
165 	    fill_buffer();
166     }
167 
168     /* Scan an alphanumeric token.  Note that we must also handle
169      * stuff like "1.0e+03" and "7e-6". */
170     if (chartype[*buf_ptr & 0177] == alphanum) {	/* we have a character
171 							 * or number */
172 	register char *j;	/* used for searching thru list of
173 				 * reserved words */
174 	register struct templ *p;
175 	register int c;
176 
177 	do {			/* copy it over */
178 	    *tok++ = *buf_ptr++;
179 	    if (buf_ptr >= buf_end)
180 		fill_buffer();
181 	} while (chartype[c = *buf_ptr & 0177] == alphanum ||
182 		isdigit(token[0]) && (c == '+' || c == '-') &&
183 		(tok[-1] == 'e' || tok[-1] == 'E'));
184 	*tok++ = '\0';
185 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
186 	    if (++buf_ptr >= buf_end)
187 		fill_buffer();
188 	}
189 	ps.its_a_keyword = false;
190 	ps.sizeof_keyword = false;
191 	if (l_struct) {		/* if last token was 'struct', then this
192 				 * token should be treated as a
193 				 * declaration */
194 	    l_struct = false;
195 	    last_code = ident;
196 	    ps.last_u_d = true;
197 	    return (decl);
198 	}
199 	ps.last_u_d = false;	/* Operator after indentifier is binary */
200 	last_code = ident;	/* Remember that this is the code we will
201 				 * return */
202 
203 	/*
204 	 * This loop will check if the token is a keyword.
205 	 */
206 	for (p = specials; (j = p->rwd) != 0; p++) {
207 	    tok = token;	/* point at scanned token */
208 	    if (*j++ != *tok++ || *j++ != *tok++)
209 		continue;	/* This test depends on the fact that
210 				 * identifiers are always at least 1
211 				 * character long (ie. the first two bytes
212 				 * of the identifier are always
213 				 * meaningful) */
214 	    if (tok[-1] == 0)
215 		break;		/* If its a one-character identifier */
216 	    while (*tok++ == *j)
217 		if (*j++ == 0)
218 		    goto found_keyword;	/* I wish that C had a multi-level
219 					 * break... */
220 	}
221 	if (p->rwd) {		/* we have a keyword */
222     found_keyword:
223 	    ps.its_a_keyword = true;
224 	    ps.last_u_d = true;
225 	    switch (p->rwcode) {
226 		case 1:	/* it is a switch */
227 		    return (swstmt);
228 		case 2:	/* a case or default */
229 		    return (casestmt);
230 
231 		case 3:	/* a "struct" */
232 		    if (ps.p_l_follow)
233 			break;	/* inside parens: cast */
234 		    l_struct = true;
235 
236 		    /*
237 		     * Next time around, we will want to know that we have
238 		     * had a 'struct'
239 		     */
240 		case 4:	/* one of the declaration keywords */
241 		    if (ps.p_l_follow) {
242 			ps.cast_mask |= 1 << ps.p_l_follow;
243 			break;	/* inside parens: cast */
244 		    }
245 		    last_code = decl;
246 		    return (decl);
247 
248 		case 5:	/* if, while, for */
249 		    return (sp_paren);
250 
251 		case 6:	/* do, else */
252 		    return (sp_nparen);
253 
254 		case 7:
255 		    ps.sizeof_keyword = true;
256 		default:	/* all others are treated like any other
257 				 * identifier */
258 		    return (ident);
259 	    }			/* end of switch */
260 	}			/* end of if (found_it) */
261 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0
262 	    && (buf_ptr[1] != ')' || buf_ptr[2] != ';')) {
263 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
264 	    ps.in_parameter_declaration = 1;
265 	}
266 
267 	/*
268 	 * The following hack attempts to guess whether or not the current
269 	 * token is in fact a declaration keyword -- one that has been
270 	 * typedefd
271 	 */
272 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr))
273 	    && !ps.p_l_follow
274 	    && (ps.last_token == rparen || ps.last_token == semicolon ||
275 		ps.last_token == decl ||
276 		ps.last_token == lbrace || ps.last_token == rbrace)) {
277 	    ps.its_a_keyword = true;
278 	    ps.last_u_d = true;
279 	    last_code = decl;
280 	    return decl;
281 	}
282 	if (last_code == decl)	/* if this is a declared variable, then
283 				 * following sign is unary */
284 	    ps.last_u_d = true;	/* will make "int a -1" work */
285 	last_code = ident;
286 	return (ident);		/* the ident is not in the list */
287     }				/* end of procesing for alpanum character */
288     /* Scan a non-alphanumeric token */
289 
290     *tok++ = *buf_ptr;		/* if it is only a one-character token, it
291 				 * is moved here */
292     *tok = '\0';
293     if (++buf_ptr >= buf_end)
294 	fill_buffer();
295 
296     switch (*token) {
297 	case '\n':
298 	    unary_delim = ps.last_u_d;
299 	    ps.last_nl = true;	/* remember that we just had a newline */
300 	    code = (had_eof ? 0 : newline);
301 
302 	    /*
303 	     * if data has been exausted, the newline is a dummy, and we
304 	     * should return code to stop
305 	     */
306 	    break;
307 
308 	case '\'':		/* start of quoted character */
309 	case '"':		/* start of string */
310 	    qchar = *token;
311 	    if (troff) {
312 		tok[-1] = '`';
313 		if (qchar == '"')
314 		    *tok++ = '`';
315 		*tok++ = BACKSLASH;
316 		*tok++ = 'f';
317 		*tok++ = 'L';
318 	    }
319 	    do {		/* copy the string */
320 		while (1) {	/* move one character or [/<char>]<char> */
321 		    if (*buf_ptr == '\n') {
322 			printf("%d: Unterminated literal\n", line_no);
323 			goto stop_lit;
324 		    }
325 		    *tok = *buf_ptr++;
326 		    if (buf_ptr >= buf_end)
327 			fill_buffer();
328 		    if (had_eof || ((tok - token) > (bufsize - 2))) {
329 			printf("Unterminated literal\n");
330 			++tok;
331 			goto stop_lit;
332 			/* get outof literal copying loop */
333 		    }
334 		    if (*tok == BACKSLASH) {	/* if escape, copy extra
335 						 * char */
336 			if (*buf_ptr == '\n')	/* check for escaped
337 						 * newline */
338 			    ++line_no;
339 			if (troff) {
340 			    *++tok = BACKSLASH;
341 			    if (*buf_ptr == BACKSLASH)
342 				*++tok = BACKSLASH;
343 			}
344 			*++tok = *buf_ptr++;
345 			++tok;	/* we must increment this again because we
346 				 * copied two chars */
347 			if (buf_ptr >= buf_end)
348 			    fill_buffer();
349 		    }
350 		    else
351 			break;	/* we copied one character */
352 		}		/* end of while (1) */
353 	    } while (*tok++ != qchar);
354 	    if (troff) {
355 		tok[-1] = BACKSLASH;
356 		*tok++ = 'f';
357 		*tok++ = 'R';
358 		*tok++ = '\'';
359 		if (qchar == '"')
360 		    *tok++ = '\'';
361 	    }
362     stop_lit:
363 	    code = ident;
364 	    break;
365 
366 	case ('('):
367 	case ('['):
368 	    unary_delim = true;
369 	    code = lparen;
370 	    break;
371 
372 	case (')'):
373 	case (']'):
374 	    code = rparen;
375 	    break;
376 
377 	case '#':
378 	    unary_delim = ps.last_u_d;
379 	    code = preesc;
380 	    break;
381 
382 	case '?':
383 	    unary_delim = true;
384 	    code = question;
385 	    break;
386 
387 	case (':'):
388 	    code = colon;
389 	    unary_delim = true;
390 	    break;
391 
392 	case (';'):
393 	    unary_delim = true;
394 	    code = semicolon;
395 	    break;
396 
397 	case ('{'):
398 	    unary_delim = true;
399 
400 	    /*
401 	     * if (ps.in_or_st) ps.block_init = 1;
402 	     */
403 	    code = ps.block_init ? lparen : lbrace;
404 	    break;
405 
406 	case ('}'):
407 	    unary_delim = true;
408 	    code = ps.block_init ? rparen : rbrace;
409 	    break;
410 
411 	case 014:		/* a form feed */
412 	    unary_delim = ps.last_u_d;
413 	    ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
414 				 * right */
415 	    code = form_feed;
416 	    break;
417 
418 	case (','):
419 	    unary_delim = true;
420 	    code = comma;
421 	    break;
422 
423 	case '.':
424 	    unary_delim = false;
425 	    code = period;
426 	    break;
427 
428 	case '-':
429 	case '+':		/* check for -, +, --, ++ */
430 	    code = (ps.last_u_d ? unary_op : binary_op);
431 	    unary_delim = true;
432 
433 	    if (*buf_ptr == token[0]) {
434 		/* check for doubled character */
435 		*tok++ = *buf_ptr++;
436 		/* buffer overflow will be checked at end of loop */
437 		if (last_code == ident || last_code == rparen) {
438 		    code = (ps.last_u_d ? unary_op : postop);
439 		    /* check for following ++ or -- */
440 		    unary_delim = false;
441 		}
442 	    }
443 	    else if (*buf_ptr == '=')
444 		/* check for operator += */
445 		*tok++ = *buf_ptr++;
446 	    else if (token[0] == '-' && *buf_ptr == '>') {
447 		/* check for operator -> */
448 		*tok++ = *buf_ptr++;
449 		if (!pointer_as_binop) {
450 		    code = unary_op;
451 		    unary_delim = false;
452 		    ps.want_blank = false;
453 		}
454 	    }
455 	    /* buffer overflow will be checked at end of switch */
456 
457 	    break;
458 
459 	case '=':
460 	    if (ps.in_or_st)
461 		ps.block_init = 1;
462 	    if (chartype[*buf_ptr] == opchar) {	/* we have two char
463 						 * assignment */
464 		tok[-1] = *buf_ptr++;
465 		if ((tok[-1] == '<' || tok[-1] == '>') && tok[-1] == *buf_ptr)
466 		    *tok++ = *buf_ptr++;
467 		*tok++ = '=';	/* Flip =+ to += */
468 		*tok = 0;
469 	    }
470 	    code = binary_op;
471 	    unary_delim = true;
472 	    break;
473 	    /* can drop thru!!! */
474 
475 	case '>':
476 	case '<':
477 	case '!':		/* ops like <, <<, <=, !=, etc */
478 	    if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
479 		*tok++ = *buf_ptr;
480 		if (++buf_ptr >= buf_end)
481 		    fill_buffer();
482 	    }
483 	    if (*buf_ptr == '=')
484 		*tok++ = *buf_ptr++;
485 	    code = (ps.last_u_d ? unary_op : binary_op);
486 	    unary_delim = true;
487 	    break;
488 
489 	default:
490 	    if (token[0] == '/' && *buf_ptr == '*') {
491 		/* it is start of comment */
492 		*tok++ = '*';
493 
494 		if (++buf_ptr >= buf_end)
495 		    fill_buffer();
496 
497 		code = comment;
498 		unary_delim = ps.last_u_d;
499 		break;
500 	    }
501 	    while (*(tok - 1) == *buf_ptr || *buf_ptr == '=') {
502 		/* handle ||, &&, etc, and also things as in int *****i */
503 		*tok++ = *buf_ptr;
504 		if (++buf_ptr >= buf_end)
505 		    fill_buffer();
506 	    }
507 	    code = (ps.last_u_d ? unary_op : binary_op);
508 	    unary_delim = true;
509 
510 
511     }				/* end of switch */
512     if (code != newline) {
513 	l_struct = false;
514 	last_code = code;
515     }
516     if (buf_ptr >= buf_end)	/* check for input buffer empty */
517 	fill_buffer();
518     ps.last_u_d = unary_delim;
519     *tok = '\0';		/* null terminate the token */
520     return (code);
521 };
522 
523 /* Add the given keyword to the keyword table, using val as the keyword type
524    */
525 addkey (key, val)
526 char       *key;
527 {
528     register struct templ *p = specials;
529     while (p->rwd)
530 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
531 	    return;
532 	else
533 	    p++;
534     if (p >= specials + sizeof specials / sizeof specials[0])
535 	return;			/* For now, table overflows are silently
536 				   ignored */
537     p->rwd = key;
538     p->rwcode = val;
539     p[1].rwd = 0;
540     p[1].rwcode = 0;
541     return;
542 }
543