xref: /csrg-svn/usr.bin/indent/lexi.c (revision 34885)
1 /*
2  * Copyright (c) 1980 Regents of the University of California.
3  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms are permitted
7  * provided that the above copyright notice and this paragraph are
8  * duplicated in all such forms and that any documentation,
9  * advertising materials, and other materials related to such
10  * distribution and use acknowledge that the software was developed
11  * by the University of California, Berkeley and the University
12  * of Illinois, Urbana.  The name of either
13  * University may not be used to endorse or promote products derived
14  * from this software without specific prior written permission.
15  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
17  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
18  */
19 
20 #ifndef lint
21 static char sccsid[] = "@(#)lexi.c	5.8 (Berkeley) 06/29/88";
22 #endif /* not lint */
23 
24 /*
25  * NAME:
26  *	lexi
27  *
28  * FUNCTION:
29  *	This is the token scanner for indent
30  *
31  * ALGORITHM:
32  *	1) Strip off intervening blanks and/or tabs.
33  *	2) If it is an alphanumeric token, move it to the token buffer "token".
34  *	   Check if it is a special reserved word that indent will want to
35  *	   know about.
36  *	3) Non-alphanumeric tokens are handled with a big switch statement.  A
37  *	   flag is kept to remember if the last token was a "unary delimiter",
38  *	   which forces a following operator to be unary as opposed to binary.
39  *
40  * PARAMETERS:
41  *	None
42  *
43  * RETURNS:
44  *	An integer code indicating the type of token scanned.
45  *
46  * GLOBALS:
47  *	buf_ptr =
48  *	had_eof
49  *	ps.last_u_d =	Set to true iff this token is a "unary delimiter"
50  *
51  * CALLS:
52  *	fill_buffer
53  *	printf (lib)
54  *
55  * CALLED BY:
56  *	main
57  *
58  * NOTES:
59  *	Start of comment is passed back so that the comment can be scanned by
60  *	pr_comment.
61  *
62  *	Strings and character literals are returned just like identifiers.
63  *
64  * HISTORY:
65  *	initial coding 	November 1976	D A Willcox of CAC
66  *	1/7/77		D A Willcox of CAC	Fix to provide proper handling
67  *						of "int a -1;"
68  *
69  */
70 
71 /*
72  * Here we have the token scanner for indent.  It scans off one token and
73  * puts it in the global variable "token".  It returns a code, indicating
74  * the type of token scanned.
75  */
76 
77 #include "indent_globs.h"
78 #include "indent_codes.h"
79 #include "ctype.h"
80 
81 #define alphanum 1
82 #define opchar 3
83 
84 struct templ {
85     char       *rwd;
86     int         rwcode;
87 };
88 
89 struct templ specials[100] =
90 {
91     "switch", 1,
92     "case", 2,
93     "break", 0,
94     "struct", 3,
95     "union", 3,
96     "enum", 3,
97     "default", 2,
98     "int", 4,
99     "char", 4,
100     "float", 4,
101     "double", 4,
102     "long", 4,
103     "short", 4,
104     "typdef", 4,
105     "unsigned", 4,
106     "register", 4,
107     "static", 4,
108     "global", 4,
109     "extern", 4,
110     "void", 4,
111     "goto", 0,
112     "return", 0,
113     "if", 5,
114     "while", 5,
115     "for", 5,
116     "else", 6,
117     "do", 6,
118     "sizeof", 7,
119     0, 0
120 };
121 
122 char        chartype[128] =
123 {				/* this is used to facilitate the decision
124 				 * of what type (alphanumeric, operator)
125 				 * each character is */
126     0, 0, 0, 0, 0, 0, 0, 0,
127     0, 0, 0, 0, 0, 0, 0, 0,
128     0, 0, 0, 0, 0, 0, 0, 0,
129     0, 0, 0, 0, 0, 0, 0, 0,
130     0, 3, 0, 0, 1, 3, 3, 0,
131     0, 0, 3, 3, 0, 3, 3, 3,
132     1, 1, 1, 1, 1, 1, 1, 1,
133     1, 1, 0, 0, 3, 3, 3, 3,
134     0, 1, 1, 1, 1, 1, 1, 1,
135     1, 1, 1, 1, 1, 1, 1, 1,
136     1, 1, 1, 1, 1, 1, 1, 1,
137     1, 1, 1, 0, 0, 0, 3, 1,
138     0, 1, 1, 1, 1, 1, 1, 1,
139     1, 1, 1, 1, 1, 1, 1, 1,
140     1, 1, 1, 1, 1, 1, 1, 1,
141     1, 1, 1, 0, 3, 0, 3, 0
142 };
143 
144 
145 
146 
147 int
148 lexi()
149 {
150     register char *tok;		/* local pointer to next char in token */
151     int         unary_delim;	/* this is set to 1 if the current token
152 				 *
153 				 * forces a following operator to be unary */
154     static int  last_code;	/* the last token type returned */
155     static int  l_struct;	/* set to 1 if the last token was 'struct' */
156     int         code;		/* internal code to be returned */
157     char        qchar;		/* the delimiter character for a string */
158 
159     tok = token;		/* point to start of place to save token */
160     unary_delim = false;
161     ps.col_1 = ps.last_nl;	/* tell world that this token started in
162 				 * column 1 iff the last thing scanned was
163 				 * nl */
164     ps.last_nl = false;
165 
166     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
167 	ps.col_1 = false;	/* leading blanks imply token is not in
168 				 * column 1 */
169 	if (++buf_ptr >= buf_end)
170 	    fill_buffer();
171     }
172 
173     /* Scan an alphanumeric token.  Note that we must also handle
174      * stuff like "1.0e+03" and "7e-6". */
175     if (chartype[*buf_ptr & 0177] == alphanum) {	/* we have a character
176 							 * or number */
177 	register char *j;	/* used for searching thru list of
178 				 * reserved words */
179 	register struct templ *p;
180 	register int c;
181 
182 	do {			/* copy it over */
183 	    *tok++ = *buf_ptr++;
184 	    if (buf_ptr >= buf_end)
185 		fill_buffer();
186 	} while (chartype[c = *buf_ptr & 0177] == alphanum ||
187 		isdigit(token[0]) && (c == '+' || c == '-') &&
188 		(tok[-1] == 'e' || tok[-1] == 'E'));
189 	*tok++ = '\0';
190 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
191 	    if (++buf_ptr >= buf_end)
192 		fill_buffer();
193 	}
194 	ps.its_a_keyword = false;
195 	ps.sizeof_keyword = false;
196 	if (l_struct) {		/* if last token was 'struct', then this
197 				 * token should be treated as a
198 				 * declaration */
199 	    l_struct = false;
200 	    last_code = ident;
201 	    ps.last_u_d = true;
202 	    return (decl);
203 	}
204 	ps.last_u_d = false;	/* Operator after indentifier is binary */
205 	last_code = ident;	/* Remember that this is the code we will
206 				 * return */
207 
208 	/*
209 	 * This loop will check if the token is a keyword.
210 	 */
211 	for (p = specials; (j = p->rwd) != 0; p++) {
212 	    tok = token;	/* point at scanned token */
213 	    if (*j++ != *tok++ || *j++ != *tok++)
214 		continue;	/* This test depends on the fact that
215 				 * identifiers are always at least 1
216 				 * character long (ie. the first two bytes
217 				 * of the identifier are always
218 				 * meaningful) */
219 	    if (tok[-1] == 0)
220 		break;		/* If its a one-character identifier */
221 	    while (*tok++ == *j)
222 		if (*j++ == 0)
223 		    goto found_keyword;	/* I wish that C had a multi-level
224 					 * break... */
225 	}
226 	if (p->rwd) {		/* we have a keyword */
227     found_keyword:
228 	    ps.its_a_keyword = true;
229 	    ps.last_u_d = true;
230 	    switch (p->rwcode) {
231 		case 1:	/* it is a switch */
232 		    return (swstmt);
233 		case 2:	/* a case or default */
234 		    return (casestmt);
235 
236 		case 3:	/* a "struct" */
237 		    if (ps.p_l_follow)
238 			break;	/* inside parens: cast */
239 		    l_struct = true;
240 
241 		    /*
242 		     * Next time around, we will want to know that we have
243 		     * had a 'struct'
244 		     */
245 		case 4:	/* one of the declaration keywords */
246 		    if (ps.p_l_follow) {
247 			ps.cast_mask |= 1 << ps.p_l_follow;
248 			break;	/* inside parens: cast */
249 		    }
250 		    last_code = decl;
251 		    return (decl);
252 
253 		case 5:	/* if, while, for */
254 		    return (sp_paren);
255 
256 		case 6:	/* do, else */
257 		    return (sp_nparen);
258 
259 		case 7:
260 		    ps.sizeof_keyword = true;
261 		default:	/* all others are treated like any other
262 				 * identifier */
263 		    return (ident);
264 	    }			/* end of switch */
265 	}			/* end of if (found_it) */
266 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0
267 	    && (buf_ptr[1] != ')' || buf_ptr[2] != ';')) {
268 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
269 	    ps.in_parameter_declaration = 1;
270 	}
271 
272 	/*
273 	 * The following hack attempts to guess whether or not the current
274 	 * token is in fact a declaration keyword -- one that has been
275 	 * typedefd
276 	 */
277 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr))
278 	    && !ps.p_l_follow
279 	    && (ps.last_token == rparen || ps.last_token == semicolon ||
280 		ps.last_token == decl ||
281 		ps.last_token == lbrace || ps.last_token == rbrace)) {
282 	    ps.its_a_keyword = true;
283 	    ps.last_u_d = true;
284 	    last_code = decl;
285 	    return decl;
286 	}
287 	if (last_code == decl)	/* if this is a declared variable, then
288 				 * following sign is unary */
289 	    ps.last_u_d = true;	/* will make "int a -1" work */
290 	last_code = ident;
291 	return (ident);		/* the ident is not in the list */
292     }				/* end of procesing for alpanum character */
293     /* Scan a non-alphanumeric token */
294 
295     *tok++ = *buf_ptr;		/* if it is only a one-character token, it
296 				 * is moved here */
297     *tok = '\0';
298     if (++buf_ptr >= buf_end)
299 	fill_buffer();
300 
301     switch (*token) {
302 	case '\n':
303 	    unary_delim = ps.last_u_d;
304 	    ps.last_nl = true;	/* remember that we just had a newline */
305 	    code = (had_eof ? 0 : newline);
306 
307 	    /*
308 	     * if data has been exausted, the newline is a dummy, and we
309 	     * should return code to stop
310 	     */
311 	    break;
312 
313 	case '\'':		/* start of quoted character */
314 	case '"':		/* start of string */
315 	    qchar = *token;
316 	    if (troff) {
317 		tok[-1] = '`';
318 		if (qchar == '"')
319 		    *tok++ = '`';
320 		*tok++ = BACKSLASH;
321 		*tok++ = 'f';
322 		*tok++ = 'L';
323 	    }
324 	    do {		/* copy the string */
325 		while (1) {	/* move one character or [/<char>]<char> */
326 		    if (*buf_ptr == '\n') {
327 			printf("%d: Unterminated literal\n", line_no);
328 			goto stop_lit;
329 		    }
330 		    *tok = *buf_ptr++;
331 		    if (buf_ptr >= buf_end)
332 			fill_buffer();
333 		    if (had_eof || ((tok - token) > (bufsize - 2))) {
334 			printf("Unterminated literal\n");
335 			++tok;
336 			goto stop_lit;
337 			/* get outof literal copying loop */
338 		    }
339 		    if (*tok == BACKSLASH) {	/* if escape, copy extra
340 						 * char */
341 			if (*buf_ptr == '\n')	/* check for escaped
342 						 * newline */
343 			    ++line_no;
344 			if (troff) {
345 			    *++tok = BACKSLASH;
346 			    if (*buf_ptr == BACKSLASH)
347 				*++tok = BACKSLASH;
348 			}
349 			*++tok = *buf_ptr++;
350 			++tok;	/* we must increment this again because we
351 				 * copied two chars */
352 			if (buf_ptr >= buf_end)
353 			    fill_buffer();
354 		    }
355 		    else
356 			break;	/* we copied one character */
357 		}		/* end of while (1) */
358 	    } while (*tok++ != qchar);
359 	    if (troff) {
360 		tok[-1] = BACKSLASH;
361 		*tok++ = 'f';
362 		*tok++ = 'R';
363 		*tok++ = '\'';
364 		if (qchar == '"')
365 		    *tok++ = '\'';
366 	    }
367     stop_lit:
368 	    code = ident;
369 	    break;
370 
371 	case ('('):
372 	case ('['):
373 	    unary_delim = true;
374 	    code = lparen;
375 	    break;
376 
377 	case (')'):
378 	case (']'):
379 	    code = rparen;
380 	    break;
381 
382 	case '#':
383 	    unary_delim = ps.last_u_d;
384 	    code = preesc;
385 	    break;
386 
387 	case '?':
388 	    unary_delim = true;
389 	    code = question;
390 	    break;
391 
392 	case (':'):
393 	    code = colon;
394 	    unary_delim = true;
395 	    break;
396 
397 	case (';'):
398 	    unary_delim = true;
399 	    code = semicolon;
400 	    break;
401 
402 	case ('{'):
403 	    unary_delim = true;
404 
405 	    /*
406 	     * if (ps.in_or_st) ps.block_init = 1;
407 	     */
408 	    code = ps.block_init ? lparen : lbrace;
409 	    break;
410 
411 	case ('}'):
412 	    unary_delim = true;
413 	    code = ps.block_init ? rparen : rbrace;
414 	    break;
415 
416 	case 014:		/* a form feed */
417 	    unary_delim = ps.last_u_d;
418 	    ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
419 				 * right */
420 	    code = form_feed;
421 	    break;
422 
423 	case (','):
424 	    unary_delim = true;
425 	    code = comma;
426 	    break;
427 
428 	case '.':
429 	    unary_delim = false;
430 	    code = period;
431 	    break;
432 
433 	case '-':
434 	case '+':		/* check for -, +, --, ++ */
435 	    code = (ps.last_u_d ? unary_op : binary_op);
436 	    unary_delim = true;
437 
438 	    if (*buf_ptr == token[0]) {
439 		/* check for doubled character */
440 		*tok++ = *buf_ptr++;
441 		/* buffer overflow will be checked at end of loop */
442 		if (last_code == ident || last_code == rparen) {
443 		    code = (ps.last_u_d ? unary_op : postop);
444 		    /* check for following ++ or -- */
445 		    unary_delim = false;
446 		}
447 	    }
448 	    else if (*buf_ptr == '=')
449 		/* check for operator += */
450 		*tok++ = *buf_ptr++;
451 	    else if (token[0] == '-' && *buf_ptr == '>') {
452 		/* check for operator -> */
453 		*tok++ = *buf_ptr++;
454 		if (!pointer_as_binop) {
455 		    code = unary_op;
456 		    unary_delim = false;
457 		    ps.want_blank = false;
458 		}
459 	    }
460 	    /* buffer overflow will be checked at end of switch */
461 
462 	    break;
463 
464 	case '=':
465 	    if (ps.in_or_st)
466 		ps.block_init = 1;
467 	    if (chartype[*buf_ptr] == opchar) {	/* we have two char
468 						 * assignment */
469 		tok[-1] = *buf_ptr++;
470 		if ((tok[-1] == '<' || tok[-1] == '>') && tok[-1] == *buf_ptr)
471 		    *tok++ = *buf_ptr++;
472 		*tok++ = '=';	/* Flip =+ to += */
473 		*tok = 0;
474 	    }
475 	    code = binary_op;
476 	    unary_delim = true;
477 	    break;
478 	    /* can drop thru!!! */
479 
480 	case '>':
481 	case '<':
482 	case '!':		/* ops like <, <<, <=, !=, etc */
483 	    if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
484 		*tok++ = *buf_ptr;
485 		if (++buf_ptr >= buf_end)
486 		    fill_buffer();
487 	    }
488 	    if (*buf_ptr == '=')
489 		*tok++ = *buf_ptr++;
490 	    code = (ps.last_u_d ? unary_op : binary_op);
491 	    unary_delim = true;
492 	    break;
493 
494 	default:
495 	    if (token[0] == '/' && *buf_ptr == '*') {
496 		/* it is start of comment */
497 		*tok++ = '*';
498 
499 		if (++buf_ptr >= buf_end)
500 		    fill_buffer();
501 
502 		code = comment;
503 		unary_delim = ps.last_u_d;
504 		break;
505 	    }
506 	    while (*(tok - 1) == *buf_ptr || *buf_ptr == '=') {
507 		/* handle ||, &&, etc, and also things as in int *****i */
508 		*tok++ = *buf_ptr;
509 		if (++buf_ptr >= buf_end)
510 		    fill_buffer();
511 	    }
512 	    code = (ps.last_u_d ? unary_op : binary_op);
513 	    unary_delim = true;
514 
515 
516     }				/* end of switch */
517     if (code != newline) {
518 	l_struct = false;
519 	last_code = code;
520     }
521     if (buf_ptr >= buf_end)	/* check for input buffer empty */
522 	fill_buffer();
523     ps.last_u_d = unary_delim;
524     *tok = '\0';		/* null terminate the token */
525     return (code);
526 };
527 
528 /* Add the given keyword to the keyword table, using val as the keyword type
529    */
530 addkey (key, val)
531 char       *key;
532 {
533     register struct templ *p = specials;
534     while (p->rwd)
535 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
536 	    return;
537 	else
538 	    p++;
539     if (p >= specials + sizeof specials / sizeof specials[0])
540 	return;			/* For now, table overflows are silently
541 				   ignored */
542     p->rwd = key;
543     p->rwcode = val;
544     p[1].rwd = 0;
545     p[1].rwcode = 0;
546     return;
547 }
548