xref: /csrg-svn/usr.bin/indent/lexi.c (revision 40275)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980 The Regents of the University of California.
4  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms are permitted
8  * provided that the above copyright notice and this paragraph are
9  * duplicated in all such forms and that any documentation,
10  * advertising materials, and other materials related to such
11  * distribution and use acknowledge that the software was developed
12  * by the University of California, Berkeley, the University of Illinois,
13  * Urbana, and Sun Microsystems, Inc.  The name of either University
14  * or Sun Microsystems may not be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
18  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
19  */
20 
21 #ifndef lint
22 static char sccsid[] = "@(#)lexi.c	5.14 (Berkeley) 03/05/90";
23 #endif /* not lint */
24 
25 /*
26  * Here we have the token scanner for indent.  It scans off one token and puts
27  * it in the global variable "token".  It returns a code, indicating the type
28  * of token scanned.
29  */
30 
31 #include "indent_globs.h"
32 #include "indent_codes.h"
33 #include <ctype.h>
34 
35 #define alphanum 1
36 #define opchar 3
37 
38 struct templ {
39     char       *rwd;
40     int         rwcode;
41 };
42 
43 struct templ specials[100] =
44 {
45     "switch", 1,
46     "case", 2,
47     "break", 0,
48     "struct", 3,
49     "union", 3,
50     "enum", 3,
51     "default", 2,
52     "int", 4,
53     "char", 4,
54     "float", 4,
55     "double", 4,
56     "long", 4,
57     "short", 4,
58     "typdef", 4,
59     "unsigned", 4,
60     "register", 4,
61     "static", 4,
62     "global", 4,
63     "extern", 4,
64     "void", 4,
65     "goto", 0,
66     "return", 0,
67     "if", 5,
68     "while", 5,
69     "for", 5,
70     "else", 6,
71     "do", 6,
72     "sizeof", 7,
73     0, 0
74 };
75 
76 char        chartype[128] =
77 {				/* this is used to facilitate the decision of
78 				 * what type (alphanumeric, operator) each
79 				 * character is */
80     0, 0, 0, 0, 0, 0, 0, 0,
81     0, 0, 0, 0, 0, 0, 0, 0,
82     0, 0, 0, 0, 0, 0, 0, 0,
83     0, 0, 0, 0, 0, 0, 0, 0,
84     0, 3, 0, 0, 1, 3, 3, 0,
85     0, 0, 3, 3, 0, 3, 0, 3,
86     1, 1, 1, 1, 1, 1, 1, 1,
87     1, 1, 0, 0, 3, 3, 3, 3,
88     0, 1, 1, 1, 1, 1, 1, 1,
89     1, 1, 1, 1, 1, 1, 1, 1,
90     1, 1, 1, 1, 1, 1, 1, 1,
91     1, 1, 1, 0, 0, 0, 3, 1,
92     0, 1, 1, 1, 1, 1, 1, 1,
93     1, 1, 1, 1, 1, 1, 1, 1,
94     1, 1, 1, 1, 1, 1, 1, 1,
95     1, 1, 1, 0, 3, 0, 3, 0
96 };
97 
98 
99 
100 
101 int
102 lexi()
103 {
104     int         unary_delim;	/* this is set to 1 if the current token
105 				 *
106 				 * forces a following operator to be unary */
107     static int  last_code;	/* the last token type returned */
108     static int  l_struct;	/* set to 1 if the last token was 'struct' */
109     int         code;		/* internal code to be returned */
110     char        qchar;		/* the delimiter character for a string */
111 
112     e_token = s_token;		/* point to start of place to save token */
113     unary_delim = false;
114     ps.col_1 = ps.last_nl;	/* tell world that this token started in
115 				 * column 1 iff the last thing scanned was nl */
116     ps.last_nl = false;
117 
118     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
119 	ps.col_1 = false;	/* leading blanks imply token is not in column
120 				 * 1 */
121 	if (++buf_ptr >= buf_end)
122 	    fill_buffer();
123     }
124 
125     /* Scan an alphanumeric token */
126     if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
127 	/*
128 	 * we have a character or number
129 	 */
130 	register char *j;	/* used for searching thru list of
131 				 *
132 				 * reserved words */
133 	register struct templ *p;
134 
135 	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
136 	    int         seendot = 0,
137 	                seenexp = 0;
138 	    if (*buf_ptr == '0' &&
139 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
140 		*e_token++ = *buf_ptr++;
141 		*e_token++ = *buf_ptr++;
142 		while (isxdigit(*buf_ptr)) {
143 		    CHECK_SIZE_TOKEN;
144 		    *e_token++ = *buf_ptr++;
145 		}
146 	    }
147 	    else
148 		while (1) {
149 		    if (*buf_ptr == '.')
150 			if (seendot)
151 			    break;
152 			else
153 			    seendot++;
154 		    CHECK_SIZE_TOKEN;
155 		    *e_token++ = *buf_ptr++;
156 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
157 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
158 			    break;
159 			else {
160 			    seenexp++;
161 			    seendot++;
162 			    CHECK_SIZE_TOKEN;
163 			    *e_token++ = *buf_ptr++;
164 			    if (*buf_ptr == '+' || *buf_ptr == '-')
165 				*e_token++ = *buf_ptr++;
166 			}
167 		}
168 	    if (*buf_ptr == 'L' || *buf_ptr == 'l')
169 		*e_token++ = *buf_ptr++;
170 	}
171 	else
172 	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
173 		CHECK_SIZE_TOKEN;
174 		*e_token++ = *buf_ptr++;
175 		if (buf_ptr >= buf_end)
176 		    fill_buffer();
177 	    }
178 	*e_token++ = '\0';
179 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
180 	    if (++buf_ptr >= buf_end)
181 		fill_buffer();
182 	}
183 	ps.its_a_keyword = false;
184 	ps.sizeof_keyword = false;
185 	if (l_struct) {		/* if last token was 'struct', then this token
186 				 * should be treated as a declaration */
187 	    l_struct = false;
188 	    last_code = ident;
189 	    ps.last_u_d = true;
190 	    return (decl);
191 	}
192 	ps.last_u_d = false;	/* Operator after indentifier is binary */
193 	last_code = ident;	/* Remember that this is the code we will
194 				 * return */
195 
196 	/*
197 	 * This loop will check if the token is a keyword.
198 	 */
199 	for (p = specials; (j = p->rwd) != 0; p++) {
200 	    register char *p = s_token;	/* point at scanned token */
201 	    if (*j++ != *p++ || *j++ != *p++)
202 		continue;	/* This test depends on the fact that
203 				 * identifiers are always at least 1 character
204 				 * long (ie. the first two bytes of the
205 				 * identifier are always meaningful) */
206 	    if (p[-1] == 0)
207 		break;		/* If its a one-character identifier */
208 	    while (*p++ == *j)
209 		if (*j++ == 0)
210 		    goto found_keyword;	/* I wish that C had a multi-level
211 					 * break... */
212 	}
213 	if (p->rwd) {		/* we have a keyword */
214     found_keyword:
215 	    ps.its_a_keyword = true;
216 	    ps.last_u_d = true;
217 	    switch (p->rwcode) {
218 	    case 1:		/* it is a switch */
219 		return (swstmt);
220 	    case 2:		/* a case or default */
221 		return (casestmt);
222 
223 	    case 3:		/* a "struct" */
224 		if (ps.p_l_follow)
225 		    break;	/* inside parens: cast */
226 		l_struct = true;
227 
228 		/*
229 		 * Next time around, we will want to know that we have had a
230 		 * 'struct'
231 		 */
232 	    case 4:		/* one of the declaration keywords */
233 		if (ps.p_l_follow) {
234 		    ps.cast_mask |= 1 << ps.p_l_follow;
235 		    break;	/* inside parens: cast */
236 		}
237 		last_code = decl;
238 		return (decl);
239 
240 	    case 5:		/* if, while, for */
241 		return (sp_paren);
242 
243 	    case 6:		/* do, else */
244 		return (sp_nparen);
245 
246 	    case 7:
247 		ps.sizeof_keyword = true;
248 	    default:		/* all others are treated like any other
249 				 * identifier */
250 		return (ident);
251 	    }			/* end of switch */
252 	}			/* end of if (found_it) */
253 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
254 	    register char *tp = buf_ptr;
255 	    while (tp < buf_end)
256 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
257 		    goto not_proc;
258 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
259 	    ps.in_parameter_declaration = 1;
260 	    rparen_count = 1;
261     not_proc:;
262 	}
263 	/*
264 	 * The following hack attempts to guess whether or not the current
265 	 * token is in fact a declaration keyword -- one that has been
266 	 * typedefd
267 	 */
268 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
269 		&& !ps.p_l_follow
270 	        && !ps.block_init
271 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
272 		    ps.last_token == decl ||
273 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
274 	    ps.its_a_keyword = true;
275 	    ps.last_u_d = true;
276 	    last_code = decl;
277 	    return decl;
278 	}
279 	if (last_code == decl)	/* if this is a declared variable, then
280 				 * following sign is unary */
281 	    ps.last_u_d = true;	/* will make "int a -1" work */
282 	last_code = ident;
283 	return (ident);		/* the ident is not in the list */
284     }				/* end of procesing for alpanum character */
285 
286     /* Scan a non-alphanumeric token */
287 
288     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
289 				 * moved here */
290     *e_token = '\0';
291     if (++buf_ptr >= buf_end)
292 	fill_buffer();
293 
294     switch (*token) {
295     case '\n':
296 	unary_delim = ps.last_u_d;
297 	ps.last_nl = true;	/* remember that we just had a newline */
298 	code = (had_eof ? 0 : newline);
299 
300 	/*
301 	 * if data has been exausted, the newline is a dummy, and we should
302 	 * return code to stop
303 	 */
304 	break;
305 
306     case '\'':			/* start of quoted character */
307     case '"':			/* start of string */
308 	qchar = *token;
309 	if (troff) {
310 	    e_token[-1] = '`';
311 	    if (qchar == '"')
312 		*e_token++ = '`';
313 	    e_token = chfont(&bodyf, &stringf, e_token);
314 	}
315 	do {			/* copy the string */
316 	    while (1) {		/* move one character or [/<char>]<char> */
317 		if (*buf_ptr == '\n') {
318 		    printf("%d: Unterminated literal\n", line_no);
319 		    goto stop_lit;
320 		}
321 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
322 					 * since CHECK_SIZE guarantees that there
323 					 * are at least 5 entries left */
324 		*e_token = *buf_ptr++;
325 		if (buf_ptr >= buf_end)
326 		    fill_buffer();
327 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
328 		    if (*buf_ptr == '\n')	/* check for escaped newline */
329 			++line_no;
330 		    if (troff) {
331 			*++e_token = BACKSLASH;
332 			if (*buf_ptr == BACKSLASH)
333 			    *++e_token = BACKSLASH;
334 		    }
335 		    *++e_token = *buf_ptr++;
336 		    ++e_token;	/* we must increment this again because we
337 				 * copied two chars */
338 		    if (buf_ptr >= buf_end)
339 			fill_buffer();
340 		}
341 		else
342 		    break;	/* we copied one character */
343 	    }			/* end of while (1) */
344 	} while (*e_token++ != qchar);
345 	if (troff) {
346 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
347 	    if (qchar == '"')
348 		*e_token++ = '\'';
349 	}
350 stop_lit:
351 	code = ident;
352 	break;
353 
354     case ('('):
355     case ('['):
356 	unary_delim = true;
357 	code = lparen;
358 	break;
359 
360     case (')'):
361     case (']'):
362 	code = rparen;
363 	break;
364 
365     case '#':
366 	unary_delim = ps.last_u_d;
367 	code = preesc;
368 	break;
369 
370     case '?':
371 	unary_delim = true;
372 	code = question;
373 	break;
374 
375     case (':'):
376 	code = colon;
377 	unary_delim = true;
378 	break;
379 
380     case (';'):
381 	unary_delim = true;
382 	code = semicolon;
383 	break;
384 
385     case ('{'):
386 	unary_delim = true;
387 
388 	/*
389 	 * if (ps.in_or_st) ps.block_init = 1;
390 	 */
391 	/* ?	code = ps.block_init ? lparen : lbrace; */
392 	code = lbrace;
393 	break;
394 
395     case ('}'):
396 	unary_delim = true;
397 	/* ?	code = ps.block_init ? rparen : rbrace; */
398 	code = rbrace;
399 	break;
400 
401     case 014:			/* a form feed */
402 	unary_delim = ps.last_u_d;
403 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
404 				 * right */
405 	code = form_feed;
406 	break;
407 
408     case (','):
409 	unary_delim = true;
410 	code = comma;
411 	break;
412 
413     case '.':
414 	unary_delim = false;
415 	code = period;
416 	break;
417 
418     case '-':
419     case '+':			/* check for -, +, --, ++ */
420 	code = (ps.last_u_d ? unary_op : binary_op);
421 	unary_delim = true;
422 
423 	if (*buf_ptr == token[0]) {
424 	    /* check for doubled character */
425 	    *e_token++ = *buf_ptr++;
426 	    /* buffer overflow will be checked at end of loop */
427 	    if (last_code == ident || last_code == rparen) {
428 		code = (ps.last_u_d ? unary_op : postop);
429 		/* check for following ++ or -- */
430 		unary_delim = false;
431 	    }
432 	}
433 	else if (*buf_ptr == '=')
434 	    /* check for operator += */
435 	    *e_token++ = *buf_ptr++;
436 	else if (*buf_ptr == '>') {
437 	    /* check for operator -> */
438 	    *e_token++ = *buf_ptr++;
439 	    if (!pointer_as_binop) {
440 		unary_delim = false;
441 		code = unary_op;
442 		ps.want_blank = false;
443 	    }
444 	}
445 	break;			/* buffer overflow will be checked at end of
446 				 * switch */
447 
448     case '=':
449 	if (ps.in_or_st)
450 	    ps.block_init = 1;
451 #ifdef undef
452 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
453 	    e_token[-1] = *buf_ptr++;
454 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
455 		*e_token++ = *buf_ptr++;
456 	    *e_token++ = '=';	/* Flip =+ to += */
457 	    *e_token = 0;
458 	}
459 #else
460 	if (*buf_ptr == '=') {/* == */
461 	    *e_token++ = '=';	/* Flip =+ to += */
462 	    buf_ptr++;
463 	    *e_token = 0;
464 	}
465 #endif
466 	code = binary_op;
467 	unary_delim = true;
468 	break;
469 	/* can drop thru!!! */
470 
471     case '>':
472     case '<':
473     case '!':			/* ops like <, <<, <=, !=, etc */
474 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
475 	    *e_token++ = *buf_ptr;
476 	    if (++buf_ptr >= buf_end)
477 		fill_buffer();
478 	}
479 	if (*buf_ptr == '=')
480 	    *e_token++ = *buf_ptr++;
481 	code = (ps.last_u_d ? unary_op : binary_op);
482 	unary_delim = true;
483 	break;
484 
485     default:
486 	if (token[0] == '/' && *buf_ptr == '*') {
487 	    /* it is start of comment */
488 	    *e_token++ = '*';
489 
490 	    if (++buf_ptr >= buf_end)
491 		fill_buffer();
492 
493 	    code = comment;
494 	    unary_delim = ps.last_u_d;
495 	    break;
496 	}
497 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
498 	    /*
499 	     * handle ||, &&, etc, and also things as in int *****i
500 	     */
501 	    *e_token++ = *buf_ptr;
502 	    if (++buf_ptr >= buf_end)
503 		fill_buffer();
504 	}
505 	code = (ps.last_u_d ? unary_op : binary_op);
506 	unary_delim = true;
507 
508 
509     }				/* end of switch */
510     if (code != newline) {
511 	l_struct = false;
512 	last_code = code;
513     }
514     if (buf_ptr >= buf_end)	/* check for input buffer empty */
515 	fill_buffer();
516     ps.last_u_d = unary_delim;
517     *e_token = '\0';		/* null terminate the token */
518     return (code);
519 }
520 
521 /*
522  * Add the given keyword to the keyword table, using val as the keyword type
523  */
524 addkey(key, val)
525     char       *key;
526 {
527     register struct templ *p = specials;
528     while (p->rwd)
529 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
530 	    return;
531 	else
532 	    p++;
533     if (p >= specials + sizeof specials / sizeof specials[0])
534 	return;			/* For now, table overflows are silently
535 				 * ignored */
536     p->rwd = key;
537     p->rwcode = val;
538     p[1].rwd = 0;
539     p[1].rwcode = 0;
540     return;
541 }
542