xref: /openbsd-src/usr.bin/indent/lexi.c (revision b2ea75c1b17e1a9a339660e7ed45cd24946b230e)
1 /*	$OpenBSD: lexi.c,v 1.8 2001/06/25 04:58:31 pjanzen Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.
6  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7  * Copyright (c) 1985 Sun Microsystems, Inc.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #ifndef lint
40 /*static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";*/
41 static char rcsid[] = "$OpenBSD: lexi.c,v 1.8 2001/06/25 04:58:31 pjanzen Exp $";
42 #endif /* not lint */
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <stdio.h>
51 #include <ctype.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <err.h>
55 #include "indent_globs.h"
56 #include "indent_codes.h"
57 
58 #define alphanum 1
59 #define opchar 3
60 
61 struct templ {
62     char       *rwd;
63     int         rwcode;
64 };
65 
66 struct templ specialsinit[] = {
67 	{ "switch", 1 },
68 	{ "case", 2 },
69 	{ "break", 0 },
70 	{ "struct", 3 },
71 	{ "union", 3 },
72 	{ "enum", 3 },
73 	{ "default", 2 },
74 	{ "int", 4 },
75 	{ "char", 4 },
76 	{ "float", 4 },
77 	{ "double", 4 },
78 	{ "long", 4 },
79 	{ "short", 4 },
80 	{ "typdef", 4 },
81 	{ "unsigned", 4 },
82 	{ "register", 4 },
83 	{ "static", 4 },
84 	{ "global", 4 },
85 	{ "extern", 4 },
86 	{ "void", 4 },
87 	{ "goto", 0 },
88 	{ "return", 0 },
89 	{ "if", 5 },
90 	{ "while", 5 },
91 	{ "for", 5 },
92 	{ "else", 6 },
93 	{ "do", 6 },
94 	{ "sizeof", 7 },
95 };
96 
97 struct templ *specials = specialsinit;
98 int	nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
99 int	maxspecials;
100 
101 char        chartype[128] =
102 {				/* this is used to facilitate the decision of
103 				 * what type (alphanumeric, operator) each
104 				 * character is */
105     0, 0, 0, 0, 0, 0, 0, 0,
106     0, 0, 0, 0, 0, 0, 0, 0,
107     0, 0, 0, 0, 0, 0, 0, 0,
108     0, 0, 0, 0, 0, 0, 0, 0,
109     0, 3, 0, 0, 1, 3, 3, 0,
110     0, 0, 3, 3, 0, 3, 0, 3,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 0, 0, 3, 3, 3, 3,
113     0, 1, 1, 1, 1, 1, 1, 1,
114     1, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 0, 0, 0, 3, 1,
117     0, 1, 1, 1, 1, 1, 1, 1,
118     1, 1, 1, 1, 1, 1, 1, 1,
119     1, 1, 1, 1, 1, 1, 1, 1,
120     1, 1, 1, 0, 3, 0, 3, 0
121 };
122 
123 
124 
125 
126 int
127 lexi()
128 {
129     int         unary_delim;	/* this is set to 1 if the current token
130 				 * forces a following operator to be unary */
131     static int  last_code;	/* the last token type returned */
132     static int  l_struct;	/* set to 1 if the last token was 'struct' */
133     int         code;		/* internal code to be returned */
134     char        qchar;		/* the delimiter character for a string */
135     int		i;
136 
137     e_token = s_token;		/* point to start of place to save token */
138     unary_delim = false;
139     ps.col_1 = ps.last_nl;	/* tell world that this token started in
140 				 * column 1 iff the last thing scanned was nl */
141     ps.last_nl = false;
142 
143     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
144 	ps.col_1 = false;	/* leading blanks imply token is not in column
145 				 * 1 */
146 	if (++buf_ptr >= buf_end)
147 	    fill_buffer();
148     }
149 
150     /* Scan an alphanumeric token */
151     if (chartype[(int)*buf_ptr] == alphanum ||
152 	(buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
153 	/*
154 	 * we have a character or number
155 	 */
156 	char *j;	/* used for searching thru list of
157 			 * reserved words */
158 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
159 	    int         seendot = 0,
160 	                seenexp = 0,
161 			seensfx = 0;
162 	    if (*buf_ptr == '0' &&
163 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
164 		*e_token++ = *buf_ptr++;
165 		*e_token++ = *buf_ptr++;
166 		while (isxdigit(*buf_ptr)) {
167 		    CHECK_SIZE_TOKEN;
168 		    *e_token++ = *buf_ptr++;
169 		}
170 	    }
171 	    else
172 		while (1) {
173 		    if (*buf_ptr == '.') {
174 			if (seendot)
175 			    break;
176 			else
177 			    seendot++;
178 		    }
179 		    CHECK_SIZE_TOKEN;
180 		    *e_token++ = *buf_ptr++;
181 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
182 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
183 			    break;
184 			else {
185 			    seenexp++;
186 			    seendot++;
187 			    CHECK_SIZE_TOKEN;
188 			    *e_token++ = *buf_ptr++;
189 			    if (*buf_ptr == '+' || *buf_ptr == '-')
190 				*e_token++ = *buf_ptr++;
191 			}
192 		    }
193 		}
194 	    while (1) {
195 		if (!(seensfx & 1) &&
196 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
197 		    CHECK_SIZE_TOKEN;
198 		    *e_token++ = *buf_ptr++;
199 		    seensfx |= 1;
200 		    continue;
201 		}
202         	if (!(seensfx & 2) &&
203 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
204 		    CHECK_SIZE_TOKEN;
205 		    if (buf_ptr[1] == buf_ptr[0])
206 		        *e_token++ = *buf_ptr++;
207 		    *e_token++ = *buf_ptr++;
208 		    seensfx |= 2;
209 		    continue;
210 		}
211 		break;
212 	    }
213 	}
214 	else
215 	    while (chartype[(int)*buf_ptr] == alphanum) {	/* copy it over */
216 		CHECK_SIZE_TOKEN;
217 		*e_token++ = *buf_ptr++;
218 		if (buf_ptr >= buf_end)
219 		    fill_buffer();
220 	    }
221 	*e_token++ = '\0';
222 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
223 	    if (++buf_ptr >= buf_end)
224 		fill_buffer();
225 	}
226 	ps.its_a_keyword = false;
227 	ps.sizeof_keyword = false;
228 	if (l_struct) {		/* if last token was 'struct', then this token
229 				 * should be treated as a declaration */
230 	    l_struct = false;
231 	    last_code = ident;
232 	    ps.last_u_d = true;
233 	    return (decl);
234 	}
235 	ps.last_u_d = false;	/* Operator after indentifier is binary */
236 	last_code = ident;	/* Remember that this is the code we will
237 				 * return */
238 
239 	/*
240 	 * This loop will check if the token is a keyword.
241 	 */
242 	for (i = 0; i < nspecials; i++) {
243 	    char *p = s_token;	/* point at scanned token */
244 	    j = specials[i].rwd;
245 	    if (*j++ != *p++ || *j++ != *p++)
246 		continue;	/* This test depends on the fact that
247 				 * identifiers are always at least 1 character
248 				 * long (ie. the first two bytes of the
249 				 * identifier are always meaningful) */
250 	    if (p[-1] == 0)
251 		break;		/* If its a one-character identifier */
252 	    while (*p++ == *j)
253 		if (*j++ == 0)
254 		    goto found_keyword;	/* I wish that C had a multi-level
255 					 * break... */
256 	}
257 	if (i < nspecials) {		/* we have a keyword */
258     found_keyword:
259 	    ps.its_a_keyword = true;
260 	    ps.last_u_d = true;
261 	    switch (specials[i].rwcode) {
262 	    case 1:		/* it is a switch */
263 		return (swstmt);
264 	    case 2:		/* a case or default */
265 		return (casestmt);
266 
267 	    case 3:		/* a "struct" */
268 		if (ps.p_l_follow)
269 		    break;	/* inside parens: cast */
270 		l_struct = true;
271 
272 		/*
273 		 * Next time around, we will want to know that we have had a
274 		 * 'struct'
275 		 */
276 	    case 4:		/* one of the declaration keywords */
277 		if (ps.p_l_follow) {
278 		    ps.cast_mask |= 1 << ps.p_l_follow;
279 		    break;	/* inside parens: cast */
280 		}
281 		last_code = decl;
282 		return (decl);
283 
284 	    case 5:		/* if, while, for */
285 		return (sp_paren);
286 
287 	    case 6:		/* do, else */
288 		return (sp_nparen);
289 
290 	    case 7:
291 		ps.sizeof_keyword = true;
292 	    default:		/* all others are treated like any other
293 				 * identifier */
294 		return (ident);
295 	    }			/* end of switch */
296 	}			/* end of if (found_it) */
297 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
298 	    char *tp = buf_ptr;
299 	    while (tp < buf_end)
300 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
301 		    goto not_proc;
302 	    strlcpy(ps.procname, token, sizeof ps.procname);
303 	    ps.in_parameter_declaration = 1;
304 	    rparen_count = 1;
305     not_proc:;
306 	}
307 	/*
308 	 * The following hack attempts to guess whether or not the current
309 	 * token is in fact a declaration keyword -- one that has been
310 	 * typedefd
311 	 */
312 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
313 		&& !ps.p_l_follow
314 	        && !ps.block_init
315 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
316 		    ps.last_token == decl ||
317 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
318 	    ps.its_a_keyword = true;
319 	    ps.last_u_d = true;
320 	    last_code = decl;
321 	    return decl;
322 	}
323 	if (last_code == decl)	/* if this is a declared variable, then
324 				 * following sign is unary */
325 	    ps.last_u_d = true;	/* will make "int a -1" work */
326 	last_code = ident;
327 	return (ident);		/* the ident is not in the list */
328     }				/* end of procesing for alpanum character */
329 
330     /* Scan a non-alphanumeric token */
331 
332     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
333 				 * moved here */
334     *e_token = '\0';
335     if (++buf_ptr >= buf_end)
336 	fill_buffer();
337 
338     switch (*token) {
339     case '\n':
340 	unary_delim = ps.last_u_d;
341 	ps.last_nl = true;	/* remember that we just had a newline */
342 	code = (had_eof ? 0 : newline);
343 
344 	/*
345 	 * if data has been exausted, the newline is a dummy, and we should
346 	 * return code to stop
347 	 */
348 	break;
349 
350     case '\'':			/* start of quoted character */
351     case '"':			/* start of string */
352 	qchar = *token;
353 	if (troff) {
354 	    e_token[-1] = '`';
355 	    if (qchar == '"')
356 		*e_token++ = '`';
357 	    e_token = chfont(&bodyf, &stringf, e_token);
358 	}
359 	do {			/* copy the string */
360 	    while (1) {		/* move one character or [/<char>]<char> */
361 		if (*buf_ptr == '\n') {
362 		    printf("%d: Unterminated literal\n", line_no);
363 		    goto stop_lit;
364 		}
365 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
366 					 * since CHECK_SIZE guarantees that there
367 					 * are at least 5 entries left */
368 		*e_token = *buf_ptr++;
369 		if (buf_ptr >= buf_end)
370 		    fill_buffer();
371 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
372 		    if (*buf_ptr == '\n')	/* check for escaped newline */
373 			++line_no;
374 		    if (troff) {
375 			*++e_token = BACKSLASH;
376 			if (*buf_ptr == BACKSLASH)
377 			    *++e_token = BACKSLASH;
378 		    }
379 		    *++e_token = *buf_ptr++;
380 		    ++e_token;	/* we must increment this again because we
381 				 * copied two chars */
382 		    if (buf_ptr >= buf_end)
383 			fill_buffer();
384 		}
385 		else
386 		    break;	/* we copied one character */
387 	    }			/* end of while (1) */
388 	} while (*e_token++ != qchar);
389 	if (troff) {
390 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
391 	    if (qchar == '"')
392 		*e_token++ = '\'';
393 	}
394 stop_lit:
395 	code = ident;
396 	break;
397 
398     case ('('):
399     case ('['):
400 	unary_delim = true;
401 	code = lparen;
402 	break;
403 
404     case (')'):
405     case (']'):
406 	code = rparen;
407 	break;
408 
409     case '#':
410 	unary_delim = ps.last_u_d;
411 	code = preesc;
412 	break;
413 
414     case '?':
415 	unary_delim = true;
416 	code = question;
417 	break;
418 
419     case (':'):
420 	code = colon;
421 	unary_delim = true;
422 	break;
423 
424     case (';'):
425 	unary_delim = true;
426 	code = semicolon;
427 	break;
428 
429     case ('{'):
430 	unary_delim = true;
431 
432 	/*
433 	 * if (ps.in_or_st) ps.block_init = 1;
434 	 */
435 	/* ?	code = ps.block_init ? lparen : lbrace; */
436 	code = lbrace;
437 	break;
438 
439     case ('}'):
440 	unary_delim = true;
441 	/* ?	code = ps.block_init ? rparen : rbrace; */
442 	code = rbrace;
443 	break;
444 
445     case 014:			/* a form feed */
446 	unary_delim = ps.last_u_d;
447 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
448 				 * right */
449 	code = form_feed;
450 	break;
451 
452     case (','):
453 	unary_delim = true;
454 	code = comma;
455 	break;
456 
457     case '.':
458 	unary_delim = false;
459 	code = period;
460 	break;
461 
462     case '-':
463     case '+':			/* check for -, +, --, ++ */
464 	code = (ps.last_u_d ? unary_op : binary_op);
465 	unary_delim = true;
466 
467 	if (*buf_ptr == token[0]) {
468 	    /* check for doubled character */
469 	    *e_token++ = *buf_ptr++;
470 	    /* buffer overflow will be checked at end of loop */
471 	    if (last_code == ident || last_code == rparen) {
472 		code = (ps.last_u_d ? unary_op : postop);
473 		/* check for following ++ or -- */
474 		unary_delim = false;
475 	    }
476 	}
477 	else if (*buf_ptr == '=')
478 	    /* check for operator += */
479 	    *e_token++ = *buf_ptr++;
480 	else if (*buf_ptr == '>') {
481 	    /* check for operator -> */
482 	    *e_token++ = *buf_ptr++;
483 	    if (!pointer_as_binop) {
484 		unary_delim = false;
485 		code = unary_op;
486 		ps.want_blank = false;
487 	    }
488 	}
489 	break;			/* buffer overflow will be checked at end of
490 				 * switch */
491 
492     case '=':
493 	if (ps.in_or_st)
494 	    ps.block_init = 1;
495 #ifdef undef
496 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
497 	    e_token[-1] = *buf_ptr++;
498 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
499 		*e_token++ = *buf_ptr++;
500 	    *e_token++ = '=';	/* Flip =+ to += */
501 	    *e_token = 0;
502 	}
503 #else
504 	if (*buf_ptr == '=') {/* == */
505 	    *e_token++ = '=';	/* Flip =+ to += */
506 	    buf_ptr++;
507 	    *e_token = 0;
508 	}
509 #endif
510 	code = binary_op;
511 	unary_delim = true;
512 	break;
513 	/* can drop thru!!! */
514 
515     case '>':
516     case '<':
517     case '!':			/* ops like <, <<, <=, !=, etc */
518 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
519 	    *e_token++ = *buf_ptr;
520 	    if (++buf_ptr >= buf_end)
521 		fill_buffer();
522 	}
523 	if (*buf_ptr == '=')
524 	    *e_token++ = *buf_ptr++;
525 	code = (ps.last_u_d ? unary_op : binary_op);
526 	unary_delim = true;
527 	break;
528 
529     default:
530 	if (token[0] == '/' && *buf_ptr == '*') {
531 	    /* it is start of comment */
532 	    *e_token++ = '*';
533 
534 	    if (++buf_ptr >= buf_end)
535 		fill_buffer();
536 
537 	    code = comment;
538 	    unary_delim = ps.last_u_d;
539 	    break;
540 	}
541 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
542 	    /*
543 	     * handle ||, &&, etc, and also things as in int *****i
544 	     */
545 	    *e_token++ = *buf_ptr;
546 	    if (++buf_ptr >= buf_end)
547 		fill_buffer();
548 	}
549 	code = (ps.last_u_d ? unary_op : binary_op);
550 	unary_delim = true;
551 
552 
553     }				/* end of switch */
554     if (code != newline) {
555 	l_struct = false;
556 	last_code = code;
557     }
558     if (buf_ptr >= buf_end)	/* check for input buffer empty */
559 	fill_buffer();
560     ps.last_u_d = unary_delim;
561     *e_token = '\0';		/* null terminate the token */
562     return (code);
563 }
564 
565 /*
566  * Add the given keyword to the keyword table, using val as the keyword type
567  */
568 void
569 addkey(key, val)
570     char       *key;
571     int		val;
572 {
573     struct templ *p;
574     int i = 0;
575 
576     while (i < nspecials) {
577 	p = &specials[i];
578 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
579 	    return;
580 	else
581 	    i++;
582     }
583 
584     if (specials == specialsinit) {
585 	/*
586 	 * Whoa. Must reallocate special table.
587 	 */
588 	nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
589 	maxspecials = nspecials;
590 	maxspecials += maxspecials >> 2;
591 	specials = (struct templ *)malloc(maxspecials * sizeof specials[0]);
592 	if (specials == NULL)
593 	    err(1, NULL);
594 	memmove(specials, specialsinit, sizeof specialsinit);
595     } else if (nspecials >= maxspecials) {
596 	maxspecials += maxspecials >> 2;
597 	specials = realloc(specials, maxspecials * sizeof specials[0]);
598 	if (specials == NULL)
599 	    err(1, NULL);
600     }
601 
602     p = &specials[i];
603     p->rwd = key;
604     p->rwcode = val;
605     nspecials++;
606     return;
607 }
608