xref: /openbsd-src/usr.bin/indent/lexi.c (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1 /*	$OpenBSD: lexi.c,v 1.14 2007/11/27 16:22:14 martynas Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.
6  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7  * Copyright (c) 1985 Sun Microsystems, Inc.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #ifndef lint
36 /*static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";*/
37 static char rcsid[] = "$OpenBSD: lexi.c,v 1.14 2007/11/27 16:22:14 martynas Exp $";
38 #endif /* not lint */
39 
40 /*
41  * Here we have the token scanner for indent.  It scans off one token and puts
42  * it in the global variable "token".  It returns a code, indicating the type
43  * of token scanned.
44  */
45 
46 #include <stdio.h>
47 #include <ctype.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <err.h>
51 #include "indent_globs.h"
52 #include "indent_codes.h"
53 
54 #define alphanum 1
55 #define opchar 3
56 
57 struct templ {
58     char       *rwd;
59     int         rwcode;
60 };
61 
62 struct templ specialsinit[] = {
63 	{ "switch", 1 },
64 	{ "case", 2 },
65 	{ "break", 0 },
66 	{ "struct", 3 },
67 	{ "union", 3 },
68 	{ "enum", 3 },
69 	{ "default", 2 },
70 	{ "int", 4 },
71 	{ "char", 4 },
72 	{ "float", 4 },
73 	{ "double", 4 },
74 	{ "long", 4 },
75 	{ "short", 4 },
76 	{ "typdef", 4 },
77 	{ "unsigned", 4 },
78 	{ "register", 4 },
79 	{ "static", 4 },
80 	{ "global", 4 },
81 	{ "extern", 4 },
82 	{ "void", 4 },
83 	{ "goto", 0 },
84 	{ "return", 0 },
85 	{ "if", 5 },
86 	{ "while", 5 },
87 	{ "for", 5 },
88 	{ "else", 6 },
89 	{ "do", 6 },
90 	{ "sizeof", 7 },
91 };
92 
93 struct templ *specials = specialsinit;
94 int	nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
95 int	maxspecials;
96 
97 char        chartype[128] =
98 {				/* this is used to facilitate the decision of
99 				 * what type (alphanumeric, operator) each
100 				 * character is */
101     0, 0, 0, 0, 0, 0, 0, 0,
102     0, 0, 0, 0, 0, 0, 0, 0,
103     0, 0, 0, 0, 0, 0, 0, 0,
104     0, 0, 0, 0, 0, 0, 0, 0,
105     0, 3, 0, 0, 1, 3, 3, 0,
106     0, 0, 3, 3, 0, 3, 0, 3,
107     1, 1, 1, 1, 1, 1, 1, 1,
108     1, 1, 0, 0, 3, 3, 3, 3,
109     0, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 0, 0, 0, 3, 1,
113     0, 1, 1, 1, 1, 1, 1, 1,
114     1, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 0, 3, 0, 3, 0
117 };
118 
119 
120 
121 
122 int
123 lexi(void)
124 {
125     int         unary_delim;	/* this is set to 1 if the current token
126 				 * forces a following operator to be unary */
127     static int  last_code;	/* the last token type returned */
128     static int  l_struct;	/* set to 1 if the last token was 'struct' */
129     int         code;		/* internal code to be returned */
130     char        qchar;		/* the delimiter character for a string */
131     int		i;
132 
133     e_token = s_token;		/* point to start of place to save token */
134     unary_delim = false;
135     ps.col_1 = ps.last_nl;	/* tell world that this token started in
136 				 * column 1 iff the last thing scanned was nl */
137     ps.last_nl = false;
138 
139     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
140 	ps.col_1 = false;	/* leading blanks imply token is not in column
141 				 * 1 */
142 	if (++buf_ptr >= buf_end)
143 	    fill_buffer();
144     }
145 
146     /* Scan an alphanumeric token */
147     if (chartype[(int)*buf_ptr] == alphanum ||
148 	(buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
149 	/*
150 	 * we have a character or number
151 	 */
152 	char *j;	/* used for searching thru list of
153 			 * reserved words */
154 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
155 	    int         seendot = 0,
156 	                seenexp = 0,
157 			seensfx = 0;
158 	    if (*buf_ptr == '0' &&
159 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
160 		*e_token++ = *buf_ptr++;
161 		*e_token++ = *buf_ptr++;
162 		while (isxdigit(*buf_ptr)) {
163 		    CHECK_SIZE_TOKEN;
164 		    *e_token++ = *buf_ptr++;
165 		}
166 	    }
167 	    else
168 		while (1) {
169 		    if (*buf_ptr == '.') {
170 			if (seendot)
171 			    break;
172 			else
173 			    seendot++;
174 		    }
175 		    CHECK_SIZE_TOKEN;
176 		    *e_token++ = *buf_ptr++;
177 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
178 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
179 			    break;
180 			else {
181 			    seenexp++;
182 			    seendot++;
183 			    CHECK_SIZE_TOKEN;
184 			    *e_token++ = *buf_ptr++;
185 			    if (*buf_ptr == '+' || *buf_ptr == '-')
186 				*e_token++ = *buf_ptr++;
187 			}
188 		    }
189 		}
190 	    while (1) {
191 		if (!(seensfx & 1) &&
192 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
193 		    CHECK_SIZE_TOKEN;
194 		    *e_token++ = *buf_ptr++;
195 		    seensfx |= 1;
196 		    continue;
197 		}
198         	if (!(seensfx & 2) &&
199 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
200 		    CHECK_SIZE_TOKEN;
201 		    if (buf_ptr[1] == buf_ptr[0])
202 		        *e_token++ = *buf_ptr++;
203 		    *e_token++ = *buf_ptr++;
204 		    seensfx |= 2;
205 		    continue;
206 		}
207 		break;
208 	    }
209 	}
210 	else
211 	    while (chartype[(int)*buf_ptr] == alphanum) {	/* copy it over */
212 		CHECK_SIZE_TOKEN;
213 		*e_token++ = *buf_ptr++;
214 		if (buf_ptr >= buf_end)
215 		    fill_buffer();
216 	    }
217 	*e_token++ = '\0';
218 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
219 	    if (++buf_ptr >= buf_end)
220 		fill_buffer();
221 	}
222 	ps.its_a_keyword = false;
223 	ps.sizeof_keyword = false;
224 	if (l_struct) {		/* if last token was 'struct', then this token
225 				 * should be treated as a declaration */
226 	    l_struct = false;
227 	    last_code = ident;
228 	    ps.last_u_d = true;
229 	    return (decl);
230 	}
231 	ps.last_u_d = false;	/* Operator after identifier is binary */
232 	last_code = ident;	/* Remember that this is the code we will
233 				 * return */
234 
235 	/*
236 	 * This loop will check if the token is a keyword.
237 	 */
238 	for (i = 0; i < nspecials; i++) {
239 	    char *p = s_token;	/* point at scanned token */
240 	    j = specials[i].rwd;
241 	    if (*j++ != *p++ || *j++ != *p++)
242 		continue;	/* This test depends on the fact that
243 				 * identifiers are always at least 1 character
244 				 * long (ie. the first two bytes of the
245 				 * identifier are always meaningful) */
246 	    if (p[-1] == 0)
247 		break;		/* If its a one-character identifier */
248 	    while (*p++ == *j)
249 		if (*j++ == 0)
250 		    goto found_keyword;	/* I wish that C had a multi-level
251 					 * break... */
252 	}
253 	if (i < nspecials) {		/* we have a keyword */
254     found_keyword:
255 	    ps.its_a_keyword = true;
256 	    ps.last_u_d = true;
257 	    switch (specials[i].rwcode) {
258 	    case 1:		/* it is a switch */
259 		return (swstmt);
260 	    case 2:		/* a case or default */
261 		return (casestmt);
262 
263 	    case 3:		/* a "struct" */
264 		if (ps.p_l_follow)
265 		    break;	/* inside parens: cast */
266 		l_struct = true;
267 
268 		/*
269 		 * Next time around, we will want to know that we have had a
270 		 * 'struct'
271 		 */
272 	    case 4:		/* one of the declaration keywords */
273 		if (ps.p_l_follow) {
274 		    ps.cast_mask |= 1 << ps.p_l_follow;
275 		    break;	/* inside parens: cast */
276 		}
277 		last_code = decl;
278 		return (decl);
279 
280 	    case 5:		/* if, while, for */
281 		return (sp_paren);
282 
283 	    case 6:		/* do, else */
284 		return (sp_nparen);
285 
286 	    case 7:
287 		ps.sizeof_keyword = true;
288 	    default:		/* all others are treated like any other
289 				 * identifier */
290 		return (ident);
291 	    }			/* end of switch */
292 	}			/* end of if (found_it) */
293 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
294 	    char *tp = buf_ptr;
295 	    while (tp < buf_end)
296 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
297 		    goto not_proc;
298 	    strlcpy(ps.procname, token, sizeof ps.procname);
299 	    ps.in_parameter_declaration = 1;
300 	    rparen_count = 1;
301     not_proc:;
302 	}
303 	/*
304 	 * The following hack attempts to guess whether or not the current
305 	 * token is in fact a declaration keyword -- one that has been
306 	 * typedefd
307 	 */
308 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
309 		&& !ps.p_l_follow
310 	        && !ps.block_init
311 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
312 		    ps.last_token == decl ||
313 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
314 	    ps.its_a_keyword = true;
315 	    ps.last_u_d = true;
316 	    last_code = decl;
317 	    return decl;
318 	}
319 	if (last_code == decl)	/* if this is a declared variable, then
320 				 * following sign is unary */
321 	    ps.last_u_d = true;	/* will make "int a -1" work */
322 	last_code = ident;
323 	return (ident);		/* the ident is not in the list */
324     }				/* end of procesing for alpanum character */
325 
326     /* Scan a non-alphanumeric token */
327 
328     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
329 				 * moved here */
330     *e_token = '\0';
331     if (++buf_ptr >= buf_end)
332 	fill_buffer();
333 
334     switch (*token) {
335     case '\n':
336 	unary_delim = ps.last_u_d;
337 	ps.last_nl = true;	/* remember that we just had a newline */
338 	code = (had_eof ? 0 : newline);
339 
340 	/*
341 	 * if data has been exausted, the newline is a dummy, and we should
342 	 * return code to stop
343 	 */
344 	break;
345 
346     case '\'':			/* start of quoted character */
347     case '"':			/* start of string */
348 	qchar = *token;
349 	if (troff) {
350 	    e_token[-1] = '`';
351 	    if (qchar == '"')
352 		*e_token++ = '`';
353 	    e_token = chfont(&bodyf, &stringf, e_token);
354 	}
355 	do {			/* copy the string */
356 	    while (1) {		/* move one character or [/<char>]<char> */
357 		if (*buf_ptr == '\n') {
358 		    printf("%d: Unterminated literal\n", line_no);
359 		    goto stop_lit;
360 		}
361 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
362 					 * since CHECK_SIZE guarantees that there
363 					 * are at least 5 entries left */
364 		*e_token = *buf_ptr++;
365 		if (buf_ptr >= buf_end)
366 		    fill_buffer();
367 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
368 		    if (*buf_ptr == '\n')	/* check for escaped newline */
369 			++line_no;
370 		    if (troff) {
371 			*++e_token = BACKSLASH;
372 			if (*buf_ptr == BACKSLASH)
373 			    *++e_token = BACKSLASH;
374 		    }
375 		    *++e_token = *buf_ptr++;
376 		    ++e_token;	/* we must increment this again because we
377 				 * copied two chars */
378 		    if (buf_ptr >= buf_end)
379 			fill_buffer();
380 		}
381 		else
382 		    break;	/* we copied one character */
383 	    }			/* end of while (1) */
384 	} while (*e_token++ != qchar);
385 	if (troff) {
386 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
387 	    if (qchar == '"')
388 		*e_token++ = '\'';
389 	}
390 stop_lit:
391 	code = ident;
392 	break;
393 
394     case ('('):
395     case ('['):
396 	unary_delim = true;
397 	code = lparen;
398 	break;
399 
400     case (')'):
401     case (']'):
402 	code = rparen;
403 	break;
404 
405     case '#':
406 	unary_delim = ps.last_u_d;
407 	code = preesc;
408 	break;
409 
410     case '?':
411 	unary_delim = true;
412 	code = question;
413 	break;
414 
415     case (':'):
416 	code = colon;
417 	unary_delim = true;
418 	break;
419 
420     case (';'):
421 	unary_delim = true;
422 	code = semicolon;
423 	break;
424 
425     case ('{'):
426 	unary_delim = true;
427 
428 	/*
429 	 * if (ps.in_or_st) ps.block_init = 1;
430 	 */
431 	/* ?	code = ps.block_init ? lparen : lbrace; */
432 	code = lbrace;
433 	break;
434 
435     case ('}'):
436 	unary_delim = true;
437 	/* ?	code = ps.block_init ? rparen : rbrace; */
438 	code = rbrace;
439 	break;
440 
441     case 014:			/* a form feed */
442 	unary_delim = ps.last_u_d;
443 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
444 				 * right */
445 	code = form_feed;
446 	break;
447 
448     case (','):
449 	unary_delim = true;
450 	code = comma;
451 	break;
452 
453     case '.':
454 	unary_delim = false;
455 	code = period;
456 	break;
457 
458     case '-':
459     case '+':			/* check for -, +, --, ++ */
460 	code = (ps.last_u_d ? unary_op : binary_op);
461 	unary_delim = true;
462 
463 	if (*buf_ptr == token[0]) {
464 	    /* check for doubled character */
465 	    *e_token++ = *buf_ptr++;
466 	    /* buffer overflow will be checked at end of loop */
467 	    if (last_code == ident || last_code == rparen) {
468 		code = (ps.last_u_d ? unary_op : postop);
469 		/* check for following ++ or -- */
470 		unary_delim = false;
471 	    }
472 	}
473 	else if (*buf_ptr == '=')
474 	    /* check for operator += */
475 	    *e_token++ = *buf_ptr++;
476 	else if (*buf_ptr == '>') {
477 	    /* check for operator -> */
478 	    *e_token++ = *buf_ptr++;
479 	    if (!pointer_as_binop) {
480 		unary_delim = false;
481 		code = unary_op;
482 		ps.want_blank = false;
483 	    }
484 	}
485 	break;			/* buffer overflow will be checked at end of
486 				 * switch */
487 
488     case '=':
489 	if (ps.in_or_st)
490 	    ps.block_init = 1;
491 #ifdef undef
492 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
493 	    e_token[-1] = *buf_ptr++;
494 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
495 		*e_token++ = *buf_ptr++;
496 	    *e_token++ = '=';	/* Flip =+ to += */
497 	    *e_token = 0;
498 	}
499 #else
500 	if (*buf_ptr == '=') {/* == */
501 	    *e_token++ = '=';	/* Flip =+ to += */
502 	    buf_ptr++;
503 	    *e_token = 0;
504 	}
505 #endif
506 	code = binary_op;
507 	unary_delim = true;
508 	break;
509 	/* can drop thru!!! */
510 
511     case '>':
512     case '<':
513     case '!':			/* ops like <, <<, <=, !=, etc */
514 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
515 	    *e_token++ = *buf_ptr;
516 	    if (++buf_ptr >= buf_end)
517 		fill_buffer();
518 	}
519 	if (*buf_ptr == '=')
520 	    *e_token++ = *buf_ptr++;
521 	code = (ps.last_u_d ? unary_op : binary_op);
522 	unary_delim = true;
523 	break;
524 
525     default:
526 	if (token[0] == '/' && *buf_ptr == '*') {
527 	    /* it is start of comment */
528 	    *e_token++ = '*';
529 
530 	    if (++buf_ptr >= buf_end)
531 		fill_buffer();
532 
533 	    code = comment;
534 	    unary_delim = ps.last_u_d;
535 	    break;
536 	}
537 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
538 	    /*
539 	     * handle ||, &&, etc, and also things as in int *****i
540 	     */
541 	    *e_token++ = *buf_ptr;
542 	    if (++buf_ptr >= buf_end)
543 		fill_buffer();
544 	}
545 	code = (ps.last_u_d ? unary_op : binary_op);
546 	unary_delim = true;
547 
548 
549     }				/* end of switch */
550     if (code != newline) {
551 	l_struct = false;
552 	last_code = code;
553     }
554     if (buf_ptr >= buf_end)	/* check for input buffer empty */
555 	fill_buffer();
556     ps.last_u_d = unary_delim;
557     *e_token = '\0';		/* null terminate the token */
558     return (code);
559 }
560 
561 /*
562  * Add the given keyword to the keyword table, using val as the keyword type
563  */
564 void
565 addkey(char *key, int val)
566 {
567     struct templ *p;
568     int i;
569 
570     for (i = 0; i < nspecials; i++) {
571 	p = &specials[i];
572 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
573 	    return;
574     }
575 
576     if (specials == specialsinit) {
577 	/*
578 	 * Whoa. Must reallocate special table.
579 	 */
580 	nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
581 	maxspecials = nspecials + (nspecials >> 2);
582 	specials = (struct templ *)calloc(maxspecials, sizeof specials[0]);
583 	if (specials == NULL)
584 	    err(1, NULL);
585 	memcpy(specials, specialsinit, sizeof specialsinit);
586     } else if (nspecials >= maxspecials) {
587 	int newspecials = maxspecials + (maxspecials >> 2);
588 	struct templ *specials2;
589 
590 	specials2 = realloc(specials, newspecials * sizeof specials[0]);
591 	if (specials2 == NULL)
592 	    err(1, NULL);
593 	specials = specials2;
594 	maxspecials = newspecials;
595     }
596 
597     p = &specials[nspecials];
598     p->rwd = key;
599     p->rwcode = val;
600     nspecials++;
601     return;
602 }
603