xref: /openbsd-src/usr.bin/indent/lexi.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /*	$OpenBSD: lexi.c,v 1.16 2013/11/26 13:21:17 deraadt Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.
6  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7  * Copyright (c) 1985 Sun Microsystems, Inc.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * Here we have the token scanner for indent.  It scans off one token and puts
37  * it in the global variable "token".  It returns a code, indicating the type
38  * of token scanned.
39  */
40 
41 #include <stdio.h>
42 #include <ctype.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <err.h>
46 #include "indent_globs.h"
47 #include "indent_codes.h"
48 
49 #define alphanum 1
50 #define opchar 3
51 
52 struct templ {
53     char       *rwd;
54     int         rwcode;
55 };
56 
57 struct templ specialsinit[] = {
58 	{ "switch", 1 },
59 	{ "case", 2 },
60 	{ "break", 0 },
61 	{ "struct", 3 },
62 	{ "union", 3 },
63 	{ "enum", 3 },
64 	{ "default", 2 },
65 	{ "int", 4 },
66 	{ "char", 4 },
67 	{ "float", 4 },
68 	{ "double", 4 },
69 	{ "long", 4 },
70 	{ "short", 4 },
71 	{ "typdef", 4 },
72 	{ "unsigned", 4 },
73 	{ "register", 4 },
74 	{ "static", 4 },
75 	{ "global", 4 },
76 	{ "extern", 4 },
77 	{ "void", 4 },
78 	{ "goto", 0 },
79 	{ "return", 0 },
80 	{ "if", 5 },
81 	{ "while", 5 },
82 	{ "for", 5 },
83 	{ "else", 6 },
84 	{ "do", 6 },
85 	{ "sizeof", 7 },
86 };
87 
88 struct templ *specials = specialsinit;
89 int	nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
90 int	maxspecials;
91 
92 char        chartype[128] =
93 {				/* this is used to facilitate the decision of
94 				 * what type (alphanumeric, operator) each
95 				 * character is */
96     0, 0, 0, 0, 0, 0, 0, 0,
97     0, 0, 0, 0, 0, 0, 0, 0,
98     0, 0, 0, 0, 0, 0, 0, 0,
99     0, 0, 0, 0, 0, 0, 0, 0,
100     0, 3, 0, 0, 1, 3, 3, 0,
101     0, 0, 3, 3, 0, 3, 0, 3,
102     1, 1, 1, 1, 1, 1, 1, 1,
103     1, 1, 0, 0, 3, 3, 3, 3,
104     0, 1, 1, 1, 1, 1, 1, 1,
105     1, 1, 1, 1, 1, 1, 1, 1,
106     1, 1, 1, 1, 1, 1, 1, 1,
107     1, 1, 1, 0, 0, 0, 3, 1,
108     0, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 0, 3, 0, 3, 0
112 };
113 
114 
115 
116 
117 int
118 lexi(void)
119 {
120     int         unary_delim;	/* this is set to 1 if the current token
121 				 * forces a following operator to be unary */
122     static int  last_code;	/* the last token type returned */
123     static int  l_struct;	/* set to 1 if the last token was 'struct' */
124     int         code;		/* internal code to be returned */
125     char        qchar;		/* the delimiter character for a string */
126     int		i;
127 
128     e_token = s_token;		/* point to start of place to save token */
129     unary_delim = false;
130     ps.col_1 = ps.last_nl;	/* tell world that this token started in
131 				 * column 1 iff the last thing scanned was nl */
132     ps.last_nl = false;
133 
134     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
135 	ps.col_1 = false;	/* leading blanks imply token is not in column
136 				 * 1 */
137 	if (++buf_ptr >= buf_end)
138 	    fill_buffer();
139     }
140 
141     /* Scan an alphanumeric token */
142     if (chartype[(int)*buf_ptr] == alphanum ||
143 	(buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
144 	/*
145 	 * we have a character or number
146 	 */
147 	char *j;	/* used for searching thru list of
148 			 * reserved words */
149 	if (isdigit((unsigned char)*buf_ptr) ||
150 	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
151 	    int         seendot = 0,
152 	                seenexp = 0,
153 			seensfx = 0;
154 	    if (*buf_ptr == '0' &&
155 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
156 		*e_token++ = *buf_ptr++;
157 		*e_token++ = *buf_ptr++;
158 		while (isxdigit(*buf_ptr)) {
159 		    CHECK_SIZE_TOKEN;
160 		    *e_token++ = *buf_ptr++;
161 		}
162 	    }
163 	    else
164 		while (1) {
165 		    if (*buf_ptr == '.') {
166 			if (seendot)
167 			    break;
168 			else
169 			    seendot++;
170 		    }
171 		    CHECK_SIZE_TOKEN;
172 		    *e_token++ = *buf_ptr++;
173 		    if (!isdigit((unsigned char)*buf_ptr) && *buf_ptr != '.') {
174 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
175 			    break;
176 			else {
177 			    seenexp++;
178 			    seendot++;
179 			    CHECK_SIZE_TOKEN;
180 			    *e_token++ = *buf_ptr++;
181 			    if (*buf_ptr == '+' || *buf_ptr == '-')
182 				*e_token++ = *buf_ptr++;
183 			}
184 		    }
185 		}
186 	    while (1) {
187 		if (!(seensfx & 1) &&
188 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
189 		    CHECK_SIZE_TOKEN;
190 		    *e_token++ = *buf_ptr++;
191 		    seensfx |= 1;
192 		    continue;
193 		}
194         	if (!(seensfx & 2) &&
195 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
196 		    CHECK_SIZE_TOKEN;
197 		    if (buf_ptr[1] == buf_ptr[0])
198 		        *e_token++ = *buf_ptr++;
199 		    *e_token++ = *buf_ptr++;
200 		    seensfx |= 2;
201 		    continue;
202 		}
203 		break;
204 	    }
205 	}
206 	else
207 	    while (chartype[(int)*buf_ptr] == alphanum) {	/* copy it over */
208 		CHECK_SIZE_TOKEN;
209 		*e_token++ = *buf_ptr++;
210 		if (buf_ptr >= buf_end)
211 		    fill_buffer();
212 	    }
213 	*e_token++ = '\0';
214 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
215 	    if (++buf_ptr >= buf_end)
216 		fill_buffer();
217 	}
218 	ps.its_a_keyword = false;
219 	ps.sizeof_keyword = false;
220 	if (l_struct) {		/* if last token was 'struct', then this token
221 				 * should be treated as a declaration */
222 	    l_struct = false;
223 	    last_code = ident;
224 	    ps.last_u_d = true;
225 	    return (decl);
226 	}
227 	ps.last_u_d = false;	/* Operator after identifier is binary */
228 	last_code = ident;	/* Remember that this is the code we will
229 				 * return */
230 
231 	/*
232 	 * This loop will check if the token is a keyword.
233 	 */
234 	for (i = 0; i < nspecials; i++) {
235 	    char *p = s_token;	/* point at scanned token */
236 	    j = specials[i].rwd;
237 	    if (*j++ != *p++ || *j++ != *p++)
238 		continue;	/* This test depends on the fact that
239 				 * identifiers are always at least 1 character
240 				 * long (ie. the first two bytes of the
241 				 * identifier are always meaningful) */
242 	    if (p[-1] == 0)
243 		break;		/* If its a one-character identifier */
244 	    while (*p++ == *j)
245 		if (*j++ == 0)
246 		    goto found_keyword;	/* I wish that C had a multi-level
247 					 * break... */
248 	}
249 	if (i < nspecials) {		/* we have a keyword */
250     found_keyword:
251 	    ps.its_a_keyword = true;
252 	    ps.last_u_d = true;
253 	    switch (specials[i].rwcode) {
254 	    case 1:		/* it is a switch */
255 		return (swstmt);
256 	    case 2:		/* a case or default */
257 		return (casestmt);
258 
259 	    case 3:		/* a "struct" */
260 		if (ps.p_l_follow)
261 		    break;	/* inside parens: cast */
262 		l_struct = true;
263 
264 		/*
265 		 * Next time around, we will want to know that we have had a
266 		 * 'struct'
267 		 */
268 	    case 4:		/* one of the declaration keywords */
269 		if (ps.p_l_follow) {
270 		    ps.cast_mask |= 1 << ps.p_l_follow;
271 		    break;	/* inside parens: cast */
272 		}
273 		last_code = decl;
274 		return (decl);
275 
276 	    case 5:		/* if, while, for */
277 		return (sp_paren);
278 
279 	    case 6:		/* do, else */
280 		return (sp_nparen);
281 
282 	    case 7:
283 		ps.sizeof_keyword = true;
284 	    default:		/* all others are treated like any other
285 				 * identifier */
286 		return (ident);
287 	    }			/* end of switch */
288 	}			/* end of if (found_it) */
289 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
290 	    char *tp = buf_ptr;
291 	    while (tp < buf_end)
292 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
293 		    goto not_proc;
294 	    strlcpy(ps.procname, token, sizeof ps.procname);
295 	    ps.in_parameter_declaration = 1;
296 	    rparen_count = 1;
297     not_proc:;
298 	}
299 	/*
300 	 * The following hack attempts to guess whether or not the current
301 	 * token is in fact a declaration keyword -- one that has been
302 	 * typedefd
303 	 */
304 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
305 	    isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
306 		&& !ps.p_l_follow
307 	        && !ps.block_init
308 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
309 		    ps.last_token == decl ||
310 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
311 	    ps.its_a_keyword = true;
312 	    ps.last_u_d = true;
313 	    last_code = decl;
314 	    return decl;
315 	}
316 	if (last_code == decl)	/* if this is a declared variable, then
317 				 * following sign is unary */
318 	    ps.last_u_d = true;	/* will make "int a -1" work */
319 	last_code = ident;
320 	return (ident);		/* the ident is not in the list */
321     }				/* end of procesing for alpanum character */
322 
323     /* Scan a non-alphanumeric token */
324 
325     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
326 				 * moved here */
327     *e_token = '\0';
328     if (++buf_ptr >= buf_end)
329 	fill_buffer();
330 
331     switch (*token) {
332     case '\n':
333 	unary_delim = ps.last_u_d;
334 	ps.last_nl = true;	/* remember that we just had a newline */
335 	code = (had_eof ? 0 : newline);
336 
337 	/*
338 	 * if data has been exausted, the newline is a dummy, and we should
339 	 * return code to stop
340 	 */
341 	break;
342 
343     case '\'':			/* start of quoted character */
344     case '"':			/* start of string */
345 	qchar = *token;
346 	if (troff) {
347 	    e_token[-1] = '`';
348 	    if (qchar == '"')
349 		*e_token++ = '`';
350 	    e_token = chfont(&bodyf, &stringf, e_token);
351 	}
352 	do {			/* copy the string */
353 	    while (1) {		/* move one character or [/<char>]<char> */
354 		if (*buf_ptr == '\n') {
355 		    printf("%d: Unterminated literal\n", line_no);
356 		    goto stop_lit;
357 		}
358 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
359 					 * since CHECK_SIZE guarantees that there
360 					 * are at least 5 entries left */
361 		*e_token = *buf_ptr++;
362 		if (buf_ptr >= buf_end)
363 		    fill_buffer();
364 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
365 		    if (*buf_ptr == '\n')	/* check for escaped newline */
366 			++line_no;
367 		    if (troff) {
368 			*++e_token = BACKSLASH;
369 			if (*buf_ptr == BACKSLASH)
370 			    *++e_token = BACKSLASH;
371 		    }
372 		    *++e_token = *buf_ptr++;
373 		    ++e_token;	/* we must increment this again because we
374 				 * copied two chars */
375 		    if (buf_ptr >= buf_end)
376 			fill_buffer();
377 		}
378 		else
379 		    break;	/* we copied one character */
380 	    }			/* end of while (1) */
381 	} while (*e_token++ != qchar);
382 	if (troff) {
383 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
384 	    if (qchar == '"')
385 		*e_token++ = '\'';
386 	}
387 stop_lit:
388 	code = ident;
389 	break;
390 
391     case ('('):
392     case ('['):
393 	unary_delim = true;
394 	code = lparen;
395 	break;
396 
397     case (')'):
398     case (']'):
399 	code = rparen;
400 	break;
401 
402     case '#':
403 	unary_delim = ps.last_u_d;
404 	code = preesc;
405 	break;
406 
407     case '?':
408 	unary_delim = true;
409 	code = question;
410 	break;
411 
412     case (':'):
413 	code = colon;
414 	unary_delim = true;
415 	break;
416 
417     case (';'):
418 	unary_delim = true;
419 	code = semicolon;
420 	break;
421 
422     case ('{'):
423 	unary_delim = true;
424 
425 	/*
426 	 * if (ps.in_or_st) ps.block_init = 1;
427 	 */
428 	/* ?	code = ps.block_init ? lparen : lbrace; */
429 	code = lbrace;
430 	break;
431 
432     case ('}'):
433 	unary_delim = true;
434 	/* ?	code = ps.block_init ? rparen : rbrace; */
435 	code = rbrace;
436 	break;
437 
438     case 014:			/* a form feed */
439 	unary_delim = ps.last_u_d;
440 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
441 				 * right */
442 	code = form_feed;
443 	break;
444 
445     case (','):
446 	unary_delim = true;
447 	code = comma;
448 	break;
449 
450     case '.':
451 	unary_delim = false;
452 	code = period;
453 	break;
454 
455     case '-':
456     case '+':			/* check for -, +, --, ++ */
457 	code = (ps.last_u_d ? unary_op : binary_op);
458 	unary_delim = true;
459 
460 	if (*buf_ptr == token[0]) {
461 	    /* check for doubled character */
462 	    *e_token++ = *buf_ptr++;
463 	    /* buffer overflow will be checked at end of loop */
464 	    if (last_code == ident || last_code == rparen) {
465 		code = (ps.last_u_d ? unary_op : postop);
466 		/* check for following ++ or -- */
467 		unary_delim = false;
468 	    }
469 	}
470 	else if (*buf_ptr == '=')
471 	    /* check for operator += */
472 	    *e_token++ = *buf_ptr++;
473 	else if (*buf_ptr == '>') {
474 	    /* check for operator -> */
475 	    *e_token++ = *buf_ptr++;
476 	    if (!pointer_as_binop) {
477 		unary_delim = false;
478 		code = unary_op;
479 		ps.want_blank = false;
480 	    }
481 	}
482 	break;			/* buffer overflow will be checked at end of
483 				 * switch */
484 
485     case '=':
486 	if (ps.in_or_st)
487 	    ps.block_init = 1;
488 #ifdef undef
489 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
490 	    e_token[-1] = *buf_ptr++;
491 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
492 		*e_token++ = *buf_ptr++;
493 	    *e_token++ = '=';	/* Flip =+ to += */
494 	    *e_token = 0;
495 	}
496 #else
497 	if (*buf_ptr == '=') {/* == */
498 	    *e_token++ = '=';	/* Flip =+ to += */
499 	    buf_ptr++;
500 	    *e_token = 0;
501 	}
502 #endif
503 	code = binary_op;
504 	unary_delim = true;
505 	break;
506 	/* can drop thru!!! */
507 
508     case '>':
509     case '<':
510     case '!':			/* ops like <, <<, <=, !=, etc */
511 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
512 	    *e_token++ = *buf_ptr;
513 	    if (++buf_ptr >= buf_end)
514 		fill_buffer();
515 	}
516 	if (*buf_ptr == '=')
517 	    *e_token++ = *buf_ptr++;
518 	code = (ps.last_u_d ? unary_op : binary_op);
519 	unary_delim = true;
520 	break;
521 
522     default:
523 	if (token[0] == '/' && *buf_ptr == '*') {
524 	    /* it is start of comment */
525 	    *e_token++ = '*';
526 
527 	    if (++buf_ptr >= buf_end)
528 		fill_buffer();
529 
530 	    code = comment;
531 	    unary_delim = ps.last_u_d;
532 	    break;
533 	}
534 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
535 	    /*
536 	     * handle ||, &&, etc, and also things as in int *****i
537 	     */
538 	    *e_token++ = *buf_ptr;
539 	    if (++buf_ptr >= buf_end)
540 		fill_buffer();
541 	}
542 	code = (ps.last_u_d ? unary_op : binary_op);
543 	unary_delim = true;
544 
545 
546     }				/* end of switch */
547     if (code != newline) {
548 	l_struct = false;
549 	last_code = code;
550     }
551     if (buf_ptr >= buf_end)	/* check for input buffer empty */
552 	fill_buffer();
553     ps.last_u_d = unary_delim;
554     *e_token = '\0';		/* null terminate the token */
555     return (code);
556 }
557 
558 /*
559  * Add the given keyword to the keyword table, using val as the keyword type
560  */
561 void
562 addkey(char *key, int val)
563 {
564     struct templ *p;
565     int i;
566 
567     for (i = 0; i < nspecials; i++) {
568 	p = &specials[i];
569 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
570 	    return;
571     }
572 
573     if (specials == specialsinit) {
574 	/*
575 	 * Whoa. Must reallocate special table.
576 	 */
577 	nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
578 	maxspecials = nspecials + (nspecials >> 2);
579 	specials = (struct templ *)calloc(maxspecials, sizeof specials[0]);
580 	if (specials == NULL)
581 	    err(1, NULL);
582 	memcpy(specials, specialsinit, sizeof specialsinit);
583     } else if (nspecials >= maxspecials) {
584 	int newspecials = maxspecials + (maxspecials >> 2);
585 	struct templ *specials2;
586 
587 	specials2 = realloc(specials, newspecials * sizeof specials[0]);
588 	if (specials2 == NULL)
589 	    err(1, NULL);
590 	specials = specials2;
591 	maxspecials = newspecials;
592     }
593 
594     p = &specials[nspecials];
595     p->rwd = key;
596     p->rwcode = val;
597     nspecials++;
598     return;
599 }
600