xref: /netbsd-src/usr.bin/indent/lexi.c (revision 23c8222edbfb0f0932d88a8351d3a0cf817dfb9e)
1 /*	$NetBSD: lexi.c,v 1.12 2003/08/07 11:14:09 agc Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
34  * Copyright (c) 1985 Sun Microsystems, Inc.
35  * All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  */
65 
66 #include <sys/cdefs.h>
67 #ifndef lint
68 #if 0
69 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
70 #else
71 __RCSID("$NetBSD: lexi.c,v 1.12 2003/08/07 11:14:09 agc Exp $");
72 #endif
73 #endif				/* not lint */
74 
75 /*
76  * Here we have the token scanner for indent.  It scans off one token and puts
77  * it in the global variable "token".  It returns a code, indicating the type
78  * of token scanned.
79  */
80 
81 #include <stdio.h>
82 #include <ctype.h>
83 #include <stdlib.h>
84 #include <string.h>
85 #include "indent_globs.h"
86 #include "indent_codes.h"
87 
88 #define alphanum 1
89 #define opchar 3
90 
91 struct templ {
92 	char   *rwd;
93 	int     rwcode;
94 };
95 
96 struct templ specials[1000] =
97 {
98 	{"switch", 1},
99 	{"case", 2},
100 	{"break", 0},
101 	{"struct", 3},
102 	{"union", 3},
103 	{"enum", 3},
104 	{"default", 2},
105 	{"int", 4},
106 	{"char", 4},
107 	{"float", 4},
108 	{"double", 4},
109 	{"long", 4},
110 	{"short", 4},
111 	{"typdef", 4},
112 	{"unsigned", 4},
113 	{"register", 4},
114 	{"static", 4},
115 	{"global", 4},
116 	{"extern", 4},
117 	{"void", 4},
118 	{"goto", 0},
119 	{"return", 0},
120 	{"if", 5},
121 	{"while", 5},
122 	{"for", 5},
123 	{"else", 6},
124 	{"do", 6},
125 	{"sizeof", 7},
126 	{0, 0}
127 };
128 
129 char    chartype[128] =
130 {				/* this is used to facilitate the decision of
131 				 * what type (alphanumeric, operator) each
132 				 * character is */
133 	0, 0, 0, 0, 0, 0, 0, 0,
134 	0, 0, 0, 0, 0, 0, 0, 0,
135 	0, 0, 0, 0, 0, 0, 0, 0,
136 	0, 0, 0, 0, 0, 0, 0, 0,
137 	0, 3, 0, 0, 1, 3, 3, 0,
138 	0, 0, 3, 3, 0, 3, 0, 3,
139 	1, 1, 1, 1, 1, 1, 1, 1,
140 	1, 1, 0, 0, 3, 3, 3, 3,
141 	0, 1, 1, 1, 1, 1, 1, 1,
142 	1, 1, 1, 1, 1, 1, 1, 1,
143 	1, 1, 1, 1, 1, 1, 1, 1,
144 	1, 1, 1, 0, 0, 0, 3, 1,
145 	0, 1, 1, 1, 1, 1, 1, 1,
146 	1, 1, 1, 1, 1, 1, 1, 1,
147 	1, 1, 1, 1, 1, 1, 1, 1,
148 	1, 1, 1, 0, 3, 0, 3, 0
149 };
150 
151 
152 
153 
154 int
155 lexi(void)
156 {
157 	int     unary_delim;	/* this is set to 1 if the current token
158 				 *
159 				 * forces a following operator to be unary */
160 	static int last_code;	/* the last token type returned */
161 	static int l_struct;	/* set to 1 if the last token was 'struct' */
162 	int     code;		/* internal code to be returned */
163 	char    qchar;		/* the delimiter character for a string */
164 
165 	e_token = s_token;	/* point to start of place to save token */
166 	unary_delim = false;
167 	ps.col_1 = ps.last_nl;	/* tell world that this token started in
168 				 * column 1 iff the last thing scanned was nl */
169 	ps.last_nl = false;
170 
171 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
172 		ps.col_1 = false;	/* leading blanks imply token is not
173 					 * in column 1 */
174 		if (++buf_ptr >= buf_end)
175 			fill_buffer();
176 	}
177 
178 	/* Scan an alphanumeric token */
179 	if (chartype[(int) *buf_ptr] == alphanum ||
180 	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
181 		/*
182 		 * we have a character or number
183 		 */
184 		char   *j;	/* used for searching thru list of
185 				 *
186 				 * reserved words */
187 		struct templ *p;
188 
189 		if (isdigit((unsigned char)*buf_ptr) ||
190 		    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
191 			int     seendot = 0, seenexp = 0, seensfx = 0;
192 			if (*buf_ptr == '0' &&
193 			    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
194 				*e_token++ = *buf_ptr++;
195 				*e_token++ = *buf_ptr++;
196 				while (isxdigit((unsigned char)*buf_ptr)) {
197 					CHECK_SIZE_TOKEN;
198 					*e_token++ = *buf_ptr++;
199 				}
200 			} else {
201 				while (1) {
202 					if (*buf_ptr == '.') {
203 						if (seendot)
204 							break;
205 						else
206 							seendot++;
207 					}
208 					CHECK_SIZE_TOKEN;
209 					*e_token++ = *buf_ptr++;
210 					if (!isdigit((unsigned char)*buf_ptr)
211 					&& *buf_ptr != '.') {
212 						if ((*buf_ptr != 'E'
213 						&& *buf_ptr != 'e') || seenexp)
214 							break;
215 						else {
216 							seenexp++;
217 							seendot++;
218 							CHECK_SIZE_TOKEN;
219 							*e_token++ = *buf_ptr++;
220 							if (*buf_ptr == '+' || *buf_ptr == '-')
221 								*e_token++ = *buf_ptr++;
222 						}
223 					}
224 				}
225 			}
226 			if (*buf_ptr == 'F' || *buf_ptr == 'f') {
227 				/* float constant */
228 				*e_token++ = *buf_ptr++;
229 			} else {
230 				/* integer constant */
231 				while (1) {
232 					if (!(seensfx & 1) &&
233 					    (*buf_ptr == 'U' ||
234 					     *buf_ptr == 'u')) {
235 						CHECK_SIZE_TOKEN;
236 						*e_token++ = *buf_ptr++;
237 						seensfx |= 1;
238 						continue;
239 					}
240 					if (!(seensfx & 2) &&
241 					    (*buf_ptr == 'L' ||
242 					     *buf_ptr == 'l')) {
243 						CHECK_SIZE_TOKEN;
244 						if (buf_ptr[1] == buf_ptr[0])
245 							*e_token++ = *buf_ptr++;
246 						*e_token++ = *buf_ptr++;
247 						seensfx |= 2;
248 						continue;
249 					}
250 					break;
251 				}
252 			}
253 		} else
254 			while (chartype[(int) *buf_ptr] == alphanum) {	/* copy it over */
255 				CHECK_SIZE_TOKEN;
256 				*e_token++ = *buf_ptr++;
257 				if (buf_ptr >= buf_end)
258 					fill_buffer();
259 			}
260 		*e_token++ = '\0';
261 		while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
262 			if (++buf_ptr >= buf_end)
263 				fill_buffer();
264 		}
265 		ps.its_a_keyword = false;
266 		ps.sizeof_keyword = false;
267 		if (l_struct) {	/* if last token was 'struct', then this token
268 				 * should be treated as a declaration */
269 			l_struct = false;
270 			last_code = ident;
271 			ps.last_u_d = true;
272 			return (decl);
273 		}
274 		ps.last_u_d = false;	/* Operator after indentifier is
275 					 * binary */
276 		last_code = ident;	/* Remember that this is the code we
277 					 * will return */
278 
279 		/*
280 		 * This loop will check if the token is a keyword.
281 		 */
282 		for (p = specials; (j = p->rwd) != 0; p++) {
283 			char   *p = s_token;	/* point at scanned token */
284 			if (*j++ != *p++ || *j++ != *p++)
285 				continue;	/* This test depends on the
286 						 * fact that identifiers are
287 						 * always at least 1 character
288 						 * long (ie. the first two
289 						 * bytes of the identifier are
290 						 * always meaningful) */
291 			if (p[-1] == 0)
292 				break;	/* If its a one-character identifier */
293 			while (*p++ == *j)
294 				if (*j++ == 0)
295 					goto found_keyword;	/* I wish that C had a
296 								 * multi-level break... */
297 		}
298 		if (p->rwd) {	/* we have a keyword */
299 	found_keyword:
300 			ps.its_a_keyword = true;
301 			ps.last_u_d = true;
302 			switch (p->rwcode) {
303 			case 1:/* it is a switch */
304 				return (swstmt);
305 			case 2:/* a case or default */
306 				return (casestmt);
307 
308 			case 3:/* a "struct" */
309 				if (ps.p_l_follow)
310 					break;	/* inside parens: cast */
311 				l_struct = true;
312 
313 				/*
314 				 * Next time around, we will want to know that we have had a
315 				 * 'struct'
316 				 */
317 			case 4:/* one of the declaration keywords */
318 				if (ps.p_l_follow) {
319 					ps.cast_mask |= 1 << ps.p_l_follow;
320 					break;	/* inside parens: cast */
321 				}
322 				last_code = decl;
323 				return (decl);
324 
325 			case 5:/* if, while, for */
326 				return (sp_paren);
327 
328 			case 6:/* do, else */
329 				return (sp_nparen);
330 
331 			case 7:
332 				ps.sizeof_keyword = true;
333 			default:	/* all others are treated like any
334 					 * other identifier */
335 				return (ident);
336 			}	/* end of switch */
337 		}		/* end of if (found_it) */
338 		if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
339 			char   *tp = buf_ptr;
340 			while (tp < buf_end)
341 				if (*tp++ == ')' && (*tp == ';' || *tp == ','))
342 					goto not_proc;
343 			strncpy(ps.procname, token, sizeof ps.procname - 1);
344 			ps.in_parameter_declaration = 1;
345 			rparen_count = 1;
346 	not_proc:	;
347 		}
348 		/*
349 		 * The following hack attempts to guess whether or not the current
350 		 * token is in fact a declaration keyword -- one that has been
351 		 * typedefd
352 		 */
353 		if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
354 		    isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
355 		    && !ps.p_l_follow
356 		    && !ps.block_init
357 		    && (ps.last_token == rparen || ps.last_token == semicolon ||
358 			ps.last_token == decl ||
359 			ps.last_token == lbrace || ps.last_token == rbrace)) {
360 			ps.its_a_keyword = true;
361 			ps.last_u_d = true;
362 			last_code = decl;
363 			return decl;
364 		}
365 		if (last_code == decl)	/* if this is a declared variable,
366 					 * then following sign is unary */
367 			ps.last_u_d = true;	/* will make "int a -1" work */
368 		last_code = ident;
369 		return (ident);	/* the ident is not in the list */
370 	}			/* end of procesing for alpanum character */
371 	/* Scan a non-alphanumeric token */
372 	*e_token++ = *buf_ptr;	/* if it is only a one-character token, it is
373 				 * moved here */
374 	*e_token = '\0';
375 	if (++buf_ptr >= buf_end)
376 		fill_buffer();
377 
378 	switch (*token) {
379 	case '\n':
380 		unary_delim = ps.last_u_d;
381 		ps.last_nl = true;	/* remember that we just had a newline */
382 		code = (had_eof ? 0 : newline);
383 
384 		/*
385 		 * if data has been exausted, the newline is a dummy, and we should
386 		 * return code to stop
387 		 */
388 		break;
389 
390 	case '\'':		/* start of quoted character */
391 	case '"':		/* start of string */
392 		qchar = *token;
393 		if (troff) {
394 			e_token[-1] = '`';
395 			if (qchar == '"')
396 				*e_token++ = '`';
397 			e_token = chfont(&bodyf, &stringf, e_token);
398 		}
399 		do {		/* copy the string */
400 			while (1) {	/* move one character or
401 					 * [/<char>]<char> */
402 				if (*buf_ptr == '\n') {
403 					printf("%d: Unterminated literal\n", line_no);
404 					goto stop_lit;
405 				}
406 				CHECK_SIZE_TOKEN;	/* Only have to do this
407 							 * once in this loop,
408 							 * since CHECK_SIZE
409 							 * guarantees that there
410 							 * are at least 5
411 							 * entries left */
412 				*e_token = *buf_ptr++;
413 				if (buf_ptr >= buf_end)
414 					fill_buffer();
415 				if (*e_token == BACKSLASH) {	/* if escape, copy extra
416 								 * char */
417 					if (*buf_ptr == '\n')	/* check for escaped
418 								 * newline */
419 						++line_no;
420 					if (troff) {
421 						*++e_token = BACKSLASH;
422 						if (*buf_ptr == BACKSLASH)
423 							*++e_token = BACKSLASH;
424 					}
425 					*++e_token = *buf_ptr++;
426 					++e_token;	/* we must increment
427 							 * this again because we
428 							 * copied two chars */
429 					if (buf_ptr >= buf_end)
430 						fill_buffer();
431 				} else
432 					break;	/* we copied one character */
433 			}	/* end of while (1) */
434 		} while (*e_token++ != qchar);
435 		if (troff) {
436 			e_token = chfont(&stringf, &bodyf, e_token - 1);
437 			if (qchar == '"')
438 				*e_token++ = '\'';
439 		}
440 stop_lit:
441 		code = ident;
442 		break;
443 
444 	case ('('):
445 	case ('['):
446 		unary_delim = true;
447 		code = lparen;
448 		break;
449 
450 	case (')'):
451 	case (']'):
452 		code = rparen;
453 		break;
454 
455 	case '#':
456 		unary_delim = ps.last_u_d;
457 		code = preesc;
458 		break;
459 
460 	case '?':
461 		unary_delim = true;
462 		code = question;
463 		break;
464 
465 	case (':'):
466 		code = colon;
467 		unary_delim = true;
468 		break;
469 
470 	case (';'):
471 		unary_delim = true;
472 		code = semicolon;
473 		break;
474 
475 	case ('{'):
476 		unary_delim = true;
477 
478 		/*
479 		 * if (ps.in_or_st) ps.block_init = 1;
480 		 */
481 		/* ?	code = ps.block_init ? lparen : lbrace; */
482 		code = lbrace;
483 		break;
484 
485 	case ('}'):
486 		unary_delim = true;
487 		/* ?	code = ps.block_init ? rparen : rbrace; */
488 		code = rbrace;
489 		break;
490 
491 	case 014:		/* a form feed */
492 		unary_delim = ps.last_u_d;
493 		ps.last_nl = true;	/* remember this so we can set
494 					 * 'ps.col_1' right */
495 		code = form_feed;
496 		break;
497 
498 	case (','):
499 		unary_delim = true;
500 		code = comma;
501 		break;
502 
503 	case '.':
504 		unary_delim = false;
505 		code = period;
506 		break;
507 
508 	case '-':
509 	case '+':		/* check for -, +, --, ++ */
510 		code = (ps.last_u_d ? unary_op : binary_op);
511 		unary_delim = true;
512 
513 		if (*buf_ptr == token[0]) {
514 			/* check for doubled character */
515 			*e_token++ = *buf_ptr++;
516 			/* buffer overflow will be checked at end of loop */
517 			if (last_code == ident || last_code == rparen) {
518 				code = (ps.last_u_d ? unary_op : postop);
519 				/* check for following ++ or -- */
520 				unary_delim = false;
521 			}
522 		} else
523 			if (*buf_ptr == '=')
524 				/* check for operator += */
525 				*e_token++ = *buf_ptr++;
526 			else
527 				if (*buf_ptr == '>') {
528 					/* check for operator -> */
529 					*e_token++ = *buf_ptr++;
530 					if (!pointer_as_binop) {
531 						unary_delim = false;
532 						code = unary_op;
533 						ps.want_blank = false;
534 					}
535 				}
536 		break;		/* buffer overflow will be checked at end of
537 				 * switch */
538 
539 	case '=':
540 		if (ps.in_or_st)
541 			ps.block_init = 1;
542 #ifdef undef
543 		if (chartype[*buf_ptr] == opchar) {	/* we have two char
544 							 * assignment */
545 			e_token[-1] = *buf_ptr++;
546 			if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
547 				*e_token++ = *buf_ptr++;
548 			*e_token++ = '=';	/* Flip =+ to += */
549 			*e_token = 0;
550 		}
551 #else
552 		if (*buf_ptr == '=') {	/* == */
553 			*e_token++ = '=';	/* Flip =+ to += */
554 			buf_ptr++;
555 			*e_token = 0;
556 		}
557 #endif
558 		code = binary_op;
559 		unary_delim = true;
560 		break;
561 		/* can drop thru!!! */
562 
563 	case '>':
564 	case '<':
565 	case '!':		/* ops like <, <<, <=, !=, etc */
566 		if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
567 			*e_token++ = *buf_ptr;
568 			if (++buf_ptr >= buf_end)
569 				fill_buffer();
570 		}
571 		if (*buf_ptr == '=')
572 			*e_token++ = *buf_ptr++;
573 		code = (ps.last_u_d ? unary_op : binary_op);
574 		unary_delim = true;
575 		break;
576 
577 	default:
578 		if (token[0] == '/' && *buf_ptr == '*') {
579 			/* it is start of comment */
580 			*e_token++ = '*';
581 
582 			if (++buf_ptr >= buf_end)
583 				fill_buffer();
584 
585 			code = comment;
586 			unary_delim = ps.last_u_d;
587 			break;
588 		}
589 		while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
590 			/*
591 		         * handle ||, &&, etc, and also things as in int *****i
592 		         */
593 			*e_token++ = *buf_ptr;
594 			if (++buf_ptr >= buf_end)
595 				fill_buffer();
596 		}
597 		code = (ps.last_u_d ? unary_op : binary_op);
598 		unary_delim = true;
599 
600 
601 	}			/* end of switch */
602 	if (code != newline) {
603 		l_struct = false;
604 		last_code = code;
605 	}
606 	if (buf_ptr >= buf_end)	/* check for input buffer empty */
607 		fill_buffer();
608 	ps.last_u_d = unary_delim;
609 	*e_token = '\0';	/* null terminate the token */
610 	return (code);
611 }
612 /*
613  * Add the given keyword to the keyword table, using val as the keyword type
614  */
615 void
616 addkey(char *key, int val)
617 {
618 	struct templ *p = specials;
619 	while (p->rwd)
620 		if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
621 			return;
622 		else
623 			p++;
624 	if (p >= specials + sizeof specials / sizeof specials[0])
625 		return;		/* For now, table overflows are silently
626 				 * ignored */
627 	p->rwd = key;
628 	p->rwcode = val;
629 	p[1].rwd = 0;
630 	p[1].rwcode = 0;
631 }
632