xref: /netbsd-src/usr.bin/indent/lexi.c (revision 9fbd88883c38d0c0fbfcbe66d76fe6b0fab3f9de)
1 /*	$NetBSD: lexi.c,v 1.9 1999/03/15 20:28:45 kristerw Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7  * Copyright (c) 1985 Sun Microsystems, Inc.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include <sys/cdefs.h>
40 #ifndef lint
41 #if 0
42 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
43 #else
44 __RCSID("$NetBSD: lexi.c,v 1.9 1999/03/15 20:28:45 kristerw Exp $");
45 #endif
46 #endif				/* not lint */
47 
48 /*
49  * Here we have the token scanner for indent.  It scans off one token and puts
50  * it in the global variable "token".  It returns a code, indicating the type
51  * of token scanned.
52  */
53 
54 #include <stdio.h>
55 #include <ctype.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include "indent_globs.h"
59 #include "indent_codes.h"
60 
61 #define alphanum 1
62 #define opchar 3
63 
64 struct templ {
65 	char   *rwd;
66 	int     rwcode;
67 };
68 
69 struct templ specials[1000] =
70 {
71 	{"switch", 1},
72 	{"case", 2},
73 	{"break", 0},
74 	{"struct", 3},
75 	{"union", 3},
76 	{"enum", 3},
77 	{"default", 2},
78 	{"int", 4},
79 	{"char", 4},
80 	{"float", 4},
81 	{"double", 4},
82 	{"long", 4},
83 	{"short", 4},
84 	{"typdef", 4},
85 	{"unsigned", 4},
86 	{"register", 4},
87 	{"static", 4},
88 	{"global", 4},
89 	{"extern", 4},
90 	{"void", 4},
91 	{"goto", 0},
92 	{"return", 0},
93 	{"if", 5},
94 	{"while", 5},
95 	{"for", 5},
96 	{"else", 6},
97 	{"do", 6},
98 	{"sizeof", 7},
99 	{0, 0}
100 };
101 
102 char    chartype[128] =
103 {				/* this is used to facilitate the decision of
104 				 * what type (alphanumeric, operator) each
105 				 * character is */
106 	0, 0, 0, 0, 0, 0, 0, 0,
107 	0, 0, 0, 0, 0, 0, 0, 0,
108 	0, 0, 0, 0, 0, 0, 0, 0,
109 	0, 0, 0, 0, 0, 0, 0, 0,
110 	0, 3, 0, 0, 1, 3, 3, 0,
111 	0, 0, 3, 3, 0, 3, 0, 3,
112 	1, 1, 1, 1, 1, 1, 1, 1,
113 	1, 1, 0, 0, 3, 3, 3, 3,
114 	0, 1, 1, 1, 1, 1, 1, 1,
115 	1, 1, 1, 1, 1, 1, 1, 1,
116 	1, 1, 1, 1, 1, 1, 1, 1,
117 	1, 1, 1, 0, 0, 0, 3, 1,
118 	0, 1, 1, 1, 1, 1, 1, 1,
119 	1, 1, 1, 1, 1, 1, 1, 1,
120 	1, 1, 1, 1, 1, 1, 1, 1,
121 	1, 1, 1, 0, 3, 0, 3, 0
122 };
123 
124 
125 
126 
127 int
128 lexi()
129 {
130 	int     unary_delim;	/* this is set to 1 if the current token
131 				 *
132 				 * forces a following operator to be unary */
133 	static int last_code;	/* the last token type returned */
134 	static int l_struct;	/* set to 1 if the last token was 'struct' */
135 	int     code;		/* internal code to be returned */
136 	char    qchar;		/* the delimiter character for a string */
137 
138 	e_token = s_token;	/* point to start of place to save token */
139 	unary_delim = false;
140 	ps.col_1 = ps.last_nl;	/* tell world that this token started in
141 				 * column 1 iff the last thing scanned was nl */
142 	ps.last_nl = false;
143 
144 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
145 		ps.col_1 = false;	/* leading blanks imply token is not
146 					 * in column 1 */
147 		if (++buf_ptr >= buf_end)
148 			fill_buffer();
149 	}
150 
151 	/* Scan an alphanumeric token */
152 	if (chartype[(int) *buf_ptr] == alphanum ||
153 	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
154 		/*
155 		 * we have a character or number
156 		 */
157 		char   *j;	/* used for searching thru list of
158 				 *
159 				 * reserved words */
160 		struct templ *p;
161 
162 		if (isdigit((unsigned char)*buf_ptr) ||
163 		    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
164 			int     seendot = 0, seenexp = 0;
165 			if (*buf_ptr == '0' &&
166 			    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
167 				*e_token++ = *buf_ptr++;
168 				*e_token++ = *buf_ptr++;
169 				while (isxdigit((unsigned char)*buf_ptr)) {
170 					CHECK_SIZE_TOKEN;
171 					*e_token++ = *buf_ptr++;
172 				}
173 			} else {
174 				while (1) {
175 					if (*buf_ptr == '.') {
176 						if (seendot)
177 							break;
178 						else
179 							seendot++;
180 					}
181 					CHECK_SIZE_TOKEN;
182 					*e_token++ = *buf_ptr++;
183 					if (!isdigit((unsigned char)*buf_ptr)
184 					&& *buf_ptr != '.') {
185 						if ((*buf_ptr != 'E'
186 						&& *buf_ptr != 'e') || seenexp)
187 							break;
188 						else {
189 							seenexp++;
190 							seendot++;
191 							CHECK_SIZE_TOKEN;
192 							*e_token++ = *buf_ptr++;
193 							if (*buf_ptr == '+' || *buf_ptr == '-')
194 								*e_token++ = *buf_ptr++;
195 						}
196 					}
197 				}
198 			}
199 			if (*buf_ptr == 'F' || *buf_ptr == 'f') {
200 				/* float constant */
201 				*e_token++ = *buf_ptr++;
202 			} else {
203 				/* integer constant (U, L, UL, LL, ULL) */
204 				if (*buf_ptr == 'U' || *buf_ptr == 'u')
205 					*e_token++ = *buf_ptr++;
206 				if (*buf_ptr == 'L' || *buf_ptr == 'l')
207 					*e_token++ = *buf_ptr++;
208 				if (*buf_ptr == 'L' || *buf_ptr == 'l')
209 					*e_token++ = *buf_ptr++;
210 			}
211 		} else
212 			while (chartype[(int) *buf_ptr] == alphanum) {	/* copy it over */
213 				CHECK_SIZE_TOKEN;
214 				*e_token++ = *buf_ptr++;
215 				if (buf_ptr >= buf_end)
216 					fill_buffer();
217 			}
218 		*e_token++ = '\0';
219 		while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
220 			if (++buf_ptr >= buf_end)
221 				fill_buffer();
222 		}
223 		ps.its_a_keyword = false;
224 		ps.sizeof_keyword = false;
225 		if (l_struct) {	/* if last token was 'struct', then this token
226 				 * should be treated as a declaration */
227 			l_struct = false;
228 			last_code = ident;
229 			ps.last_u_d = true;
230 			return (decl);
231 		}
232 		ps.last_u_d = false;	/* Operator after indentifier is
233 					 * binary */
234 		last_code = ident;	/* Remember that this is the code we
235 					 * will return */
236 
237 		/*
238 		 * This loop will check if the token is a keyword.
239 		 */
240 		for (p = specials; (j = p->rwd) != 0; p++) {
241 			char   *p = s_token;	/* point at scanned token */
242 			if (*j++ != *p++ || *j++ != *p++)
243 				continue;	/* This test depends on the
244 						 * fact that identifiers are
245 						 * always at least 1 character
246 						 * long (ie. the first two
247 						 * bytes of the identifier are
248 						 * always meaningful) */
249 			if (p[-1] == 0)
250 				break;	/* If its a one-character identifier */
251 			while (*p++ == *j)
252 				if (*j++ == 0)
253 					goto found_keyword;	/* I wish that C had a
254 								 * multi-level break... */
255 		}
256 		if (p->rwd) {	/* we have a keyword */
257 	found_keyword:
258 			ps.its_a_keyword = true;
259 			ps.last_u_d = true;
260 			switch (p->rwcode) {
261 			case 1:/* it is a switch */
262 				return (swstmt);
263 			case 2:/* a case or default */
264 				return (casestmt);
265 
266 			case 3:/* a "struct" */
267 				if (ps.p_l_follow)
268 					break;	/* inside parens: cast */
269 				l_struct = true;
270 
271 				/*
272 				 * Next time around, we will want to know that we have had a
273 				 * 'struct'
274 				 */
275 			case 4:/* one of the declaration keywords */
276 				if (ps.p_l_follow) {
277 					ps.cast_mask |= 1 << ps.p_l_follow;
278 					break;	/* inside parens: cast */
279 				}
280 				last_code = decl;
281 				return (decl);
282 
283 			case 5:/* if, while, for */
284 				return (sp_paren);
285 
286 			case 6:/* do, else */
287 				return (sp_nparen);
288 
289 			case 7:
290 				ps.sizeof_keyword = true;
291 			default:	/* all others are treated like any
292 					 * other identifier */
293 				return (ident);
294 			}	/* end of switch */
295 		}		/* end of if (found_it) */
296 		if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
297 			char   *tp = buf_ptr;
298 			while (tp < buf_end)
299 				if (*tp++ == ')' && (*tp == ';' || *tp == ','))
300 					goto not_proc;
301 			strncpy(ps.procname, token, sizeof ps.procname - 1);
302 			ps.in_parameter_declaration = 1;
303 			rparen_count = 1;
304 	not_proc:	;
305 		}
306 		/*
307 		 * The following hack attempts to guess whether or not the current
308 		 * token is in fact a declaration keyword -- one that has been
309 		 * typedefd
310 		 */
311 		if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
312 		    isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
313 		    && !ps.p_l_follow
314 		    && !ps.block_init
315 		    && (ps.last_token == rparen || ps.last_token == semicolon ||
316 			ps.last_token == decl ||
317 			ps.last_token == lbrace || ps.last_token == rbrace)) {
318 			ps.its_a_keyword = true;
319 			ps.last_u_d = true;
320 			last_code = decl;
321 			return decl;
322 		}
323 		if (last_code == decl)	/* if this is a declared variable,
324 					 * then following sign is unary */
325 			ps.last_u_d = true;	/* will make "int a -1" work */
326 		last_code = ident;
327 		return (ident);	/* the ident is not in the list */
328 	}			/* end of procesing for alpanum character */
329 	/* Scan a non-alphanumeric token */
330 	*e_token++ = *buf_ptr;	/* if it is only a one-character token, it is
331 				 * moved here */
332 	*e_token = '\0';
333 	if (++buf_ptr >= buf_end)
334 		fill_buffer();
335 
336 	switch (*token) {
337 	case '\n':
338 		unary_delim = ps.last_u_d;
339 		ps.last_nl = true;	/* remember that we just had a newline */
340 		code = (had_eof ? 0 : newline);
341 
342 		/*
343 		 * if data has been exausted, the newline is a dummy, and we should
344 		 * return code to stop
345 		 */
346 		break;
347 
348 	case '\'':		/* start of quoted character */
349 	case '"':		/* start of string */
350 		qchar = *token;
351 		if (troff) {
352 			e_token[-1] = '`';
353 			if (qchar == '"')
354 				*e_token++ = '`';
355 			e_token = chfont(&bodyf, &stringf, e_token);
356 		}
357 		do {		/* copy the string */
358 			while (1) {	/* move one character or
359 					 * [/<char>]<char> */
360 				if (*buf_ptr == '\n') {
361 					printf("%d: Unterminated literal\n", line_no);
362 					goto stop_lit;
363 				}
364 				CHECK_SIZE_TOKEN;	/* Only have to do this
365 							 * once in this loop,
366 							 * since CHECK_SIZE
367 							 * guarantees that there
368 							 * are at least 5
369 							 * entries left */
370 				*e_token = *buf_ptr++;
371 				if (buf_ptr >= buf_end)
372 					fill_buffer();
373 				if (*e_token == BACKSLASH) {	/* if escape, copy extra
374 								 * char */
375 					if (*buf_ptr == '\n')	/* check for escaped
376 								 * newline */
377 						++line_no;
378 					if (troff) {
379 						*++e_token = BACKSLASH;
380 						if (*buf_ptr == BACKSLASH)
381 							*++e_token = BACKSLASH;
382 					}
383 					*++e_token = *buf_ptr++;
384 					++e_token;	/* we must increment
385 							 * this again because we
386 							 * copied two chars */
387 					if (buf_ptr >= buf_end)
388 						fill_buffer();
389 				} else
390 					break;	/* we copied one character */
391 			}	/* end of while (1) */
392 		} while (*e_token++ != qchar);
393 		if (troff) {
394 			e_token = chfont(&stringf, &bodyf, e_token - 1);
395 			if (qchar == '"')
396 				*e_token++ = '\'';
397 		}
398 stop_lit:
399 		code = ident;
400 		break;
401 
402 	case ('('):
403 	case ('['):
404 		unary_delim = true;
405 		code = lparen;
406 		break;
407 
408 	case (')'):
409 	case (']'):
410 		code = rparen;
411 		break;
412 
413 	case '#':
414 		unary_delim = ps.last_u_d;
415 		code = preesc;
416 		break;
417 
418 	case '?':
419 		unary_delim = true;
420 		code = question;
421 		break;
422 
423 	case (':'):
424 		code = colon;
425 		unary_delim = true;
426 		break;
427 
428 	case (';'):
429 		unary_delim = true;
430 		code = semicolon;
431 		break;
432 
433 	case ('{'):
434 		unary_delim = true;
435 
436 		/*
437 		 * if (ps.in_or_st) ps.block_init = 1;
438 		 */
439 		/* ?	code = ps.block_init ? lparen : lbrace; */
440 		code = lbrace;
441 		break;
442 
443 	case ('}'):
444 		unary_delim = true;
445 		/* ?	code = ps.block_init ? rparen : rbrace; */
446 		code = rbrace;
447 		break;
448 
449 	case 014:		/* a form feed */
450 		unary_delim = ps.last_u_d;
451 		ps.last_nl = true;	/* remember this so we can set
452 					 * 'ps.col_1' right */
453 		code = form_feed;
454 		break;
455 
456 	case (','):
457 		unary_delim = true;
458 		code = comma;
459 		break;
460 
461 	case '.':
462 		unary_delim = false;
463 		code = period;
464 		break;
465 
466 	case '-':
467 	case '+':		/* check for -, +, --, ++ */
468 		code = (ps.last_u_d ? unary_op : binary_op);
469 		unary_delim = true;
470 
471 		if (*buf_ptr == token[0]) {
472 			/* check for doubled character */
473 			*e_token++ = *buf_ptr++;
474 			/* buffer overflow will be checked at end of loop */
475 			if (last_code == ident || last_code == rparen) {
476 				code = (ps.last_u_d ? unary_op : postop);
477 				/* check for following ++ or -- */
478 				unary_delim = false;
479 			}
480 		} else
481 			if (*buf_ptr == '=')
482 				/* check for operator += */
483 				*e_token++ = *buf_ptr++;
484 			else
485 				if (*buf_ptr == '>') {
486 					/* check for operator -> */
487 					*e_token++ = *buf_ptr++;
488 					if (!pointer_as_binop) {
489 						unary_delim = false;
490 						code = unary_op;
491 						ps.want_blank = false;
492 					}
493 				}
494 		break;		/* buffer overflow will be checked at end of
495 				 * switch */
496 
497 	case '=':
498 		if (ps.in_or_st)
499 			ps.block_init = 1;
500 #ifdef undef
501 		if (chartype[*buf_ptr] == opchar) {	/* we have two char
502 							 * assignment */
503 			e_token[-1] = *buf_ptr++;
504 			if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
505 				*e_token++ = *buf_ptr++;
506 			*e_token++ = '=';	/* Flip =+ to += */
507 			*e_token = 0;
508 		}
509 #else
510 		if (*buf_ptr == '=') {	/* == */
511 			*e_token++ = '=';	/* Flip =+ to += */
512 			buf_ptr++;
513 			*e_token = 0;
514 		}
515 #endif
516 		code = binary_op;
517 		unary_delim = true;
518 		break;
519 		/* can drop thru!!! */
520 
521 	case '>':
522 	case '<':
523 	case '!':		/* ops like <, <<, <=, !=, etc */
524 		if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
525 			*e_token++ = *buf_ptr;
526 			if (++buf_ptr >= buf_end)
527 				fill_buffer();
528 		}
529 		if (*buf_ptr == '=')
530 			*e_token++ = *buf_ptr++;
531 		code = (ps.last_u_d ? unary_op : binary_op);
532 		unary_delim = true;
533 		break;
534 
535 	default:
536 		if (token[0] == '/' && *buf_ptr == '*') {
537 			/* it is start of comment */
538 			*e_token++ = '*';
539 
540 			if (++buf_ptr >= buf_end)
541 				fill_buffer();
542 
543 			code = comment;
544 			unary_delim = ps.last_u_d;
545 			break;
546 		}
547 		while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
548 			/*
549 		         * handle ||, &&, etc, and also things as in int *****i
550 		         */
551 			*e_token++ = *buf_ptr;
552 			if (++buf_ptr >= buf_end)
553 				fill_buffer();
554 		}
555 		code = (ps.last_u_d ? unary_op : binary_op);
556 		unary_delim = true;
557 
558 
559 	}			/* end of switch */
560 	if (code != newline) {
561 		l_struct = false;
562 		last_code = code;
563 	}
564 	if (buf_ptr >= buf_end)	/* check for input buffer empty */
565 		fill_buffer();
566 	ps.last_u_d = unary_delim;
567 	*e_token = '\0';	/* null terminate the token */
568 	return (code);
569 }
570 /*
571  * Add the given keyword to the keyword table, using val as the keyword type
572  */
573 void
574 addkey(key, val)
575 	char   *key;
576 	int     val;
577 {
578 	struct templ *p = specials;
579 	while (p->rwd)
580 		if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
581 			return;
582 		else
583 			p++;
584 	if (p >= specials + sizeof specials / sizeof specials[0])
585 		return;		/* For now, table overflows are silently
586 				 * ignored */
587 	p->rwd = key;
588 	p->rwcode = val;
589 	p[1].rwd = 0;
590 	p[1].rwcode = 0;
591 }
592