xref: /netbsd-src/usr.bin/indent/lexi.c (revision ce0bb6e8d2e560ecacbe865a848624f94498063b)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980 The Regents of the University of California.
4  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #ifndef lint
37 /*static char sccsid[] = "from: @(#)lexi.c	5.16 (Berkeley) 2/26/91";*/
38 static char rcsid[] = "$Id: lexi.c,v 1.2 1993/08/01 18:14:31 mycroft Exp $";
39 #endif /* not lint */
40 
41 /*
42  * Here we have the token scanner for indent.  It scans off one token and puts
43  * it in the global variable "token".  It returns a code, indicating the type
44  * of token scanned.
45  */
46 
47 #include <stdio.h>
48 #include <ctype.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include "indent_globs.h"
52 #include "indent_codes.h"
53 
54 #define alphanum 1
55 #define opchar 3
56 
57 struct templ {
58     char       *rwd;
59     int         rwcode;
60 };
61 
62 struct templ specials[100] =
63 {
64     "switch", 1,
65     "case", 2,
66     "break", 0,
67     "struct", 3,
68     "union", 3,
69     "enum", 3,
70     "default", 2,
71     "int", 4,
72     "char", 4,
73     "float", 4,
74     "double", 4,
75     "long", 4,
76     "short", 4,
77     "typdef", 4,
78     "unsigned", 4,
79     "register", 4,
80     "static", 4,
81     "global", 4,
82     "extern", 4,
83     "void", 4,
84     "goto", 0,
85     "return", 0,
86     "if", 5,
87     "while", 5,
88     "for", 5,
89     "else", 6,
90     "do", 6,
91     "sizeof", 7,
92     0, 0
93 };
94 
95 char        chartype[128] =
96 {				/* this is used to facilitate the decision of
97 				 * what type (alphanumeric, operator) each
98 				 * character is */
99     0, 0, 0, 0, 0, 0, 0, 0,
100     0, 0, 0, 0, 0, 0, 0, 0,
101     0, 0, 0, 0, 0, 0, 0, 0,
102     0, 0, 0, 0, 0, 0, 0, 0,
103     0, 3, 0, 0, 1, 3, 3, 0,
104     0, 0, 3, 3, 0, 3, 0, 3,
105     1, 1, 1, 1, 1, 1, 1, 1,
106     1, 1, 0, 0, 3, 3, 3, 3,
107     0, 1, 1, 1, 1, 1, 1, 1,
108     1, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 0, 0, 0, 3, 1,
111     0, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 1, 1, 1, 1, 1, 1,
114     1, 1, 1, 0, 3, 0, 3, 0
115 };
116 
117 
118 
119 
120 int
121 lexi()
122 {
123     int         unary_delim;	/* this is set to 1 if the current token
124 				 *
125 				 * forces a following operator to be unary */
126     static int  last_code;	/* the last token type returned */
127     static int  l_struct;	/* set to 1 if the last token was 'struct' */
128     int         code;		/* internal code to be returned */
129     char        qchar;		/* the delimiter character for a string */
130 
131     e_token = s_token;		/* point to start of place to save token */
132     unary_delim = false;
133     ps.col_1 = ps.last_nl;	/* tell world that this token started in
134 				 * column 1 iff the last thing scanned was nl */
135     ps.last_nl = false;
136 
137     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
138 	ps.col_1 = false;	/* leading blanks imply token is not in column
139 				 * 1 */
140 	if (++buf_ptr >= buf_end)
141 	    fill_buffer();
142     }
143 
144     /* Scan an alphanumeric token */
145     if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
146 	/*
147 	 * we have a character or number
148 	 */
149 	register char *j;	/* used for searching thru list of
150 				 *
151 				 * reserved words */
152 	register struct templ *p;
153 
154 	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
155 	    int         seendot = 0,
156 	                seenexp = 0;
157 	    if (*buf_ptr == '0' &&
158 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
159 		*e_token++ = *buf_ptr++;
160 		*e_token++ = *buf_ptr++;
161 		while (isxdigit(*buf_ptr)) {
162 		    CHECK_SIZE_TOKEN;
163 		    *e_token++ = *buf_ptr++;
164 		}
165 	    }
166 	    else
167 		while (1) {
168 		    if (*buf_ptr == '.')
169 			if (seendot)
170 			    break;
171 			else
172 			    seendot++;
173 		    CHECK_SIZE_TOKEN;
174 		    *e_token++ = *buf_ptr++;
175 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
176 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
177 			    break;
178 			else {
179 			    seenexp++;
180 			    seendot++;
181 			    CHECK_SIZE_TOKEN;
182 			    *e_token++ = *buf_ptr++;
183 			    if (*buf_ptr == '+' || *buf_ptr == '-')
184 				*e_token++ = *buf_ptr++;
185 			}
186 		}
187 	    if (*buf_ptr == 'L' || *buf_ptr == 'l')
188 		*e_token++ = *buf_ptr++;
189 	}
190 	else
191 	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
192 		CHECK_SIZE_TOKEN;
193 		*e_token++ = *buf_ptr++;
194 		if (buf_ptr >= buf_end)
195 		    fill_buffer();
196 	    }
197 	*e_token++ = '\0';
198 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
199 	    if (++buf_ptr >= buf_end)
200 		fill_buffer();
201 	}
202 	ps.its_a_keyword = false;
203 	ps.sizeof_keyword = false;
204 	if (l_struct) {		/* if last token was 'struct', then this token
205 				 * should be treated as a declaration */
206 	    l_struct = false;
207 	    last_code = ident;
208 	    ps.last_u_d = true;
209 	    return (decl);
210 	}
211 	ps.last_u_d = false;	/* Operator after indentifier is binary */
212 	last_code = ident;	/* Remember that this is the code we will
213 				 * return */
214 
215 	/*
216 	 * This loop will check if the token is a keyword.
217 	 */
218 	for (p = specials; (j = p->rwd) != 0; p++) {
219 	    register char *p = s_token;	/* point at scanned token */
220 	    if (*j++ != *p++ || *j++ != *p++)
221 		continue;	/* This test depends on the fact that
222 				 * identifiers are always at least 1 character
223 				 * long (ie. the first two bytes of the
224 				 * identifier are always meaningful) */
225 	    if (p[-1] == 0)
226 		break;		/* If its a one-character identifier */
227 	    while (*p++ == *j)
228 		if (*j++ == 0)
229 		    goto found_keyword;	/* I wish that C had a multi-level
230 					 * break... */
231 	}
232 	if (p->rwd) {		/* we have a keyword */
233     found_keyword:
234 	    ps.its_a_keyword = true;
235 	    ps.last_u_d = true;
236 	    switch (p->rwcode) {
237 	    case 1:		/* it is a switch */
238 		return (swstmt);
239 	    case 2:		/* a case or default */
240 		return (casestmt);
241 
242 	    case 3:		/* a "struct" */
243 		if (ps.p_l_follow)
244 		    break;	/* inside parens: cast */
245 		l_struct = true;
246 
247 		/*
248 		 * Next time around, we will want to know that we have had a
249 		 * 'struct'
250 		 */
251 	    case 4:		/* one of the declaration keywords */
252 		if (ps.p_l_follow) {
253 		    ps.cast_mask |= 1 << ps.p_l_follow;
254 		    break;	/* inside parens: cast */
255 		}
256 		last_code = decl;
257 		return (decl);
258 
259 	    case 5:		/* if, while, for */
260 		return (sp_paren);
261 
262 	    case 6:		/* do, else */
263 		return (sp_nparen);
264 
265 	    case 7:
266 		ps.sizeof_keyword = true;
267 	    default:		/* all others are treated like any other
268 				 * identifier */
269 		return (ident);
270 	    }			/* end of switch */
271 	}			/* end of if (found_it) */
272 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
273 	    register char *tp = buf_ptr;
274 	    while (tp < buf_end)
275 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
276 		    goto not_proc;
277 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
278 	    ps.in_parameter_declaration = 1;
279 	    rparen_count = 1;
280     not_proc:;
281 	}
282 	/*
283 	 * The following hack attempts to guess whether or not the current
284 	 * token is in fact a declaration keyword -- one that has been
285 	 * typedefd
286 	 */
287 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
288 		&& !ps.p_l_follow
289 	        && !ps.block_init
290 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
291 		    ps.last_token == decl ||
292 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
293 	    ps.its_a_keyword = true;
294 	    ps.last_u_d = true;
295 	    last_code = decl;
296 	    return decl;
297 	}
298 	if (last_code == decl)	/* if this is a declared variable, then
299 				 * following sign is unary */
300 	    ps.last_u_d = true;	/* will make "int a -1" work */
301 	last_code = ident;
302 	return (ident);		/* the ident is not in the list */
303     }				/* end of procesing for alpanum character */
304 
305     /* Scan a non-alphanumeric token */
306 
307     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
308 				 * moved here */
309     *e_token = '\0';
310     if (++buf_ptr >= buf_end)
311 	fill_buffer();
312 
313     switch (*token) {
314     case '\n':
315 	unary_delim = ps.last_u_d;
316 	ps.last_nl = true;	/* remember that we just had a newline */
317 	code = (had_eof ? 0 : newline);
318 
319 	/*
320 	 * if data has been exausted, the newline is a dummy, and we should
321 	 * return code to stop
322 	 */
323 	break;
324 
325     case '\'':			/* start of quoted character */
326     case '"':			/* start of string */
327 	qchar = *token;
328 	if (troff) {
329 	    e_token[-1] = '`';
330 	    if (qchar == '"')
331 		*e_token++ = '`';
332 	    e_token = chfont(&bodyf, &stringf, e_token);
333 	}
334 	do {			/* copy the string */
335 	    while (1) {		/* move one character or [/<char>]<char> */
336 		if (*buf_ptr == '\n') {
337 		    printf("%d: Unterminated literal\n", line_no);
338 		    goto stop_lit;
339 		}
340 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
341 					 * since CHECK_SIZE guarantees that there
342 					 * are at least 5 entries left */
343 		*e_token = *buf_ptr++;
344 		if (buf_ptr >= buf_end)
345 		    fill_buffer();
346 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
347 		    if (*buf_ptr == '\n')	/* check for escaped newline */
348 			++line_no;
349 		    if (troff) {
350 			*++e_token = BACKSLASH;
351 			if (*buf_ptr == BACKSLASH)
352 			    *++e_token = BACKSLASH;
353 		    }
354 		    *++e_token = *buf_ptr++;
355 		    ++e_token;	/* we must increment this again because we
356 				 * copied two chars */
357 		    if (buf_ptr >= buf_end)
358 			fill_buffer();
359 		}
360 		else
361 		    break;	/* we copied one character */
362 	    }			/* end of while (1) */
363 	} while (*e_token++ != qchar);
364 	if (troff) {
365 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
366 	    if (qchar == '"')
367 		*e_token++ = '\'';
368 	}
369 stop_lit:
370 	code = ident;
371 	break;
372 
373     case ('('):
374     case ('['):
375 	unary_delim = true;
376 	code = lparen;
377 	break;
378 
379     case (')'):
380     case (']'):
381 	code = rparen;
382 	break;
383 
384     case '#':
385 	unary_delim = ps.last_u_d;
386 	code = preesc;
387 	break;
388 
389     case '?':
390 	unary_delim = true;
391 	code = question;
392 	break;
393 
394     case (':'):
395 	code = colon;
396 	unary_delim = true;
397 	break;
398 
399     case (';'):
400 	unary_delim = true;
401 	code = semicolon;
402 	break;
403 
404     case ('{'):
405 	unary_delim = true;
406 
407 	/*
408 	 * if (ps.in_or_st) ps.block_init = 1;
409 	 */
410 	/* ?	code = ps.block_init ? lparen : lbrace; */
411 	code = lbrace;
412 	break;
413 
414     case ('}'):
415 	unary_delim = true;
416 	/* ?	code = ps.block_init ? rparen : rbrace; */
417 	code = rbrace;
418 	break;
419 
420     case 014:			/* a form feed */
421 	unary_delim = ps.last_u_d;
422 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
423 				 * right */
424 	code = form_feed;
425 	break;
426 
427     case (','):
428 	unary_delim = true;
429 	code = comma;
430 	break;
431 
432     case '.':
433 	unary_delim = false;
434 	code = period;
435 	break;
436 
437     case '-':
438     case '+':			/* check for -, +, --, ++ */
439 	code = (ps.last_u_d ? unary_op : binary_op);
440 	unary_delim = true;
441 
442 	if (*buf_ptr == token[0]) {
443 	    /* check for doubled character */
444 	    *e_token++ = *buf_ptr++;
445 	    /* buffer overflow will be checked at end of loop */
446 	    if (last_code == ident || last_code == rparen) {
447 		code = (ps.last_u_d ? unary_op : postop);
448 		/* check for following ++ or -- */
449 		unary_delim = false;
450 	    }
451 	}
452 	else if (*buf_ptr == '=')
453 	    /* check for operator += */
454 	    *e_token++ = *buf_ptr++;
455 	else if (*buf_ptr == '>') {
456 	    /* check for operator -> */
457 	    *e_token++ = *buf_ptr++;
458 	    if (!pointer_as_binop) {
459 		unary_delim = false;
460 		code = unary_op;
461 		ps.want_blank = false;
462 	    }
463 	}
464 	break;			/* buffer overflow will be checked at end of
465 				 * switch */
466 
467     case '=':
468 	if (ps.in_or_st)
469 	    ps.block_init = 1;
470 #ifdef undef
471 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
472 	    e_token[-1] = *buf_ptr++;
473 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
474 		*e_token++ = *buf_ptr++;
475 	    *e_token++ = '=';	/* Flip =+ to += */
476 	    *e_token = 0;
477 	}
478 #else
479 	if (*buf_ptr == '=') {/* == */
480 	    *e_token++ = '=';	/* Flip =+ to += */
481 	    buf_ptr++;
482 	    *e_token = 0;
483 	}
484 #endif
485 	code = binary_op;
486 	unary_delim = true;
487 	break;
488 	/* can drop thru!!! */
489 
490     case '>':
491     case '<':
492     case '!':			/* ops like <, <<, <=, !=, etc */
493 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
494 	    *e_token++ = *buf_ptr;
495 	    if (++buf_ptr >= buf_end)
496 		fill_buffer();
497 	}
498 	if (*buf_ptr == '=')
499 	    *e_token++ = *buf_ptr++;
500 	code = (ps.last_u_d ? unary_op : binary_op);
501 	unary_delim = true;
502 	break;
503 
504     default:
505 	if (token[0] == '/' && *buf_ptr == '*') {
506 	    /* it is start of comment */
507 	    *e_token++ = '*';
508 
509 	    if (++buf_ptr >= buf_end)
510 		fill_buffer();
511 
512 	    code = comment;
513 	    unary_delim = ps.last_u_d;
514 	    break;
515 	}
516 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
517 	    /*
518 	     * handle ||, &&, etc, and also things as in int *****i
519 	     */
520 	    *e_token++ = *buf_ptr;
521 	    if (++buf_ptr >= buf_end)
522 		fill_buffer();
523 	}
524 	code = (ps.last_u_d ? unary_op : binary_op);
525 	unary_delim = true;
526 
527 
528     }				/* end of switch */
529     if (code != newline) {
530 	l_struct = false;
531 	last_code = code;
532     }
533     if (buf_ptr >= buf_end)	/* check for input buffer empty */
534 	fill_buffer();
535     ps.last_u_d = unary_delim;
536     *e_token = '\0';		/* null terminate the token */
537     return (code);
538 }
539 
540 /*
541  * Add the given keyword to the keyword table, using val as the keyword type
542  */
543 addkey(key, val)
544     char       *key;
545 {
546     register struct templ *p = specials;
547     while (p->rwd)
548 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
549 	    return;
550 	else
551 	    p++;
552     if (p >= specials + sizeof specials / sizeof specials[0])
553 	return;			/* For now, table overflows are silently
554 				 * ignored */
555     p->rwd = key;
556     p->rwcode = val;
557     p[1].rwd = 0;
558     p[1].rwcode = 0;
559     return;
560 }
561