xref: /netbsd-src/external/historical/nawk/dist/lex.c (revision 7788a0781fe6ff2cce37368b4578a7ade0850cb1)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #if HAVE_NBTOOL_CONFIG_H
26 #include "nbtool_config.h"
27 #endif
28 
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <ctype.h>
33 #include "awk.h"
34 #include "awkgram.h"
35 
36 extern YYSTYPE	yylval;
37 extern int	infunc;
38 
39 int	lineno	= 1;
40 int	bracecnt = 0;
41 int	brackcnt  = 0;
42 int	parencnt = 0;
43 
44 typedef struct Keyword {
45 	const char *word;
46 	int	sub;
47 	int	type;
48 } Keyword;
49 
50 int peek(void);
51 int gettok(char **, int *);
52 int binsearch(const char *, const Keyword *, int);
53 
54 const Keyword keywords[] ={	/* keep sorted: binary searched */
55 	{ "BEGIN",	XBEGIN,		XBEGIN },
56 	{ "END",	XEND,		XEND },
57 	{ "NF",		VARNF,		VARNF },
58 	{ "atan2",	FATAN,		BLTIN },
59 	{ "break",	BREAK,		BREAK },
60 	{ "close",	CLOSE,		CLOSE },
61 	{ "continue",	CONTINUE,	CONTINUE },
62 	{ "cos",	FCOS,		BLTIN },
63 	{ "delete",	DELETE,		DELETE },
64 	{ "do",		DO,		DO },
65 	{ "else",	ELSE,		ELSE },
66 	{ "exit",	EXIT,		EXIT },
67 	{ "exp",	FEXP,		BLTIN },
68 	{ "fflush",	FFLUSH,		BLTIN },
69 	{ "for",	FOR,		FOR },
70 	{ "func",	FUNC,		FUNC },
71 	{ "function",	FUNC,		FUNC },
72 	{ "gensub",	GENSUB,		GENSUB },
73 	{ "getline",	GETLINE,	GETLINE },
74 	{ "gsub",	GSUB,		GSUB },
75 	{ "if",		IF,		IF },
76 	{ "in",		IN,		IN },
77 	{ "index",	INDEX,		INDEX },
78 	{ "int",	FINT,		BLTIN },
79 	{ "length",	FLENGTH,	BLTIN },
80 	{ "log",	FLOG,		BLTIN },
81 	{ "match",	MATCHFCN,	MATCHFCN },
82 	{ "next",	NEXT,		NEXT },
83 	{ "nextfile",	NEXTFILE,	NEXTFILE },
84 	{ "print",	PRINT,		PRINT },
85 	{ "printf",	PRINTF,		PRINTF },
86 	{ "rand",	FRAND,		BLTIN },
87 	{ "return",	RETURN,		RETURN },
88 	{ "sin",	FSIN,		BLTIN },
89 	{ "split",	SPLIT,		SPLIT },
90 	{ "sprintf",	SPRINTF,	SPRINTF },
91 	{ "sqrt",	FSQRT,		BLTIN },
92 	{ "srand",	FSRAND,		BLTIN },
93 	{ "strftime",	FSTRFTIME,	BLTIN },
94 	{ "sub",	SUB,		SUB },
95 	{ "substr",	SUBSTR,		SUBSTR },
96 	{ "system",	FSYSTEM,	BLTIN },
97 	{ "systime",	FSYSTIME,	BLTIN },
98 	{ "tolower",	FTOLOWER,	BLTIN },
99 	{ "toupper",	FTOUPPER,	BLTIN },
100 	{ "while",	WHILE,		WHILE },
101 };
102 
103 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
104 
105 int peek(void)
106 {
107 	int c = input();
108 	unput(c);
109 	return c;
110 }
111 
112 int gettok(char **pbuf, int *psz)	/* get next input token */
113 {
114 	int c, retc;
115 	uschar *buf = (uschar *) *pbuf;
116 	int sz = *psz;
117 	uschar *bp = buf;
118 
119 	c = input();
120 	if (c == 0)
121 		return 0;
122 	buf[0] = c;
123 	buf[1] = 0;
124 	if (!isalnum(c) && c != '.' && c != '_')
125 		return c;
126 
127 	*bp++ = c;
128 	if (isalpha(c) || c == '_') {	/* it's a varname */
129 		for ( ; (c = input()) != 0; ) {
130 			if (bp-buf >= sz)
131 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
132 					FATAL( "out of space for name %.10s...", buf );
133 			if (isalnum(c) || c == '_')
134 				*bp++ = c;
135 			else {
136 				*bp = 0;
137 				unput(c);
138 				break;
139 			}
140 		}
141 		*bp = 0;
142 		retc = 'a';	/* alphanumeric */
143 	} else {	/* maybe it's a number, but could be . */
144 		char *rem;
145 		/* read input until can't be a number */
146 		for ( ; (c = input()) != 0; ) {
147 			if (bp-buf >= sz)
148 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
149 					FATAL( "out of space for number %.10s...", buf );
150 			if (isdigit(c) || c == 'e' || c == 'E'
151 			  || c == '.' || c == '+' || c == '-')
152 				*bp++ = c;
153 			else {
154 				unput(c);
155 				break;
156 			}
157 		}
158 		*bp = 0;
159 		strtod(buf, &rem);	/* parse the number */
160 		if (rem == (char *)buf) {	/* it wasn't a valid number at all */
161 			buf[1] = 0;	/* return one character as token */
162 			retc = buf[0];	/* character is its own type */
163 			unputstr(rem+1); /* put rest back for later */
164 		} else {	/* some prefix was a number */
165 			unputstr(rem);	/* put rest back for later */
166 			rem[0] = 0;	/* truncate buf after number part */
167 			retc = '0';	/* type is number */
168 		}
169 	}
170 	*pbuf = buf;
171 	*psz = sz;
172 	return retc;
173 }
174 
175 int	word(char *);
176 int	string(void);
177 int	regexpr(void);
178 int	sc	= 0;	/* 1 => return a } right now */
179 int	reg	= 0;	/* 1 => return a REGEXPR now */
180 
181 int yylex(void)
182 {
183 	int c;
184 	static char *buf = 0;
185 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
186 
187 	if (buf == 0 && (buf = malloc(bufsize)) == NULL)
188 		FATAL( "out of space in yylex" );
189 	if (sc) {
190 		sc = 0;
191 		RET('}');
192 	}
193 	if (reg) {
194 		reg = 0;
195 		return regexpr();
196 	}
197 	for (;;) {
198 		c = gettok(&buf, &bufsize);
199 		if (c == 0)
200 			return 0;
201 		if (isalpha(c) || c == '_')
202 			return word(buf);
203 		if (isdigit(c)) {
204 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
205 			/* should this also have STR set? */
206 			RET(NUMBER);
207 		}
208 
209 		yylval.i = c;
210 		switch (c) {
211 		case '\n':	/* {EOL} */
212 			RET(NL);
213 		case '\r':	/* assume \n is coming */
214 		case ' ':	/* {WS}+ */
215 		case '\t':
216 			break;
217 		case '#':	/* #.* strip comments */
218 			while ((c = input()) != '\n' && c != 0)
219 				;
220 			unput(c);
221 			break;
222 		case ';':
223 			RET(';');
224 		case '\\':
225 			if (peek() == '\n') {
226 				input();
227 			} else if (peek() == '\r') {
228 				input(); input();	/* \n */
229 				lineno++;
230 			} else {
231 				RET(c);
232 			}
233 			break;
234 		case '&':
235 			if (peek() == '&') {
236 				input(); RET(AND);
237 			} else
238 				RET('&');
239 		case '|':
240 			if (peek() == '|') {
241 				input(); RET(BOR);
242 			} else
243 				RET('|');
244 		case '!':
245 			if (peek() == '=') {
246 				input(); yylval.i = NE; RET(NE);
247 			} else if (peek() == '~') {
248 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
249 			} else
250 				RET(NOT);
251 		case '~':
252 			yylval.i = MATCH;
253 			RET(MATCHOP);
254 		case '<':
255 			if (peek() == '=') {
256 				input(); yylval.i = LE; RET(LE);
257 			} else {
258 				yylval.i = LT; RET(LT);
259 			}
260 		case '=':
261 			if (peek() == '=') {
262 				input(); yylval.i = EQ; RET(EQ);
263 			} else {
264 				yylval.i = ASSIGN; RET(ASGNOP);
265 			}
266 		case '>':
267 			if (peek() == '=') {
268 				input(); yylval.i = GE; RET(GE);
269 			} else if (peek() == '>') {
270 				input(); yylval.i = APPEND; RET(APPEND);
271 			} else {
272 				yylval.i = GT; RET(GT);
273 			}
274 		case '+':
275 			if (peek() == '+') {
276 				input(); yylval.i = INCR; RET(INCR);
277 			} else if (peek() == '=') {
278 				input(); yylval.i = ADDEQ; RET(ASGNOP);
279 			} else
280 				RET('+');
281 		case '-':
282 			if (peek() == '-') {
283 				input(); yylval.i = DECR; RET(DECR);
284 			} else if (peek() == '=') {
285 				input(); yylval.i = SUBEQ; RET(ASGNOP);
286 			} else
287 				RET('-');
288 		case '*':
289 			if (peek() == '=') {	/* *= */
290 				input(); yylval.i = MULTEQ; RET(ASGNOP);
291 			} else if (peek() == '*') {	/* ** or **= */
292 				input();	/* eat 2nd * */
293 				if (peek() == '=') {
294 					input(); yylval.i = POWEQ; RET(ASGNOP);
295 				} else {
296 					RET(POWER);
297 				}
298 			} else
299 				RET('*');
300 		case '/':
301 			RET('/');
302 		case '%':
303 			if (peek() == '=') {
304 				input(); yylval.i = MODEQ; RET(ASGNOP);
305 			} else
306 				RET('%');
307 		case '^':
308 			if (peek() == '=') {
309 				input(); yylval.i = POWEQ; RET(ASGNOP);
310 			} else
311 				RET(POWER);
312 
313 		case '$':
314 			/* BUG: awkward, if not wrong */
315 			c = gettok(&buf, &bufsize);
316 			if (isalpha(c)) {
317 				if (strcmp(buf, "NF") == 0) {	/* very special */
318 					unputstr("(NF)");
319 					RET(INDIRECT);
320 				}
321 				c = peek();
322 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
323 					unputstr(buf);
324 					RET(INDIRECT);
325 				}
326 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
327 				RET(IVAR);
328 			} else if (c == 0) {	/*  */
329 				SYNTAX( "unexpected end of input after $" );
330 				RET(';');
331 			} else {
332 				unputstr(buf);
333 				RET(INDIRECT);
334 			}
335 
336 		case '}':
337 			if (--bracecnt < 0)
338 				SYNTAX( "extra }" );
339 			sc = 1;
340 			RET(';');
341 		case ']':
342 			if (--brackcnt < 0)
343 				SYNTAX( "extra ]" );
344 			RET(']');
345 		case ')':
346 			if (--parencnt < 0)
347 				SYNTAX( "extra )" );
348 			RET(')');
349 		case '{':
350 			bracecnt++;
351 			RET('{');
352 		case '[':
353 			brackcnt++;
354 			RET('[');
355 		case '(':
356 			parencnt++;
357 			RET('(');
358 
359 		case '"':
360 			return string();	/* BUG: should be like tran.c ? */
361 
362 		default:
363 			RET(c);
364 		}
365 	}
366 }
367 
368 int string(void)
369 {
370 	int c, n;
371 	uschar *s, *bp;
372 	static uschar *buf = 0;
373 	static int bufsz = 500;
374 
375 	if (buf == 0 && (buf = malloc(bufsz)) == NULL)
376 		FATAL("out of space for strings");
377 	for (bp = buf; (c = input()) != '"'; ) {
378 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
379 			FATAL("out of space for string %.10s...", buf);
380 		switch (c) {
381 		case '\n':
382 		case '\r':
383 		case 0:
384 			SYNTAX( "non-terminated string %.10s...", buf );
385 			lineno++;
386 			if (c == 0)	/* hopeless */
387 				FATAL( "giving up" );
388 			break;
389 		case '\\':
390 			c = input();
391 			switch (c) {
392 			case '\n': break;
393 			case '"': *bp++ = '"'; break;
394 			case 'n': *bp++ = '\n'; break;
395 			case 't': *bp++ = '\t'; break;
396 			case 'f': *bp++ = '\f'; break;
397 			case 'r': *bp++ = '\r'; break;
398 			case 'b': *bp++ = '\b'; break;
399 			case 'v': *bp++ = '\v'; break;
400 			case 'a': *bp++ = '\007'; break;
401 			case '\\': *bp++ = '\\'; break;
402 
403 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
404 			case '3': case '4': case '5': case '6': case '7':
405 				n = c - '0';
406 				if ((c = peek()) >= '0' && c < '8') {
407 					n = 8 * n + input() - '0';
408 					if ((c = peek()) >= '0' && c < '8')
409 						n = 8 * n + input() - '0';
410 				}
411 				*bp++ = n;
412 				break;
413 
414 			case 'x':	/* hex  \x0-9a-fA-F + */
415 			    {	char xbuf[100], *px;
416 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
417 					if (isdigit(c)
418 					 || (c >= 'a' && c <= 'f')
419 					 || (c >= 'A' && c <= 'F'))
420 						*px++ = c;
421 					else
422 						break;
423 				}
424 				*px = 0;
425 				unput(c);
426 	  			sscanf(xbuf, "%x", &n);
427 				*bp++ = n;
428 				break;
429 			    }
430 
431 			default:
432 				WARNING("warning: escape sequence `\\%c' "
433 				    "treated as plain `%c'", c, c);
434 				*bp++ = c;
435 				break;
436 			}
437 			break;
438 		default:
439 			*bp++ = c;
440 			break;
441 		}
442 	}
443 	*bp = 0;
444 	s = tostring(buf);
445 	*bp++ = ' '; *bp++ = 0;
446 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
447 	RET(STRING);
448 }
449 
450 
451 int binsearch(const char *w, const Keyword *kp, int n)
452 {
453 	int cond, low, mid, high;
454 
455 	low = 0;
456 	high = n - 1;
457 	while (low <= high) {
458 		mid = (low + high) / 2;
459 		if ((cond = strcmp(w, kp[mid].word)) < 0)
460 			high = mid - 1;
461 		else if (cond > 0)
462 			low = mid + 1;
463 		else
464 			return mid;
465 	}
466 	return -1;
467 }
468 
469 int word(char *w)
470 {
471 	const Keyword *kp;
472 	int c, n;
473 
474 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
475 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
476 	kp = keywords + n;
477 	if (n != -1) {	/* found in table */
478 		yylval.i = kp->sub;
479 		switch (kp->type) {	/* special handling */
480 		case BLTIN:
481 			if (kp->sub == FSYSTEM && safe)
482 				SYNTAX( "system is unsafe" );
483 			RET(kp->type);
484 		case FUNC:
485 			if (infunc)
486 				SYNTAX( "illegal nested function" );
487 			RET(kp->type);
488 		case RETURN:
489 			if (!infunc)
490 				SYNTAX( "return not in function" );
491 			RET(kp->type);
492 		case VARNF:
493 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
494 			RET(VARNF);
495 		default:
496 			RET(kp->type);
497 		}
498 	}
499 	c = peek();	/* look for '(' */
500 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
501 		yylval.i = n;
502 		RET(ARG);
503 	} else {
504 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
505 		if (c == '(') {
506 			RET(CALL);
507 		} else {
508 			RET(VAR);
509 		}
510 	}
511 }
512 
513 void startreg(void)	/* next call to yylex will return a regular expression */
514 {
515 	reg = 1;
516 }
517 
518 int regexpr(void)
519 {
520 	int c;
521 	static uschar *buf = 0;
522 	static int bufsz = 500;
523 	uschar *bp;
524 
525 	if (buf == 0 && (buf = malloc(bufsz)) == NULL)
526 		FATAL("out of space for rex expr");
527 	bp = buf;
528 	for ( ; (c = input()) != '/' && c != 0; ) {
529 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
530 			FATAL("out of space for reg expr %.10s...", buf);
531 		if (c == '\n') {
532 			SYNTAX( "newline in regular expression %.10s...", buf );
533 			unput('\n');
534 			break;
535 		} else if (c == '\\') {
536 			*bp++ = '\\';
537 			*bp++ = input();
538 		} else {
539 			*bp++ = c;
540 		}
541 	}
542 	*bp = 0;
543 	if (c == 0)
544 		SYNTAX("non-terminated regular expression %.10s...", buf);
545 	yylval.s = tostring(buf);
546 	unput('/');
547 	RET(REGEXPR);
548 }
549 
550 /* low-level lexical stuff, sort of inherited from lex */
551 
552 char	ebuf[300];
553 char	*ep = ebuf;
554 char	yysbuf[100];	/* pushback buffer */
555 char	*yysptr = yysbuf;
556 FILE	*yyin = 0;
557 
558 int input(void)	/* get next lexical input character */
559 {
560 	int c;
561 	extern char *lexprog;
562 
563 	if (yysptr > yysbuf)
564 		c = (uschar)*--yysptr;
565 	else if (lexprog != NULL) {	/* awk '...' */
566 		if ((c = (uschar)*lexprog) != 0)
567 			lexprog++;
568 	} else				/* awk -f ... */
569 		c = pgetc();
570 	if (c == '\n')
571 		lineno++;
572 	else if (c == EOF)
573 		c = 0;
574 	if (ep >= ebuf + sizeof ebuf)
575 		ep = ebuf;
576 	return *ep++ = c;
577 }
578 
579 void unput(int c)	/* put lexical character back on input */
580 {
581 	if (c == '\n')
582 		lineno--;
583 	if (yysptr >= yysbuf + sizeof(yysbuf))
584 		FATAL("pushed back too much: %.20s...", yysbuf);
585 	*yysptr++ = c;
586 	if (--ep < ebuf)
587 		ep = ebuf + sizeof(ebuf) - 1;
588 }
589 
590 void unputstr(const char *s)	/* put a string back on input */
591 {
592 	int i;
593 
594 	for (i = strlen(s)-1; i >= 0; i--)
595 		unput(s[i]);
596 }
597