xref: /openbsd-src/usr.bin/awk/lex.c (revision 1a8dbaac879b9f3335ad7fb25429ce63ac1d6bac)
1 /*	$OpenBSD: lex.c,v 1.26 2020/08/28 16:29:16 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "awkgram.tab.h"
32 
33 extern YYSTYPE	yylval;
34 extern bool	infunc;
35 
36 int	lineno	= 1;
37 int	bracecnt = 0;
38 int	brackcnt  = 0;
39 int	parencnt = 0;
40 
41 typedef struct Keyword {
42 	const char *word;
43 	int	sub;
44 	int	type;
45 } Keyword;
46 
47 const Keyword keywords[] = {	/* keep sorted: binary searched */
48 	{ "BEGIN",	XBEGIN,		XBEGIN },
49 	{ "END",	XEND,		XEND },
50 	{ "NF",		VARNF,		VARNF },
51 	{ "and",	FAND,		BLTIN },
52 	{ "atan2",	FATAN,		BLTIN },
53 	{ "break",	BREAK,		BREAK },
54 	{ "close",	CLOSE,		CLOSE },
55 	{ "compl",	FCOMPL,		BLTIN },
56 	{ "continue",	CONTINUE,	CONTINUE },
57 	{ "cos",	FCOS,		BLTIN },
58 	{ "delete",	DELETE,		DELETE },
59 	{ "do",		DO,		DO },
60 	{ "else",	ELSE,		ELSE },
61 	{ "exit",	EXIT,		EXIT },
62 	{ "exp",	FEXP,		BLTIN },
63 	{ "fflush",	FFLUSH,		BLTIN },
64 	{ "for",	FOR,		FOR },
65 	{ "func",	FUNC,		FUNC },
66 	{ "function",	FUNC,		FUNC },
67 	{ "gensub",	GENSUB,		GENSUB },
68 	{ "getline",	GETLINE,	GETLINE },
69 	{ "gsub",	GSUB,		GSUB },
70 	{ "if",		IF,		IF },
71 	{ "in",		IN,		IN },
72 	{ "index",	INDEX,		INDEX },
73 	{ "int",	FINT,		BLTIN },
74 	{ "length",	FLENGTH,	BLTIN },
75 	{ "log",	FLOG,		BLTIN },
76 	{ "lshift",	FLSHIFT,	BLTIN },
77 	{ "match",	MATCHFCN,	MATCHFCN },
78 	{ "mktime",	FMKTIME,	BLTIN },
79 	{ "next",	NEXT,		NEXT },
80 	{ "nextfile",	NEXTFILE,	NEXTFILE },
81 	{ "or",		FFOR,		BLTIN },
82 	{ "print",	PRINT,		PRINT },
83 	{ "printf",	PRINTF,		PRINTF },
84 	{ "rand",	FRAND,		BLTIN },
85 	{ "return",	RETURN,		RETURN },
86 	{ "rshift",	FRSHIFT,	BLTIN },
87 	{ "sin",	FSIN,		BLTIN },
88 	{ "split",	SPLIT,		SPLIT },
89 	{ "sprintf",	SPRINTF,	SPRINTF },
90 	{ "sqrt",	FSQRT,		BLTIN },
91 	{ "srand",	FSRAND,		BLTIN },
92 	{ "strftime",	FSTRFTIME,	BLTIN },
93 	{ "sub",	SUB,		SUB },
94 	{ "substr",	SUBSTR,		SUBSTR },
95 	{ "system",	FSYSTEM,	BLTIN },
96 	{ "systime",	FSYSTIME,	BLTIN },
97 	{ "tolower",	FTOLOWER,	BLTIN },
98 	{ "toupper",	FTOUPPER,	BLTIN },
99 	{ "while",	WHILE,		WHILE },
100 	{ "xor",	FXOR,		BLTIN },
101 };
102 
103 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
104 
105 static int peek(void)
106 {
107 	int c = input();
108 	unput(c);
109 	return c;
110 }
111 
112 static int gettok(char **pbuf, int *psz)	/* get next input token */
113 {
114 	int c, retc;
115 	char *buf = *pbuf;
116 	int sz = *psz;
117 	char *bp = buf;
118 
119 	c = input();
120 	if (c == 0)
121 		return 0;
122 	buf[0] = c;
123 	buf[1] = 0;
124 	if (!isalnum(c) && c != '.' && c != '_')
125 		return c;
126 
127 	*bp++ = c;
128 	if (isalpha(c) || c == '_') {	/* it's a varname */
129 		for ( ; (c = input()) != 0; ) {
130 			if (bp-buf >= sz)
131 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
132 					FATAL( "out of space for name %.10s...", buf );
133 			if (isalnum(c) || c == '_')
134 				*bp++ = c;
135 			else {
136 				*bp = 0;
137 				unput(c);
138 				break;
139 			}
140 		}
141 		*bp = 0;
142 		retc = 'a';	/* alphanumeric */
143 	} else {	/* maybe it's a number, but could be . */
144 		char *rem;
145 		/* read input until can't be a number */
146 		for ( ; (c = input()) != 0; ) {
147 			if (bp-buf >= sz)
148 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
149 					FATAL( "out of space for number %.10s...", buf );
150 			if (isdigit(c) || c == 'e' || c == 'E'
151 			  || c == '.' || c == '+' || c == '-')
152 				*bp++ = c;
153 			else {
154 				unput(c);
155 				break;
156 			}
157 		}
158 		*bp = 0;
159 		strtod(buf, &rem);	/* parse the number */
160 		if (rem == buf) {	/* it wasn't a valid number at all */
161 			buf[1] = 0;	/* return one character as token */
162 			retc = (uschar)buf[0];	/* character is its own type */
163 			unputstr(rem+1); /* put rest back for later */
164 		} else {	/* some prefix was a number */
165 			unputstr(rem);	/* put rest back for later */
166 			rem[0] = 0;	/* truncate buf after number part */
167 			retc = '0';	/* type is number */
168 		}
169 	}
170 	*pbuf = buf;
171 	*psz = sz;
172 	return retc;
173 }
174 
175 int	word(char *);
176 int	string(void);
177 int	regexpr(void);
178 bool	sc	= false;	/* true => return a } right now */
179 bool	reg	= false;	/* true => return a REGEXPR now */
180 
181 int yylex(void)
182 {
183 	int c;
184 	static char *buf = NULL;
185 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
186 
187 	if (buf == NULL && (buf = malloc(bufsize)) == NULL)
188 		FATAL( "out of space in yylex" );
189 	if (sc) {
190 		sc = false;
191 		RET('}');
192 	}
193 	if (reg) {
194 		reg = false;
195 		return regexpr();
196 	}
197 	for (;;) {
198 		c = gettok(&buf, &bufsize);
199 		if (c == 0)
200 			return 0;
201 		if (isalpha(c) || c == '_')
202 			return word(buf);
203 		if (isdigit(c)) {
204 			char *cp = tostring(buf);
205 			yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
206 			free(cp);
207 			/* should this also have STR set? */
208 			RET(NUMBER);
209 		}
210 
211 		yylval.i = c;
212 		switch (c) {
213 		case '\n':	/* {EOL} */
214 			lineno++;
215 			RET(NL);
216 		case '\r':	/* assume \n is coming */
217 		case ' ':	/* {WS}+ */
218 		case '\t':
219 			break;
220 		case '#':	/* #.* strip comments */
221 			while ((c = input()) != '\n' && c != 0)
222 				;
223 			unput(c);
224 			/*
225 			 * Next line is a hack, itcompensates for
226 			 * unput's treatment of \n.
227 			 */
228 			lineno++;
229 			break;
230 		case ';':
231 			RET(';');
232 		case '\\':
233 			if (peek() == '\n') {
234 				input();
235 				lineno++;
236 			} else if (peek() == '\r') {
237 				input(); input();	/* \n */
238 				lineno++;
239 			} else {
240 				RET(c);
241 			}
242 			break;
243 		case '&':
244 			if (peek() == '&') {
245 				input(); RET(AND);
246 			} else
247 				RET('&');
248 		case '|':
249 			if (peek() == '|') {
250 				input(); RET(BOR);
251 			} else
252 				RET('|');
253 		case '!':
254 			if (peek() == '=') {
255 				input(); yylval.i = NE; RET(NE);
256 			} else if (peek() == '~') {
257 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
258 			} else
259 				RET(NOT);
260 		case '~':
261 			yylval.i = MATCH;
262 			RET(MATCHOP);
263 		case '<':
264 			if (peek() == '=') {
265 				input(); yylval.i = LE; RET(LE);
266 			} else {
267 				yylval.i = LT; RET(LT);
268 			}
269 		case '=':
270 			if (peek() == '=') {
271 				input(); yylval.i = EQ; RET(EQ);
272 			} else {
273 				yylval.i = ASSIGN; RET(ASGNOP);
274 			}
275 		case '>':
276 			if (peek() == '=') {
277 				input(); yylval.i = GE; RET(GE);
278 			} else if (peek() == '>') {
279 				input(); yylval.i = APPEND; RET(APPEND);
280 			} else {
281 				yylval.i = GT; RET(GT);
282 			}
283 		case '+':
284 			if (peek() == '+') {
285 				input(); yylval.i = INCR; RET(INCR);
286 			} else if (peek() == '=') {
287 				input(); yylval.i = ADDEQ; RET(ASGNOP);
288 			} else
289 				RET('+');
290 		case '-':
291 			if (peek() == '-') {
292 				input(); yylval.i = DECR; RET(DECR);
293 			} else if (peek() == '=') {
294 				input(); yylval.i = SUBEQ; RET(ASGNOP);
295 			} else
296 				RET('-');
297 		case '*':
298 			if (peek() == '=') {	/* *= */
299 				input(); yylval.i = MULTEQ; RET(ASGNOP);
300 			} else if (peek() == '*') {	/* ** or **= */
301 				input();	/* eat 2nd * */
302 				if (peek() == '=') {
303 					input(); yylval.i = POWEQ; RET(ASGNOP);
304 				} else {
305 					RET(POWER);
306 				}
307 			} else
308 				RET('*');
309 		case '/':
310 			RET('/');
311 		case '%':
312 			if (peek() == '=') {
313 				input(); yylval.i = MODEQ; RET(ASGNOP);
314 			} else
315 				RET('%');
316 		case '^':
317 			if (peek() == '=') {
318 				input(); yylval.i = POWEQ; RET(ASGNOP);
319 			} else
320 				RET(POWER);
321 
322 		case '$':
323 			/* BUG: awkward, if not wrong */
324 			c = gettok(&buf, &bufsize);
325 			if (isalpha(c)) {
326 				if (strcmp(buf, "NF") == 0) {	/* very special */
327 					unputstr("(NF)");
328 					RET(INDIRECT);
329 				}
330 				c = peek();
331 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
332 					unputstr(buf);
333 					RET(INDIRECT);
334 				}
335 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
336 				RET(IVAR);
337 			} else if (c == 0) {	/*  */
338 				SYNTAX( "unexpected end of input after $" );
339 				RET(';');
340 			} else {
341 				unputstr(buf);
342 				RET(INDIRECT);
343 			}
344 
345 		case '}':
346 			if (--bracecnt < 0)
347 				SYNTAX( "extra }" );
348 			sc = true;
349 			RET(';');
350 		case ']':
351 			if (--brackcnt < 0)
352 				SYNTAX( "extra ]" );
353 			RET(']');
354 		case ')':
355 			if (--parencnt < 0)
356 				SYNTAX( "extra )" );
357 			RET(')');
358 		case '{':
359 			bracecnt++;
360 			RET('{');
361 		case '[':
362 			brackcnt++;
363 			RET('[');
364 		case '(':
365 			parencnt++;
366 			RET('(');
367 
368 		case '"':
369 			return string();	/* BUG: should be like tran.c ? */
370 
371 		default:
372 			RET(c);
373 		}
374 	}
375 }
376 
377 int string(void)
378 {
379 	int c, n;
380 	char *s, *bp;
381 	static char *buf = NULL;
382 	static int bufsz = 500;
383 
384 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
385 		FATAL("out of space for strings");
386 	for (bp = buf; (c = input()) != '"'; ) {
387 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
388 			FATAL("out of space for string %.10s...", buf);
389 		switch (c) {
390 		case '\n':
391 		case '\r':
392 		case 0:
393 			*bp = '\0';
394 			SYNTAX( "non-terminated string %.10s...", buf );
395 			if (c == 0)	/* hopeless */
396 				FATAL( "giving up" );
397 			lineno++;
398 			break;
399 		case '\\':
400 			c = input();
401 			switch (c) {
402 			case '\n': break;
403 			case '"': *bp++ = '"'; break;
404 			case 'n': *bp++ = '\n'; break;
405 			case 't': *bp++ = '\t'; break;
406 			case 'f': *bp++ = '\f'; break;
407 			case 'r': *bp++ = '\r'; break;
408 			case 'b': *bp++ = '\b'; break;
409 			case 'v': *bp++ = '\v'; break;
410 			case 'a': *bp++ = '\a'; break;
411 			case '\\': *bp++ = '\\'; break;
412 
413 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
414 			case '3': case '4': case '5': case '6': case '7':
415 				n = c - '0';
416 				if ((c = peek()) >= '0' && c < '8') {
417 					n = 8 * n + input() - '0';
418 					if ((c = peek()) >= '0' && c < '8')
419 						n = 8 * n + input() - '0';
420 				}
421 				*bp++ = n;
422 				break;
423 
424 			case 'x':	/* hex  \x0-9a-fA-F + */
425 			    {	char xbuf[100], *px;
426 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
427 					if (isdigit(c)
428 					 || (c >= 'a' && c <= 'f')
429 					 || (c >= 'A' && c <= 'F'))
430 						*px++ = c;
431 					else
432 						break;
433 				}
434 				*px = 0;
435 				unput(c);
436 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
437 				*bp++ = n;
438 				break;
439 			    }
440 
441 			default:
442 				*bp++ = c;
443 				break;
444 			}
445 			break;
446 		default:
447 			*bp++ = c;
448 			break;
449 		}
450 	}
451 	*bp = 0;
452 	s = tostring(buf);
453 	*bp++ = ' '; *bp++ = '\0';
454 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
455 	free(s);
456 	RET(STRING);
457 }
458 
459 
460 static int binsearch(char *w, const Keyword *kp, int n)
461 {
462 	int cond, low, mid, high;
463 
464 	low = 0;
465 	high = n - 1;
466 	while (low <= high) {
467 		mid = (low + high) / 2;
468 		if ((cond = strcmp(w, kp[mid].word)) < 0)
469 			high = mid - 1;
470 		else if (cond > 0)
471 			low = mid + 1;
472 		else
473 			return mid;
474 	}
475 	return -1;
476 }
477 
478 int word(char *w)
479 {
480 	const Keyword *kp;
481 	int c, n;
482 
483 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
484 	if (n != -1) {	/* found in table */
485 		kp = keywords + n;
486 		yylval.i = kp->sub;
487 		switch (kp->type) {	/* special handling */
488 		case BLTIN:
489 			if (kp->sub == FSYSTEM && safe)
490 				SYNTAX( "system is unsafe" );
491 			RET(kp->type);
492 		case FUNC:
493 			if (infunc)
494 				SYNTAX( "illegal nested function" );
495 			RET(kp->type);
496 		case RETURN:
497 			if (!infunc)
498 				SYNTAX( "return not in function" );
499 			RET(kp->type);
500 		case VARNF:
501 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
502 			RET(VARNF);
503 		default:
504 			RET(kp->type);
505 		}
506 	}
507 	c = peek();	/* look for '(' */
508 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
509 		yylval.i = n;
510 		RET(ARG);
511 	} else {
512 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
513 		if (c == '(') {
514 			RET(CALL);
515 		} else {
516 			RET(VAR);
517 		}
518 	}
519 }
520 
521 void startreg(void)	/* next call to yylex will return a regular expression */
522 {
523 	reg = true;
524 }
525 
526 int regexpr(void)
527 {
528 	int c, openclass = 0;
529 	static char *buf = NULL;
530 	static int bufsz = 500;
531 	char *bp, *cstart;
532 
533 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
534 		FATAL("out of space for rex expr");
535 	bp = buf;
536 	for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
537 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
538 			FATAL("out of space for reg expr %.10s...", buf);
539 		if (c == '\n') {
540 			*bp = '\0';
541 			SYNTAX( "newline in regular expression %.10s...", buf );
542 			unput('\n');
543 			break;
544 		} else if (c == '\\') {
545 			*bp++ = '\\';
546 			*bp++ = input();
547 		} else {
548 			/*
549 			 * POSIX requires a slash in a regexp to be escaped,
550 			 * other awks don't require it to be escaped inside
551 			 * a character class.
552 			 */
553 			if (!do_posix) {
554 				if (c == '[') {
555 					int nextc = peek();
556 					if (openclass == 0 || nextc == ':' ||
557 					    nextc == '.' || nextc == '=') {
558 						if (++openclass == 1)
559 							cstart = bp;
560 					}
561 				} else if (c == ']' && openclass > 0) {
562 					/*
563 					 * A ']' as the first char in a
564 					 * class is treated literally.
565 					 */
566 					if (cstart != bp - 1 &&
567 					    (cstart != bp - 2 || bp[-1] != '^'))
568 						openclass--;
569 				}
570 			}
571 			*bp++ = c;
572 		}
573 	}
574 	*bp = 0;
575 	if (c == 0)
576 		SYNTAX("non-terminated regular expression %.10s...", buf);
577 	yylval.s = tostring(buf);
578 	unput('/');
579 	RET(REGEXPR);
580 }
581 
582 /* low-level lexical stuff, sort of inherited from lex */
583 
584 char	ebuf[300];
585 char	*ep = ebuf;
586 char	yysbuf[100];	/* pushback buffer */
587 char	*yysptr = yysbuf;
588 FILE	*yyin = NULL;
589 
590 int input(void)	/* get next lexical input character */
591 {
592 	int c;
593 	extern char *lexprog;
594 
595 	if (yysptr > yysbuf)
596 		c = (uschar)*--yysptr;
597 	else if (lexprog != NULL) {	/* awk '...' */
598 		if ((c = (uschar)*lexprog) != 0)
599 			lexprog++;
600 	} else				/* awk -f ... */
601 		c = pgetc();
602 	if (c == EOF)
603 		c = 0;
604 	if (ep >= ebuf + sizeof ebuf)
605 		ep = ebuf;
606 	*ep = c;
607 	if (c != 0) {
608 		ep++;
609 	}
610 	return (c);
611 }
612 
613 void unput(int c)	/* put lexical character back on input */
614 {
615 	if (c == '\n')
616 		lineno--;
617 	if (yysptr >= yysbuf + sizeof(yysbuf))
618 		FATAL("pushed back too much: %.20s...", yysbuf);
619 	*yysptr++ = c;
620 	if (--ep < ebuf)
621 		ep = ebuf + sizeof(ebuf) - 1;
622 }
623 
624 void unputstr(const char *s)	/* put a string back on input */
625 {
626 	int i;
627 
628 	for (i = strlen(s)-1; i >= 0; i--)
629 		unput(s[i]);
630 }
631