xref: /openbsd-src/usr.bin/awk/lex.c (revision 1ad61ae0a79a724d2d3ec69e69c8e1d1ff6b53a0)
1 /*	$OpenBSD: lex.c,v 1.31 2023/09/17 14:49:44 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "awkgram.tab.h"
32 
33 extern YYSTYPE	yylval;
34 extern bool	infunc;
35 
36 int	lineno	= 1;
37 int	bracecnt = 0;
38 int	brackcnt  = 0;
39 int	parencnt = 0;
40 
41 typedef struct Keyword {
42 	const char *word;
43 	int	sub;
44 	int	type;
45 } Keyword;
46 
47 const Keyword keywords[] = {	/* keep sorted: binary searched */
48 	{ "BEGIN",	XBEGIN,		XBEGIN },
49 	{ "END",	XEND,		XEND },
50 	{ "NF",		VARNF,		VARNF },
51 	{ "and",	FAND,		BLTIN },
52 	{ "atan2",	FATAN,		BLTIN },
53 	{ "break",	BREAK,		BREAK },
54 	{ "close",	CLOSE,		CLOSE },
55 	{ "compl",	FCOMPL,		BLTIN },
56 	{ "continue",	CONTINUE,	CONTINUE },
57 	{ "cos",	FCOS,		BLTIN },
58 	{ "delete",	DELETE,		DELETE },
59 	{ "do",		DO,		DO },
60 	{ "else",	ELSE,		ELSE },
61 	{ "exit",	EXIT,		EXIT },
62 	{ "exp",	FEXP,		BLTIN },
63 	{ "fflush",	FFLUSH,		BLTIN },
64 	{ "for",	FOR,		FOR },
65 	{ "func",	FUNC,		FUNC },
66 	{ "function",	FUNC,		FUNC },
67 	{ "gensub",	GENSUB,		GENSUB },
68 	{ "getline",	GETLINE,	GETLINE },
69 	{ "gsub",	GSUB,		GSUB },
70 	{ "if",		IF,		IF },
71 	{ "in",		IN,		IN },
72 	{ "index",	INDEX,		INDEX },
73 	{ "int",	FINT,		BLTIN },
74 	{ "length",	FLENGTH,	BLTIN },
75 	{ "log",	FLOG,		BLTIN },
76 	{ "lshift",	FLSHIFT,	BLTIN },
77 	{ "match",	MATCHFCN,	MATCHFCN },
78 	{ "mktime",	FMKTIME,	BLTIN },
79 	{ "next",	NEXT,		NEXT },
80 	{ "nextfile",	NEXTFILE,	NEXTFILE },
81 	{ "or",		FFOR,		BLTIN },
82 	{ "print",	PRINT,		PRINT },
83 	{ "printf",	PRINTF,		PRINTF },
84 	{ "rand",	FRAND,		BLTIN },
85 	{ "return",	RETURN,		RETURN },
86 	{ "rshift",	FRSHIFT,	BLTIN },
87 	{ "sin",	FSIN,		BLTIN },
88 	{ "split",	SPLIT,		SPLIT },
89 	{ "sprintf",	SPRINTF,	SPRINTF },
90 	{ "sqrt",	FSQRT,		BLTIN },
91 	{ "srand",	FSRAND,		BLTIN },
92 	{ "strftime",	FSTRFTIME,	BLTIN },
93 	{ "sub",	SUB,		SUB },
94 	{ "substr",	SUBSTR,		SUBSTR },
95 	{ "system",	FSYSTEM,	BLTIN },
96 	{ "systime",	FSYSTIME,	BLTIN },
97 	{ "tolower",	FTOLOWER,	BLTIN },
98 	{ "toupper",	FTOUPPER,	BLTIN },
99 	{ "while",	WHILE,		WHILE },
100 	{ "xor",	FXOR,		BLTIN },
101 };
102 
103 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
104 
105 static int peek(void)
106 {
107 	int c = input();
108 	unput(c);
109 	return c;
110 }
111 
112 static int gettok(char **pbuf, int *psz)	/* get next input token */
113 {
114 	int c, retc;
115 	char *buf = *pbuf;
116 	int sz = *psz;
117 	char *bp = buf;
118 
119 	c = input();
120 	if (c == 0)
121 		return 0;
122 	buf[0] = c;
123 	buf[1] = 0;
124 	if (!isalnum(c) && c != '.' && c != '_')
125 		return c;
126 
127 	*bp++ = c;
128 	if (isalpha(c) || c == '_') {	/* it's a varname */
129 		for ( ; (c = input()) != 0; ) {
130 			if (bp-buf >= sz)
131 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
132 					FATAL( "out of space for name %.10s...", buf );
133 			if (isalnum(c) || c == '_')
134 				*bp++ = c;
135 			else {
136 				*bp = 0;
137 				unput(c);
138 				break;
139 			}
140 		}
141 		*bp = 0;
142 		retc = 'a';	/* alphanumeric */
143 	} else {	/* maybe it's a number, but could be . */
144 		char *rem;
145 		/* read input until can't be a number */
146 		for ( ; (c = input()) != 0; ) {
147 			if (bp-buf >= sz)
148 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
149 					FATAL( "out of space for number %.10s...", buf );
150 			if (isdigit(c) || c == 'e' || c == 'E'
151 			  || c == '.' || c == '+' || c == '-')
152 				*bp++ = c;
153 			else {
154 				unput(c);
155 				break;
156 			}
157 		}
158 		*bp = 0;
159 		strtod(buf, &rem);	/* parse the number */
160 		if (rem == buf) {	/* it wasn't a valid number at all */
161 			buf[1] = 0;	/* return one character as token */
162 			retc = (uschar)buf[0];	/* character is its own type */
163 			unputstr(rem+1); /* put rest back for later */
164 		} else {	/* some prefix was a number */
165 			unputstr(rem);	/* put rest back for later */
166 			rem[0] = 0;	/* truncate buf after number part */
167 			retc = '0';	/* type is number */
168 		}
169 	}
170 	*pbuf = buf;
171 	*psz = sz;
172 	return retc;
173 }
174 
175 int	word(char *);
176 int	string(void);
177 int	regexpr(void);
178 bool	sc	= false;	/* true => return a } right now */
179 bool	reg	= false;	/* true => return a REGEXPR now */
180 
181 int yylex(void)
182 {
183 	int c;
184 	static char *buf = NULL;
185 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
186 
187 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
188 		FATAL( "out of space in yylex" );
189 	if (sc) {
190 		sc = false;
191 		RET('}');
192 	}
193 	if (reg) {
194 		reg = false;
195 		return regexpr();
196 	}
197 	for (;;) {
198 		c = gettok(&buf, &bufsize);
199 		if (c == 0)
200 			return 0;
201 		if (isalpha(c) || c == '_')
202 			return word(buf);
203 		if (isdigit(c)) {
204 			char *cp = tostring(buf);
205 			double result;
206 
207 			if (is_number(cp, & result))
208 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
209 			else
210 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
211 			free(cp);
212 			/* should this also have STR set? */
213 			RET(NUMBER);
214 		}
215 
216 		yylval.i = c;
217 		switch (c) {
218 		case '\n':	/* {EOL} */
219 			lineno++;
220 			RET(NL);
221 		case '\r':	/* assume \n is coming */
222 		case ' ':	/* {WS}+ */
223 		case '\t':
224 			break;
225 		case '#':	/* #.* strip comments */
226 			while ((c = input()) != '\n' && c != 0)
227 				;
228 			unput(c);
229 			/*
230 			 * Next line is a hack, itcompensates for
231 			 * unput's treatment of \n.
232 			 */
233 			lineno++;
234 			break;
235 		case ';':
236 			RET(';');
237 		case '\\':
238 			if (peek() == '\n') {
239 				input();
240 				lineno++;
241 			} else if (peek() == '\r') {
242 				input(); input();	/* \n */
243 				lineno++;
244 			} else {
245 				RET(c);
246 			}
247 			break;
248 		case '&':
249 			if (peek() == '&') {
250 				input(); RET(AND);
251 			} else
252 				RET('&');
253 		case '|':
254 			if (peek() == '|') {
255 				input(); RET(BOR);
256 			} else
257 				RET('|');
258 		case '!':
259 			if (peek() == '=') {
260 				input(); yylval.i = NE; RET(NE);
261 			} else if (peek() == '~') {
262 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
263 			} else
264 				RET(NOT);
265 		case '~':
266 			yylval.i = MATCH;
267 			RET(MATCHOP);
268 		case '<':
269 			if (peek() == '=') {
270 				input(); yylval.i = LE; RET(LE);
271 			} else {
272 				yylval.i = LT; RET(LT);
273 			}
274 		case '=':
275 			if (peek() == '=') {
276 				input(); yylval.i = EQ; RET(EQ);
277 			} else {
278 				yylval.i = ASSIGN; RET(ASGNOP);
279 			}
280 		case '>':
281 			if (peek() == '=') {
282 				input(); yylval.i = GE; RET(GE);
283 			} else if (peek() == '>') {
284 				input(); yylval.i = APPEND; RET(APPEND);
285 			} else {
286 				yylval.i = GT; RET(GT);
287 			}
288 		case '+':
289 			if (peek() == '+') {
290 				input(); yylval.i = INCR; RET(INCR);
291 			} else if (peek() == '=') {
292 				input(); yylval.i = ADDEQ; RET(ASGNOP);
293 			} else
294 				RET('+');
295 		case '-':
296 			if (peek() == '-') {
297 				input(); yylval.i = DECR; RET(DECR);
298 			} else if (peek() == '=') {
299 				input(); yylval.i = SUBEQ; RET(ASGNOP);
300 			} else
301 				RET('-');
302 		case '*':
303 			if (peek() == '=') {	/* *= */
304 				input(); yylval.i = MULTEQ; RET(ASGNOP);
305 			} else if (peek() == '*') {	/* ** or **= */
306 				input();	/* eat 2nd * */
307 				if (peek() == '=') {
308 					input(); yylval.i = POWEQ; RET(ASGNOP);
309 				} else {
310 					RET(POWER);
311 				}
312 			} else
313 				RET('*');
314 		case '/':
315 			RET('/');
316 		case '%':
317 			if (peek() == '=') {
318 				input(); yylval.i = MODEQ; RET(ASGNOP);
319 			} else
320 				RET('%');
321 		case '^':
322 			if (peek() == '=') {
323 				input(); yylval.i = POWEQ; RET(ASGNOP);
324 			} else
325 				RET(POWER);
326 
327 		case '$':
328 			/* BUG: awkward, if not wrong */
329 			c = gettok(&buf, &bufsize);
330 			if (isalpha(c)) {
331 				if (strcmp(buf, "NF") == 0) {	/* very special */
332 					unputstr("(NF)");
333 					RET(INDIRECT);
334 				}
335 				c = peek();
336 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
337 					unputstr(buf);
338 					RET(INDIRECT);
339 				}
340 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
341 				RET(IVAR);
342 			} else if (c == 0) {	/*  */
343 				SYNTAX( "unexpected end of input after $" );
344 				RET(';');
345 			} else {
346 				unputstr(buf);
347 				RET(INDIRECT);
348 			}
349 
350 		case '}':
351 			if (--bracecnt < 0)
352 				SYNTAX( "extra }" );
353 			sc = true;
354 			RET(';');
355 		case ']':
356 			if (--brackcnt < 0)
357 				SYNTAX( "extra ]" );
358 			RET(']');
359 		case ')':
360 			if (--parencnt < 0)
361 				SYNTAX( "extra )" );
362 			RET(')');
363 		case '{':
364 			bracecnt++;
365 			RET('{');
366 		case '[':
367 			brackcnt++;
368 			RET('[');
369 		case '(':
370 			parencnt++;
371 			RET('(');
372 
373 		case '"':
374 			return string();	/* BUG: should be like tran.c ? */
375 
376 		default:
377 			RET(c);
378 		}
379 	}
380 }
381 
382 extern int runetochar(char *str, int c);
383 
384 int string(void)
385 {
386 	int c, n;
387 	char *s, *bp;
388 	static char *buf = NULL;
389 	static int bufsz = 500;
390 
391 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
392 		FATAL("out of space for strings");
393 	for (bp = buf; (c = input()) != '"'; ) {
394 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
395 			FATAL("out of space for string %.10s...", buf);
396 		switch (c) {
397 		case '\n':
398 		case '\r':
399 		case 0:
400 			*bp = '\0';
401 			SYNTAX( "non-terminated string %.10s...", buf );
402 			if (c == 0)	/* hopeless */
403 				FATAL( "giving up" );
404 			lineno++;
405 			break;
406 		case '\\':
407 			c = input();
408 			switch (c) {
409 			case '\n': break;
410 			case '"': *bp++ = '"'; break;
411 			case 'n': *bp++ = '\n'; break;
412 			case 't': *bp++ = '\t'; break;
413 			case 'f': *bp++ = '\f'; break;
414 			case 'r': *bp++ = '\r'; break;
415 			case 'b': *bp++ = '\b'; break;
416 			case 'v': *bp++ = '\v'; break;
417 			case 'a': *bp++ = '\a'; break;
418 			case '\\': *bp++ = '\\'; break;
419 
420 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
421 			case '3': case '4': case '5': case '6': case '7':
422 				n = c - '0';
423 				if ((c = peek()) >= '0' && c < '8') {
424 					n = 8 * n + input() - '0';
425 					if ((c = peek()) >= '0' && c < '8')
426 						n = 8 * n + input() - '0';
427 				}
428 				*bp++ = n;
429 				break;
430 
431 			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
432 			    {
433 				int i;
434 
435 				n = 0;
436 				for (i = 1; i <= 2; i++) {
437 					c = input();
438 					if (c == 0)
439 						break;
440 					if (isxdigit(c)) {
441 						c = tolower(c);
442 						n *= 16;
443 						if (isdigit(c))
444 							n += (c - '0');
445 						else
446 							n += 10 + (c - 'a');
447 					} else
448 						break;
449 				}
450 				if (n)
451 					*bp++ = n;
452 				else
453 					unput(c);
454 				break;
455 			    }
456 
457 			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
458 			    {
459 				int i;
460 
461 				n = 0;
462 				for (i = 0; i < 8; i++) {
463 					c = input();
464 					if (!isxdigit(c) || c == 0)
465 						break;
466 					c = tolower(c);
467 					n *= 16;
468 					if (isdigit(c))
469 						n += (c - '0');
470 					else
471 						n += 10 + (c - 'a');
472 				}
473 				unput(c);
474 				bp += runetochar(bp, n);
475 				break;
476 			    }
477 
478 			default:
479 				*bp++ = c;
480 				break;
481 			}
482 			break;
483 		default:
484 			*bp++ = c;
485 			break;
486 		}
487 	}
488 	*bp = 0;
489 	s = tostring(buf);
490 	*bp++ = ' '; *bp++ = '\0';
491 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
492 	free(s);
493 	RET(STRING);
494 }
495 
496 
497 static int binsearch(char *w, const Keyword *kp, int n)
498 {
499 	int cond, low, mid, high;
500 
501 	low = 0;
502 	high = n - 1;
503 	while (low <= high) {
504 		mid = (low + high) / 2;
505 		if ((cond = strcmp(w, kp[mid].word)) < 0)
506 			high = mid - 1;
507 		else if (cond > 0)
508 			low = mid + 1;
509 		else
510 			return mid;
511 	}
512 	return -1;
513 }
514 
515 int word(char *w)
516 {
517 	const Keyword *kp;
518 	int c, n;
519 
520 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
521 	if (n != -1) {	/* found in table */
522 		kp = keywords + n;
523 		yylval.i = kp->sub;
524 		switch (kp->type) {	/* special handling */
525 		case BLTIN:
526 			if (kp->sub == FSYSTEM && safe)
527 				SYNTAX( "system is unsafe" );
528 			RET(kp->type);
529 		case FUNC:
530 			if (infunc)
531 				SYNTAX( "illegal nested function" );
532 			RET(kp->type);
533 		case RETURN:
534 			if (!infunc)
535 				SYNTAX( "return not in function" );
536 			RET(kp->type);
537 		case VARNF:
538 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
539 			RET(VARNF);
540 		default:
541 			RET(kp->type);
542 		}
543 	}
544 	c = peek();	/* look for '(' */
545 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
546 		yylval.i = n;
547 		RET(ARG);
548 	} else {
549 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
550 		if (c == '(') {
551 			RET(CALL);
552 		} else {
553 			RET(VAR);
554 		}
555 	}
556 }
557 
558 void startreg(void)	/* next call to yylex will return a regular expression */
559 {
560 	reg = true;
561 }
562 
563 int regexpr(void)
564 {
565 	int c, openclass = 0;
566 	static char *buf = NULL;
567 	static int bufsz = 500;
568 	char *bp, *cstart;
569 
570 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
571 		FATAL("out of space for reg expr");
572 	bp = buf;
573 	for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
574 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
575 			FATAL("out of space for reg expr %.10s...", buf);
576 		if (c == '\n') {
577 			*bp = '\0';
578 			SYNTAX( "newline in regular expression %.10s...", buf );
579 			unput('\n');
580 			break;
581 		} else if (c == '\\') {
582 			*bp++ = '\\';
583 			*bp++ = input();
584 		} else {
585 			/*
586 			 * POSIX requires a slash in a regexp to be escaped,
587 			 * other awks don't require it to be escaped inside
588 			 * a character class.
589 			 */
590 			if (!do_posix) {
591 				if (c == '[') {
592 					int nextc = peek();
593 					if (openclass == 0 || nextc == ':' ||
594 					    nextc == '.' || nextc == '=') {
595 						if (++openclass == 1)
596 							cstart = bp;
597 					}
598 				} else if (c == ']' && openclass > 0) {
599 					/*
600 					 * A ']' as the first char in a
601 					 * class is treated literally.
602 					 */
603 					if (cstart != bp - 1 &&
604 					    (cstart != bp - 2 || bp[-1] != '^'))
605 						openclass--;
606 				}
607 			}
608 			*bp++ = c;
609 		}
610 	}
611 	*bp = 0;
612 	if (c == 0)
613 		SYNTAX("non-terminated regular expression %.10s...", buf);
614 	yylval.s = tostring(buf);
615 	unput('/');
616 	RET(REGEXPR);
617 }
618 
619 /* low-level lexical stuff, sort of inherited from lex */
620 
621 char	ebuf[300];
622 char	*ep = ebuf;
623 char	yysbuf[100];	/* pushback buffer */
624 char	*yysptr = yysbuf;
625 FILE	*yyin = NULL;
626 
627 int input(void)	/* get next lexical input character */
628 {
629 	int c;
630 	extern char *lexprog;
631 
632 	if (yysptr > yysbuf)
633 		c = (uschar)*--yysptr;
634 	else if (lexprog != NULL) {	/* awk '...' */
635 		if ((c = (uschar)*lexprog) != 0)
636 			lexprog++;
637 	} else				/* awk -f ... */
638 		c = pgetc();
639 	if (c == EOF)
640 		c = 0;
641 	if (ep >= ebuf + sizeof ebuf)
642 		ep = ebuf;
643 	*ep = c;
644 	if (c != 0) {
645 		ep++;
646 	}
647 	return (c);
648 }
649 
650 void unput(int c)	/* put lexical character back on input */
651 {
652 	if (c == '\n')
653 		lineno--;
654 	if (yysptr >= yysbuf + sizeof(yysbuf))
655 		FATAL("pushed back too much: %.20s...", yysbuf);
656 	*yysptr++ = c;
657 	if (--ep < ebuf)
658 		ep = ebuf + sizeof(ebuf) - 1;
659 }
660 
661 void unputstr(const char *s)	/* put a string back on input */
662 {
663 	int i;
664 
665 	for (i = strlen(s)-1; i >= 0; i--)
666 		unput(s[i]);
667 }
668