xref: /openbsd-src/usr.bin/awk/lex.c (revision 8500990981f885cbe5e6a4958549cacc238b5ae6)
1 /*	$OpenBSD: lex.c,v 1.7 2003/07/02 21:04:09 deraadt Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "ytab.h"
32 
33 extern YYSTYPE	yylval;
34 extern int	infunc;
35 
36 int	lineno	= 1;
37 int	bracecnt = 0;
38 int	brackcnt  = 0;
39 int	parencnt = 0;
40 
41 typedef struct Keyword {
42 	const char *word;
43 	int	sub;
44 	int	type;
45 } Keyword;
46 
47 Keyword keywords[] ={	/* keep sorted: binary searched */
48 	{ "BEGIN",	XBEGIN,		XBEGIN },
49 	{ "END",	XEND,		XEND },
50 	{ "NF",		VARNF,		VARNF },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "continue",	CONTINUE,	CONTINUE },
55 	{ "cos",	FCOS,		BLTIN },
56 	{ "delete",	DELETE,		DELETE },
57 	{ "do",		DO,		DO },
58 	{ "else",	ELSE,		ELSE },
59 	{ "exit",	EXIT,		EXIT },
60 	{ "exp",	FEXP,		BLTIN },
61 	{ "fflush",	FFLUSH,		BLTIN },
62 	{ "for",	FOR,		FOR },
63 	{ "func",	FUNC,		FUNC },
64 	{ "function",	FUNC,		FUNC },
65 	{ "getline",	GETLINE,	GETLINE },
66 	{ "gsub",	GSUB,		GSUB },
67 	{ "if",		IF,		IF },
68 	{ "in",		IN,		IN },
69 	{ "index",	INDEX,		INDEX },
70 	{ "int",	FINT,		BLTIN },
71 	{ "length",	FLENGTH,	BLTIN },
72 	{ "log",	FLOG,		BLTIN },
73 	{ "match",	MATCHFCN,	MATCHFCN },
74 	{ "next",	NEXT,		NEXT },
75 	{ "nextfile",	NEXTFILE,	NEXTFILE },
76 	{ "print",	PRINT,		PRINT },
77 	{ "printf",	PRINTF,		PRINTF },
78 	{ "rand",	FRAND,		BLTIN },
79 	{ "return",	RETURN,		RETURN },
80 	{ "sin",	FSIN,		BLTIN },
81 	{ "split",	SPLIT,		SPLIT },
82 	{ "sprintf",	SPRINTF,	SPRINTF },
83 	{ "sqrt",	FSQRT,		BLTIN },
84 	{ "srand",	FSRAND,		BLTIN },
85 	{ "sub",	SUB,		SUB },
86 	{ "substr",	SUBSTR,		SUBSTR },
87 	{ "system",	FSYSTEM,	BLTIN },
88 	{ "tolower",	FTOLOWER,	BLTIN },
89 	{ "toupper",	FTOUPPER,	BLTIN },
90 	{ "while",	WHILE,		WHILE },
91 };
92 
93 #define DEBUG
94 #ifdef	DEBUG
95 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
96 #else
97 #define	RET(x)	return(x)
98 #endif
99 
100 int peek(void);
101 int gettok(char **, int *);
102 int binsearch(char *, Keyword *, int);
103 
104 int peek(void)
105 {
106 	int c = input();
107 	unput(c);
108 	return c;
109 }
110 
111 int gettok(char **pbuf, int *psz)	/* get next input token */
112 {
113 	int c, retc;
114 	char *buf = *pbuf;
115 	int sz = *psz;
116 	char *bp = buf;
117 
118 	c = input();
119 	if (c == 0)
120 		return 0;
121 	buf[0] = c;
122 	buf[1] = 0;
123 	if (!isalnum(c) && c != '.' && c != '_')
124 		return c;
125 
126 	*bp++ = c;
127 	if (isalpha(c) || c == '_') {	/* it's a varname */
128 		for ( ; (c = input()) != 0; ) {
129 			if (bp-buf >= sz)
130 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
131 					FATAL( "out of space for name %.10s...", buf );
132 			if (isalnum(c) || c == '_')
133 				*bp++ = c;
134 			else {
135 				*bp = 0;
136 				unput(c);
137 				break;
138 			}
139 		}
140 		*bp = 0;
141 		retc = 'a';	/* alphanumeric */
142 	} else {	/* it's a number */
143 		char *rem;
144 		/* read input until can't be a number */
145 		for ( ; (c = input()) != 0; ) {
146 			if (bp-buf >= sz)
147 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
148 					FATAL( "out of space for number %.10s...", buf );
149 			if (isdigit(c) || c == 'e' || c == 'E'
150 			  || c == '.' || c == '+' || c == '-')
151 				*bp++ = c;
152 			else {
153 				unput(c);
154 				break;
155 			}
156 		}
157 		*bp = 0;
158 		strtod(buf, &rem);	/* parse the number */
159 		unputstr(rem);		/* put rest back for later */
160 		if (rem == buf) {	/* it wasn't a valid number at all */
161 			buf[1] = 0;	/* so return one character as token */
162 			retc = buf[0];	/* character is its own type */
163 		} else {	/* some prefix was a number */
164 			rem[0] = 0;	/* so truncate where failure started */
165 			retc = '0';	/* number */
166 		}
167 	}
168 	*pbuf = buf;
169 	*psz = sz;
170 	return retc;
171 }
172 
173 int	word(char *);
174 int	string(void);
175 int	regexpr(void);
176 int	sc	= 0;	/* 1 => return a } right now */
177 int	reg	= 0;	/* 1 => return a REGEXPR now */
178 
179 int yylex(void)
180 {
181 	int c;
182 	static char *buf = 0;
183 	static int bufsize = 500;
184 
185 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
186 		FATAL( "out of space in yylex" );
187 	if (sc) {
188 		sc = 0;
189 		RET('}');
190 	}
191 	if (reg) {
192 		reg = 0;
193 		return regexpr();
194 	}
195 	for (;;) {
196 		c = gettok(&buf, &bufsize);
197 		if (c == 0)
198 			return 0;
199 		if (isalpha(c) || c == '_')
200 			return word(buf);
201 		if (isdigit(c)) {
202 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
203 			/* should this also have STR set? */
204 			RET(NUMBER);
205 		}
206 
207 		yylval.i = c;
208 		switch (c) {
209 		case '\n':	/* {EOL} */
210 			RET(NL);
211 		case '\r':	/* assume \n is coming */
212 		case ' ':	/* {WS}+ */
213 		case '\t':
214 			break;
215 		case '#':	/* #.* strip comments */
216 			while ((c = input()) != '\n' && c != 0)
217 				;
218 			unput(c);
219 			break;
220 		case ';':
221 			RET(';');
222 		case '\\':
223 			if (peek() == '\n') {
224 				input();
225 			} else if (peek() == '\r') {
226 				input(); input();	/* \n */
227 				lineno++;
228 			} else {
229 				RET(c);
230 			}
231 			break;
232 		case '&':
233 			if (peek() == '&') {
234 				input(); RET(AND);
235 			} else
236 				RET('&');
237 		case '|':
238 			if (peek() == '|') {
239 				input(); RET(BOR);
240 			} else
241 				RET('|');
242 		case '!':
243 			if (peek() == '=') {
244 				input(); yylval.i = NE; RET(NE);
245 			} else if (peek() == '~') {
246 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
247 			} else
248 				RET(NOT);
249 		case '~':
250 			yylval.i = MATCH;
251 			RET(MATCHOP);
252 		case '<':
253 			if (peek() == '=') {
254 				input(); yylval.i = LE; RET(LE);
255 			} else {
256 				yylval.i = LT; RET(LT);
257 			}
258 		case '=':
259 			if (peek() == '=') {
260 				input(); yylval.i = EQ; RET(EQ);
261 			} else {
262 				yylval.i = ASSIGN; RET(ASGNOP);
263 			}
264 		case '>':
265 			if (peek() == '=') {
266 				input(); yylval.i = GE; RET(GE);
267 			} else if (peek() == '>') {
268 				input(); yylval.i = APPEND; RET(APPEND);
269 			} else {
270 				yylval.i = GT; RET(GT);
271 			}
272 		case '+':
273 			if (peek() == '+') {
274 				input(); yylval.i = INCR; RET(INCR);
275 			} else if (peek() == '=') {
276 				input(); yylval.i = ADDEQ; RET(ASGNOP);
277 			} else
278 				RET('+');
279 		case '-':
280 			if (peek() == '-') {
281 				input(); yylval.i = DECR; RET(DECR);
282 			} else if (peek() == '=') {
283 				input(); yylval.i = SUBEQ; RET(ASGNOP);
284 			} else
285 				RET('-');
286 		case '*':
287 			if (peek() == '=') {	/* *= */
288 				input(); yylval.i = MULTEQ; RET(ASGNOP);
289 			} else if (peek() == '*') {	/* ** or **= */
290 				input();	/* eat 2nd * */
291 				if (peek() == '=') {
292 					input(); yylval.i = POWEQ; RET(ASGNOP);
293 				} else {
294 					RET(POWER);
295 				}
296 			} else
297 				RET('*');
298 		case '/':
299 			RET('/');
300 		case '%':
301 			if (peek() == '=') {
302 				input(); yylval.i = MODEQ; RET(ASGNOP);
303 			} else
304 				RET('%');
305 		case '^':
306 			if (peek() == '=') {
307 				input(); yylval.i = POWEQ; RET(ASGNOP);
308 			} else
309 				RET(POWER);
310 
311 		case '$':
312 			/* BUG: awkward, if not wrong */
313 			c = gettok(&buf, &bufsize);
314 			if (isalpha(c)) {
315 				if (strcmp(buf, "NF") == 0) {	/* very special */
316 					unputstr("(NF)");
317 					RET(INDIRECT);
318 				}
319 				c = peek();
320 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
321 					unputstr(buf);
322 					RET(INDIRECT);
323 				}
324 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
325 				RET(IVAR);
326 			} else if (c == 0) {	/*  */
327 				SYNTAX( "unexpected end of input after $" );
328 				RET(';');
329 			} else {
330 				unputstr(buf);
331 				RET(INDIRECT);
332 			}
333 
334 		case '}':
335 			if (--bracecnt < 0)
336 				SYNTAX( "extra }" );
337 			sc = 1;
338 			RET(';');
339 		case ']':
340 			if (--brackcnt < 0)
341 				SYNTAX( "extra ]" );
342 			RET(']');
343 		case ')':
344 			if (--parencnt < 0)
345 				SYNTAX( "extra )" );
346 			RET(')');
347 		case '{':
348 			bracecnt++;
349 			RET('{');
350 		case '[':
351 			brackcnt++;
352 			RET('[');
353 		case '(':
354 			parencnt++;
355 			RET('(');
356 
357 		case '"':
358 			return string();	/* BUG: should be like tran.c ? */
359 
360 		default:
361 			RET(c);
362 		}
363 	}
364 }
365 
366 int string(void)
367 {
368 	int c, n;
369 	char *s, *bp;
370 	static char *buf = 0;
371 	static int bufsz = 500;
372 
373 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
374 		FATAL("out of space for strings");
375 	for (bp = buf; (c = input()) != '"'; ) {
376 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
377 			FATAL("out of space for string %.10s...", buf);
378 		switch (c) {
379 		case '\n':
380 		case '\r':
381 		case 0:
382 			SYNTAX( "non-terminated string %.10s...", buf );
383 			lineno++;
384 			if (c == 0)	/* hopeless */
385 				FATAL( "giving up" );
386 			break;
387 		case '\\':
388 			c = input();
389 			switch (c) {
390 			case '"': *bp++ = '"'; break;
391 			case 'n': *bp++ = '\n'; break;
392 			case 't': *bp++ = '\t'; break;
393 			case 'f': *bp++ = '\f'; break;
394 			case 'r': *bp++ = '\r'; break;
395 			case 'b': *bp++ = '\b'; break;
396 			case 'v': *bp++ = '\v'; break;
397 			case 'a': *bp++ = '\007'; break;
398 			case '\\': *bp++ = '\\'; break;
399 
400 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
401 			case '3': case '4': case '5': case '6': case '7':
402 				n = c - '0';
403 				if ((c = peek()) >= '0' && c < '8') {
404 					n = 8 * n + input() - '0';
405 					if ((c = peek()) >= '0' && c < '8')
406 						n = 8 * n + input() - '0';
407 				}
408 				*bp++ = n;
409 				break;
410 
411 			case 'x':	/* hex  \x0-9a-fA-F + */
412 			    {	char xbuf[100], *px;
413 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
414 					if (isdigit(c)
415 					 || (c >= 'a' && c <= 'f')
416 					 || (c >= 'A' && c <= 'F'))
417 						*px++ = c;
418 					else
419 						break;
420 				}
421 				*px = 0;
422 				unput(c);
423 	  			sscanf(xbuf, "%x", &n);
424 				*bp++ = n;
425 				break;
426 			    }
427 
428 			default:
429 				*bp++ = c;
430 				break;
431 			}
432 			break;
433 		default:
434 			*bp++ = c;
435 			break;
436 		}
437 	}
438 	*bp = 0;
439 	s = tostring(buf);
440 	*bp++ = ' '; *bp++ = 0;
441 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
442 	RET(STRING);
443 }
444 
445 
446 int binsearch(char *w, Keyword *kp, int n)
447 {
448 	int cond, low, mid, high;
449 
450 	low = 0;
451 	high = n - 1;
452 	while (low <= high) {
453 		mid = (low + high) / 2;
454 		if ((cond = strcmp(w, kp[mid].word)) < 0)
455 			high = mid - 1;
456 		else if (cond > 0)
457 			low = mid + 1;
458 		else
459 			return mid;
460 	}
461 	return -1;
462 }
463 
464 int word(char *w)
465 {
466 	Keyword *kp;
467 	int c, n;
468 
469 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
470 	kp = keywords + n;
471 	if (n != -1) {	/* found in table */
472 		yylval.i = kp->sub;
473 		switch (kp->type) {	/* special handling */
474 		case FSYSTEM:
475 			if (safe)
476 				SYNTAX( "system is unsafe" );
477 			RET(kp->type);
478 		case FUNC:
479 			if (infunc)
480 				SYNTAX( "illegal nested function" );
481 			RET(kp->type);
482 		case RETURN:
483 			if (!infunc)
484 				SYNTAX( "return not in function" );
485 			RET(kp->type);
486 		case VARNF:
487 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
488 			RET(VARNF);
489 		default:
490 			RET(kp->type);
491 		}
492 	}
493 	c = peek();	/* look for '(' */
494 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
495 		yylval.i = n;
496 		RET(ARG);
497 	} else {
498 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
499 		if (c == '(') {
500 			RET(CALL);
501 		} else {
502 			RET(VAR);
503 		}
504 	}
505 }
506 
507 void startreg(void)	/* next call to yylex will return a regular expression */
508 {
509 	reg = 1;
510 }
511 
512 int regexpr(void)
513 {
514 	int c;
515 	static char *buf = 0;
516 	static int bufsz = 500;
517 	char *bp;
518 
519 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
520 		FATAL("out of space for rex expr");
521 	bp = buf;
522 	for ( ; (c = input()) != '/' && c != 0; ) {
523 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
524 			FATAL("out of space for reg expr %.10s...", buf);
525 		if (c == '\n') {
526 			SYNTAX( "newline in regular expression %.10s...", buf );
527 			unput('\n');
528 			break;
529 		} else if (c == '\\') {
530 			*bp++ = '\\';
531 			*bp++ = input();
532 		} else {
533 			*bp++ = c;
534 		}
535 	}
536 	*bp = 0;
537 	yylval.s = tostring(buf);
538 	unput('/');
539 	RET(REGEXPR);
540 }
541 
542 /* low-level lexical stuff, sort of inherited from lex */
543 
544 char	ebuf[300];
545 char	*ep = ebuf;
546 char	yysbuf[100];	/* pushback buffer */
547 char	*yysptr = yysbuf;
548 FILE	*yyin = 0;
549 
550 int input(void)	/* get next lexical input character */
551 {
552 	int c;
553 	extern char *lexprog;
554 
555 	if (yysptr > yysbuf)
556 		c = *--yysptr;
557 	else if (lexprog != NULL) {	/* awk '...' */
558 		if ((c = *lexprog) != 0)
559 			lexprog++;
560 	} else				/* awk -f ... */
561 		c = pgetc();
562 	if (c == '\n')
563 		lineno++;
564 	else if (c == EOF)
565 		c = 0;
566 	if (ep >= ebuf + sizeof ebuf)
567 		ep = ebuf;
568 	return *ep++ = c;
569 }
570 
571 void unput(int c)	/* put lexical character back on input */
572 {
573 	if (c == '\n')
574 		lineno--;
575 	if (yysptr >= yysbuf + sizeof(yysbuf))
576 		FATAL("pushed back too much: %.20s...", yysbuf);
577 	*yysptr++ = c;
578 	if (--ep < ebuf)
579 		ep = ebuf + sizeof(ebuf) - 1;
580 }
581 
582 void unputstr(const char *s)	/* put a string back on input */
583 {
584 	int i;
585 
586 	for (i = strlen(s)-1; i >= 0; i--)
587 		unput(s[i]);
588 }
589