xref: /openbsd-src/usr.bin/awk/lex.c (revision 850e275390052b330d93020bf619a739a3c277ac)
1 /*	$OpenBSD: lex.c,v 1.10 2008/06/04 14:04:42 pyr Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "ytab.h"
32 
33 extern YYSTYPE	yylval;
34 extern int	infunc;
35 
36 int	lineno	= 1;
37 int	bracecnt = 0;
38 int	brackcnt  = 0;
39 int	parencnt = 0;
40 
41 typedef struct Keyword {
42 	const char *word;
43 	int	sub;
44 	int	type;
45 } Keyword;
46 
47 Keyword keywords[] ={	/* keep sorted: binary searched */
48 	{ "BEGIN",	XBEGIN,		XBEGIN },
49 	{ "END",	XEND,		XEND },
50 	{ "NF",		VARNF,		VARNF },
51 	{ "and",	FAND,		BLTIN },
52 	{ "atan2",	FATAN,		BLTIN },
53 	{ "break",	BREAK,		BREAK },
54 	{ "close",	CLOSE,		CLOSE },
55 	{ "compl",	FCOMPL,		BLTIN },
56 	{ "continue",	CONTINUE,	CONTINUE },
57 	{ "cos",	FCOS,		BLTIN },
58 	{ "delete",	DELETE,		DELETE },
59 	{ "do",		DO,		DO },
60 	{ "else",	ELSE,		ELSE },
61 	{ "exit",	EXIT,		EXIT },
62 	{ "exp",	FEXP,		BLTIN },
63 	{ "fflush",	FFLUSH,		BLTIN },
64 	{ "for",	FOR,		FOR },
65 	{ "func",	FUNC,		FUNC },
66 	{ "function",	FUNC,		FUNC },
67 	{ "getline",	GETLINE,	GETLINE },
68 	{ "gsub",	GSUB,		GSUB },
69 	{ "if",		IF,		IF },
70 	{ "in",		IN,		IN },
71 	{ "index",	INDEX,		INDEX },
72 	{ "int",	FINT,		BLTIN },
73 	{ "length",	FLENGTH,	BLTIN },
74 	{ "log",	FLOG,		BLTIN },
75 	{ "lshift",	FLSHIFT,	BLTIN },
76 	{ "match",	MATCHFCN,	MATCHFCN },
77 	{ "next",	NEXT,		NEXT },
78 	{ "nextfile",	NEXTFILE,	NEXTFILE },
79 	{ "or",		FFOR,		BLTIN },
80 	{ "print",	PRINT,		PRINT },
81 	{ "printf",	PRINTF,		PRINTF },
82 	{ "rand",	FRAND,		BLTIN },
83 	{ "return",	RETURN,		RETURN },
84 	{ "rshift",	FRSHIFT,	BLTIN },
85 	{ "sin",	FSIN,		BLTIN },
86 	{ "split",	SPLIT,		SPLIT },
87 	{ "sprintf",	SPRINTF,	SPRINTF },
88 	{ "sqrt",	FSQRT,		BLTIN },
89 	{ "srand",	FSRAND,		BLTIN },
90 	{ "sub",	SUB,		SUB },
91 	{ "substr",	SUBSTR,		SUBSTR },
92 	{ "system",	FSYSTEM,	BLTIN },
93 	{ "tolower",	FTOLOWER,	BLTIN },
94 	{ "toupper",	FTOUPPER,	BLTIN },
95 	{ "while",	WHILE,		WHILE },
96 	{ "xor",	FXOR,		BLTIN },
97 };
98 
99 #define DEBUG
100 #ifdef	DEBUG
101 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
102 #else
103 #define	RET(x)	return(x)
104 #endif
105 
106 int peek(void);
107 int gettok(char **, int *);
108 int binsearch(char *, Keyword *, int);
109 
110 int peek(void)
111 {
112 	int c = input();
113 	unput(c);
114 	return c;
115 }
116 
117 int gettok(char **pbuf, int *psz)	/* get next input token */
118 {
119 	int c, retc;
120 	char *buf = *pbuf;
121 	int sz = *psz;
122 	char *bp = buf;
123 
124 	c = input();
125 	if (c == 0)
126 		return 0;
127 	buf[0] = c;
128 	buf[1] = 0;
129 	if (!isalnum(c) && c != '.' && c != '_')
130 		return c;
131 
132 	*bp++ = c;
133 	if (isalpha(c) || c == '_') {	/* it's a varname */
134 		for ( ; (c = input()) != 0; ) {
135 			if (bp-buf >= sz)
136 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
137 					FATAL( "out of space for name %.10s...", buf );
138 			if (isalnum(c) || c == '_')
139 				*bp++ = c;
140 			else {
141 				*bp = 0;
142 				unput(c);
143 				break;
144 			}
145 		}
146 		*bp = 0;
147 		retc = 'a';	/* alphanumeric */
148 	} else {	/* it's a number */
149 		char *rem;
150 		/* read input until can't be a number */
151 		for ( ; (c = input()) != 0; ) {
152 			if (bp-buf >= sz)
153 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
154 					FATAL( "out of space for number %.10s...", buf );
155 			if (isdigit(c) || c == 'e' || c == 'E'
156 			  || c == '.' || c == '+' || c == '-')
157 				*bp++ = c;
158 			else {
159 				unput(c);
160 				break;
161 			}
162 		}
163 		*bp = 0;
164 		strtod(buf, &rem);	/* parse the number */
165 		unputstr(rem);		/* put rest back for later */
166 /* printf("unputstr [%s], buf [%s]\n", rem, buf); */
167 		if (rem == buf) {	/* it wasn't a valid number at all */
168 			buf[1] = 0;	/* so return one character as token */
169 			retc = buf[0];	/* character is its own type */
170 		} else {	/* some prefix was a number */
171 			rem[0] = 0;	/* so truncate where failure started */
172 			retc = '0';	/* number */
173 		}
174 	}
175 	*pbuf = buf;
176 	*psz = sz;
177 	return retc;
178 }
179 
180 int	word(char *);
181 int	string(void);
182 int	regexpr(void);
183 int	sc	= 0;	/* 1 => return a } right now */
184 int	reg	= 0;	/* 1 => return a REGEXPR now */
185 
186 int yylex(void)
187 {
188 	int c;
189 	static char *buf = 0;
190 	static int bufsize = 500;
191 
192 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
193 		FATAL( "out of space in yylex" );
194 	if (sc) {
195 		sc = 0;
196 		RET('}');
197 	}
198 	if (reg) {
199 		reg = 0;
200 		return regexpr();
201 	}
202 /* printf("top\n"); */
203 	for (;;) {
204 		c = gettok(&buf, &bufsize);
205 /* printf("gettok [%s]\n", buf); */
206 		if (c == 0)
207 			return 0;
208 		if (isalpha(c) || c == '_')
209 			return word(buf);
210 		if (isdigit(c)) {
211 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
212 			/* should this also have STR set? */
213 			RET(NUMBER);
214 		}
215 
216 		yylval.i = c;
217 		switch (c) {
218 		case '\n':	/* {EOL} */
219 			RET(NL);
220 		case '\r':	/* assume \n is coming */
221 		case ' ':	/* {WS}+ */
222 		case '\t':
223 			break;
224 		case '#':	/* #.* strip comments */
225 			while ((c = input()) != '\n' && c != 0)
226 				;
227 			unput(c);
228 			break;
229 		case ';':
230 			RET(';');
231 		case '\\':
232 			if (peek() == '\n') {
233 				input();
234 			} else if (peek() == '\r') {
235 				input(); input();	/* \n */
236 				lineno++;
237 			} else {
238 				RET(c);
239 			}
240 			break;
241 		case '&':
242 			if (peek() == '&') {
243 				input(); RET(AND);
244 			} else
245 				RET('&');
246 		case '|':
247 			if (peek() == '|') {
248 				input(); RET(BOR);
249 			} else
250 				RET('|');
251 		case '!':
252 			if (peek() == '=') {
253 				input(); yylval.i = NE; RET(NE);
254 			} else if (peek() == '~') {
255 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
256 			} else
257 				RET(NOT);
258 		case '~':
259 			yylval.i = MATCH;
260 			RET(MATCHOP);
261 		case '<':
262 			if (peek() == '=') {
263 				input(); yylval.i = LE; RET(LE);
264 			} else {
265 				yylval.i = LT; RET(LT);
266 			}
267 		case '=':
268 			if (peek() == '=') {
269 				input(); yylval.i = EQ; RET(EQ);
270 			} else {
271 				yylval.i = ASSIGN; RET(ASGNOP);
272 			}
273 		case '>':
274 			if (peek() == '=') {
275 				input(); yylval.i = GE; RET(GE);
276 			} else if (peek() == '>') {
277 				input(); yylval.i = APPEND; RET(APPEND);
278 			} else {
279 				yylval.i = GT; RET(GT);
280 			}
281 		case '+':
282 			if (peek() == '+') {
283 				input(); yylval.i = INCR; RET(INCR);
284 			} else if (peek() == '=') {
285 				input(); yylval.i = ADDEQ; RET(ASGNOP);
286 			} else
287 				RET('+');
288 		case '-':
289 			if (peek() == '-') {
290 				input(); yylval.i = DECR; RET(DECR);
291 			} else if (peek() == '=') {
292 				input(); yylval.i = SUBEQ; RET(ASGNOP);
293 			} else
294 				RET('-');
295 		case '*':
296 			if (peek() == '=') {	/* *= */
297 				input(); yylval.i = MULTEQ; RET(ASGNOP);
298 			} else if (peek() == '*') {	/* ** or **= */
299 				input();	/* eat 2nd * */
300 				if (peek() == '=') {
301 					input(); yylval.i = POWEQ; RET(ASGNOP);
302 				} else {
303 					RET(POWER);
304 				}
305 			} else
306 				RET('*');
307 		case '/':
308 			RET('/');
309 		case '%':
310 			if (peek() == '=') {
311 				input(); yylval.i = MODEQ; RET(ASGNOP);
312 			} else
313 				RET('%');
314 		case '^':
315 			if (peek() == '=') {
316 				input(); yylval.i = POWEQ; RET(ASGNOP);
317 			} else
318 				RET(POWER);
319 
320 		case '$':
321 			/* BUG: awkward, if not wrong */
322 			c = gettok(&buf, &bufsize);
323 			if (isalpha(c)) {
324 				if (strcmp(buf, "NF") == 0) {	/* very special */
325 					unputstr("(NF)");
326 					RET(INDIRECT);
327 				}
328 				c = peek();
329 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
330 					unputstr(buf);
331 					RET(INDIRECT);
332 				}
333 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
334 				RET(IVAR);
335 			} else if (c == 0) {	/*  */
336 				SYNTAX( "unexpected end of input after $" );
337 				RET(';');
338 			} else {
339 				unputstr(buf);
340 				RET(INDIRECT);
341 			}
342 
343 		case '}':
344 			if (--bracecnt < 0)
345 				SYNTAX( "extra }" );
346 			sc = 1;
347 			RET(';');
348 		case ']':
349 			if (--brackcnt < 0)
350 				SYNTAX( "extra ]" );
351 			RET(']');
352 		case ')':
353 			if (--parencnt < 0)
354 				SYNTAX( "extra )" );
355 			RET(')');
356 		case '{':
357 			bracecnt++;
358 			RET('{');
359 		case '[':
360 			brackcnt++;
361 			RET('[');
362 		case '(':
363 			parencnt++;
364 			RET('(');
365 
366 		case '"':
367 			return string();	/* BUG: should be like tran.c ? */
368 
369 		default:
370 			RET(c);
371 		}
372 	}
373 }
374 
375 int string(void)
376 {
377 	int c, n;
378 	char *s, *bp;
379 	static char *buf = 0;
380 	static int bufsz = 500;
381 
382 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
383 		FATAL("out of space for strings");
384 	for (bp = buf; (c = input()) != '"'; ) {
385 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
386 			FATAL("out of space for string %.10s...", buf);
387 		switch (c) {
388 		case '\n':
389 		case '\r':
390 		case 0:
391 			SYNTAX( "non-terminated string %.10s...", buf );
392 			lineno++;
393 			if (c == 0)	/* hopeless */
394 				FATAL( "giving up" );
395 			break;
396 		case '\\':
397 			c = input();
398 			switch (c) {
399 			case '"': *bp++ = '"'; break;
400 			case 'n': *bp++ = '\n'; break;
401 			case 't': *bp++ = '\t'; break;
402 			case 'f': *bp++ = '\f'; break;
403 			case 'r': *bp++ = '\r'; break;
404 			case 'b': *bp++ = '\b'; break;
405 			case 'v': *bp++ = '\v'; break;
406 			case 'a': *bp++ = '\007'; break;
407 			case '\\': *bp++ = '\\'; break;
408 
409 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
410 			case '3': case '4': case '5': case '6': case '7':
411 				n = c - '0';
412 				if ((c = peek()) >= '0' && c < '8') {
413 					n = 8 * n + input() - '0';
414 					if ((c = peek()) >= '0' && c < '8')
415 						n = 8 * n + input() - '0';
416 				}
417 				*bp++ = n;
418 				break;
419 
420 			case 'x':	/* hex  \x0-9a-fA-F + */
421 			    {	char xbuf[100], *px;
422 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
423 					if (isdigit(c)
424 					 || (c >= 'a' && c <= 'f')
425 					 || (c >= 'A' && c <= 'F'))
426 						*px++ = c;
427 					else
428 						break;
429 				}
430 				*px = 0;
431 				unput(c);
432 	  			sscanf(xbuf, "%x", &n);
433 				*bp++ = n;
434 				break;
435 			    }
436 
437 			default:
438 				*bp++ = c;
439 				break;
440 			}
441 			break;
442 		default:
443 			*bp++ = c;
444 			break;
445 		}
446 	}
447 	*bp = 0;
448 	s = tostring(buf);
449 	*bp++ = ' '; *bp++ = 0;
450 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
451 	RET(STRING);
452 }
453 
454 
455 int binsearch(char *w, Keyword *kp, int n)
456 {
457 	int cond, low, mid, high;
458 
459 	low = 0;
460 	high = n - 1;
461 	while (low <= high) {
462 		mid = (low + high) / 2;
463 		if ((cond = strcmp(w, kp[mid].word)) < 0)
464 			high = mid - 1;
465 		else if (cond > 0)
466 			low = mid + 1;
467 		else
468 			return mid;
469 	}
470 	return -1;
471 }
472 
473 int word(char *w)
474 {
475 	Keyword *kp;
476 	int c, n;
477 
478 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
479 	kp = keywords + n;
480 	if (n != -1) {	/* found in table */
481 		yylval.i = kp->sub;
482 		switch (kp->type) {	/* special handling */
483 		case FSYSTEM:
484 			if (safe)
485 				SYNTAX( "system is unsafe" );
486 			RET(kp->type);
487 		case FUNC:
488 			if (infunc)
489 				SYNTAX( "illegal nested function" );
490 			RET(kp->type);
491 		case RETURN:
492 			if (!infunc)
493 				SYNTAX( "return not in function" );
494 			RET(kp->type);
495 		case VARNF:
496 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
497 			RET(VARNF);
498 		default:
499 			RET(kp->type);
500 		}
501 	}
502 	c = peek();	/* look for '(' */
503 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
504 		yylval.i = n;
505 		RET(ARG);
506 	} else {
507 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
508 		if (c == '(') {
509 			RET(CALL);
510 		} else {
511 			RET(VAR);
512 		}
513 	}
514 }
515 
516 void startreg(void)	/* next call to yylex will return a regular expression */
517 {
518 	reg = 1;
519 }
520 
521 int regexpr(void)
522 {
523 	int c, openclass = 0;
524 	static char *buf = 0;
525 	static int bufsz = 500;
526 	char *bp;
527 
528 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
529 		FATAL("out of space for rex expr");
530 	bp = buf;
531 	for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) {
532 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
533 			FATAL("out of space for reg expr %.10s...", buf);
534 		if (c == '\n') {
535 			SYNTAX( "newline in regular expression %.10s...", buf );
536 			unput('\n');
537 			break;
538 		} else if (c == '\\') {
539 			*bp++ = '\\';
540 			*bp++ = input();
541 		} else {
542 			if (c == '[')
543 				openclass = 1;
544 			else if (c == ']')
545 				openclass = 0;
546 			*bp++ = c;
547 		}
548 	}
549 	*bp = 0;
550 	if (c == 0)
551 		SYNTAX("non-terminated regular expression %.10s...", buf);
552 	yylval.s = tostring(buf);
553 	unput('/');
554 	RET(REGEXPR);
555 }
556 
557 /* low-level lexical stuff, sort of inherited from lex */
558 
559 char	ebuf[300];
560 char	*ep = ebuf;
561 char	yysbuf[100];	/* pushback buffer */
562 char	*yysptr = yysbuf;
563 FILE	*yyin = 0;
564 
565 int input(void)	/* get next lexical input character */
566 {
567 	int c;
568 	extern char *lexprog;
569 
570 	if (yysptr > yysbuf)
571 		c = (uschar)*--yysptr;
572 	else if (lexprog != NULL) {	/* awk '...' */
573 		if ((c = (uschar)*lexprog) != 0)
574 			lexprog++;
575 	} else				/* awk -f ... */
576 		c = pgetc();
577 	if (c == '\n')
578 		lineno++;
579 	else if (c == EOF)
580 		c = 0;
581 	if (ep >= ebuf + sizeof ebuf)
582 		ep = ebuf;
583 	return *ep++ = c;
584 }
585 
586 void unput(int c)	/* put lexical character back on input */
587 {
588 	if (c == '\n')
589 		lineno--;
590 	if (yysptr >= yysbuf + sizeof(yysbuf))
591 		FATAL("pushed back too much: %.20s...", yysbuf);
592 	*yysptr++ = c;
593 	if (--ep < ebuf)
594 		ep = ebuf + sizeof(ebuf) - 1;
595 }
596 
597 void unputstr(const char *s)	/* put a string back on input */
598 {
599 	int i;
600 
601 	for (i = strlen(s)-1; i >= 0; i--)
602 		unput(s[i]);
603 }
604