xref: /plan9-contrib/sys/src/cmd/awk/lex.c (revision a2c41696452f8a895ad2951a6355034fbc3034ed)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "y.tab.h"
31 
32 extern YYSTYPE	yylval;
33 extern int	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 Keyword keywords[] ={	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "atan2",	FATAN,		BLTIN },
51 	{ "break",	BREAK,		BREAK },
52 	{ "close",	CLOSE,		CLOSE },
53 	{ "continue",	CONTINUE,	CONTINUE },
54 	{ "cos",	FCOS,		BLTIN },
55 	{ "delete",	DELETE,		DELETE },
56 	{ "do",		DO,		DO },
57 	{ "else",	ELSE,		ELSE },
58 	{ "exit",	EXIT,		EXIT },
59 	{ "exp",	FEXP,		BLTIN },
60 	{ "fflush",	FFLUSH,		BLTIN },
61 	{ "for",	FOR,		FOR },
62 	{ "func",	FUNC,		FUNC },
63 	{ "function",	FUNC,		FUNC },
64 	{ "getline",	GETLINE,	GETLINE },
65 	{ "gsub",	GSUB,		GSUB },
66 	{ "if",		IF,		IF },
67 	{ "in",		IN,		IN },
68 	{ "index",	INDEX,		INDEX },
69 	{ "int",	FINT,		BLTIN },
70 	{ "length",	FLENGTH,	BLTIN },
71 	{ "log",	FLOG,		BLTIN },
72 	{ "match",	MATCHFCN,	MATCHFCN },
73 	{ "next",	NEXT,		NEXT },
74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
75 	{ "print",	PRINT,		PRINT },
76 	{ "printf",	PRINTF,		PRINTF },
77 	{ "rand",	FRAND,		BLTIN },
78 	{ "return",	RETURN,		RETURN },
79 	{ "sin",	FSIN,		BLTIN },
80 	{ "split",	SPLIT,		SPLIT },
81 	{ "sprintf",	SPRINTF,	SPRINTF },
82 	{ "sqrt",	FSQRT,		BLTIN },
83 	{ "srand",	FSRAND,		BLTIN },
84 	{ "sub",	SUB,		SUB },
85 	{ "substr",	SUBSTR,		SUBSTR },
86 	{ "system",	FSYSTEM,	BLTIN },
87 	{ "tolower",	FTOLOWER,	BLTIN },
88 	{ "toupper",	FTOUPPER,	BLTIN },
89 	{ "utf",	FUTF,		BLTIN },
90 	{ "while",	WHILE,		WHILE },
91 };
92 
93 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
94 
peek(void)95 int peek(void)
96 {
97 	int c = input();
98 	unput(c);
99 	return c;
100 }
101 
gettok(char ** pbuf,int * psz)102 int gettok(char **pbuf, int *psz)	/* get next input token */
103 {
104 	int c, retc;
105 	char *buf = *pbuf;
106 	int sz = *psz;
107 	char *bp = buf;
108 
109 	c = input();
110 	if (c == 0)
111 		return 0;
112 	buf[0] = c;
113 	buf[1] = 0;
114 	if (!isalnum(c) && c != '.' && c != '_')
115 		return c;
116 
117 	*bp++ = c;
118 	if (isalpha(c) || c == '_') {	/* it's a varname */
119 		for ( ; (c = input()) != 0; ) {
120 			if (bp-buf >= sz)
121 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
122 					FATAL( "out of space for name %.10s...", buf );
123 			if (isalnum(c) || c == '_')
124 				*bp++ = c;
125 			else {
126 				*bp = 0;
127 				unput(c);
128 				break;
129 			}
130 		}
131 		*bp = 0;
132 		retc = 'a';	/* alphanumeric */
133 	} else {	/* maybe it's a number, but could be . */
134 		char *rem;
135 		/* read input until can't be a number */
136 		for ( ; (c = input()) != 0; ) {
137 			if (bp-buf >= sz)
138 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
139 					FATAL( "out of space for number %.10s...", buf );
140 			if (isdigit(c) || c == 'e' || c == 'E'
141 			  || c == '.' || c == '+' || c == '-')
142 				*bp++ = c;
143 			else {
144 				unput(c);
145 				break;
146 			}
147 		}
148 		*bp = 0;
149 		strtod(buf, &rem);	/* parse the number */
150 		if (rem == buf) {	/* it wasn't a valid number at all */
151 			buf[1] = 0;	/* return one character as token */
152 			retc = buf[0];	/* character is its own type */
153 			unputstr(rem+1); /* put rest back for later */
154 		} else {	/* some prefix was a number */
155 			unputstr(rem);	/* put rest back for later */
156 			rem[0] = 0;	/* truncate buf after number part */
157 			retc = '0';	/* type is number */
158 		}
159 	}
160 	*pbuf = buf;
161 	*psz = sz;
162 	return retc;
163 }
164 
165 int	word(char *);
166 int	string(void);
167 int	regexpr(void);
168 int	sc	= 0;	/* 1 => return a } right now */
169 int	reg	= 0;	/* 1 => return a REGEXPR now */
170 
yylex(void)171 int yylex(void)
172 {
173 	int c;
174 	static char *buf = 0;
175 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
176 
177 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
178 		FATAL( "out of space in yylex" );
179 	if (sc) {
180 		sc = 0;
181 		RET('}');
182 	}
183 	if (reg) {
184 		reg = 0;
185 		return regexpr();
186 	}
187 	for (;;) {
188 		c = gettok(&buf, &bufsize);
189 		if (c == 0)
190 			return 0;
191 		if (isalpha(c) || c == '_')
192 			return word(buf);
193 		if (isdigit(c)) {
194 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
195 			/* should this also have STR set? */
196 			RET(NUMBER);
197 		}
198 
199 		yylval.i = c;
200 		switch (c) {
201 		case '\n':	/* {EOL} */
202 			RET(NL);
203 		case '\r':	/* assume \n is coming */
204 		case ' ':	/* {WS}+ */
205 		case '\t':
206 			break;
207 		case '#':	/* #.* strip comments */
208 			while ((c = input()) != '\n' && c != 0)
209 				;
210 			unput(c);
211 			break;
212 		case ';':
213 			RET(';');
214 		case '\\':
215 			if (peek() == '\n') {
216 				input();
217 			} else if (peek() == '\r') {
218 				input(); input();	/* \n */
219 				lineno++;
220 			} else {
221 				RET(c);
222 			}
223 			break;
224 		case '&':
225 			if (peek() == '&') {
226 				input(); RET(AND);
227 			} else
228 				RET('&');
229 		case '|':
230 			if (peek() == '|') {
231 				input(); RET(BOR);
232 			} else
233 				RET('|');
234 		case '!':
235 			if (peek() == '=') {
236 				input(); yylval.i = NE; RET(NE);
237 			} else if (peek() == '~') {
238 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
239 			} else
240 				RET(NOT);
241 		case '~':
242 			yylval.i = MATCH;
243 			RET(MATCHOP);
244 		case '<':
245 			if (peek() == '=') {
246 				input(); yylval.i = LE; RET(LE);
247 			} else {
248 				yylval.i = LT; RET(LT);
249 			}
250 		case '=':
251 			if (peek() == '=') {
252 				input(); yylval.i = EQ; RET(EQ);
253 			} else {
254 				yylval.i = ASSIGN; RET(ASGNOP);
255 			}
256 		case '>':
257 			if (peek() == '=') {
258 				input(); yylval.i = GE; RET(GE);
259 			} else if (peek() == '>') {
260 				input(); yylval.i = APPEND; RET(APPEND);
261 			} else {
262 				yylval.i = GT; RET(GT);
263 			}
264 		case '+':
265 			if (peek() == '+') {
266 				input(); yylval.i = INCR; RET(INCR);
267 			} else if (peek() == '=') {
268 				input(); yylval.i = ADDEQ; RET(ASGNOP);
269 			} else
270 				RET('+');
271 		case '-':
272 			if (peek() == '-') {
273 				input(); yylval.i = DECR; RET(DECR);
274 			} else if (peek() == '=') {
275 				input(); yylval.i = SUBEQ; RET(ASGNOP);
276 			} else
277 				RET('-');
278 		case '*':
279 			if (peek() == '=') {	/* *= */
280 				input(); yylval.i = MULTEQ; RET(ASGNOP);
281 			} else if (peek() == '*') {	/* ** or **= */
282 				input();	/* eat 2nd * */
283 				if (peek() == '=') {
284 					input(); yylval.i = POWEQ; RET(ASGNOP);
285 				} else {
286 					RET(POWER);
287 				}
288 			} else
289 				RET('*');
290 		case '/':
291 			RET('/');
292 		case '%':
293 			if (peek() == '=') {
294 				input(); yylval.i = MODEQ; RET(ASGNOP);
295 			} else
296 				RET('%');
297 		case '^':
298 			if (peek() == '=') {
299 				input(); yylval.i = POWEQ; RET(ASGNOP);
300 			} else
301 				RET(POWER);
302 
303 		case '$':
304 			/* BUG: awkward, if not wrong */
305 			c = gettok(&buf, &bufsize);
306 			if (isalpha(c)) {
307 				if (strcmp(buf, "NF") == 0) {	/* very special */
308 					unputstr("(NF)");
309 					RET(INDIRECT);
310 				}
311 				c = peek();
312 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
313 					unputstr(buf);
314 					RET(INDIRECT);
315 				}
316 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
317 				RET(IVAR);
318 			} else if (c == 0) {	/*  */
319 				SYNTAX( "unexpected end of input after $" );
320 				RET(';');
321 			} else {
322 				unputstr(buf);
323 				RET(INDIRECT);
324 			}
325 
326 		case '}':
327 			if (--bracecnt < 0)
328 				SYNTAX( "extra }" );
329 			sc = 1;
330 			RET(';');
331 		case ']':
332 			if (--brackcnt < 0)
333 				SYNTAX( "extra ]" );
334 			RET(']');
335 		case ')':
336 			if (--parencnt < 0)
337 				SYNTAX( "extra )" );
338 			RET(')');
339 		case '{':
340 			bracecnt++;
341 			RET('{');
342 		case '[':
343 			brackcnt++;
344 			RET('[');
345 		case '(':
346 			parencnt++;
347 			RET('(');
348 
349 		case '"':
350 			return string();	/* BUG: should be like tran.c ? */
351 
352 		default:
353 			RET(c);
354 		}
355 	}
356 }
357 
string(void)358 int string(void)
359 {
360 	int c, n;
361 	char *s, *bp;
362 	static char *buf = 0;
363 	static int bufsz = 500;
364 
365 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
366 		FATAL("out of space for strings");
367 	for (bp = buf; (c = input()) != '"'; ) {
368 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
369 			FATAL("out of space for string %.10s...", buf);
370 		switch (c) {
371 		case '\n':
372 		case '\r':
373 		case 0:
374 			SYNTAX( "non-terminated string %.10s...", buf );
375 			lineno++;
376 			if (c == 0)	/* hopeless */
377 				FATAL( "giving up" );
378 			break;
379 		case '\\':
380 			c = input();
381 			switch (c) {
382 			case '"': *bp++ = '"'; break;
383 			case 'n': *bp++ = '\n'; break;
384 			case 't': *bp++ = '\t'; break;
385 			case 'f': *bp++ = '\f'; break;
386 			case 'r': *bp++ = '\r'; break;
387 			case 'b': *bp++ = '\b'; break;
388 			case 'v': *bp++ = '\v'; break;
389 			case 'a': *bp++ = '\007'; break;
390 			case '\\': *bp++ = '\\'; break;
391 
392 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
393 			case '3': case '4': case '5': case '6': case '7':
394 				n = c - '0';
395 				if ((c = peek()) >= '0' && c < '8') {
396 					n = 8 * n + input() - '0';
397 					if ((c = peek()) >= '0' && c < '8')
398 						n = 8 * n + input() - '0';
399 				}
400 				*bp++ = n;
401 				break;
402 
403 			case 'x':	/* hex  \x0-9a-fA-F + */
404 			    {	char xbuf[100], *px;
405 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
406 					if (isdigit(c)
407 					 || (c >= 'a' && c <= 'f')
408 					 || (c >= 'A' && c <= 'F'))
409 						*px++ = c;
410 					else
411 						break;
412 				}
413 				*px = 0;
414 				unput(c);
415 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
416 				*bp++ = n;
417 				break;
418 			    }
419 
420 			default:
421 				*bp++ = c;
422 				break;
423 			}
424 			break;
425 		default:
426 			*bp++ = c;
427 			break;
428 		}
429 	}
430 	*bp = 0;
431 	s = tostring(buf);
432 	*bp++ = ' '; *bp++ = 0;
433 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
434 	RET(STRING);
435 }
436 
437 
binsearch(char * w,Keyword * kp,int n)438 int binsearch(char *w, Keyword *kp, int n)
439 {
440 	int cond, low, mid, high;
441 
442 	low = 0;
443 	high = n - 1;
444 	while (low <= high) {
445 		mid = (low + high) / 2;
446 		if ((cond = strcmp(w, kp[mid].word)) < 0)
447 			high = mid - 1;
448 		else if (cond > 0)
449 			low = mid + 1;
450 		else
451 			return mid;
452 	}
453 	return -1;
454 }
455 
word(char * w)456 int word(char *w)
457 {
458 	Keyword *kp;
459 	int c, n;
460 
461 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
462 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
463 	kp = keywords + n;
464 	if (n != -1) {	/* found in table */
465 		yylval.i = kp->sub;
466 		switch (kp->type) {	/* special handling */
467 		case BLTIN:
468 			if (kp->sub == FSYSTEM && safe)
469 				SYNTAX( "system is unsafe" );
470 			RET(kp->type);
471 		case FUNC:
472 			if (infunc)
473 				SYNTAX( "illegal nested function" );
474 			RET(kp->type);
475 		case RETURN:
476 			if (!infunc)
477 				SYNTAX( "return not in function" );
478 			RET(kp->type);
479 		case VARNF:
480 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
481 			RET(VARNF);
482 		default:
483 			RET(kp->type);
484 		}
485 	}
486 	c = peek();	/* look for '(' */
487 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
488 		yylval.i = n;
489 		RET(ARG);
490 	} else {
491 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
492 		if (c == '(') {
493 			RET(CALL);
494 		} else {
495 			RET(VAR);
496 		}
497 	}
498 }
499 
startreg(void)500 void startreg(void)	/* next call to yylex will return a regular expression */
501 {
502 	reg = 1;
503 }
504 
regexpr(void)505 int regexpr(void)
506 {
507 	int c;
508 	static char *buf = 0;
509 	static int bufsz = 500;
510 	char *bp;
511 
512 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
513 		FATAL("out of space for rex expr");
514 	bp = buf;
515 	for ( ; (c = input()) != '/' && c != 0; ) {
516 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
517 			FATAL("out of space for reg expr %.10s...", buf);
518 		if (c == '\n') {
519 			SYNTAX( "newline in regular expression %.10s...", buf );
520 			unput('\n');
521 			break;
522 		} else if (c == '\\') {
523 			*bp++ = '\\';
524 			*bp++ = input();
525 		} else {
526 			*bp++ = c;
527 		}
528 	}
529 	*bp = 0;
530 	if (c == 0)
531 		SYNTAX("non-terminated regular expression %.10s...", buf);
532 	yylval.s = tostring(buf);
533 	unput('/');
534 	RET(REGEXPR);
535 }
536 
537 /* low-level lexical stuff, sort of inherited from lex */
538 
539 char	ebuf[300];
540 char	*ep = ebuf;
541 char	yysbuf[100];	/* pushback buffer */
542 char	*yysptr = yysbuf;
543 FILE	*yyin = 0;
544 
input(void)545 int input(void)	/* get next lexical input character */
546 {
547 	int c;
548 	extern char *lexprog;
549 
550 	if (yysptr > yysbuf)
551 		c = (uschar)*--yysptr;
552 	else if (lexprog != NULL) {	/* awk '...' */
553 		if ((c = (uschar)*lexprog) != 0)
554 			lexprog++;
555 	} else				/* awk -f ... */
556 		c = pgetc();
557 	if (c == '\n')
558 		lineno++;
559 	else if (c == EOF)
560 		c = 0;
561 	if (ep >= ebuf + sizeof ebuf)
562 		ep = ebuf;
563 	return *ep++ = c;
564 }
565 
unput(int c)566 void unput(int c)	/* put lexical character back on input */
567 {
568 	if (c == '\n')
569 		lineno--;
570 	if (yysptr >= yysbuf + sizeof(yysbuf))
571 		FATAL("pushed back too much: %.20s...", yysbuf);
572 	*yysptr++ = c;
573 	if (--ep < ebuf)
574 		ep = ebuf + sizeof(ebuf) - 1;
575 }
576 
unputstr(const char * s)577 void unputstr(const char *s)	/* put a string back on input */
578 {
579 	int i;
580 
581 	for (i = strlen(s)-1; i >= 0; i--)
582 		unput(s[i]);
583 }
584