xref: /openbsd-src/usr.bin/awk/lex.c (revision b2ea75c1b17e1a9a339660e7ed45cd24946b230e)
1 /*	$OpenBSD: lex.c,v 1.4 1999/12/08 23:09:45 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "ytab.h"
32 
33 extern YYSTYPE	yylval;
34 extern int	infunc;
35 
36 int	lineno	= 1;
37 int	bracecnt = 0;
38 int	brackcnt  = 0;
39 int	parencnt = 0;
40 
41 typedef struct Keyword {
42 	char	*word;
43 	int	sub;
44 	int	type;
45 } Keyword;
46 
47 Keyword keywords[] ={	/* keep sorted: binary searched */
48 	{ "BEGIN",	XBEGIN,		XBEGIN },
49 	{ "END",	XEND,		XEND },
50 	{ "NF",		VARNF,		VARNF },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "continue",	CONTINUE,	CONTINUE },
55 	{ "cos",	FCOS,		BLTIN },
56 	{ "delete",	DELETE,		DELETE },
57 	{ "do",		DO,		DO },
58 	{ "else",	ELSE,		ELSE },
59 	{ "exit",	EXIT,		EXIT },
60 	{ "exp",	FEXP,		BLTIN },
61 	{ "fflush",	FFLUSH,		BLTIN },
62 	{ "for",	FOR,		FOR },
63 	{ "func",	FUNC,		FUNC },
64 	{ "function",	FUNC,		FUNC },
65 	{ "getline",	GETLINE,	GETLINE },
66 	{ "gsub",	GSUB,		GSUB },
67 	{ "if",		IF,		IF },
68 	{ "in",		IN,		IN },
69 	{ "index",	INDEX,		INDEX },
70 	{ "int",	FINT,		BLTIN },
71 	{ "length",	FLENGTH,	BLTIN },
72 	{ "log",	FLOG,		BLTIN },
73 	{ "match",	MATCHFCN,	MATCHFCN },
74 	{ "next",	NEXT,		NEXT },
75 	{ "nextfile",	NEXTFILE,	NEXTFILE },
76 	{ "print",	PRINT,		PRINT },
77 	{ "printf",	PRINTF,		PRINTF },
78 	{ "rand",	FRAND,		BLTIN },
79 	{ "return",	RETURN,		RETURN },
80 	{ "sin",	FSIN,		BLTIN },
81 	{ "split",	SPLIT,		SPLIT },
82 	{ "sprintf",	SPRINTF,	SPRINTF },
83 	{ "sqrt",	FSQRT,		BLTIN },
84 	{ "srand",	FSRAND,		BLTIN },
85 	{ "sub",	SUB,		SUB },
86 	{ "substr",	SUBSTR,		SUBSTR },
87 	{ "system",	FSYSTEM,	BLTIN },
88 	{ "tolower",	FTOLOWER,	BLTIN },
89 	{ "toupper",	FTOUPPER,	BLTIN },
90 	{ "while",	WHILE,		WHILE },
91 };
92 
93 #define DEBUG
94 #ifdef	DEBUG
95 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
96 #else
97 #define	RET(x)	return(x)
98 #endif
99 
100 int peek(void)
101 {
102 	int c = input();
103 	unput(c);
104 	return c;
105 }
106 
107 int gettok(char **pbuf, int *psz)	/* get next input token */
108 {
109 	int c;
110 	char *buf = *pbuf;
111 	int sz = *psz;
112 	char *bp = buf;
113 
114 	c = input();
115 	if (c == 0)
116 		return 0;
117 	buf[0] = c;
118 	buf[1] = 0;
119 	if (!isalnum(c) && c != '.' && c != '_')
120 		return c;
121 
122 	*bp++ = c;
123 	if (isalpha(c) || c == '_') {	/* it's a varname */
124 		for ( ; (c = input()) != 0; ) {
125 			if (bp-buf >= sz)
126 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
127 					FATAL( "out of space for name %.10s...", buf );
128 			if (isalnum(c) || c == '_')
129 				*bp++ = c;
130 			else {
131 				*bp = 0;
132 				unput(c);
133 				break;
134 			}
135 		}
136 		*bp = 0;
137 	} else {	/* it's a number */
138 		char *rem;
139 		/* read input until can't be a number */
140 		for ( ; (c = input()) != 0; ) {
141 			if (bp-buf >= sz)
142 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143 					FATAL( "out of space for number %.10s...", buf );
144 			if (isdigit(c) || c == 'e' || c == 'E'
145 			  || c == '.' || c == '+' || c == '-')
146 				*bp++ = c;
147 			else {
148 				unput(c);
149 				break;
150 			}
151 		}
152 		*bp = 0;
153 		strtod(buf, &rem);	/* parse the number */
154 		unputstr(rem);		/* put rest back for later */
155 		rem[0] = 0;
156 	}
157 	*pbuf = buf;
158 	*psz = sz;
159 	return buf[0];
160 }
161 
162 int	word(char *);
163 int	string(void);
164 int	regexpr(void);
165 int	sc	= 0;	/* 1 => return a } right now */
166 int	reg	= 0;	/* 1 => return a REGEXPR now */
167 
168 int yylex(void)
169 {
170 	int c;
171 	static char *buf = 0;
172 	static int bufsize = 500;
173 
174 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
175 		FATAL( "out of space in yylex" );
176 	if (sc) {
177 		sc = 0;
178 		RET('}');
179 	}
180 	if (reg) {
181 		reg = 0;
182 		return regexpr();
183 	}
184 	for (;;) {
185 		c = gettok(&buf, &bufsize);
186 		if (c == 0)
187 			return 0;
188 		if (isalpha(c) || c == '_')
189 			return word(buf);
190 		if (isdigit(c) || c == '.') {
191 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
192 			/* should this also have STR set? */
193 			RET(NUMBER);
194 		}
195 
196 		yylval.i = c;
197 		switch (c) {
198 		case '\n':	/* {EOL} */
199 			RET(NL);
200 		case '\r':	/* assume \n is coming */
201 		case ' ':	/* {WS}+ */
202 		case '\t':
203 			break;
204 		case '#':	/* #.* strip comments */
205 			while ((c = input()) != '\n' && c != 0)
206 				;
207 			unput(c);
208 			break;
209 		case ';':
210 			RET(';');
211 		case '\\':
212 			if (peek() == '\n') {
213 				input();
214 			} else if (peek() == '\r') {
215 				input(); input();	/* \n */
216 				lineno++;
217 			} else {
218 				RET(c);
219 			}
220 			break;
221 		case '&':
222 			if (peek() == '&') {
223 				input(); RET(AND);
224 			} else
225 				RET('&');
226 		case '|':
227 			if (peek() == '|') {
228 				input(); RET(BOR);
229 			} else
230 				RET('|');
231 		case '!':
232 			if (peek() == '=') {
233 				input(); yylval.i = NE; RET(NE);
234 			} else if (peek() == '~') {
235 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
236 			} else
237 				RET(NOT);
238 		case '~':
239 			yylval.i = MATCH;
240 			RET(MATCHOP);
241 		case '<':
242 			if (peek() == '=') {
243 				input(); yylval.i = LE; RET(LE);
244 			} else {
245 				yylval.i = LT; RET(LT);
246 			}
247 		case '=':
248 			if (peek() == '=') {
249 				input(); yylval.i = EQ; RET(EQ);
250 			} else {
251 				yylval.i = ASSIGN; RET(ASGNOP);
252 			}
253 		case '>':
254 			if (peek() == '=') {
255 				input(); yylval.i = GE; RET(GE);
256 			} else if (peek() == '>') {
257 				input(); yylval.i = APPEND; RET(APPEND);
258 			} else {
259 				yylval.i = GT; RET(GT);
260 			}
261 		case '+':
262 			if (peek() == '+') {
263 				input(); yylval.i = INCR; RET(INCR);
264 			} else if (peek() == '=') {
265 				input(); yylval.i = ADDEQ; RET(ASGNOP);
266 			} else
267 				RET('+');
268 		case '-':
269 			if (peek() == '-') {
270 				input(); yylval.i = DECR; RET(DECR);
271 			} else if (peek() == '=') {
272 				input(); yylval.i = SUBEQ; RET(ASGNOP);
273 			} else
274 				RET('-');
275 		case '*':
276 			if (peek() == '=') {	/* *= */
277 				input(); yylval.i = MULTEQ; RET(ASGNOP);
278 			} else if (peek() == '*') {	/* ** or **= */
279 				input();	/* eat 2nd * */
280 				if (peek() == '=') {
281 					input(); yylval.i = POWEQ; RET(ASGNOP);
282 				} else {
283 					RET(POWER);
284 				}
285 			} else
286 				RET('*');
287 		case '/':
288 			RET('/');
289 		case '%':
290 			if (peek() == '=') {
291 				input(); yylval.i = MODEQ; RET(ASGNOP);
292 			} else
293 				RET('%');
294 		case '^':
295 			if (peek() == '=') {
296 				input(); yylval.i = POWEQ; RET(ASGNOP);
297 			} else
298 				RET(POWER);
299 
300 		case '$':
301 			/* BUG: awkward, if not wrong */
302 			c = gettok(&buf, &bufsize);
303 			if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
304 				unputstr(buf);
305 				RET(INDIRECT);
306 			} else if (isalpha(c)) {
307 				if (strcmp(buf, "NF") == 0) {	/* very special */
308 					unputstr("(NF)");
309 					RET(INDIRECT);
310 				}
311 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
312 				RET(IVAR);
313 			} else {
314 				unputstr(buf);
315 				RET(INDIRECT);
316 			}
317 
318 		case '}':
319 			if (--bracecnt < 0)
320 				SYNTAX( "extra }" );
321 			sc = 1;
322 			RET(';');
323 		case ']':
324 			if (--brackcnt < 0)
325 				SYNTAX( "extra ]" );
326 			RET(']');
327 		case ')':
328 			if (--parencnt < 0)
329 				SYNTAX( "extra )" );
330 			RET(')');
331 		case '{':
332 			bracecnt++;
333 			RET('{');
334 		case '[':
335 			brackcnt++;
336 			RET('[');
337 		case '(':
338 			parencnt++;
339 			RET('(');
340 
341 		case '"':
342 			return string();	/* BUG: should be like tran.c ? */
343 
344 		default:
345 			RET(c);
346 		}
347 	}
348 }
349 
350 int string(void)
351 {
352 	int c, n;
353 	char *s, *bp;
354 	static char *buf = 0;
355 	static int bufsz = 500;
356 
357 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
358 		FATAL("out of space for strings");
359 	for (bp = buf; (c = input()) != '"'; ) {
360 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
361 			FATAL("out of space for string %.10s...", buf);
362 		switch (c) {
363 		case '\n':
364 		case '\r':
365 		case 0:
366 			SYNTAX( "non-terminated string %.10s...", buf );
367 			lineno++;
368 			break;
369 		case '\\':
370 			c = input();
371 			switch (c) {
372 			case '"': *bp++ = '"'; break;
373 			case 'n': *bp++ = '\n'; break;
374 			case 't': *bp++ = '\t'; break;
375 			case 'f': *bp++ = '\f'; break;
376 			case 'r': *bp++ = '\r'; break;
377 			case 'b': *bp++ = '\b'; break;
378 			case 'v': *bp++ = '\v'; break;
379 			case 'a': *bp++ = '\007'; break;
380 			case '\\': *bp++ = '\\'; break;
381 
382 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
383 			case '3': case '4': case '5': case '6': case '7':
384 				n = c - '0';
385 				if ((c = peek()) >= '0' && c < '8') {
386 					n = 8 * n + input() - '0';
387 					if ((c = peek()) >= '0' && c < '8')
388 						n = 8 * n + input() - '0';
389 				}
390 				*bp++ = n;
391 				break;
392 
393 			case 'x':	/* hex  \x0-9a-fA-F + */
394 			    {	char xbuf[100], *px;
395 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
396 					if (isdigit(c)
397 					 || (c >= 'a' && c <= 'f')
398 					 || (c >= 'A' && c <= 'F'))
399 						*px++ = c;
400 					else
401 						break;
402 				}
403 				*px = 0;
404 				unput(c);
405 	  			sscanf(xbuf, "%x", &n);
406 				*bp++ = n;
407 				break;
408 			    }
409 
410 			default:
411 				*bp++ = c;
412 				break;
413 			}
414 			break;
415 		default:
416 			*bp++ = c;
417 			break;
418 		}
419 	}
420 	*bp = 0;
421 	s = tostring(buf);
422 	*bp++ = ' '; *bp++ = 0;
423 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
424 	RET(STRING);
425 }
426 
427 
428 int binsearch(char *w, Keyword *kp, int n)
429 {
430 	int cond, low, mid, high;
431 
432 	low = 0;
433 	high = n - 1;
434 	while (low <= high) {
435 		mid = (low + high) / 2;
436 		if ((cond = strcmp(w, kp[mid].word)) < 0)
437 			high = mid - 1;
438 		else if (cond > 0)
439 			low = mid + 1;
440 		else
441 			return mid;
442 	}
443 	return -1;
444 }
445 
446 int word(char *w)
447 {
448 	Keyword *kp;
449 	int c, n;
450 
451 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
452 	kp = keywords + n;
453 	if (n != -1) {	/* found in table */
454 		yylval.i = kp->sub;
455 		switch (kp->type) {	/* special handling */
456 		case FSYSTEM:
457 			if (safe)
458 				SYNTAX( "system is unsafe" );
459 			RET(kp->type);
460 		case FUNC:
461 			if (infunc)
462 				SYNTAX( "illegal nested function" );
463 			RET(kp->type);
464 		case RETURN:
465 			if (!infunc)
466 				SYNTAX( "return not in function" );
467 			RET(kp->type);
468 		case VARNF:
469 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
470 			RET(VARNF);
471 		default:
472 			RET(kp->type);
473 		}
474 	}
475 	c = peek();	/* look for '(' */
476 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
477 		yylval.i = n;
478 		RET(ARG);
479 	} else {
480 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
481 		if (c == '(') {
482 			RET(CALL);
483 		} else {
484 			RET(VAR);
485 		}
486 	}
487 }
488 
489 void startreg(void)	/* next call to yyles will return a regular expression */
490 {
491 	reg = 1;
492 }
493 
494 int regexpr(void)
495 {
496 	int c;
497 	static char *buf = 0;
498 	static int bufsz = 500;
499 	char *bp;
500 
501 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
502 		FATAL("out of space for rex expr");
503 	bp = buf;
504 	for ( ; (c = input()) != '/' && c != 0; ) {
505 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
506 			FATAL("out of space for reg expr %.10s...", buf);
507 		if (c == '\n') {
508 			SYNTAX( "newline in regular expression %.10s...", buf );
509 			unput('\n');
510 			break;
511 		} else if (c == '\\') {
512 			*bp++ = '\\';
513 			*bp++ = input();
514 		} else {
515 			*bp++ = c;
516 		}
517 	}
518 	*bp = 0;
519 	yylval.s = tostring(buf);
520 	unput('/');
521 	RET(REGEXPR);
522 }
523 
524 /* low-level lexical stuff, sort of inherited from lex */
525 
526 char	ebuf[300];
527 char	*ep = ebuf;
528 char	yysbuf[100];	/* pushback buffer */
529 char	*yysptr = yysbuf;
530 FILE	*yyin = 0;
531 
532 int input(void)	/* get next lexical input character */
533 {
534 	int c;
535 	extern char *lexprog;
536 
537 	if (yysptr > yysbuf)
538 		c = *--yysptr;
539 	else if (lexprog != NULL) {	/* awk '...' */
540 		if ((c = *lexprog) != 0)
541 			lexprog++;
542 	} else				/* awk -f ... */
543 		c = pgetc();
544 	if (c == '\n')
545 		lineno++;
546 	else if (c == EOF)
547 		c = 0;
548 	if (ep >= ebuf + sizeof ebuf)
549 		ep = ebuf;
550 	return *ep++ = c;
551 }
552 
553 void unput(int c)	/* put lexical character back on input */
554 {
555 	if (c == '\n')
556 		lineno--;
557 	if (yysptr >= yysbuf + sizeof(yysbuf))
558 		FATAL("pushed back too much: %.20s...", yysbuf);
559 	*yysptr++ = c;
560 	if (--ep < ebuf)
561 		ep = ebuf + sizeof(ebuf) - 1;
562 }
563 
564 void unputstr(char *s)	/* put a string back on input */
565 {
566 	int i;
567 
568 	for (i = strlen(s)-1; i >= 0; i--)
569 		unput(s[i]);
570 }
571