1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "y.tab.h"
31
32 extern YYSTYPE yylval;
33 extern int infunc;
34
35 int lineno = 1;
36 int bracecnt = 0;
37 int brackcnt = 0;
38 int parencnt = 0;
39
40 typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44 } Keyword;
45
46 Keyword keywords[] ={ /* keep sorted: binary searched */
47 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "atan2", FATAN, BLTIN },
51 { "break", BREAK, BREAK },
52 { "close", CLOSE, CLOSE },
53 { "continue", CONTINUE, CONTINUE },
54 { "cos", FCOS, BLTIN },
55 { "delete", DELETE, DELETE },
56 { "do", DO, DO },
57 { "else", ELSE, ELSE },
58 { "exit", EXIT, EXIT },
59 { "exp", FEXP, BLTIN },
60 { "fflush", FFLUSH, BLTIN },
61 { "for", FOR, FOR },
62 { "func", FUNC, FUNC },
63 { "function", FUNC, FUNC },
64 { "getline", GETLINE, GETLINE },
65 { "gsub", GSUB, GSUB },
66 { "if", IF, IF },
67 { "in", IN, IN },
68 { "index", INDEX, INDEX },
69 { "int", FINT, BLTIN },
70 { "length", FLENGTH, BLTIN },
71 { "log", FLOG, BLTIN },
72 { "match", MATCHFCN, MATCHFCN },
73 { "next", NEXT, NEXT },
74 { "nextfile", NEXTFILE, NEXTFILE },
75 { "print", PRINT, PRINT },
76 { "printf", PRINTF, PRINTF },
77 { "rand", FRAND, BLTIN },
78 { "return", RETURN, RETURN },
79 { "sin", FSIN, BLTIN },
80 { "split", SPLIT, SPLIT },
81 { "sprintf", SPRINTF, SPRINTF },
82 { "sqrt", FSQRT, BLTIN },
83 { "srand", FSRAND, BLTIN },
84 { "sub", SUB, SUB },
85 { "substr", SUBSTR, SUBSTR },
86 { "system", FSYSTEM, BLTIN },
87 { "tolower", FTOLOWER, BLTIN },
88 { "toupper", FTOUPPER, BLTIN },
89 { "utf", FUTF, BLTIN },
90 { "while", WHILE, WHILE },
91 };
92
93 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
94
peek(void)95 int peek(void)
96 {
97 int c = input();
98 unput(c);
99 return c;
100 }
101
gettok(char ** pbuf,int * psz)102 int gettok(char **pbuf, int *psz) /* get next input token */
103 {
104 int c, retc;
105 char *buf = *pbuf;
106 int sz = *psz;
107 char *bp = buf;
108
109 c = input();
110 if (c == 0)
111 return 0;
112 buf[0] = c;
113 buf[1] = 0;
114 if (!isalnum(c) && c != '.' && c != '_')
115 return c;
116
117 *bp++ = c;
118 if (isalpha(c) || c == '_') { /* it's a varname */
119 for ( ; (c = input()) != 0; ) {
120 if (bp-buf >= sz)
121 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
122 FATAL( "out of space for name %.10s...", buf );
123 if (isalnum(c) || c == '_')
124 *bp++ = c;
125 else {
126 *bp = 0;
127 unput(c);
128 break;
129 }
130 }
131 *bp = 0;
132 retc = 'a'; /* alphanumeric */
133 } else { /* maybe it's a number, but could be . */
134 char *rem;
135 /* read input until can't be a number */
136 for ( ; (c = input()) != 0; ) {
137 if (bp-buf >= sz)
138 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
139 FATAL( "out of space for number %.10s...", buf );
140 if (isdigit(c) || c == 'e' || c == 'E'
141 || c == '.' || c == '+' || c == '-')
142 *bp++ = c;
143 else {
144 unput(c);
145 break;
146 }
147 }
148 *bp = 0;
149 strtod(buf, &rem); /* parse the number */
150 if (rem == buf) { /* it wasn't a valid number at all */
151 buf[1] = 0; /* return one character as token */
152 retc = buf[0]; /* character is its own type */
153 unputstr(rem+1); /* put rest back for later */
154 } else { /* some prefix was a number */
155 unputstr(rem); /* put rest back for later */
156 rem[0] = 0; /* truncate buf after number part */
157 retc = '0'; /* type is number */
158 }
159 }
160 *pbuf = buf;
161 *psz = sz;
162 return retc;
163 }
164
165 int word(char *);
166 int string(void);
167 int regexpr(void);
168 int sc = 0; /* 1 => return a } right now */
169 int reg = 0; /* 1 => return a REGEXPR now */
170
yylex(void)171 int yylex(void)
172 {
173 int c;
174 static char *buf = 0;
175 static int bufsize = 5; /* BUG: setting this small causes core dump! */
176
177 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
178 FATAL( "out of space in yylex" );
179 if (sc) {
180 sc = 0;
181 RET('}');
182 }
183 if (reg) {
184 reg = 0;
185 return regexpr();
186 }
187 for (;;) {
188 c = gettok(&buf, &bufsize);
189 if (c == 0)
190 return 0;
191 if (isalpha(c) || c == '_')
192 return word(buf);
193 if (isdigit(c)) {
194 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
195 /* should this also have STR set? */
196 RET(NUMBER);
197 }
198
199 yylval.i = c;
200 switch (c) {
201 case '\n': /* {EOL} */
202 RET(NL);
203 case '\r': /* assume \n is coming */
204 case ' ': /* {WS}+ */
205 case '\t':
206 break;
207 case '#': /* #.* strip comments */
208 while ((c = input()) != '\n' && c != 0)
209 ;
210 unput(c);
211 break;
212 case ';':
213 RET(';');
214 case '\\':
215 if (peek() == '\n') {
216 input();
217 } else if (peek() == '\r') {
218 input(); input(); /* \n */
219 lineno++;
220 } else {
221 RET(c);
222 }
223 break;
224 case '&':
225 if (peek() == '&') {
226 input(); RET(AND);
227 } else
228 RET('&');
229 case '|':
230 if (peek() == '|') {
231 input(); RET(BOR);
232 } else
233 RET('|');
234 case '!':
235 if (peek() == '=') {
236 input(); yylval.i = NE; RET(NE);
237 } else if (peek() == '~') {
238 input(); yylval.i = NOTMATCH; RET(MATCHOP);
239 } else
240 RET(NOT);
241 case '~':
242 yylval.i = MATCH;
243 RET(MATCHOP);
244 case '<':
245 if (peek() == '=') {
246 input(); yylval.i = LE; RET(LE);
247 } else {
248 yylval.i = LT; RET(LT);
249 }
250 case '=':
251 if (peek() == '=') {
252 input(); yylval.i = EQ; RET(EQ);
253 } else {
254 yylval.i = ASSIGN; RET(ASGNOP);
255 }
256 case '>':
257 if (peek() == '=') {
258 input(); yylval.i = GE; RET(GE);
259 } else if (peek() == '>') {
260 input(); yylval.i = APPEND; RET(APPEND);
261 } else {
262 yylval.i = GT; RET(GT);
263 }
264 case '+':
265 if (peek() == '+') {
266 input(); yylval.i = INCR; RET(INCR);
267 } else if (peek() == '=') {
268 input(); yylval.i = ADDEQ; RET(ASGNOP);
269 } else
270 RET('+');
271 case '-':
272 if (peek() == '-') {
273 input(); yylval.i = DECR; RET(DECR);
274 } else if (peek() == '=') {
275 input(); yylval.i = SUBEQ; RET(ASGNOP);
276 } else
277 RET('-');
278 case '*':
279 if (peek() == '=') { /* *= */
280 input(); yylval.i = MULTEQ; RET(ASGNOP);
281 } else if (peek() == '*') { /* ** or **= */
282 input(); /* eat 2nd * */
283 if (peek() == '=') {
284 input(); yylval.i = POWEQ; RET(ASGNOP);
285 } else {
286 RET(POWER);
287 }
288 } else
289 RET('*');
290 case '/':
291 RET('/');
292 case '%':
293 if (peek() == '=') {
294 input(); yylval.i = MODEQ; RET(ASGNOP);
295 } else
296 RET('%');
297 case '^':
298 if (peek() == '=') {
299 input(); yylval.i = POWEQ; RET(ASGNOP);
300 } else
301 RET(POWER);
302
303 case '$':
304 /* BUG: awkward, if not wrong */
305 c = gettok(&buf, &bufsize);
306 if (isalpha(c)) {
307 if (strcmp(buf, "NF") == 0) { /* very special */
308 unputstr("(NF)");
309 RET(INDIRECT);
310 }
311 c = peek();
312 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
313 unputstr(buf);
314 RET(INDIRECT);
315 }
316 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
317 RET(IVAR);
318 } else if (c == 0) { /* */
319 SYNTAX( "unexpected end of input after $" );
320 RET(';');
321 } else {
322 unputstr(buf);
323 RET(INDIRECT);
324 }
325
326 case '}':
327 if (--bracecnt < 0)
328 SYNTAX( "extra }" );
329 sc = 1;
330 RET(';');
331 case ']':
332 if (--brackcnt < 0)
333 SYNTAX( "extra ]" );
334 RET(']');
335 case ')':
336 if (--parencnt < 0)
337 SYNTAX( "extra )" );
338 RET(')');
339 case '{':
340 bracecnt++;
341 RET('{');
342 case '[':
343 brackcnt++;
344 RET('[');
345 case '(':
346 parencnt++;
347 RET('(');
348
349 case '"':
350 return string(); /* BUG: should be like tran.c ? */
351
352 default:
353 RET(c);
354 }
355 }
356 }
357
string(void)358 int string(void)
359 {
360 int c, n;
361 char *s, *bp;
362 static char *buf = 0;
363 static int bufsz = 500;
364
365 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
366 FATAL("out of space for strings");
367 for (bp = buf; (c = input()) != '"'; ) {
368 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
369 FATAL("out of space for string %.10s...", buf);
370 switch (c) {
371 case '\n':
372 case '\r':
373 case 0:
374 SYNTAX( "non-terminated string %.10s...", buf );
375 lineno++;
376 if (c == 0) /* hopeless */
377 FATAL( "giving up" );
378 break;
379 case '\\':
380 c = input();
381 switch (c) {
382 case '"': *bp++ = '"'; break;
383 case 'n': *bp++ = '\n'; break;
384 case 't': *bp++ = '\t'; break;
385 case 'f': *bp++ = '\f'; break;
386 case 'r': *bp++ = '\r'; break;
387 case 'b': *bp++ = '\b'; break;
388 case 'v': *bp++ = '\v'; break;
389 case 'a': *bp++ = '\007'; break;
390 case '\\': *bp++ = '\\'; break;
391
392 case '0': case '1': case '2': /* octal: \d \dd \ddd */
393 case '3': case '4': case '5': case '6': case '7':
394 n = c - '0';
395 if ((c = peek()) >= '0' && c < '8') {
396 n = 8 * n + input() - '0';
397 if ((c = peek()) >= '0' && c < '8')
398 n = 8 * n + input() - '0';
399 }
400 *bp++ = n;
401 break;
402
403 case 'x': /* hex \x0-9a-fA-F + */
404 { char xbuf[100], *px;
405 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
406 if (isdigit(c)
407 || (c >= 'a' && c <= 'f')
408 || (c >= 'A' && c <= 'F'))
409 *px++ = c;
410 else
411 break;
412 }
413 *px = 0;
414 unput(c);
415 sscanf(xbuf, "%x", (unsigned int *) &n);
416 *bp++ = n;
417 break;
418 }
419
420 default:
421 *bp++ = c;
422 break;
423 }
424 break;
425 default:
426 *bp++ = c;
427 break;
428 }
429 }
430 *bp = 0;
431 s = tostring(buf);
432 *bp++ = ' '; *bp++ = 0;
433 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
434 RET(STRING);
435 }
436
437
binsearch(char * w,Keyword * kp,int n)438 int binsearch(char *w, Keyword *kp, int n)
439 {
440 int cond, low, mid, high;
441
442 low = 0;
443 high = n - 1;
444 while (low <= high) {
445 mid = (low + high) / 2;
446 if ((cond = strcmp(w, kp[mid].word)) < 0)
447 high = mid - 1;
448 else if (cond > 0)
449 low = mid + 1;
450 else
451 return mid;
452 }
453 return -1;
454 }
455
word(char * w)456 int word(char *w)
457 {
458 Keyword *kp;
459 int c, n;
460
461 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
462 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
463 kp = keywords + n;
464 if (n != -1) { /* found in table */
465 yylval.i = kp->sub;
466 switch (kp->type) { /* special handling */
467 case BLTIN:
468 if (kp->sub == FSYSTEM && safe)
469 SYNTAX( "system is unsafe" );
470 RET(kp->type);
471 case FUNC:
472 if (infunc)
473 SYNTAX( "illegal nested function" );
474 RET(kp->type);
475 case RETURN:
476 if (!infunc)
477 SYNTAX( "return not in function" );
478 RET(kp->type);
479 case VARNF:
480 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
481 RET(VARNF);
482 default:
483 RET(kp->type);
484 }
485 }
486 c = peek(); /* look for '(' */
487 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
488 yylval.i = n;
489 RET(ARG);
490 } else {
491 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
492 if (c == '(') {
493 RET(CALL);
494 } else {
495 RET(VAR);
496 }
497 }
498 }
499
startreg(void)500 void startreg(void) /* next call to yylex will return a regular expression */
501 {
502 reg = 1;
503 }
504
regexpr(void)505 int regexpr(void)
506 {
507 int c;
508 static char *buf = 0;
509 static int bufsz = 500;
510 char *bp;
511
512 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
513 FATAL("out of space for rex expr");
514 bp = buf;
515 for ( ; (c = input()) != '/' && c != 0; ) {
516 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
517 FATAL("out of space for reg expr %.10s...", buf);
518 if (c == '\n') {
519 SYNTAX( "newline in regular expression %.10s...", buf );
520 unput('\n');
521 break;
522 } else if (c == '\\') {
523 *bp++ = '\\';
524 *bp++ = input();
525 } else {
526 *bp++ = c;
527 }
528 }
529 *bp = 0;
530 if (c == 0)
531 SYNTAX("non-terminated regular expression %.10s...", buf);
532 yylval.s = tostring(buf);
533 unput('/');
534 RET(REGEXPR);
535 }
536
537 /* low-level lexical stuff, sort of inherited from lex */
538
539 char ebuf[300];
540 char *ep = ebuf;
541 char yysbuf[100]; /* pushback buffer */
542 char *yysptr = yysbuf;
543 FILE *yyin = 0;
544
input(void)545 int input(void) /* get next lexical input character */
546 {
547 int c;
548 extern char *lexprog;
549
550 if (yysptr > yysbuf)
551 c = (uschar)*--yysptr;
552 else if (lexprog != NULL) { /* awk '...' */
553 if ((c = (uschar)*lexprog) != 0)
554 lexprog++;
555 } else /* awk -f ... */
556 c = pgetc();
557 if (c == '\n')
558 lineno++;
559 else if (c == EOF)
560 c = 0;
561 if (ep >= ebuf + sizeof ebuf)
562 ep = ebuf;
563 return *ep++ = c;
564 }
565
unput(int c)566 void unput(int c) /* put lexical character back on input */
567 {
568 if (c == '\n')
569 lineno--;
570 if (yysptr >= yysbuf + sizeof(yysbuf))
571 FATAL("pushed back too much: %.20s...", yysbuf);
572 *yysptr++ = c;
573 if (--ep < ebuf)
574 ep = ebuf + sizeof(ebuf) - 1;
575 }
576
unputstr(const char * s)577 void unputstr(const char *s) /* put a string back on input */
578 {
579 int i;
580
581 for (i = strlen(s)-1; i >= 0; i--)
582 unput(s[i]);
583 }
584