1 /* $OpenBSD: lex.c,v 1.10 2008/06/04 14:04:42 pyr Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <ctype.h> 30 #include "awk.h" 31 #include "ytab.h" 32 33 extern YYSTYPE yylval; 34 extern int infunc; 35 36 int lineno = 1; 37 int bracecnt = 0; 38 int brackcnt = 0; 39 int parencnt = 0; 40 41 typedef struct Keyword { 42 const char *word; 43 int sub; 44 int type; 45 } Keyword; 46 47 Keyword keywords[] ={ /* keep sorted: binary searched */ 48 { "BEGIN", XBEGIN, XBEGIN }, 49 { "END", XEND, XEND }, 50 { "NF", VARNF, VARNF }, 51 { "and", FAND, BLTIN }, 52 { "atan2", FATAN, BLTIN }, 53 { "break", BREAK, BREAK }, 54 { "close", CLOSE, CLOSE }, 55 { "compl", FCOMPL, BLTIN }, 56 { "continue", CONTINUE, CONTINUE }, 57 { "cos", FCOS, BLTIN }, 58 { "delete", DELETE, DELETE }, 59 { "do", DO, DO }, 60 { "else", ELSE, ELSE }, 61 { "exit", EXIT, EXIT }, 62 { "exp", FEXP, BLTIN }, 63 { "fflush", FFLUSH, BLTIN }, 64 { "for", FOR, FOR }, 65 { "func", FUNC, FUNC }, 66 { "function", FUNC, FUNC }, 67 { "getline", GETLINE, GETLINE }, 68 { "gsub", GSUB, GSUB }, 69 { "if", IF, IF }, 70 { "in", IN, IN }, 71 { "index", INDEX, INDEX }, 72 { "int", FINT, BLTIN }, 73 { "length", FLENGTH, BLTIN }, 74 { "log", FLOG, BLTIN }, 75 { "lshift", FLSHIFT, BLTIN }, 76 { "match", MATCHFCN, MATCHFCN }, 77 { "next", NEXT, NEXT }, 78 { "nextfile", NEXTFILE, NEXTFILE }, 79 { "or", FFOR, BLTIN }, 80 { "print", PRINT, PRINT }, 81 { "printf", PRINTF, PRINTF }, 82 { "rand", FRAND, BLTIN }, 83 { "return", RETURN, RETURN }, 84 { "rshift", FRSHIFT, BLTIN }, 85 { "sin", FSIN, BLTIN }, 86 { "split", SPLIT, SPLIT }, 87 { "sprintf", SPRINTF, SPRINTF }, 88 { "sqrt", FSQRT, BLTIN }, 89 { "srand", FSRAND, BLTIN }, 90 { "sub", SUB, SUB }, 91 { "substr", SUBSTR, SUBSTR }, 92 { "system", FSYSTEM, BLTIN }, 93 { "tolower", FTOLOWER, BLTIN }, 94 { "toupper", FTOUPPER, BLTIN }, 95 { "while", WHILE, WHILE }, 96 { "xor", FXOR, BLTIN }, 97 }; 98 99 #define DEBUG 100 #ifdef DEBUG 101 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 102 #else 103 #define RET(x) return(x) 104 #endif 105 106 int peek(void); 107 int gettok(char **, int *); 108 int binsearch(char *, Keyword *, int); 109 110 int peek(void) 111 { 112 int c = input(); 113 unput(c); 114 return c; 115 } 116 117 int gettok(char **pbuf, int *psz) /* get next input token */ 118 { 119 int c, retc; 120 char *buf = *pbuf; 121 int sz = *psz; 122 char *bp = buf; 123 124 c = input(); 125 if (c == 0) 126 return 0; 127 buf[0] = c; 128 buf[1] = 0; 129 if (!isalnum(c) && c != '.' && c != '_') 130 return c; 131 132 *bp++ = c; 133 if (isalpha(c) || c == '_') { /* it's a varname */ 134 for ( ; (c = input()) != 0; ) { 135 if (bp-buf >= sz) 136 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 137 FATAL( "out of space for name %.10s...", buf ); 138 if (isalnum(c) || c == '_') 139 *bp++ = c; 140 else { 141 *bp = 0; 142 unput(c); 143 break; 144 } 145 } 146 *bp = 0; 147 retc = 'a'; /* alphanumeric */ 148 } else { /* it's a number */ 149 char *rem; 150 /* read input until can't be a number */ 151 for ( ; (c = input()) != 0; ) { 152 if (bp-buf >= sz) 153 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 154 FATAL( "out of space for number %.10s...", buf ); 155 if (isdigit(c) || c == 'e' || c == 'E' 156 || c == '.' || c == '+' || c == '-') 157 *bp++ = c; 158 else { 159 unput(c); 160 break; 161 } 162 } 163 *bp = 0; 164 strtod(buf, &rem); /* parse the number */ 165 unputstr(rem); /* put rest back for later */ 166 /* printf("unputstr [%s], buf [%s]\n", rem, buf); */ 167 if (rem == buf) { /* it wasn't a valid number at all */ 168 buf[1] = 0; /* so return one character as token */ 169 retc = buf[0]; /* character is its own type */ 170 } else { /* some prefix was a number */ 171 rem[0] = 0; /* so truncate where failure started */ 172 retc = '0'; /* number */ 173 } 174 } 175 *pbuf = buf; 176 *psz = sz; 177 return retc; 178 } 179 180 int word(char *); 181 int string(void); 182 int regexpr(void); 183 int sc = 0; /* 1 => return a } right now */ 184 int reg = 0; /* 1 => return a REGEXPR now */ 185 186 int yylex(void) 187 { 188 int c; 189 static char *buf = 0; 190 static int bufsize = 500; 191 192 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL) 193 FATAL( "out of space in yylex" ); 194 if (sc) { 195 sc = 0; 196 RET('}'); 197 } 198 if (reg) { 199 reg = 0; 200 return regexpr(); 201 } 202 /* printf("top\n"); */ 203 for (;;) { 204 c = gettok(&buf, &bufsize); 205 /* printf("gettok [%s]\n", buf); */ 206 if (c == 0) 207 return 0; 208 if (isalpha(c) || c == '_') 209 return word(buf); 210 if (isdigit(c)) { 211 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 212 /* should this also have STR set? */ 213 RET(NUMBER); 214 } 215 216 yylval.i = c; 217 switch (c) { 218 case '\n': /* {EOL} */ 219 RET(NL); 220 case '\r': /* assume \n is coming */ 221 case ' ': /* {WS}+ */ 222 case '\t': 223 break; 224 case '#': /* #.* strip comments */ 225 while ((c = input()) != '\n' && c != 0) 226 ; 227 unput(c); 228 break; 229 case ';': 230 RET(';'); 231 case '\\': 232 if (peek() == '\n') { 233 input(); 234 } else if (peek() == '\r') { 235 input(); input(); /* \n */ 236 lineno++; 237 } else { 238 RET(c); 239 } 240 break; 241 case '&': 242 if (peek() == '&') { 243 input(); RET(AND); 244 } else 245 RET('&'); 246 case '|': 247 if (peek() == '|') { 248 input(); RET(BOR); 249 } else 250 RET('|'); 251 case '!': 252 if (peek() == '=') { 253 input(); yylval.i = NE; RET(NE); 254 } else if (peek() == '~') { 255 input(); yylval.i = NOTMATCH; RET(MATCHOP); 256 } else 257 RET(NOT); 258 case '~': 259 yylval.i = MATCH; 260 RET(MATCHOP); 261 case '<': 262 if (peek() == '=') { 263 input(); yylval.i = LE; RET(LE); 264 } else { 265 yylval.i = LT; RET(LT); 266 } 267 case '=': 268 if (peek() == '=') { 269 input(); yylval.i = EQ; RET(EQ); 270 } else { 271 yylval.i = ASSIGN; RET(ASGNOP); 272 } 273 case '>': 274 if (peek() == '=') { 275 input(); yylval.i = GE; RET(GE); 276 } else if (peek() == '>') { 277 input(); yylval.i = APPEND; RET(APPEND); 278 } else { 279 yylval.i = GT; RET(GT); 280 } 281 case '+': 282 if (peek() == '+') { 283 input(); yylval.i = INCR; RET(INCR); 284 } else if (peek() == '=') { 285 input(); yylval.i = ADDEQ; RET(ASGNOP); 286 } else 287 RET('+'); 288 case '-': 289 if (peek() == '-') { 290 input(); yylval.i = DECR; RET(DECR); 291 } else if (peek() == '=') { 292 input(); yylval.i = SUBEQ; RET(ASGNOP); 293 } else 294 RET('-'); 295 case '*': 296 if (peek() == '=') { /* *= */ 297 input(); yylval.i = MULTEQ; RET(ASGNOP); 298 } else if (peek() == '*') { /* ** or **= */ 299 input(); /* eat 2nd * */ 300 if (peek() == '=') { 301 input(); yylval.i = POWEQ; RET(ASGNOP); 302 } else { 303 RET(POWER); 304 } 305 } else 306 RET('*'); 307 case '/': 308 RET('/'); 309 case '%': 310 if (peek() == '=') { 311 input(); yylval.i = MODEQ; RET(ASGNOP); 312 } else 313 RET('%'); 314 case '^': 315 if (peek() == '=') { 316 input(); yylval.i = POWEQ; RET(ASGNOP); 317 } else 318 RET(POWER); 319 320 case '$': 321 /* BUG: awkward, if not wrong */ 322 c = gettok(&buf, &bufsize); 323 if (isalpha(c)) { 324 if (strcmp(buf, "NF") == 0) { /* very special */ 325 unputstr("(NF)"); 326 RET(INDIRECT); 327 } 328 c = peek(); 329 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 330 unputstr(buf); 331 RET(INDIRECT); 332 } 333 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 334 RET(IVAR); 335 } else if (c == 0) { /* */ 336 SYNTAX( "unexpected end of input after $" ); 337 RET(';'); 338 } else { 339 unputstr(buf); 340 RET(INDIRECT); 341 } 342 343 case '}': 344 if (--bracecnt < 0) 345 SYNTAX( "extra }" ); 346 sc = 1; 347 RET(';'); 348 case ']': 349 if (--brackcnt < 0) 350 SYNTAX( "extra ]" ); 351 RET(']'); 352 case ')': 353 if (--parencnt < 0) 354 SYNTAX( "extra )" ); 355 RET(')'); 356 case '{': 357 bracecnt++; 358 RET('{'); 359 case '[': 360 brackcnt++; 361 RET('['); 362 case '(': 363 parencnt++; 364 RET('('); 365 366 case '"': 367 return string(); /* BUG: should be like tran.c ? */ 368 369 default: 370 RET(c); 371 } 372 } 373 } 374 375 int string(void) 376 { 377 int c, n; 378 char *s, *bp; 379 static char *buf = 0; 380 static int bufsz = 500; 381 382 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 383 FATAL("out of space for strings"); 384 for (bp = buf; (c = input()) != '"'; ) { 385 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0)) 386 FATAL("out of space for string %.10s...", buf); 387 switch (c) { 388 case '\n': 389 case '\r': 390 case 0: 391 SYNTAX( "non-terminated string %.10s...", buf ); 392 lineno++; 393 if (c == 0) /* hopeless */ 394 FATAL( "giving up" ); 395 break; 396 case '\\': 397 c = input(); 398 switch (c) { 399 case '"': *bp++ = '"'; break; 400 case 'n': *bp++ = '\n'; break; 401 case 't': *bp++ = '\t'; break; 402 case 'f': *bp++ = '\f'; break; 403 case 'r': *bp++ = '\r'; break; 404 case 'b': *bp++ = '\b'; break; 405 case 'v': *bp++ = '\v'; break; 406 case 'a': *bp++ = '\007'; break; 407 case '\\': *bp++ = '\\'; break; 408 409 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 410 case '3': case '4': case '5': case '6': case '7': 411 n = c - '0'; 412 if ((c = peek()) >= '0' && c < '8') { 413 n = 8 * n + input() - '0'; 414 if ((c = peek()) >= '0' && c < '8') 415 n = 8 * n + input() - '0'; 416 } 417 *bp++ = n; 418 break; 419 420 case 'x': /* hex \x0-9a-fA-F + */ 421 { char xbuf[100], *px; 422 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 423 if (isdigit(c) 424 || (c >= 'a' && c <= 'f') 425 || (c >= 'A' && c <= 'F')) 426 *px++ = c; 427 else 428 break; 429 } 430 *px = 0; 431 unput(c); 432 sscanf(xbuf, "%x", &n); 433 *bp++ = n; 434 break; 435 } 436 437 default: 438 *bp++ = c; 439 break; 440 } 441 break; 442 default: 443 *bp++ = c; 444 break; 445 } 446 } 447 *bp = 0; 448 s = tostring(buf); 449 *bp++ = ' '; *bp++ = 0; 450 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 451 RET(STRING); 452 } 453 454 455 int binsearch(char *w, Keyword *kp, int n) 456 { 457 int cond, low, mid, high; 458 459 low = 0; 460 high = n - 1; 461 while (low <= high) { 462 mid = (low + high) / 2; 463 if ((cond = strcmp(w, kp[mid].word)) < 0) 464 high = mid - 1; 465 else if (cond > 0) 466 low = mid + 1; 467 else 468 return mid; 469 } 470 return -1; 471 } 472 473 int word(char *w) 474 { 475 Keyword *kp; 476 int c, n; 477 478 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 479 kp = keywords + n; 480 if (n != -1) { /* found in table */ 481 yylval.i = kp->sub; 482 switch (kp->type) { /* special handling */ 483 case FSYSTEM: 484 if (safe) 485 SYNTAX( "system is unsafe" ); 486 RET(kp->type); 487 case FUNC: 488 if (infunc) 489 SYNTAX( "illegal nested function" ); 490 RET(kp->type); 491 case RETURN: 492 if (!infunc) 493 SYNTAX( "return not in function" ); 494 RET(kp->type); 495 case VARNF: 496 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 497 RET(VARNF); 498 default: 499 RET(kp->type); 500 } 501 } 502 c = peek(); /* look for '(' */ 503 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 504 yylval.i = n; 505 RET(ARG); 506 } else { 507 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 508 if (c == '(') { 509 RET(CALL); 510 } else { 511 RET(VAR); 512 } 513 } 514 } 515 516 void startreg(void) /* next call to yylex will return a regular expression */ 517 { 518 reg = 1; 519 } 520 521 int regexpr(void) 522 { 523 int c, openclass = 0; 524 static char *buf = 0; 525 static int bufsz = 500; 526 char *bp; 527 528 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 529 FATAL("out of space for rex expr"); 530 bp = buf; 531 for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) { 532 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0)) 533 FATAL("out of space for reg expr %.10s...", buf); 534 if (c == '\n') { 535 SYNTAX( "newline in regular expression %.10s...", buf ); 536 unput('\n'); 537 break; 538 } else if (c == '\\') { 539 *bp++ = '\\'; 540 *bp++ = input(); 541 } else { 542 if (c == '[') 543 openclass = 1; 544 else if (c == ']') 545 openclass = 0; 546 *bp++ = c; 547 } 548 } 549 *bp = 0; 550 if (c == 0) 551 SYNTAX("non-terminated regular expression %.10s...", buf); 552 yylval.s = tostring(buf); 553 unput('/'); 554 RET(REGEXPR); 555 } 556 557 /* low-level lexical stuff, sort of inherited from lex */ 558 559 char ebuf[300]; 560 char *ep = ebuf; 561 char yysbuf[100]; /* pushback buffer */ 562 char *yysptr = yysbuf; 563 FILE *yyin = 0; 564 565 int input(void) /* get next lexical input character */ 566 { 567 int c; 568 extern char *lexprog; 569 570 if (yysptr > yysbuf) 571 c = (uschar)*--yysptr; 572 else if (lexprog != NULL) { /* awk '...' */ 573 if ((c = (uschar)*lexprog) != 0) 574 lexprog++; 575 } else /* awk -f ... */ 576 c = pgetc(); 577 if (c == '\n') 578 lineno++; 579 else if (c == EOF) 580 c = 0; 581 if (ep >= ebuf + sizeof ebuf) 582 ep = ebuf; 583 return *ep++ = c; 584 } 585 586 void unput(int c) /* put lexical character back on input */ 587 { 588 if (c == '\n') 589 lineno--; 590 if (yysptr >= yysbuf + sizeof(yysbuf)) 591 FATAL("pushed back too much: %.20s...", yysbuf); 592 *yysptr++ = c; 593 if (--ep < ebuf) 594 ep = ebuf + sizeof(ebuf) - 1; 595 } 596 597 void unputstr(const char *s) /* put a string back on input */ 598 { 599 int i; 600 601 for (i = strlen(s)-1; i >= 0; i--) 602 unput(s[i]); 603 } 604