1 /* $OpenBSD: lex.c,v 1.26 2020/08/28 16:29:16 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <ctype.h> 30 #include "awk.h" 31 #include "awkgram.tab.h" 32 33 extern YYSTYPE yylval; 34 extern bool infunc; 35 36 int lineno = 1; 37 int bracecnt = 0; 38 int brackcnt = 0; 39 int parencnt = 0; 40 41 typedef struct Keyword { 42 const char *word; 43 int sub; 44 int type; 45 } Keyword; 46 47 const Keyword keywords[] = { /* keep sorted: binary searched */ 48 { "BEGIN", XBEGIN, XBEGIN }, 49 { "END", XEND, XEND }, 50 { "NF", VARNF, VARNF }, 51 { "and", FAND, BLTIN }, 52 { "atan2", FATAN, BLTIN }, 53 { "break", BREAK, BREAK }, 54 { "close", CLOSE, CLOSE }, 55 { "compl", FCOMPL, BLTIN }, 56 { "continue", CONTINUE, CONTINUE }, 57 { "cos", FCOS, BLTIN }, 58 { "delete", DELETE, DELETE }, 59 { "do", DO, DO }, 60 { "else", ELSE, ELSE }, 61 { "exit", EXIT, EXIT }, 62 { "exp", FEXP, BLTIN }, 63 { "fflush", FFLUSH, BLTIN }, 64 { "for", FOR, FOR }, 65 { "func", FUNC, FUNC }, 66 { "function", FUNC, FUNC }, 67 { "gensub", GENSUB, GENSUB }, 68 { "getline", GETLINE, GETLINE }, 69 { "gsub", GSUB, GSUB }, 70 { "if", IF, IF }, 71 { "in", IN, IN }, 72 { "index", INDEX, INDEX }, 73 { "int", FINT, BLTIN }, 74 { "length", FLENGTH, BLTIN }, 75 { "log", FLOG, BLTIN }, 76 { "lshift", FLSHIFT, BLTIN }, 77 { "match", MATCHFCN, MATCHFCN }, 78 { "mktime", FMKTIME, BLTIN }, 79 { "next", NEXT, NEXT }, 80 { "nextfile", NEXTFILE, NEXTFILE }, 81 { "or", FFOR, BLTIN }, 82 { "print", PRINT, PRINT }, 83 { "printf", PRINTF, PRINTF }, 84 { "rand", FRAND, BLTIN }, 85 { "return", RETURN, RETURN }, 86 { "rshift", FRSHIFT, BLTIN }, 87 { "sin", FSIN, BLTIN }, 88 { "split", SPLIT, SPLIT }, 89 { "sprintf", SPRINTF, SPRINTF }, 90 { "sqrt", FSQRT, BLTIN }, 91 { "srand", FSRAND, BLTIN }, 92 { "strftime", FSTRFTIME, BLTIN }, 93 { "sub", SUB, SUB }, 94 { "substr", SUBSTR, SUBSTR }, 95 { "system", FSYSTEM, BLTIN }, 96 { "systime", FSYSTIME, BLTIN }, 97 { "tolower", FTOLOWER, BLTIN }, 98 { "toupper", FTOUPPER, BLTIN }, 99 { "while", WHILE, WHILE }, 100 { "xor", FXOR, BLTIN }, 101 }; 102 103 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 104 105 static int peek(void) 106 { 107 int c = input(); 108 unput(c); 109 return c; 110 } 111 112 static int gettok(char **pbuf, int *psz) /* get next input token */ 113 { 114 int c, retc; 115 char *buf = *pbuf; 116 int sz = *psz; 117 char *bp = buf; 118 119 c = input(); 120 if (c == 0) 121 return 0; 122 buf[0] = c; 123 buf[1] = 0; 124 if (!isalnum(c) && c != '.' && c != '_') 125 return c; 126 127 *bp++ = c; 128 if (isalpha(c) || c == '_') { /* it's a varname */ 129 for ( ; (c = input()) != 0; ) { 130 if (bp-buf >= sz) 131 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 132 FATAL( "out of space for name %.10s...", buf ); 133 if (isalnum(c) || c == '_') 134 *bp++ = c; 135 else { 136 *bp = 0; 137 unput(c); 138 break; 139 } 140 } 141 *bp = 0; 142 retc = 'a'; /* alphanumeric */ 143 } else { /* maybe it's a number, but could be . */ 144 char *rem; 145 /* read input until can't be a number */ 146 for ( ; (c = input()) != 0; ) { 147 if (bp-buf >= sz) 148 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 149 FATAL( "out of space for number %.10s...", buf ); 150 if (isdigit(c) || c == 'e' || c == 'E' 151 || c == '.' || c == '+' || c == '-') 152 *bp++ = c; 153 else { 154 unput(c); 155 break; 156 } 157 } 158 *bp = 0; 159 strtod(buf, &rem); /* parse the number */ 160 if (rem == buf) { /* it wasn't a valid number at all */ 161 buf[1] = 0; /* return one character as token */ 162 retc = (uschar)buf[0]; /* character is its own type */ 163 unputstr(rem+1); /* put rest back for later */ 164 } else { /* some prefix was a number */ 165 unputstr(rem); /* put rest back for later */ 166 rem[0] = 0; /* truncate buf after number part */ 167 retc = '0'; /* type is number */ 168 } 169 } 170 *pbuf = buf; 171 *psz = sz; 172 return retc; 173 } 174 175 int word(char *); 176 int string(void); 177 int regexpr(void); 178 bool sc = false; /* true => return a } right now */ 179 bool reg = false; /* true => return a REGEXPR now */ 180 181 int yylex(void) 182 { 183 int c; 184 static char *buf = NULL; 185 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 186 187 if (buf == NULL && (buf = malloc(bufsize)) == NULL) 188 FATAL( "out of space in yylex" ); 189 if (sc) { 190 sc = false; 191 RET('}'); 192 } 193 if (reg) { 194 reg = false; 195 return regexpr(); 196 } 197 for (;;) { 198 c = gettok(&buf, &bufsize); 199 if (c == 0) 200 return 0; 201 if (isalpha(c) || c == '_') 202 return word(buf); 203 if (isdigit(c)) { 204 char *cp = tostring(buf); 205 yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab); 206 free(cp); 207 /* should this also have STR set? */ 208 RET(NUMBER); 209 } 210 211 yylval.i = c; 212 switch (c) { 213 case '\n': /* {EOL} */ 214 lineno++; 215 RET(NL); 216 case '\r': /* assume \n is coming */ 217 case ' ': /* {WS}+ */ 218 case '\t': 219 break; 220 case '#': /* #.* strip comments */ 221 while ((c = input()) != '\n' && c != 0) 222 ; 223 unput(c); 224 /* 225 * Next line is a hack, itcompensates for 226 * unput's treatment of \n. 227 */ 228 lineno++; 229 break; 230 case ';': 231 RET(';'); 232 case '\\': 233 if (peek() == '\n') { 234 input(); 235 lineno++; 236 } else if (peek() == '\r') { 237 input(); input(); /* \n */ 238 lineno++; 239 } else { 240 RET(c); 241 } 242 break; 243 case '&': 244 if (peek() == '&') { 245 input(); RET(AND); 246 } else 247 RET('&'); 248 case '|': 249 if (peek() == '|') { 250 input(); RET(BOR); 251 } else 252 RET('|'); 253 case '!': 254 if (peek() == '=') { 255 input(); yylval.i = NE; RET(NE); 256 } else if (peek() == '~') { 257 input(); yylval.i = NOTMATCH; RET(MATCHOP); 258 } else 259 RET(NOT); 260 case '~': 261 yylval.i = MATCH; 262 RET(MATCHOP); 263 case '<': 264 if (peek() == '=') { 265 input(); yylval.i = LE; RET(LE); 266 } else { 267 yylval.i = LT; RET(LT); 268 } 269 case '=': 270 if (peek() == '=') { 271 input(); yylval.i = EQ; RET(EQ); 272 } else { 273 yylval.i = ASSIGN; RET(ASGNOP); 274 } 275 case '>': 276 if (peek() == '=') { 277 input(); yylval.i = GE; RET(GE); 278 } else if (peek() == '>') { 279 input(); yylval.i = APPEND; RET(APPEND); 280 } else { 281 yylval.i = GT; RET(GT); 282 } 283 case '+': 284 if (peek() == '+') { 285 input(); yylval.i = INCR; RET(INCR); 286 } else if (peek() == '=') { 287 input(); yylval.i = ADDEQ; RET(ASGNOP); 288 } else 289 RET('+'); 290 case '-': 291 if (peek() == '-') { 292 input(); yylval.i = DECR; RET(DECR); 293 } else if (peek() == '=') { 294 input(); yylval.i = SUBEQ; RET(ASGNOP); 295 } else 296 RET('-'); 297 case '*': 298 if (peek() == '=') { /* *= */ 299 input(); yylval.i = MULTEQ; RET(ASGNOP); 300 } else if (peek() == '*') { /* ** or **= */ 301 input(); /* eat 2nd * */ 302 if (peek() == '=') { 303 input(); yylval.i = POWEQ; RET(ASGNOP); 304 } else { 305 RET(POWER); 306 } 307 } else 308 RET('*'); 309 case '/': 310 RET('/'); 311 case '%': 312 if (peek() == '=') { 313 input(); yylval.i = MODEQ; RET(ASGNOP); 314 } else 315 RET('%'); 316 case '^': 317 if (peek() == '=') { 318 input(); yylval.i = POWEQ; RET(ASGNOP); 319 } else 320 RET(POWER); 321 322 case '$': 323 /* BUG: awkward, if not wrong */ 324 c = gettok(&buf, &bufsize); 325 if (isalpha(c)) { 326 if (strcmp(buf, "NF") == 0) { /* very special */ 327 unputstr("(NF)"); 328 RET(INDIRECT); 329 } 330 c = peek(); 331 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 332 unputstr(buf); 333 RET(INDIRECT); 334 } 335 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 336 RET(IVAR); 337 } else if (c == 0) { /* */ 338 SYNTAX( "unexpected end of input after $" ); 339 RET(';'); 340 } else { 341 unputstr(buf); 342 RET(INDIRECT); 343 } 344 345 case '}': 346 if (--bracecnt < 0) 347 SYNTAX( "extra }" ); 348 sc = true; 349 RET(';'); 350 case ']': 351 if (--brackcnt < 0) 352 SYNTAX( "extra ]" ); 353 RET(']'); 354 case ')': 355 if (--parencnt < 0) 356 SYNTAX( "extra )" ); 357 RET(')'); 358 case '{': 359 bracecnt++; 360 RET('{'); 361 case '[': 362 brackcnt++; 363 RET('['); 364 case '(': 365 parencnt++; 366 RET('('); 367 368 case '"': 369 return string(); /* BUG: should be like tran.c ? */ 370 371 default: 372 RET(c); 373 } 374 } 375 } 376 377 int string(void) 378 { 379 int c, n; 380 char *s, *bp; 381 static char *buf = NULL; 382 static int bufsz = 500; 383 384 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 385 FATAL("out of space for strings"); 386 for (bp = buf; (c = input()) != '"'; ) { 387 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 388 FATAL("out of space for string %.10s...", buf); 389 switch (c) { 390 case '\n': 391 case '\r': 392 case 0: 393 *bp = '\0'; 394 SYNTAX( "non-terminated string %.10s...", buf ); 395 if (c == 0) /* hopeless */ 396 FATAL( "giving up" ); 397 lineno++; 398 break; 399 case '\\': 400 c = input(); 401 switch (c) { 402 case '\n': break; 403 case '"': *bp++ = '"'; break; 404 case 'n': *bp++ = '\n'; break; 405 case 't': *bp++ = '\t'; break; 406 case 'f': *bp++ = '\f'; break; 407 case 'r': *bp++ = '\r'; break; 408 case 'b': *bp++ = '\b'; break; 409 case 'v': *bp++ = '\v'; break; 410 case 'a': *bp++ = '\a'; break; 411 case '\\': *bp++ = '\\'; break; 412 413 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 414 case '3': case '4': case '5': case '6': case '7': 415 n = c - '0'; 416 if ((c = peek()) >= '0' && c < '8') { 417 n = 8 * n + input() - '0'; 418 if ((c = peek()) >= '0' && c < '8') 419 n = 8 * n + input() - '0'; 420 } 421 *bp++ = n; 422 break; 423 424 case 'x': /* hex \x0-9a-fA-F + */ 425 { char xbuf[100], *px; 426 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 427 if (isdigit(c) 428 || (c >= 'a' && c <= 'f') 429 || (c >= 'A' && c <= 'F')) 430 *px++ = c; 431 else 432 break; 433 } 434 *px = 0; 435 unput(c); 436 sscanf(xbuf, "%x", (unsigned int *) &n); 437 *bp++ = n; 438 break; 439 } 440 441 default: 442 *bp++ = c; 443 break; 444 } 445 break; 446 default: 447 *bp++ = c; 448 break; 449 } 450 } 451 *bp = 0; 452 s = tostring(buf); 453 *bp++ = ' '; *bp++ = '\0'; 454 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 455 free(s); 456 RET(STRING); 457 } 458 459 460 static int binsearch(char *w, const Keyword *kp, int n) 461 { 462 int cond, low, mid, high; 463 464 low = 0; 465 high = n - 1; 466 while (low <= high) { 467 mid = (low + high) / 2; 468 if ((cond = strcmp(w, kp[mid].word)) < 0) 469 high = mid - 1; 470 else if (cond > 0) 471 low = mid + 1; 472 else 473 return mid; 474 } 475 return -1; 476 } 477 478 int word(char *w) 479 { 480 const Keyword *kp; 481 int c, n; 482 483 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 484 if (n != -1) { /* found in table */ 485 kp = keywords + n; 486 yylval.i = kp->sub; 487 switch (kp->type) { /* special handling */ 488 case BLTIN: 489 if (kp->sub == FSYSTEM && safe) 490 SYNTAX( "system is unsafe" ); 491 RET(kp->type); 492 case FUNC: 493 if (infunc) 494 SYNTAX( "illegal nested function" ); 495 RET(kp->type); 496 case RETURN: 497 if (!infunc) 498 SYNTAX( "return not in function" ); 499 RET(kp->type); 500 case VARNF: 501 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 502 RET(VARNF); 503 default: 504 RET(kp->type); 505 } 506 } 507 c = peek(); /* look for '(' */ 508 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 509 yylval.i = n; 510 RET(ARG); 511 } else { 512 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 513 if (c == '(') { 514 RET(CALL); 515 } else { 516 RET(VAR); 517 } 518 } 519 } 520 521 void startreg(void) /* next call to yylex will return a regular expression */ 522 { 523 reg = true; 524 } 525 526 int regexpr(void) 527 { 528 int c, openclass = 0; 529 static char *buf = NULL; 530 static int bufsz = 500; 531 char *bp, *cstart; 532 533 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 534 FATAL("out of space for rex expr"); 535 bp = buf; 536 for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) { 537 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 538 FATAL("out of space for reg expr %.10s...", buf); 539 if (c == '\n') { 540 *bp = '\0'; 541 SYNTAX( "newline in regular expression %.10s...", buf ); 542 unput('\n'); 543 break; 544 } else if (c == '\\') { 545 *bp++ = '\\'; 546 *bp++ = input(); 547 } else { 548 /* 549 * POSIX requires a slash in a regexp to be escaped, 550 * other awks don't require it to be escaped inside 551 * a character class. 552 */ 553 if (!do_posix) { 554 if (c == '[') { 555 int nextc = peek(); 556 if (openclass == 0 || nextc == ':' || 557 nextc == '.' || nextc == '=') { 558 if (++openclass == 1) 559 cstart = bp; 560 } 561 } else if (c == ']' && openclass > 0) { 562 /* 563 * A ']' as the first char in a 564 * class is treated literally. 565 */ 566 if (cstart != bp - 1 && 567 (cstart != bp - 2 || bp[-1] != '^')) 568 openclass--; 569 } 570 } 571 *bp++ = c; 572 } 573 } 574 *bp = 0; 575 if (c == 0) 576 SYNTAX("non-terminated regular expression %.10s...", buf); 577 yylval.s = tostring(buf); 578 unput('/'); 579 RET(REGEXPR); 580 } 581 582 /* low-level lexical stuff, sort of inherited from lex */ 583 584 char ebuf[300]; 585 char *ep = ebuf; 586 char yysbuf[100]; /* pushback buffer */ 587 char *yysptr = yysbuf; 588 FILE *yyin = NULL; 589 590 int input(void) /* get next lexical input character */ 591 { 592 int c; 593 extern char *lexprog; 594 595 if (yysptr > yysbuf) 596 c = (uschar)*--yysptr; 597 else if (lexprog != NULL) { /* awk '...' */ 598 if ((c = (uschar)*lexprog) != 0) 599 lexprog++; 600 } else /* awk -f ... */ 601 c = pgetc(); 602 if (c == EOF) 603 c = 0; 604 if (ep >= ebuf + sizeof ebuf) 605 ep = ebuf; 606 *ep = c; 607 if (c != 0) { 608 ep++; 609 } 610 return (c); 611 } 612 613 void unput(int c) /* put lexical character back on input */ 614 { 615 if (c == '\n') 616 lineno--; 617 if (yysptr >= yysbuf + sizeof(yysbuf)) 618 FATAL("pushed back too much: %.20s...", yysbuf); 619 *yysptr++ = c; 620 if (--ep < ebuf) 621 ep = ebuf + sizeof(ebuf) - 1; 622 } 623 624 void unputstr(const char *s) /* put a string back on input */ 625 { 626 int i; 627 628 for (i = strlen(s)-1; i >= 0; i--) 629 unput(s[i]); 630 } 631