1 /* $OpenBSD: lex.c,v 1.31 2023/09/17 14:49:44 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <ctype.h> 30 #include "awk.h" 31 #include "awkgram.tab.h" 32 33 extern YYSTYPE yylval; 34 extern bool infunc; 35 36 int lineno = 1; 37 int bracecnt = 0; 38 int brackcnt = 0; 39 int parencnt = 0; 40 41 typedef struct Keyword { 42 const char *word; 43 int sub; 44 int type; 45 } Keyword; 46 47 const Keyword keywords[] = { /* keep sorted: binary searched */ 48 { "BEGIN", XBEGIN, XBEGIN }, 49 { "END", XEND, XEND }, 50 { "NF", VARNF, VARNF }, 51 { "and", FAND, BLTIN }, 52 { "atan2", FATAN, BLTIN }, 53 { "break", BREAK, BREAK }, 54 { "close", CLOSE, CLOSE }, 55 { "compl", FCOMPL, BLTIN }, 56 { "continue", CONTINUE, CONTINUE }, 57 { "cos", FCOS, BLTIN }, 58 { "delete", DELETE, DELETE }, 59 { "do", DO, DO }, 60 { "else", ELSE, ELSE }, 61 { "exit", EXIT, EXIT }, 62 { "exp", FEXP, BLTIN }, 63 { "fflush", FFLUSH, BLTIN }, 64 { "for", FOR, FOR }, 65 { "func", FUNC, FUNC }, 66 { "function", FUNC, FUNC }, 67 { "gensub", GENSUB, GENSUB }, 68 { "getline", GETLINE, GETLINE }, 69 { "gsub", GSUB, GSUB }, 70 { "if", IF, IF }, 71 { "in", IN, IN }, 72 { "index", INDEX, INDEX }, 73 { "int", FINT, BLTIN }, 74 { "length", FLENGTH, BLTIN }, 75 { "log", FLOG, BLTIN }, 76 { "lshift", FLSHIFT, BLTIN }, 77 { "match", MATCHFCN, MATCHFCN }, 78 { "mktime", FMKTIME, BLTIN }, 79 { "next", NEXT, NEXT }, 80 { "nextfile", NEXTFILE, NEXTFILE }, 81 { "or", FFOR, BLTIN }, 82 { "print", PRINT, PRINT }, 83 { "printf", PRINTF, PRINTF }, 84 { "rand", FRAND, BLTIN }, 85 { "return", RETURN, RETURN }, 86 { "rshift", FRSHIFT, BLTIN }, 87 { "sin", FSIN, BLTIN }, 88 { "split", SPLIT, SPLIT }, 89 { "sprintf", SPRINTF, SPRINTF }, 90 { "sqrt", FSQRT, BLTIN }, 91 { "srand", FSRAND, BLTIN }, 92 { "strftime", FSTRFTIME, BLTIN }, 93 { "sub", SUB, SUB }, 94 { "substr", SUBSTR, SUBSTR }, 95 { "system", FSYSTEM, BLTIN }, 96 { "systime", FSYSTIME, BLTIN }, 97 { "tolower", FTOLOWER, BLTIN }, 98 { "toupper", FTOUPPER, BLTIN }, 99 { "while", WHILE, WHILE }, 100 { "xor", FXOR, BLTIN }, 101 }; 102 103 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 104 105 static int peek(void) 106 { 107 int c = input(); 108 unput(c); 109 return c; 110 } 111 112 static int gettok(char **pbuf, int *psz) /* get next input token */ 113 { 114 int c, retc; 115 char *buf = *pbuf; 116 int sz = *psz; 117 char *bp = buf; 118 119 c = input(); 120 if (c == 0) 121 return 0; 122 buf[0] = c; 123 buf[1] = 0; 124 if (!isalnum(c) && c != '.' && c != '_') 125 return c; 126 127 *bp++ = c; 128 if (isalpha(c) || c == '_') { /* it's a varname */ 129 for ( ; (c = input()) != 0; ) { 130 if (bp-buf >= sz) 131 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 132 FATAL( "out of space for name %.10s...", buf ); 133 if (isalnum(c) || c == '_') 134 *bp++ = c; 135 else { 136 *bp = 0; 137 unput(c); 138 break; 139 } 140 } 141 *bp = 0; 142 retc = 'a'; /* alphanumeric */ 143 } else { /* maybe it's a number, but could be . */ 144 char *rem; 145 /* read input until can't be a number */ 146 for ( ; (c = input()) != 0; ) { 147 if (bp-buf >= sz) 148 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 149 FATAL( "out of space for number %.10s...", buf ); 150 if (isdigit(c) || c == 'e' || c == 'E' 151 || c == '.' || c == '+' || c == '-') 152 *bp++ = c; 153 else { 154 unput(c); 155 break; 156 } 157 } 158 *bp = 0; 159 strtod(buf, &rem); /* parse the number */ 160 if (rem == buf) { /* it wasn't a valid number at all */ 161 buf[1] = 0; /* return one character as token */ 162 retc = (uschar)buf[0]; /* character is its own type */ 163 unputstr(rem+1); /* put rest back for later */ 164 } else { /* some prefix was a number */ 165 unputstr(rem); /* put rest back for later */ 166 rem[0] = 0; /* truncate buf after number part */ 167 retc = '0'; /* type is number */ 168 } 169 } 170 *pbuf = buf; 171 *psz = sz; 172 return retc; 173 } 174 175 int word(char *); 176 int string(void); 177 int regexpr(void); 178 bool sc = false; /* true => return a } right now */ 179 bool reg = false; /* true => return a REGEXPR now */ 180 181 int yylex(void) 182 { 183 int c; 184 static char *buf = NULL; 185 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 186 187 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 188 FATAL( "out of space in yylex" ); 189 if (sc) { 190 sc = false; 191 RET('}'); 192 } 193 if (reg) { 194 reg = false; 195 return regexpr(); 196 } 197 for (;;) { 198 c = gettok(&buf, &bufsize); 199 if (c == 0) 200 return 0; 201 if (isalpha(c) || c == '_') 202 return word(buf); 203 if (isdigit(c)) { 204 char *cp = tostring(buf); 205 double result; 206 207 if (is_number(cp, & result)) 208 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab); 209 else 210 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab); 211 free(cp); 212 /* should this also have STR set? */ 213 RET(NUMBER); 214 } 215 216 yylval.i = c; 217 switch (c) { 218 case '\n': /* {EOL} */ 219 lineno++; 220 RET(NL); 221 case '\r': /* assume \n is coming */ 222 case ' ': /* {WS}+ */ 223 case '\t': 224 break; 225 case '#': /* #.* strip comments */ 226 while ((c = input()) != '\n' && c != 0) 227 ; 228 unput(c); 229 /* 230 * Next line is a hack, itcompensates for 231 * unput's treatment of \n. 232 */ 233 lineno++; 234 break; 235 case ';': 236 RET(';'); 237 case '\\': 238 if (peek() == '\n') { 239 input(); 240 lineno++; 241 } else if (peek() == '\r') { 242 input(); input(); /* \n */ 243 lineno++; 244 } else { 245 RET(c); 246 } 247 break; 248 case '&': 249 if (peek() == '&') { 250 input(); RET(AND); 251 } else 252 RET('&'); 253 case '|': 254 if (peek() == '|') { 255 input(); RET(BOR); 256 } else 257 RET('|'); 258 case '!': 259 if (peek() == '=') { 260 input(); yylval.i = NE; RET(NE); 261 } else if (peek() == '~') { 262 input(); yylval.i = NOTMATCH; RET(MATCHOP); 263 } else 264 RET(NOT); 265 case '~': 266 yylval.i = MATCH; 267 RET(MATCHOP); 268 case '<': 269 if (peek() == '=') { 270 input(); yylval.i = LE; RET(LE); 271 } else { 272 yylval.i = LT; RET(LT); 273 } 274 case '=': 275 if (peek() == '=') { 276 input(); yylval.i = EQ; RET(EQ); 277 } else { 278 yylval.i = ASSIGN; RET(ASGNOP); 279 } 280 case '>': 281 if (peek() == '=') { 282 input(); yylval.i = GE; RET(GE); 283 } else if (peek() == '>') { 284 input(); yylval.i = APPEND; RET(APPEND); 285 } else { 286 yylval.i = GT; RET(GT); 287 } 288 case '+': 289 if (peek() == '+') { 290 input(); yylval.i = INCR; RET(INCR); 291 } else if (peek() == '=') { 292 input(); yylval.i = ADDEQ; RET(ASGNOP); 293 } else 294 RET('+'); 295 case '-': 296 if (peek() == '-') { 297 input(); yylval.i = DECR; RET(DECR); 298 } else if (peek() == '=') { 299 input(); yylval.i = SUBEQ; RET(ASGNOP); 300 } else 301 RET('-'); 302 case '*': 303 if (peek() == '=') { /* *= */ 304 input(); yylval.i = MULTEQ; RET(ASGNOP); 305 } else if (peek() == '*') { /* ** or **= */ 306 input(); /* eat 2nd * */ 307 if (peek() == '=') { 308 input(); yylval.i = POWEQ; RET(ASGNOP); 309 } else { 310 RET(POWER); 311 } 312 } else 313 RET('*'); 314 case '/': 315 RET('/'); 316 case '%': 317 if (peek() == '=') { 318 input(); yylval.i = MODEQ; RET(ASGNOP); 319 } else 320 RET('%'); 321 case '^': 322 if (peek() == '=') { 323 input(); yylval.i = POWEQ; RET(ASGNOP); 324 } else 325 RET(POWER); 326 327 case '$': 328 /* BUG: awkward, if not wrong */ 329 c = gettok(&buf, &bufsize); 330 if (isalpha(c)) { 331 if (strcmp(buf, "NF") == 0) { /* very special */ 332 unputstr("(NF)"); 333 RET(INDIRECT); 334 } 335 c = peek(); 336 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 337 unputstr(buf); 338 RET(INDIRECT); 339 } 340 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 341 RET(IVAR); 342 } else if (c == 0) { /* */ 343 SYNTAX( "unexpected end of input after $" ); 344 RET(';'); 345 } else { 346 unputstr(buf); 347 RET(INDIRECT); 348 } 349 350 case '}': 351 if (--bracecnt < 0) 352 SYNTAX( "extra }" ); 353 sc = true; 354 RET(';'); 355 case ']': 356 if (--brackcnt < 0) 357 SYNTAX( "extra ]" ); 358 RET(']'); 359 case ')': 360 if (--parencnt < 0) 361 SYNTAX( "extra )" ); 362 RET(')'); 363 case '{': 364 bracecnt++; 365 RET('{'); 366 case '[': 367 brackcnt++; 368 RET('['); 369 case '(': 370 parencnt++; 371 RET('('); 372 373 case '"': 374 return string(); /* BUG: should be like tran.c ? */ 375 376 default: 377 RET(c); 378 } 379 } 380 } 381 382 extern int runetochar(char *str, int c); 383 384 int string(void) 385 { 386 int c, n; 387 char *s, *bp; 388 static char *buf = NULL; 389 static int bufsz = 500; 390 391 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 392 FATAL("out of space for strings"); 393 for (bp = buf; (c = input()) != '"'; ) { 394 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 395 FATAL("out of space for string %.10s...", buf); 396 switch (c) { 397 case '\n': 398 case '\r': 399 case 0: 400 *bp = '\0'; 401 SYNTAX( "non-terminated string %.10s...", buf ); 402 if (c == 0) /* hopeless */ 403 FATAL( "giving up" ); 404 lineno++; 405 break; 406 case '\\': 407 c = input(); 408 switch (c) { 409 case '\n': break; 410 case '"': *bp++ = '"'; break; 411 case 'n': *bp++ = '\n'; break; 412 case 't': *bp++ = '\t'; break; 413 case 'f': *bp++ = '\f'; break; 414 case 'r': *bp++ = '\r'; break; 415 case 'b': *bp++ = '\b'; break; 416 case 'v': *bp++ = '\v'; break; 417 case 'a': *bp++ = '\a'; break; 418 case '\\': *bp++ = '\\'; break; 419 420 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 421 case '3': case '4': case '5': case '6': case '7': 422 n = c - '0'; 423 if ((c = peek()) >= '0' && c < '8') { 424 n = 8 * n + input() - '0'; 425 if ((c = peek()) >= '0' && c < '8') 426 n = 8 * n + input() - '0'; 427 } 428 *bp++ = n; 429 break; 430 431 case 'x': /* hex \x0-9a-fA-F (exactly two) */ 432 { 433 int i; 434 435 n = 0; 436 for (i = 1; i <= 2; i++) { 437 c = input(); 438 if (c == 0) 439 break; 440 if (isxdigit(c)) { 441 c = tolower(c); 442 n *= 16; 443 if (isdigit(c)) 444 n += (c - '0'); 445 else 446 n += 10 + (c - 'a'); 447 } else 448 break; 449 } 450 if (n) 451 *bp++ = n; 452 else 453 unput(c); 454 break; 455 } 456 457 case 'u': /* utf \u0-9a-fA-F (1..8) */ 458 { 459 int i; 460 461 n = 0; 462 for (i = 0; i < 8; i++) { 463 c = input(); 464 if (!isxdigit(c) || c == 0) 465 break; 466 c = tolower(c); 467 n *= 16; 468 if (isdigit(c)) 469 n += (c - '0'); 470 else 471 n += 10 + (c - 'a'); 472 } 473 unput(c); 474 bp += runetochar(bp, n); 475 break; 476 } 477 478 default: 479 *bp++ = c; 480 break; 481 } 482 break; 483 default: 484 *bp++ = c; 485 break; 486 } 487 } 488 *bp = 0; 489 s = tostring(buf); 490 *bp++ = ' '; *bp++ = '\0'; 491 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 492 free(s); 493 RET(STRING); 494 } 495 496 497 static int binsearch(char *w, const Keyword *kp, int n) 498 { 499 int cond, low, mid, high; 500 501 low = 0; 502 high = n - 1; 503 while (low <= high) { 504 mid = (low + high) / 2; 505 if ((cond = strcmp(w, kp[mid].word)) < 0) 506 high = mid - 1; 507 else if (cond > 0) 508 low = mid + 1; 509 else 510 return mid; 511 } 512 return -1; 513 } 514 515 int word(char *w) 516 { 517 const Keyword *kp; 518 int c, n; 519 520 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 521 if (n != -1) { /* found in table */ 522 kp = keywords + n; 523 yylval.i = kp->sub; 524 switch (kp->type) { /* special handling */ 525 case BLTIN: 526 if (kp->sub == FSYSTEM && safe) 527 SYNTAX( "system is unsafe" ); 528 RET(kp->type); 529 case FUNC: 530 if (infunc) 531 SYNTAX( "illegal nested function" ); 532 RET(kp->type); 533 case RETURN: 534 if (!infunc) 535 SYNTAX( "return not in function" ); 536 RET(kp->type); 537 case VARNF: 538 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 539 RET(VARNF); 540 default: 541 RET(kp->type); 542 } 543 } 544 c = peek(); /* look for '(' */ 545 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 546 yylval.i = n; 547 RET(ARG); 548 } else { 549 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 550 if (c == '(') { 551 RET(CALL); 552 } else { 553 RET(VAR); 554 } 555 } 556 } 557 558 void startreg(void) /* next call to yylex will return a regular expression */ 559 { 560 reg = true; 561 } 562 563 int regexpr(void) 564 { 565 int c, openclass = 0; 566 static char *buf = NULL; 567 static int bufsz = 500; 568 char *bp, *cstart; 569 570 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 571 FATAL("out of space for reg expr"); 572 bp = buf; 573 for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) { 574 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 575 FATAL("out of space for reg expr %.10s...", buf); 576 if (c == '\n') { 577 *bp = '\0'; 578 SYNTAX( "newline in regular expression %.10s...", buf ); 579 unput('\n'); 580 break; 581 } else if (c == '\\') { 582 *bp++ = '\\'; 583 *bp++ = input(); 584 } else { 585 /* 586 * POSIX requires a slash in a regexp to be escaped, 587 * other awks don't require it to be escaped inside 588 * a character class. 589 */ 590 if (!do_posix) { 591 if (c == '[') { 592 int nextc = peek(); 593 if (openclass == 0 || nextc == ':' || 594 nextc == '.' || nextc == '=') { 595 if (++openclass == 1) 596 cstart = bp; 597 } 598 } else if (c == ']' && openclass > 0) { 599 /* 600 * A ']' as the first char in a 601 * class is treated literally. 602 */ 603 if (cstart != bp - 1 && 604 (cstart != bp - 2 || bp[-1] != '^')) 605 openclass--; 606 } 607 } 608 *bp++ = c; 609 } 610 } 611 *bp = 0; 612 if (c == 0) 613 SYNTAX("non-terminated regular expression %.10s...", buf); 614 yylval.s = tostring(buf); 615 unput('/'); 616 RET(REGEXPR); 617 } 618 619 /* low-level lexical stuff, sort of inherited from lex */ 620 621 char ebuf[300]; 622 char *ep = ebuf; 623 char yysbuf[100]; /* pushback buffer */ 624 char *yysptr = yysbuf; 625 FILE *yyin = NULL; 626 627 int input(void) /* get next lexical input character */ 628 { 629 int c; 630 extern char *lexprog; 631 632 if (yysptr > yysbuf) 633 c = (uschar)*--yysptr; 634 else if (lexprog != NULL) { /* awk '...' */ 635 if ((c = (uschar)*lexprog) != 0) 636 lexprog++; 637 } else /* awk -f ... */ 638 c = pgetc(); 639 if (c == EOF) 640 c = 0; 641 if (ep >= ebuf + sizeof ebuf) 642 ep = ebuf; 643 *ep = c; 644 if (c != 0) { 645 ep++; 646 } 647 return (c); 648 } 649 650 void unput(int c) /* put lexical character back on input */ 651 { 652 if (c == '\n') 653 lineno--; 654 if (yysptr >= yysbuf + sizeof(yysbuf)) 655 FATAL("pushed back too much: %.20s...", yysbuf); 656 *yysptr++ = c; 657 if (--ep < ebuf) 658 ep = ebuf + sizeof(ebuf) - 1; 659 } 660 661 void unputstr(const char *s) /* put a string back on input */ 662 { 663 int i; 664 665 for (i = strlen(s)-1; i >= 0; i--) 666 unput(s[i]); 667 } 668