1 /* $OpenBSD: lex.c,v 1.7 2003/07/02 21:04:09 deraadt Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <ctype.h> 30 #include "awk.h" 31 #include "ytab.h" 32 33 extern YYSTYPE yylval; 34 extern int infunc; 35 36 int lineno = 1; 37 int bracecnt = 0; 38 int brackcnt = 0; 39 int parencnt = 0; 40 41 typedef struct Keyword { 42 const char *word; 43 int sub; 44 int type; 45 } Keyword; 46 47 Keyword keywords[] ={ /* keep sorted: binary searched */ 48 { "BEGIN", XBEGIN, XBEGIN }, 49 { "END", XEND, XEND }, 50 { "NF", VARNF, VARNF }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "continue", CONTINUE, CONTINUE }, 55 { "cos", FCOS, BLTIN }, 56 { "delete", DELETE, DELETE }, 57 { "do", DO, DO }, 58 { "else", ELSE, ELSE }, 59 { "exit", EXIT, EXIT }, 60 { "exp", FEXP, BLTIN }, 61 { "fflush", FFLUSH, BLTIN }, 62 { "for", FOR, FOR }, 63 { "func", FUNC, FUNC }, 64 { "function", FUNC, FUNC }, 65 { "getline", GETLINE, GETLINE }, 66 { "gsub", GSUB, GSUB }, 67 { "if", IF, IF }, 68 { "in", IN, IN }, 69 { "index", INDEX, INDEX }, 70 { "int", FINT, BLTIN }, 71 { "length", FLENGTH, BLTIN }, 72 { "log", FLOG, BLTIN }, 73 { "match", MATCHFCN, MATCHFCN }, 74 { "next", NEXT, NEXT }, 75 { "nextfile", NEXTFILE, NEXTFILE }, 76 { "print", PRINT, PRINT }, 77 { "printf", PRINTF, PRINTF }, 78 { "rand", FRAND, BLTIN }, 79 { "return", RETURN, RETURN }, 80 { "sin", FSIN, BLTIN }, 81 { "split", SPLIT, SPLIT }, 82 { "sprintf", SPRINTF, SPRINTF }, 83 { "sqrt", FSQRT, BLTIN }, 84 { "srand", FSRAND, BLTIN }, 85 { "sub", SUB, SUB }, 86 { "substr", SUBSTR, SUBSTR }, 87 { "system", FSYSTEM, BLTIN }, 88 { "tolower", FTOLOWER, BLTIN }, 89 { "toupper", FTOUPPER, BLTIN }, 90 { "while", WHILE, WHILE }, 91 }; 92 93 #define DEBUG 94 #ifdef DEBUG 95 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 96 #else 97 #define RET(x) return(x) 98 #endif 99 100 int peek(void); 101 int gettok(char **, int *); 102 int binsearch(char *, Keyword *, int); 103 104 int peek(void) 105 { 106 int c = input(); 107 unput(c); 108 return c; 109 } 110 111 int gettok(char **pbuf, int *psz) /* get next input token */ 112 { 113 int c, retc; 114 char *buf = *pbuf; 115 int sz = *psz; 116 char *bp = buf; 117 118 c = input(); 119 if (c == 0) 120 return 0; 121 buf[0] = c; 122 buf[1] = 0; 123 if (!isalnum(c) && c != '.' && c != '_') 124 return c; 125 126 *bp++ = c; 127 if (isalpha(c) || c == '_') { /* it's a varname */ 128 for ( ; (c = input()) != 0; ) { 129 if (bp-buf >= sz) 130 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 131 FATAL( "out of space for name %.10s...", buf ); 132 if (isalnum(c) || c == '_') 133 *bp++ = c; 134 else { 135 *bp = 0; 136 unput(c); 137 break; 138 } 139 } 140 *bp = 0; 141 retc = 'a'; /* alphanumeric */ 142 } else { /* it's a number */ 143 char *rem; 144 /* read input until can't be a number */ 145 for ( ; (c = input()) != 0; ) { 146 if (bp-buf >= sz) 147 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 148 FATAL( "out of space for number %.10s...", buf ); 149 if (isdigit(c) || c == 'e' || c == 'E' 150 || c == '.' || c == '+' || c == '-') 151 *bp++ = c; 152 else { 153 unput(c); 154 break; 155 } 156 } 157 *bp = 0; 158 strtod(buf, &rem); /* parse the number */ 159 unputstr(rem); /* put rest back for later */ 160 if (rem == buf) { /* it wasn't a valid number at all */ 161 buf[1] = 0; /* so return one character as token */ 162 retc = buf[0]; /* character is its own type */ 163 } else { /* some prefix was a number */ 164 rem[0] = 0; /* so truncate where failure started */ 165 retc = '0'; /* number */ 166 } 167 } 168 *pbuf = buf; 169 *psz = sz; 170 return retc; 171 } 172 173 int word(char *); 174 int string(void); 175 int regexpr(void); 176 int sc = 0; /* 1 => return a } right now */ 177 int reg = 0; /* 1 => return a REGEXPR now */ 178 179 int yylex(void) 180 { 181 int c; 182 static char *buf = 0; 183 static int bufsize = 500; 184 185 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL) 186 FATAL( "out of space in yylex" ); 187 if (sc) { 188 sc = 0; 189 RET('}'); 190 } 191 if (reg) { 192 reg = 0; 193 return regexpr(); 194 } 195 for (;;) { 196 c = gettok(&buf, &bufsize); 197 if (c == 0) 198 return 0; 199 if (isalpha(c) || c == '_') 200 return word(buf); 201 if (isdigit(c)) { 202 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 203 /* should this also have STR set? */ 204 RET(NUMBER); 205 } 206 207 yylval.i = c; 208 switch (c) { 209 case '\n': /* {EOL} */ 210 RET(NL); 211 case '\r': /* assume \n is coming */ 212 case ' ': /* {WS}+ */ 213 case '\t': 214 break; 215 case '#': /* #.* strip comments */ 216 while ((c = input()) != '\n' && c != 0) 217 ; 218 unput(c); 219 break; 220 case ';': 221 RET(';'); 222 case '\\': 223 if (peek() == '\n') { 224 input(); 225 } else if (peek() == '\r') { 226 input(); input(); /* \n */ 227 lineno++; 228 } else { 229 RET(c); 230 } 231 break; 232 case '&': 233 if (peek() == '&') { 234 input(); RET(AND); 235 } else 236 RET('&'); 237 case '|': 238 if (peek() == '|') { 239 input(); RET(BOR); 240 } else 241 RET('|'); 242 case '!': 243 if (peek() == '=') { 244 input(); yylval.i = NE; RET(NE); 245 } else if (peek() == '~') { 246 input(); yylval.i = NOTMATCH; RET(MATCHOP); 247 } else 248 RET(NOT); 249 case '~': 250 yylval.i = MATCH; 251 RET(MATCHOP); 252 case '<': 253 if (peek() == '=') { 254 input(); yylval.i = LE; RET(LE); 255 } else { 256 yylval.i = LT; RET(LT); 257 } 258 case '=': 259 if (peek() == '=') { 260 input(); yylval.i = EQ; RET(EQ); 261 } else { 262 yylval.i = ASSIGN; RET(ASGNOP); 263 } 264 case '>': 265 if (peek() == '=') { 266 input(); yylval.i = GE; RET(GE); 267 } else if (peek() == '>') { 268 input(); yylval.i = APPEND; RET(APPEND); 269 } else { 270 yylval.i = GT; RET(GT); 271 } 272 case '+': 273 if (peek() == '+') { 274 input(); yylval.i = INCR; RET(INCR); 275 } else if (peek() == '=') { 276 input(); yylval.i = ADDEQ; RET(ASGNOP); 277 } else 278 RET('+'); 279 case '-': 280 if (peek() == '-') { 281 input(); yylval.i = DECR; RET(DECR); 282 } else if (peek() == '=') { 283 input(); yylval.i = SUBEQ; RET(ASGNOP); 284 } else 285 RET('-'); 286 case '*': 287 if (peek() == '=') { /* *= */ 288 input(); yylval.i = MULTEQ; RET(ASGNOP); 289 } else if (peek() == '*') { /* ** or **= */ 290 input(); /* eat 2nd * */ 291 if (peek() == '=') { 292 input(); yylval.i = POWEQ; RET(ASGNOP); 293 } else { 294 RET(POWER); 295 } 296 } else 297 RET('*'); 298 case '/': 299 RET('/'); 300 case '%': 301 if (peek() == '=') { 302 input(); yylval.i = MODEQ; RET(ASGNOP); 303 } else 304 RET('%'); 305 case '^': 306 if (peek() == '=') { 307 input(); yylval.i = POWEQ; RET(ASGNOP); 308 } else 309 RET(POWER); 310 311 case '$': 312 /* BUG: awkward, if not wrong */ 313 c = gettok(&buf, &bufsize); 314 if (isalpha(c)) { 315 if (strcmp(buf, "NF") == 0) { /* very special */ 316 unputstr("(NF)"); 317 RET(INDIRECT); 318 } 319 c = peek(); 320 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 321 unputstr(buf); 322 RET(INDIRECT); 323 } 324 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 325 RET(IVAR); 326 } else if (c == 0) { /* */ 327 SYNTAX( "unexpected end of input after $" ); 328 RET(';'); 329 } else { 330 unputstr(buf); 331 RET(INDIRECT); 332 } 333 334 case '}': 335 if (--bracecnt < 0) 336 SYNTAX( "extra }" ); 337 sc = 1; 338 RET(';'); 339 case ']': 340 if (--brackcnt < 0) 341 SYNTAX( "extra ]" ); 342 RET(']'); 343 case ')': 344 if (--parencnt < 0) 345 SYNTAX( "extra )" ); 346 RET(')'); 347 case '{': 348 bracecnt++; 349 RET('{'); 350 case '[': 351 brackcnt++; 352 RET('['); 353 case '(': 354 parencnt++; 355 RET('('); 356 357 case '"': 358 return string(); /* BUG: should be like tran.c ? */ 359 360 default: 361 RET(c); 362 } 363 } 364 } 365 366 int string(void) 367 { 368 int c, n; 369 char *s, *bp; 370 static char *buf = 0; 371 static int bufsz = 500; 372 373 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 374 FATAL("out of space for strings"); 375 for (bp = buf; (c = input()) != '"'; ) { 376 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0)) 377 FATAL("out of space for string %.10s...", buf); 378 switch (c) { 379 case '\n': 380 case '\r': 381 case 0: 382 SYNTAX( "non-terminated string %.10s...", buf ); 383 lineno++; 384 if (c == 0) /* hopeless */ 385 FATAL( "giving up" ); 386 break; 387 case '\\': 388 c = input(); 389 switch (c) { 390 case '"': *bp++ = '"'; break; 391 case 'n': *bp++ = '\n'; break; 392 case 't': *bp++ = '\t'; break; 393 case 'f': *bp++ = '\f'; break; 394 case 'r': *bp++ = '\r'; break; 395 case 'b': *bp++ = '\b'; break; 396 case 'v': *bp++ = '\v'; break; 397 case 'a': *bp++ = '\007'; break; 398 case '\\': *bp++ = '\\'; break; 399 400 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 401 case '3': case '4': case '5': case '6': case '7': 402 n = c - '0'; 403 if ((c = peek()) >= '0' && c < '8') { 404 n = 8 * n + input() - '0'; 405 if ((c = peek()) >= '0' && c < '8') 406 n = 8 * n + input() - '0'; 407 } 408 *bp++ = n; 409 break; 410 411 case 'x': /* hex \x0-9a-fA-F + */ 412 { char xbuf[100], *px; 413 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 414 if (isdigit(c) 415 || (c >= 'a' && c <= 'f') 416 || (c >= 'A' && c <= 'F')) 417 *px++ = c; 418 else 419 break; 420 } 421 *px = 0; 422 unput(c); 423 sscanf(xbuf, "%x", &n); 424 *bp++ = n; 425 break; 426 } 427 428 default: 429 *bp++ = c; 430 break; 431 } 432 break; 433 default: 434 *bp++ = c; 435 break; 436 } 437 } 438 *bp = 0; 439 s = tostring(buf); 440 *bp++ = ' '; *bp++ = 0; 441 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 442 RET(STRING); 443 } 444 445 446 int binsearch(char *w, Keyword *kp, int n) 447 { 448 int cond, low, mid, high; 449 450 low = 0; 451 high = n - 1; 452 while (low <= high) { 453 mid = (low + high) / 2; 454 if ((cond = strcmp(w, kp[mid].word)) < 0) 455 high = mid - 1; 456 else if (cond > 0) 457 low = mid + 1; 458 else 459 return mid; 460 } 461 return -1; 462 } 463 464 int word(char *w) 465 { 466 Keyword *kp; 467 int c, n; 468 469 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 470 kp = keywords + n; 471 if (n != -1) { /* found in table */ 472 yylval.i = kp->sub; 473 switch (kp->type) { /* special handling */ 474 case FSYSTEM: 475 if (safe) 476 SYNTAX( "system is unsafe" ); 477 RET(kp->type); 478 case FUNC: 479 if (infunc) 480 SYNTAX( "illegal nested function" ); 481 RET(kp->type); 482 case RETURN: 483 if (!infunc) 484 SYNTAX( "return not in function" ); 485 RET(kp->type); 486 case VARNF: 487 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 488 RET(VARNF); 489 default: 490 RET(kp->type); 491 } 492 } 493 c = peek(); /* look for '(' */ 494 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 495 yylval.i = n; 496 RET(ARG); 497 } else { 498 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 499 if (c == '(') { 500 RET(CALL); 501 } else { 502 RET(VAR); 503 } 504 } 505 } 506 507 void startreg(void) /* next call to yylex will return a regular expression */ 508 { 509 reg = 1; 510 } 511 512 int regexpr(void) 513 { 514 int c; 515 static char *buf = 0; 516 static int bufsz = 500; 517 char *bp; 518 519 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 520 FATAL("out of space for rex expr"); 521 bp = buf; 522 for ( ; (c = input()) != '/' && c != 0; ) { 523 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0)) 524 FATAL("out of space for reg expr %.10s...", buf); 525 if (c == '\n') { 526 SYNTAX( "newline in regular expression %.10s...", buf ); 527 unput('\n'); 528 break; 529 } else if (c == '\\') { 530 *bp++ = '\\'; 531 *bp++ = input(); 532 } else { 533 *bp++ = c; 534 } 535 } 536 *bp = 0; 537 yylval.s = tostring(buf); 538 unput('/'); 539 RET(REGEXPR); 540 } 541 542 /* low-level lexical stuff, sort of inherited from lex */ 543 544 char ebuf[300]; 545 char *ep = ebuf; 546 char yysbuf[100]; /* pushback buffer */ 547 char *yysptr = yysbuf; 548 FILE *yyin = 0; 549 550 int input(void) /* get next lexical input character */ 551 { 552 int c; 553 extern char *lexprog; 554 555 if (yysptr > yysbuf) 556 c = *--yysptr; 557 else if (lexprog != NULL) { /* awk '...' */ 558 if ((c = *lexprog) != 0) 559 lexprog++; 560 } else /* awk -f ... */ 561 c = pgetc(); 562 if (c == '\n') 563 lineno++; 564 else if (c == EOF) 565 c = 0; 566 if (ep >= ebuf + sizeof ebuf) 567 ep = ebuf; 568 return *ep++ = c; 569 } 570 571 void unput(int c) /* put lexical character back on input */ 572 { 573 if (c == '\n') 574 lineno--; 575 if (yysptr >= yysbuf + sizeof(yysbuf)) 576 FATAL("pushed back too much: %.20s...", yysbuf); 577 *yysptr++ = c; 578 if (--ep < ebuf) 579 ep = ebuf + sizeof(ebuf) - 1; 580 } 581 582 void unputstr(const char *s) /* put a string back on input */ 583 { 584 int i; 585 586 for (i = strlen(s)-1; i >= 0; i--) 587 unput(s[i]); 588 } 589