1 /* $OpenBSD: lex.c,v 1.4 1999/12/08 23:09:45 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <ctype.h> 30 #include "awk.h" 31 #include "ytab.h" 32 33 extern YYSTYPE yylval; 34 extern int infunc; 35 36 int lineno = 1; 37 int bracecnt = 0; 38 int brackcnt = 0; 39 int parencnt = 0; 40 41 typedef struct Keyword { 42 char *word; 43 int sub; 44 int type; 45 } Keyword; 46 47 Keyword keywords[] ={ /* keep sorted: binary searched */ 48 { "BEGIN", XBEGIN, XBEGIN }, 49 { "END", XEND, XEND }, 50 { "NF", VARNF, VARNF }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "continue", CONTINUE, CONTINUE }, 55 { "cos", FCOS, BLTIN }, 56 { "delete", DELETE, DELETE }, 57 { "do", DO, DO }, 58 { "else", ELSE, ELSE }, 59 { "exit", EXIT, EXIT }, 60 { "exp", FEXP, BLTIN }, 61 { "fflush", FFLUSH, BLTIN }, 62 { "for", FOR, FOR }, 63 { "func", FUNC, FUNC }, 64 { "function", FUNC, FUNC }, 65 { "getline", GETLINE, GETLINE }, 66 { "gsub", GSUB, GSUB }, 67 { "if", IF, IF }, 68 { "in", IN, IN }, 69 { "index", INDEX, INDEX }, 70 { "int", FINT, BLTIN }, 71 { "length", FLENGTH, BLTIN }, 72 { "log", FLOG, BLTIN }, 73 { "match", MATCHFCN, MATCHFCN }, 74 { "next", NEXT, NEXT }, 75 { "nextfile", NEXTFILE, NEXTFILE }, 76 { "print", PRINT, PRINT }, 77 { "printf", PRINTF, PRINTF }, 78 { "rand", FRAND, BLTIN }, 79 { "return", RETURN, RETURN }, 80 { "sin", FSIN, BLTIN }, 81 { "split", SPLIT, SPLIT }, 82 { "sprintf", SPRINTF, SPRINTF }, 83 { "sqrt", FSQRT, BLTIN }, 84 { "srand", FSRAND, BLTIN }, 85 { "sub", SUB, SUB }, 86 { "substr", SUBSTR, SUBSTR }, 87 { "system", FSYSTEM, BLTIN }, 88 { "tolower", FTOLOWER, BLTIN }, 89 { "toupper", FTOUPPER, BLTIN }, 90 { "while", WHILE, WHILE }, 91 }; 92 93 #define DEBUG 94 #ifdef DEBUG 95 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 96 #else 97 #define RET(x) return(x) 98 #endif 99 100 int peek(void) 101 { 102 int c = input(); 103 unput(c); 104 return c; 105 } 106 107 int gettok(char **pbuf, int *psz) /* get next input token */ 108 { 109 int c; 110 char *buf = *pbuf; 111 int sz = *psz; 112 char *bp = buf; 113 114 c = input(); 115 if (c == 0) 116 return 0; 117 buf[0] = c; 118 buf[1] = 0; 119 if (!isalnum(c) && c != '.' && c != '_') 120 return c; 121 122 *bp++ = c; 123 if (isalpha(c) || c == '_') { /* it's a varname */ 124 for ( ; (c = input()) != 0; ) { 125 if (bp-buf >= sz) 126 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 127 FATAL( "out of space for name %.10s...", buf ); 128 if (isalnum(c) || c == '_') 129 *bp++ = c; 130 else { 131 *bp = 0; 132 unput(c); 133 break; 134 } 135 } 136 *bp = 0; 137 } else { /* it's a number */ 138 char *rem; 139 /* read input until can't be a number */ 140 for ( ; (c = input()) != 0; ) { 141 if (bp-buf >= sz) 142 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 143 FATAL( "out of space for number %.10s...", buf ); 144 if (isdigit(c) || c == 'e' || c == 'E' 145 || c == '.' || c == '+' || c == '-') 146 *bp++ = c; 147 else { 148 unput(c); 149 break; 150 } 151 } 152 *bp = 0; 153 strtod(buf, &rem); /* parse the number */ 154 unputstr(rem); /* put rest back for later */ 155 rem[0] = 0; 156 } 157 *pbuf = buf; 158 *psz = sz; 159 return buf[0]; 160 } 161 162 int word(char *); 163 int string(void); 164 int regexpr(void); 165 int sc = 0; /* 1 => return a } right now */ 166 int reg = 0; /* 1 => return a REGEXPR now */ 167 168 int yylex(void) 169 { 170 int c; 171 static char *buf = 0; 172 static int bufsize = 500; 173 174 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL) 175 FATAL( "out of space in yylex" ); 176 if (sc) { 177 sc = 0; 178 RET('}'); 179 } 180 if (reg) { 181 reg = 0; 182 return regexpr(); 183 } 184 for (;;) { 185 c = gettok(&buf, &bufsize); 186 if (c == 0) 187 return 0; 188 if (isalpha(c) || c == '_') 189 return word(buf); 190 if (isdigit(c) || c == '.') { 191 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 192 /* should this also have STR set? */ 193 RET(NUMBER); 194 } 195 196 yylval.i = c; 197 switch (c) { 198 case '\n': /* {EOL} */ 199 RET(NL); 200 case '\r': /* assume \n is coming */ 201 case ' ': /* {WS}+ */ 202 case '\t': 203 break; 204 case '#': /* #.* strip comments */ 205 while ((c = input()) != '\n' && c != 0) 206 ; 207 unput(c); 208 break; 209 case ';': 210 RET(';'); 211 case '\\': 212 if (peek() == '\n') { 213 input(); 214 } else if (peek() == '\r') { 215 input(); input(); /* \n */ 216 lineno++; 217 } else { 218 RET(c); 219 } 220 break; 221 case '&': 222 if (peek() == '&') { 223 input(); RET(AND); 224 } else 225 RET('&'); 226 case '|': 227 if (peek() == '|') { 228 input(); RET(BOR); 229 } else 230 RET('|'); 231 case '!': 232 if (peek() == '=') { 233 input(); yylval.i = NE; RET(NE); 234 } else if (peek() == '~') { 235 input(); yylval.i = NOTMATCH; RET(MATCHOP); 236 } else 237 RET(NOT); 238 case '~': 239 yylval.i = MATCH; 240 RET(MATCHOP); 241 case '<': 242 if (peek() == '=') { 243 input(); yylval.i = LE; RET(LE); 244 } else { 245 yylval.i = LT; RET(LT); 246 } 247 case '=': 248 if (peek() == '=') { 249 input(); yylval.i = EQ; RET(EQ); 250 } else { 251 yylval.i = ASSIGN; RET(ASGNOP); 252 } 253 case '>': 254 if (peek() == '=') { 255 input(); yylval.i = GE; RET(GE); 256 } else if (peek() == '>') { 257 input(); yylval.i = APPEND; RET(APPEND); 258 } else { 259 yylval.i = GT; RET(GT); 260 } 261 case '+': 262 if (peek() == '+') { 263 input(); yylval.i = INCR; RET(INCR); 264 } else if (peek() == '=') { 265 input(); yylval.i = ADDEQ; RET(ASGNOP); 266 } else 267 RET('+'); 268 case '-': 269 if (peek() == '-') { 270 input(); yylval.i = DECR; RET(DECR); 271 } else if (peek() == '=') { 272 input(); yylval.i = SUBEQ; RET(ASGNOP); 273 } else 274 RET('-'); 275 case '*': 276 if (peek() == '=') { /* *= */ 277 input(); yylval.i = MULTEQ; RET(ASGNOP); 278 } else if (peek() == '*') { /* ** or **= */ 279 input(); /* eat 2nd * */ 280 if (peek() == '=') { 281 input(); yylval.i = POWEQ; RET(ASGNOP); 282 } else { 283 RET(POWER); 284 } 285 } else 286 RET('*'); 287 case '/': 288 RET('/'); 289 case '%': 290 if (peek() == '=') { 291 input(); yylval.i = MODEQ; RET(ASGNOP); 292 } else 293 RET('%'); 294 case '^': 295 if (peek() == '=') { 296 input(); yylval.i = POWEQ; RET(ASGNOP); 297 } else 298 RET(POWER); 299 300 case '$': 301 /* BUG: awkward, if not wrong */ 302 c = gettok(&buf, &bufsize); 303 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 304 unputstr(buf); 305 RET(INDIRECT); 306 } else if (isalpha(c)) { 307 if (strcmp(buf, "NF") == 0) { /* very special */ 308 unputstr("(NF)"); 309 RET(INDIRECT); 310 } 311 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 312 RET(IVAR); 313 } else { 314 unputstr(buf); 315 RET(INDIRECT); 316 } 317 318 case '}': 319 if (--bracecnt < 0) 320 SYNTAX( "extra }" ); 321 sc = 1; 322 RET(';'); 323 case ']': 324 if (--brackcnt < 0) 325 SYNTAX( "extra ]" ); 326 RET(']'); 327 case ')': 328 if (--parencnt < 0) 329 SYNTAX( "extra )" ); 330 RET(')'); 331 case '{': 332 bracecnt++; 333 RET('{'); 334 case '[': 335 brackcnt++; 336 RET('['); 337 case '(': 338 parencnt++; 339 RET('('); 340 341 case '"': 342 return string(); /* BUG: should be like tran.c ? */ 343 344 default: 345 RET(c); 346 } 347 } 348 } 349 350 int string(void) 351 { 352 int c, n; 353 char *s, *bp; 354 static char *buf = 0; 355 static int bufsz = 500; 356 357 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 358 FATAL("out of space for strings"); 359 for (bp = buf; (c = input()) != '"'; ) { 360 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0)) 361 FATAL("out of space for string %.10s...", buf); 362 switch (c) { 363 case '\n': 364 case '\r': 365 case 0: 366 SYNTAX( "non-terminated string %.10s...", buf ); 367 lineno++; 368 break; 369 case '\\': 370 c = input(); 371 switch (c) { 372 case '"': *bp++ = '"'; break; 373 case 'n': *bp++ = '\n'; break; 374 case 't': *bp++ = '\t'; break; 375 case 'f': *bp++ = '\f'; break; 376 case 'r': *bp++ = '\r'; break; 377 case 'b': *bp++ = '\b'; break; 378 case 'v': *bp++ = '\v'; break; 379 case 'a': *bp++ = '\007'; break; 380 case '\\': *bp++ = '\\'; break; 381 382 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 383 case '3': case '4': case '5': case '6': case '7': 384 n = c - '0'; 385 if ((c = peek()) >= '0' && c < '8') { 386 n = 8 * n + input() - '0'; 387 if ((c = peek()) >= '0' && c < '8') 388 n = 8 * n + input() - '0'; 389 } 390 *bp++ = n; 391 break; 392 393 case 'x': /* hex \x0-9a-fA-F + */ 394 { char xbuf[100], *px; 395 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 396 if (isdigit(c) 397 || (c >= 'a' && c <= 'f') 398 || (c >= 'A' && c <= 'F')) 399 *px++ = c; 400 else 401 break; 402 } 403 *px = 0; 404 unput(c); 405 sscanf(xbuf, "%x", &n); 406 *bp++ = n; 407 break; 408 } 409 410 default: 411 *bp++ = c; 412 break; 413 } 414 break; 415 default: 416 *bp++ = c; 417 break; 418 } 419 } 420 *bp = 0; 421 s = tostring(buf); 422 *bp++ = ' '; *bp++ = 0; 423 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 424 RET(STRING); 425 } 426 427 428 int binsearch(char *w, Keyword *kp, int n) 429 { 430 int cond, low, mid, high; 431 432 low = 0; 433 high = n - 1; 434 while (low <= high) { 435 mid = (low + high) / 2; 436 if ((cond = strcmp(w, kp[mid].word)) < 0) 437 high = mid - 1; 438 else if (cond > 0) 439 low = mid + 1; 440 else 441 return mid; 442 } 443 return -1; 444 } 445 446 int word(char *w) 447 { 448 Keyword *kp; 449 int c, n; 450 451 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 452 kp = keywords + n; 453 if (n != -1) { /* found in table */ 454 yylval.i = kp->sub; 455 switch (kp->type) { /* special handling */ 456 case FSYSTEM: 457 if (safe) 458 SYNTAX( "system is unsafe" ); 459 RET(kp->type); 460 case FUNC: 461 if (infunc) 462 SYNTAX( "illegal nested function" ); 463 RET(kp->type); 464 case RETURN: 465 if (!infunc) 466 SYNTAX( "return not in function" ); 467 RET(kp->type); 468 case VARNF: 469 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 470 RET(VARNF); 471 default: 472 RET(kp->type); 473 } 474 } 475 c = peek(); /* look for '(' */ 476 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 477 yylval.i = n; 478 RET(ARG); 479 } else { 480 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 481 if (c == '(') { 482 RET(CALL); 483 } else { 484 RET(VAR); 485 } 486 } 487 } 488 489 void startreg(void) /* next call to yyles will return a regular expression */ 490 { 491 reg = 1; 492 } 493 494 int regexpr(void) 495 { 496 int c; 497 static char *buf = 0; 498 static int bufsz = 500; 499 char *bp; 500 501 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 502 FATAL("out of space for rex expr"); 503 bp = buf; 504 for ( ; (c = input()) != '/' && c != 0; ) { 505 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0)) 506 FATAL("out of space for reg expr %.10s...", buf); 507 if (c == '\n') { 508 SYNTAX( "newline in regular expression %.10s...", buf ); 509 unput('\n'); 510 break; 511 } else if (c == '\\') { 512 *bp++ = '\\'; 513 *bp++ = input(); 514 } else { 515 *bp++ = c; 516 } 517 } 518 *bp = 0; 519 yylval.s = tostring(buf); 520 unput('/'); 521 RET(REGEXPR); 522 } 523 524 /* low-level lexical stuff, sort of inherited from lex */ 525 526 char ebuf[300]; 527 char *ep = ebuf; 528 char yysbuf[100]; /* pushback buffer */ 529 char *yysptr = yysbuf; 530 FILE *yyin = 0; 531 532 int input(void) /* get next lexical input character */ 533 { 534 int c; 535 extern char *lexprog; 536 537 if (yysptr > yysbuf) 538 c = *--yysptr; 539 else if (lexprog != NULL) { /* awk '...' */ 540 if ((c = *lexprog) != 0) 541 lexprog++; 542 } else /* awk -f ... */ 543 c = pgetc(); 544 if (c == '\n') 545 lineno++; 546 else if (c == EOF) 547 c = 0; 548 if (ep >= ebuf + sizeof ebuf) 549 ep = ebuf; 550 return *ep++ = c; 551 } 552 553 void unput(int c) /* put lexical character back on input */ 554 { 555 if (c == '\n') 556 lineno--; 557 if (yysptr >= yysbuf + sizeof(yysbuf)) 558 FATAL("pushed back too much: %.20s...", yysbuf); 559 *yysptr++ = c; 560 if (--ep < ebuf) 561 ep = ebuf + sizeof(ebuf) - 1; 562 } 563 564 void unputstr(char *s) /* put a string back on input */ 565 { 566 int i; 567 568 for (i = strlen(s)-1; i >= 0; i--) 569 unput(s[i]); 570 } 571