1 /*- 2 * Copyright (c) 1992 Diomidis Spinellis. 3 * Copyright (c) 1992 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Diomidis Spinellis of Imperial College, University of London. 8 * 9 * %sccs.include.redist.c% 10 */ 11 12 #ifndef lint 13 static char sccsid[] = "@(#)compile.c 5.1 (Berkeley) 08/23/92"; 14 #endif /* not lint */ 15 16 #include <sys/types.h> 17 #include <sys/stat.h> 18 19 #include <ctype.h> 20 #include <errno.h> 21 #include <fcntl.h> 22 #include <limits.h> 23 #include <regex.h> 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 28 #include "defs.h" 29 #include "extern.h" 30 31 static char *compile_addr __P((char *, struct s_addr *)); 32 static char *compile_delimited __P((char *, char *)); 33 static char *compile_flags __P((char *, struct s_subst *)); 34 static char *compile_re __P((char *, regex_t *, int)); 35 static char *compile_subst __P((char *, char **, size_t)); 36 static char *compile_text __P((void)); 37 static char *compile_tr __P((char *, char **)); 38 static struct s_command 39 **compile_stream __P((char *, struct s_command **, char *)); 40 static char *duptoeol __P((char *)); 41 static struct s_command 42 *findlabel __P((struct s_command *, struct s_command *)); 43 static void fixuplabel __P((struct s_command *, struct s_command *)); 44 45 /* 46 * Command specification. This is used to drive the command parser. 47 */ 48 struct s_format { 49 char code; /* Command code */ 50 int naddr; /* Number of address args */ 51 enum e_args args; /* Argument type */ 52 }; 53 54 static struct s_format cmd_fmts[] = { 55 {'{', 2, GROUP}, 56 {'a', 1, TEXT}, 57 {'b', 2, BRANCH}, 58 {'c', 2, TEXT}, 59 {'d', 2, EMPTY}, 60 {'D', 2, EMPTY}, 61 {'g', 2, EMPTY}, 62 {'G', 2, EMPTY}, 63 {'h', 2, EMPTY}, 64 {'H', 2, EMPTY}, 65 {'i', 1, TEXT}, 66 {'l', 2, EMPTY}, 67 {'n', 2, EMPTY}, 68 {'N', 2, EMPTY}, 69 {'p', 2, EMPTY}, 70 {'P', 2, EMPTY}, 71 {'q', 1, EMPTY}, 72 {'r', 1, RFILE}, 73 {'s', 2, SUBST}, 74 {'t', 2, BRANCH}, 75 {'w', 2, WFILE}, 76 {'x', 2, EMPTY}, 77 {'y', 2, TR}, 78 {'!', 2, NONSEL}, 79 {':', 0, LABEL}, 80 {'#', 0, COMMENT}, 81 {'=', 1, EMPTY}, 82 {'\0', 0, COMMENT}, 83 }; 84 85 /* The compiled program */ 86 struct s_command *prog; 87 88 /* 89 * Compile the program into prog. 90 * Initialise appends 91 */ 92 void 93 compile() 94 { 95 *compile_stream(NULL, &prog, NULL) = NULL; 96 fixuplabel(prog, prog); 97 appends = xmalloc(sizeof(struct s_appends) * appendnum); 98 } 99 100 #define EATSPACE() do { \ 101 if (p) \ 102 while (*p && isascii(*p) && isspace(*p)) \ 103 p++; \ 104 } while (0) 105 106 static struct s_command ** 107 compile_stream(terminator, link, p) 108 char *terminator; 109 struct s_command **link; 110 register char *p; 111 { 112 static char lbuf[_POSIX2_LINE_MAX + 1]; /* To save stack */ 113 struct s_command *cmd, *cmd2; 114 struct s_format *fp; 115 int naddr; /* Number of addresses */ 116 117 if (p != NULL) 118 goto semicolon; 119 for (;;) { 120 if ((p = cu_fgets(lbuf, sizeof(lbuf))) == NULL) { 121 if (terminator != NULL) 122 err(COMPILE, "unexpected EOF (pending }'s)"); 123 return (link); 124 } 125 126 semicolon: EATSPACE(); 127 if (p && (*p == '#' || *p == '\0')) 128 continue; 129 if (*p == '}') { 130 if (terminator == NULL) 131 err(COMPILE, "unexpected }"); 132 return (link); 133 } 134 *link = cmd = xmalloc(sizeof(struct s_command)); 135 link = &cmd->next; 136 cmd->nonsel = cmd->inrange = 0; 137 /* First parse the addresses */ 138 naddr = 0; 139 cmd->a1 = cmd->a2 = NULL; 140 141 /* Valid characters to start an address */ 142 #define addrchar(c) (strchr("0123456789/\\$", (c))) 143 if (addrchar(*p)) { 144 naddr++; 145 cmd->a1 = xmalloc(sizeof(struct s_addr)); 146 p = compile_addr(p, cmd->a1); 147 EATSPACE(); /* EXTENSION */ 148 if (*p == ',') { 149 naddr++; 150 p++; 151 EATSPACE(); /* EXTENSION */ 152 cmd->a2 = xmalloc(sizeof(struct s_addr)); 153 p = compile_addr(p, cmd->a2); 154 } 155 } 156 157 nonsel: /* Now parse the command */ 158 EATSPACE(); 159 if (!*p) 160 err(COMPILE, "command expected"); 161 cmd->code = *p; 162 for (fp = cmd_fmts; fp->code; fp++) 163 if (fp->code == *p) 164 break; 165 if (!fp->code) 166 err(COMPILE, "invalid command code %c", *p); 167 if (naddr > fp->naddr) 168 err(COMPILE, 169 "command %c expects up to %d address(es), found %d", *p, fp->naddr, naddr); 170 switch (fp->args) { 171 case NONSEL: /* ! */ 172 cmd->nonsel = ! cmd->nonsel; 173 p++; 174 goto nonsel; 175 case GROUP: /* { */ 176 p++; 177 EATSPACE(); 178 if (!*p) 179 p = NULL; 180 cmd2 = xmalloc(sizeof(struct s_command)); 181 cmd2->code = '}'; 182 *compile_stream("}", &cmd->u.c, p) = cmd2; 183 cmd->next = cmd2; 184 link = &cmd2->next; 185 break; 186 case EMPTY: /* d D g G h H l n N p P q x = \0 */ 187 p++; 188 EATSPACE(); 189 if (*p == ';') { 190 p++; 191 link = &cmd->next; 192 goto semicolon; 193 } 194 if (*p) 195 err(COMPILE, 196 "extra characters at the end of %c command", cmd->code); 197 break; 198 case TEXT: /* a c i */ 199 p++; 200 EATSPACE(); 201 if (*p != '\\') 202 err(COMPILE, 203 "command %c expects \\ followed by text", cmd->code); 204 p++; 205 EATSPACE(); 206 if (*p) 207 err(COMPILE, 208 "extra characters after \\ at the end of %c command", cmd->code); 209 cmd->t = compile_text(); 210 break; 211 case COMMENT: /* \0 # */ 212 break; 213 case WFILE: /* w */ 214 p++; 215 EATSPACE(); 216 if (*p == '\0') 217 err(COMPILE, "filename expected"); 218 cmd->t = duptoeol(p); 219 if (aflag) 220 cmd->u.fd = -1; 221 else if ((cmd->u.fd = open(p, 222 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, 223 DEFFILEMODE)) == -1) 224 err(FATAL, "%s: %s\n", p, strerror(errno)); 225 break; 226 case RFILE: /* r */ 227 p++; 228 EATSPACE(); 229 if (*p == '\0') 230 err(COMPILE, "filename expected"); 231 else 232 cmd->t = duptoeol(p); 233 break; 234 case BRANCH: /* b t */ 235 p++; 236 EATSPACE(); 237 if (*p == '\0') 238 cmd->t = NULL; 239 else 240 cmd->t = duptoeol(p); 241 break; 242 case LABEL: /* : */ 243 p++; 244 EATSPACE(); 245 cmd->t = duptoeol(p); 246 if (strlen(p) == 0) 247 err(COMPILE, "empty label"); 248 break; 249 case SUBST: /* s */ 250 p++; 251 if (*p == '\0' || *p == '\\') 252 err(COMPILE, 253 "substitute pattern can not be delimited by newline or backslash"); 254 cmd->u.s = xmalloc(sizeof(struct s_subst)); 255 p = compile_re(p, &cmd->u.s->re, 0); 256 if (p == NULL) 257 err(COMPILE, "newline in substitution pattern"); 258 cmd->u.s->pmatch = xmalloc((cmd->u.s->re.re_nsub + 1) * 259 sizeof(regmatch_t)); 260 p--; 261 p = compile_subst(p, 262 &cmd->u.s->new, cmd->u.s->re.re_nsub); 263 if (p == NULL) 264 err(COMPILE, 265 "unterminated substitute replace in regular expression"); 266 p = compile_flags(p, cmd->u.s); 267 EATSPACE(); 268 if (*p == ';') { 269 p++; 270 link = &cmd->next; 271 goto semicolon; 272 } 273 break; 274 case TR: /* y */ 275 p++; 276 p = compile_tr(p, (char **)&cmd->u.y); 277 EATSPACE(); 278 if (*p == ';') { 279 p++; 280 link = &cmd->next; 281 goto semicolon; 282 } 283 if (*p) 284 err(COMPILE, 285 "extra text at the end of a transform command"); 286 break; 287 } 288 } 289 } 290 291 /* 292 * Get a delimited string. P points to the delimeter of the string; d points 293 * to a buffer area. Newline and delimiter escapes are processed; other 294 * escapes are ignored. 295 * 296 * Returns a pointer to the first character after the final delimiter or NULL 297 * in the case of a non-terminated string. The character array d is filled 298 * with the processed string. 299 */ 300 static char * 301 compile_delimited(p, d) 302 char *p, *d; 303 { 304 char c; 305 306 c = *p++; 307 if (c == '\0') 308 return (NULL); 309 else if (c == '\\') 310 err(COMPILE, "\\ can not be used as a string delimiter"); 311 else if (c == '\n') 312 err(COMPILE, "newline can not be used as a string delimiter"); 313 while (*p) { 314 if (*p == '\\' && p[1] == c) 315 p++; 316 else if (*p == '\\' && p[1] == 'n') { 317 *d++ = '\n'; 318 p += 2; 319 continue; 320 } else if (*p == c) { 321 *d = '\0'; 322 return (p + 1); 323 } 324 *d++ = *p++; 325 } 326 return (NULL); 327 } 328 329 /* 330 * Get a regular expression. P points to the delimeter of the regular 331 * expression; d points a regexp pointer. Newline and delimiter escapes 332 * are processed; other escapes are ignored. 333 * Returns a pointer to the first character after the final delimiter 334 * or NULL in the case of a non terminated regular expression. 335 * The regexp pointer is set to the compiled regular expression. 336 * Cflags are passed to regcomp. 337 */ 338 static char * 339 compile_re(p, rep, cflags) 340 char *p; 341 regex_t *rep; 342 int cflags; 343 { 344 int eval; 345 char re[_POSIX2_LINE_MAX + 1]; 346 347 p = compile_delimited(p, re); 348 if (p && (eval = regcomp(rep, re, cflags)) != 0) 349 err(COMPILE, "RE error: %s", strregerror(eval, rep)); 350 return (p); 351 } 352 353 /* 354 * Compile the substitution string of a regular expression and set res to 355 * point to a saved copy of it. Nsub is the number of parenthesized regular 356 * expressions. 357 */ 358 static char * 359 compile_subst(p, res, nsub) 360 char *p, **res; 361 size_t nsub; 362 { 363 static char lbuf[_POSIX2_LINE_MAX + 1]; 364 int asize, size; 365 char c, *text, *op, *s; 366 367 c = *p++; /* Terminator character */ 368 if (c == '\0') 369 return (NULL); 370 371 asize = 2 * _POSIX2_LINE_MAX + 1; 372 text = xmalloc(asize); 373 size = 0; 374 do { 375 op = s = text + size; 376 for (; *p; p++) { 377 if (*p == '\\') { 378 p++; 379 if (strchr("123456789", *p) != NULL) { 380 *s++ = '\\'; 381 if (*p - '1' > nsub) 382 err(COMPILE, 383 "\\%c not defined in regular expression (use \\1-\\%d)", *p, nsub + 1); 384 } else if (*p == '&') 385 *s++ = '\\'; 386 } else if (*p == c) { 387 p++; 388 *s++ = '\0'; 389 size += s - op; 390 *res = xrealloc(text, size); 391 return (p); 392 } else if (*p == '\n') { 393 err(COMPILE, 394 "unescaped newline inside substitute pattern"); 395 return (NULL); 396 } 397 *s++ = *p; 398 } 399 size += s - op; 400 if (asize - size < _POSIX2_LINE_MAX + 1) { 401 asize *= 2; 402 text = xmalloc(asize); 403 } 404 } while (cu_fgets(p = lbuf, sizeof(lbuf))); 405 err(COMPILE, "EOF in substitute pattern"); 406 return (NULL); 407 } 408 409 /* 410 * Compile the flags of the s command 411 */ 412 static char * 413 compile_flags(p, s) 414 char *p; 415 struct s_subst *s; 416 { 417 int gn; /* True if we have seen g or n */ 418 char wfile[_POSIX2_LINE_MAX + 1], *q; 419 420 s->n = 1; /* Default */ 421 s->p = 0; 422 s->wfile = NULL; 423 s->wfd = -1; 424 for (gn = 0;;) { 425 EATSPACE(); /* EXTENSION */ 426 switch (*p) { 427 case 'g': 428 if (gn) 429 err(WARNING, 430 "both g and number in substitute flags"); 431 gn = 1; 432 s->n = 0; 433 break; 434 case '\0': 435 case '\n': 436 case ';': 437 return (p); 438 case 'p': 439 s->p = 1; 440 break; 441 case '1': case '2': case '3': 442 case '4': case '5': case '6': 443 case '7': case '8': case '9': 444 if (gn) 445 err(WARNING, 446 "both g and number in substitute flags"); 447 gn = 1; 448 /* XXX Check for overflow */ 449 s->n = (int)strtol(p, &p, 10); 450 break; 451 case 'w': 452 p++; 453 #ifdef HISTORIC_PRACTICE 454 if (*p != ' ') { 455 err(WARNING, "space missing before w wfile"); 456 return (p); 457 } 458 #endif 459 EATSPACE(); 460 q = wfile; 461 while (*p) { 462 if (*p == '\n') 463 break; 464 *q++ = *p++; 465 } 466 *q = '\0'; 467 if (q == wfile) 468 err(COMPILE, "empty wfile specified"); 469 s->wfile = strdup(wfile); 470 if (!aflag && (s->wfd = open(wfile, 471 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, 472 DEFFILEMODE)) == -1) 473 err(FATAL, "%s: %s\n", wfile, strerror(errno)); 474 return (p); 475 default: 476 err(COMPILE, 477 "bad flag in substitute command: '%c'", p[-1]); 478 break; 479 } 480 p++; 481 } 482 } 483 484 /* 485 * Compile a translation set of strings into a lookup table. 486 */ 487 static char * 488 compile_tr(p, transtab) 489 char *p; 490 char **transtab; 491 { 492 int i; 493 char *lt, *op, *np; 494 char old[_POSIX2_LINE_MAX + 1]; 495 char new[_POSIX2_LINE_MAX + 1]; 496 497 if (*p == '\0' || *p == '\\') 498 err(COMPILE, 499 "transform pattern can not be delimited by newline or backslash"); 500 p = compile_delimited(p, old); 501 if (p == NULL) { 502 err(COMPILE, "unterminated transform source string"); 503 return (NULL); 504 } 505 p = compile_delimited(--p, new); 506 if (p == NULL) { 507 err(COMPILE, "unterminated transform target string"); 508 return (NULL); 509 } 510 EATSPACE(); 511 if (strlen(new) != strlen(old)) { 512 err(COMPILE, "transform strings are not the same length"); 513 return (NULL); 514 } 515 /* We assume characters are 8 bits */ 516 lt = xmalloc(UCHAR_MAX); 517 for (i = 0; i <= UCHAR_MAX; i++) 518 lt[i] = (char)i; 519 for (op = old, np = new; *op; op++, np++) 520 lt[(u_char)*op] = *np; 521 *transtab = lt; 522 return (p); 523 } 524 525 /* 526 * Compile the text following an a or i command. 527 */ 528 static char * 529 compile_text() 530 { 531 int asize, size; 532 char *text, *p, *op, *s; 533 char lbuf[_POSIX2_LINE_MAX + 1]; 534 535 asize = 2 * _POSIX2_LINE_MAX + 1; 536 text = xmalloc(asize); 537 size = 0; 538 while (cu_fgets(lbuf, sizeof(lbuf))) { 539 op = s = text + size; 540 p = lbuf; 541 EATSPACE(); 542 for (; *p; p++) { 543 if (*p == '\\') 544 p++; 545 *s++ = *p; 546 } 547 size += s - op; 548 if (p[-2] != '\\') { 549 *s = '\0'; 550 break; 551 } 552 if (asize - size < _POSIX2_LINE_MAX + 1) { 553 asize *= 2; 554 text = xmalloc(asize); 555 } 556 } 557 return (xrealloc(text, size + 1)); 558 } 559 560 /* 561 * Get an address and return a pointer to the first character after 562 * it. Fill the structure pointed to according to the address. 563 */ 564 static char * 565 compile_addr(p, a) 566 char *p; 567 struct s_addr *a; 568 { 569 regex_t *re; 570 char *end; 571 572 switch (*p) { 573 case '\\': /* Context address */ 574 re = xmalloc(sizeof(regex_t)); 575 a->u.r = re; 576 p = compile_re(p + 1, re, REG_NOSUB); 577 if (p == NULL) 578 err(COMPILE, "unterminated regular expression"); 579 a->type = AT_RE; 580 return (p); 581 case '/': /* Context address */ 582 re = xmalloc(sizeof(regex_t)); 583 a->u.r = re; 584 p = compile_re(p, a->u.r, REG_NOSUB); 585 if (p == NULL) 586 err(COMPILE, "unterminated regular expression"); 587 a->type = AT_RE; 588 return (p); 589 case '$': /* Last line */ 590 a->type = AT_LAST; 591 return (p + 1); 592 /* Line number */ 593 case '0': case '1': case '2': case '3': case '4': 594 case '5': case '6': case '7': case '8': case '9': 595 a->type = AT_LINE; 596 a->u.l = strtol(p, &end, 10); 597 return (end); 598 default: 599 err(COMPILE, "expected context address"); 600 return (NULL); 601 } 602 } 603 604 /* 605 * Return a copy of all the characters up to \n or \0 606 */ 607 static char * 608 duptoeol(s) 609 register char *s; 610 { 611 size_t len; 612 char *start; 613 614 for (start = s; *s != '\0' && *s != '\n'; ++s); 615 *s = '\0'; 616 len = s - start + 1; 617 return (memmove(xmalloc(len), start, len)); 618 } 619 620 /* 621 * Find the label contained in the command l in the command linked list cp. 622 * L is excluded from the search. Return NULL if not found. 623 */ 624 static struct s_command * 625 findlabel(l, cp) 626 struct s_command *l, *cp; 627 { 628 struct s_command *r; 629 630 for (; cp; cp = cp->next) 631 if (cp->code == ':' && cp != l && strcmp(l->t, cp->t) == 0) 632 return (cp); 633 else if (cp->code == '{' && (r = findlabel(l, cp->u.c))) 634 return (r); 635 return (NULL); 636 } 637 638 /* 639 * Convert goto label names to addresses. 640 * Detect duplicate labels. 641 * Set appendnum to the number of a and r commands in the script. 642 * Free the memory used by labels in b and t commands (but not by :) 643 * Root is a pointer to the script linked list; cp points to the 644 * search start. 645 * TODO: Remove } nodes 646 */ 647 static void 648 fixuplabel(root, cp) 649 struct s_command *root, *cp; 650 { 651 struct s_command *cp2; 652 653 for (; cp; cp = cp->next) 654 switch (cp->code) { 655 case 'a': 656 case 'r': 657 appendnum++; 658 break; 659 case 'b': 660 case 't': 661 if (cp->t == NULL) { 662 cp->u.c = NULL; 663 break; 664 } 665 if ((cp2 = findlabel(cp, root)) == NULL) 666 err(COMPILE2, "unspecified label %s", cp->t); 667 free(cp->t); 668 cp->u.c = cp2; 669 break; 670 case '{': 671 fixuplabel(root, cp->u.c); 672 break; 673 case ':': 674 if (findlabel(cp, root)) 675 err(COMPILE2, "duplicate label %s", cp->t); 676 break; 677 } 678 } 679