1 /*- 2 * Copyright (c) 1992 Henry Spencer. 3 * Copyright (c) 1992 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer of the University of Toronto. 8 * 9 * %sccs.include.redist.c% 10 * 11 * @(#)regcomp.c 5.2 (Berkeley) 09/30/92 12 */ 13 14 #if defined(LIBC_SCCS) && !defined(lint) 15 static char sccsid[] = "@(#)regcomp.c 5.2 (Berkeley) 09/30/92"; 16 #endif /* LIBC_SCCS and not lint */ 17 18 #include <sys/types.h> 19 #include <stdio.h> 20 #include <string.h> 21 #include <ctype.h> 22 #include <limits.h> 23 #include <stdlib.h> 24 #include <regex.h> 25 26 #include "utils.h" 27 #include "regex2.h" 28 29 #include "cclass.h" 30 #include "cname.h" 31 32 /* 33 * parse structure, passed up and down to avoid global variables and 34 * other clumsinesses 35 */ 36 struct parse { 37 uchar *next; /* next character in RE */ 38 int error; /* has an error been seen? */ 39 sop *strip; /* malloced strip */ 40 sopno ssize; /* malloced strip size (allocated) */ 41 sopno slen; /* malloced strip length (used) */ 42 int ncsalloc; /* number of csets allocated */ 43 struct re_guts *g; 44 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 45 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 46 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 47 }; 48 49 #include "regcomp.ih" 50 51 static uchar nuls[10]; /* place to point scanner in event of error */ 52 53 /* 54 * macros for use with parse structure 55 * BEWARE: these know that the parse structure is named `p' !!! 56 */ 57 #define PEEK() ((uchar)*p->next) 58 #define PEEK2() ((uchar)*(p->next+1)) 59 #define SEE(c) (PEEK() == (c)) 60 #define SEETWO(a, b) (PEEK() == (a) && PEEK2() == (b)) 61 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 62 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 63 #define NEXT() (p->next++) 64 #define NEXT2() (p->next += 2) 65 #define NEXTn(n) (p->next += (n)) 66 #define GETNEXT() ((uchar)*p->next++) 67 #define SETERROR(e) seterr(p, (e)) 68 #define REQUIRE(co, e) ((co) || SETERROR(e)) 69 #define MUSTSEE(c, e) (REQUIRE(PEEK() == (c), e)) 70 #define MUSTEAT(c, e) (REQUIRE(GETNEXT() == (c), e)) 71 #define MUSTNOTSEE(c, e) (REQUIRE(PEEK() != (c), e)) 72 #define EMIT(sop, sopnd) doemit(p, sop, (size_t)(sopnd)) 73 #define INSERT(sop, pos) doinsert(p, sop, HERE()-(pos)+1, pos) 74 #define FWD(pos) dofwd(p, pos, HERE()-(pos)) 75 #define BACK(sop, pos) EMIT(sop, HERE()-pos) 76 #define HERE() (p->slen) 77 #define THERE() (p->slen - 1) 78 #define DROP(n) (p->slen -= (n)) 79 80 /* 81 - regcomp - interface for parser and compilation 82 = extern int regcomp(regex_t *preg, const char *pattern, int cflags); 83 = #define REG_EXTENDED 001 84 = #define REG_ICASE 002 85 = #define REG_NOSUB 004 86 = #define REG_NEWLINE 010 87 */ 88 int /* 0 success, otherwise REG_something */ 89 regcomp(preg, pattern, cflags) 90 regex_t *preg; 91 const char *pattern; 92 int cflags; 93 { 94 struct parse pa; 95 register struct re_guts *g; 96 register struct parse *p = &pa; 97 register int i; 98 99 /* do the mallocs early so failure handling is easy */ 100 /* the +NUC here is for the category table */ 101 g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC); 102 if (g == NULL) 103 return(REG_ESPACE); 104 p->ssize = strlen(pattern)/2*3 + 1; 105 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 106 p->slen = 0; 107 if (p->strip == NULL) { 108 free((char *)g); 109 return(REG_ESPACE); 110 } 111 112 /* set things up */ 113 p->g = g; 114 p->next = (uchar *)pattern; 115 p->error = 0; 116 p->ncsalloc = 0; 117 for (i = 0; i < NPAREN; i++) { 118 p->pbegin[i] = 0; 119 p->pend[i] = 0; 120 } 121 g->csetsize = NUC; 122 g->sets = NULL; 123 g->setbits = NULL; 124 g->ncsets = 0; 125 g->cflags = cflags; 126 g->iflags = 0; 127 g->must = NULL; 128 g->mlen = 0; 129 g->nsub = 0; 130 g->ncategories = 1; /* category 0 is "everything else" */ 131 g->categories = (uchar *)g + sizeof(struct re_guts); 132 (void) memset((char *)g->categories, 0, NUC); 133 g->backrefs = 0; 134 g->nplus = 0; 135 136 /* do it */ 137 EMIT(OEND, 0); 138 g->firststate = THERE(); 139 if (cflags®_EXTENDED) 140 p_ere(p, '\0'); 141 else 142 p_bre(p, '\0', '\0'); 143 EMIT(OEND, 0); 144 g->laststate = THERE(); 145 146 /* tidy up loose ends and fill things in */ 147 categorize(p, g); 148 stripsnug(p, g); 149 findmust(p, g); 150 g->nplus = pluscount(p, g); 151 g->magic = MAGIC2; 152 preg->re_nsub = g->nsub; 153 preg->re_g = g; 154 preg->re_magic = MAGIC1; 155 #ifndef REDEBUG 156 /* not debugging, so can't rely on the assert() in regexec() */ 157 if (g->iflags&BAD) 158 SETERROR(REG_ASSERT); 159 #endif 160 161 /* win or lose, we're done */ 162 if (p->error != 0) /* lose */ 163 regfree(preg); 164 return(p->error); 165 } 166 167 /* 168 - p_ere - ERE parser top level, concatenation and alternation 169 == static void p_ere(register struct parse *p, uchar stop); 170 */ 171 static void 172 p_ere(p, stop) 173 register struct parse *p; 174 uchar stop; /* character this ERE should end at */ 175 { 176 register uchar c; 177 register sopno prevback; 178 register sopno prevfwd; 179 register sopno conc; 180 register int first = 1; /* is this the first alternative? */ 181 182 for (;;) { 183 /* do a bunch of concatenated expressions */ 184 conc = HERE(); 185 while ((c = PEEK()) != '|' && c != stop && c != '\0') 186 p_ere_exp(p); 187 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 188 189 if (!EAT('|')) 190 break; /* NOTE BREAK OUT */ 191 192 if (first) { 193 INSERT(OCH_, conc); /* offset is wrong */ 194 prevfwd = conc; 195 prevback = conc; 196 first = 0; 197 } 198 BACK(OOR1, prevback); 199 prevback = THERE(); 200 FWD(prevfwd); /* fix previous offset */ 201 prevfwd = HERE(); 202 EMIT(OOR2, 0); /* offset is very wrong */ 203 } 204 205 if (!first) { /* tail-end fixups */ 206 FWD(prevfwd); 207 BACK(O_CH, prevback); 208 } 209 210 assert(SEE(stop) || SEE('\0')); 211 } 212 213 /* 214 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 215 == static void p_ere_exp(register struct parse *p); 216 */ 217 static void 218 p_ere_exp(p) 219 register struct parse *p; 220 { 221 register uchar c; 222 register sopno pos; 223 register int count; 224 register int count2; 225 register sopno subno; 226 int wascaret = 0; 227 /* we call { a repetition if followed by a digit */ 228 # define ISRPT(c1, c2) (c1 == '*' || c1 == '+' || c1 == '?' || \ 229 (c1 == '{' && isdigit(c2))) 230 231 c = GETNEXT(); 232 assert(c != '\0'); /* caller should have ensured this */ 233 234 pos = HERE(); 235 switch (c) { 236 case '(': 237 MUSTNOTSEE('\0', REG_EPAREN); 238 p->g->nsub++; 239 subno = p->g->nsub; 240 if (subno < NPAREN) 241 p->pbegin[subno] = HERE(); 242 EMIT(OLPAREN, subno); 243 if (!SEE(')')) 244 p_ere(p, ')'); 245 if (subno < NPAREN) { 246 p->pend[subno] = HERE(); 247 assert(p->pend[subno] != 0); 248 } 249 EMIT(ORPAREN, subno); 250 MUSTEAT(')', REG_EPAREN); 251 break; 252 #ifndef POSIX_MISTAKE 253 case ')': /* happens only if no current unmatched ( */ 254 /* 255 * You may ask, why the ifndef? Because I didn't notice 256 * this until slightly too late for 1003.2, and none of the 257 * other 1003.2 regular-expression reviewers noticed it at 258 * all. So an unmatched ) is legal POSIX, at least until 259 * we can get it fixed. 260 */ 261 SETERROR(REG_EPAREN); 262 break; 263 #endif 264 case '^': 265 EMIT(OBOL, 0); 266 p->g->iflags |= USEBOL; 267 wascaret = 1; 268 break; 269 case '$': 270 EMIT(OEOL, 0); 271 p->g->iflags |= USEEOL; 272 break; 273 case '|': 274 SETERROR(REG_EMPTY); 275 break; 276 case '*': 277 case '+': 278 case '?': 279 SETERROR(REG_BADRPT); 280 break; 281 case '.': 282 if (p->g->cflags®_NEWLINE) 283 nonnewline(p); 284 else 285 EMIT(OANY, 0); 286 break; 287 case '[': 288 p_bracket(p); 289 break; 290 case '\\': 291 c = GETNEXT(); 292 #ifdef xxx 293 if (c == '^' || c == '.' || c == '[' || c == '$' || 294 c == '(' || c == ')' || c == '|' || 295 c == '*' || c == '+' || c == '?' || 296 c == '{' || c == '\\') 297 #else 298 if (c != '\0') 299 #endif 300 ordinary(p, c); 301 else 302 SETERROR(REG_EESCAPE); 303 break; 304 case '{': /* okay as ordinary except if digit follows */ 305 REQUIRE(!isdigit(PEEK()), REG_BADRPT); 306 /* FALLTHROUGH */ 307 default: 308 ordinary(p, c); 309 break; 310 } 311 312 c = PEEK(); 313 if (!ISRPT(c, PEEK2())) 314 return; /* no repetition, we're done */ 315 NEXT(); 316 317 REQUIRE(!wascaret, REG_BADRPT); 318 switch (c) { 319 case '*': /* implemented as +? */ 320 INSERT(OPLUS_, pos); 321 BACK(O_PLUS, pos); 322 INSERT(OQUEST_, pos); 323 BACK(O_QUEST, pos); 324 break; 325 case '+': 326 INSERT(OPLUS_, pos); 327 BACK(O_PLUS, pos); 328 break; 329 case '?': 330 INSERT(OQUEST_, pos); 331 BACK(O_QUEST, pos); 332 break; 333 case '{': 334 count = p_count(p); 335 if (EAT(',')) { 336 if (isdigit(PEEK())) { 337 count2 = p_count(p); 338 REQUIRE(count <= count2, REG_BADBR); 339 } else /* single number with comma */ 340 count2 = INFINITY; 341 } else /* just a single number */ 342 count2 = count; 343 repeat(p, pos, count, count2); 344 if (!EAT('}')) { /* error heuristics */ 345 while ((c = PEEK()) != '\0' && c != '}') 346 NEXT(); 347 if (c == '\0') 348 SETERROR(REG_EBRACE); 349 else 350 SETERROR(REG_BADBR); 351 } 352 break; 353 } 354 355 c = PEEK(); 356 REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT); 357 } 358 359 /* 360 - p_bre - BRE parser top level, anchoring and concatenation 361 == static void p_bre(register struct parse *p, register uchar end1, \ 362 == register uchar end2); 363 * Giving end1 as '\0' essentially eliminates the end1/end2 check. 364 * 365 * This implementation is a bit of a kludge, in that a trailing $ is first 366 * taken as an ordinary character and then revised to be an anchor. The 367 * only undesirable side effect is that '$' gets included as a character 368 * category in such cases. This is fairly harmless; not worth fixing. 369 * The amount of lookahead needed to avoid this kludge is excessive, 370 * especially since things like "$*" appear to be legal. xxx 371 */ 372 static void 373 p_bre(p, end1, end2) 374 register struct parse *p; 375 register uchar end1; /* first terminating character */ 376 register uchar end2; /* second terminating character */ 377 { 378 register sopno start = HERE(); 379 register int first = 1; /* first subexpression? */ 380 register int wasdollar = 0; 381 382 if (EAT('^')) { 383 EMIT(OBOL, 0); 384 p->g->iflags |= USEBOL; 385 } 386 while (!SEE('\0') && !SEETWO(end1, end2)) { 387 wasdollar = p_simp_re(p, first); 388 first = 0; 389 } 390 if (wasdollar) { /* oops, that was a trailing anchor */ 391 DROP(1); 392 EMIT(OEOL, 0); 393 p->g->iflags |= USEEOL; 394 } 395 396 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 397 } 398 399 /* 400 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 401 == static int p_simp_re(register struct parse *p, int starordinary); 402 */ 403 static int /* was the simple RE an unbackslashed $? */ 404 p_simp_re(p, starordinary) 405 register struct parse *p; 406 int starordinary; /* is a leading * an ordinary character? */ 407 { 408 register int c; 409 register int count; 410 register int count2; 411 register sopno pos; 412 register int i; 413 register sopno subno; 414 # define BACKSL (1<<CHAR_BIT) 415 416 pos = HERE(); /* repetion op, if any, covers from here */ 417 418 c = GETNEXT(); 419 assert(c != '\0'); /* caller should have ensured this */ 420 if (c == '\\') 421 c = BACKSL | PEEK(); 422 switch (c) { 423 case '.': 424 if (p->g->cflags®_NEWLINE) 425 nonnewline(p); 426 else 427 EMIT(OANY, 0); 428 break; 429 case '[': 430 p_bracket(p); 431 break; 432 case BACKSL|'{': 433 SETERROR(REG_BADRPT); 434 break; 435 case BACKSL|'(': 436 NEXT(); 437 p->g->nsub++; 438 subno = p->g->nsub; 439 if (subno < NPAREN) 440 p->pbegin[subno] = HERE(); 441 EMIT(OLPAREN, subno); 442 /* the SEE here is an error heuristic */ 443 if (!SEE('\0') && !SEETWO('\\', ')')) 444 p_bre(p, '\\', ')'); 445 if (subno < NPAREN) { 446 p->pend[subno] = HERE(); 447 assert(p->pend[subno] != 0); 448 } 449 EMIT(ORPAREN, subno); 450 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 451 break; 452 case BACKSL|')': /* should not get here -- must be user */ 453 case BACKSL|'}': 454 SETERROR(REG_EPAREN); 455 break; 456 case BACKSL|'1': 457 case BACKSL|'2': 458 case BACKSL|'3': 459 case BACKSL|'4': 460 case BACKSL|'5': 461 case BACKSL|'6': 462 case BACKSL|'7': 463 case BACKSL|'8': 464 case BACKSL|'9': 465 i = (c&~BACKSL) - '0'; 466 assert(i < NPAREN); 467 if (p->pend[i] != 0) { 468 assert(i <= p->g->nsub); 469 EMIT(OBACK_, i); 470 assert(p->pbegin[i] != 0); 471 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 472 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 473 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 474 EMIT(O_BACK, i); 475 } else 476 SETERROR(REG_ESUBREG); 477 p->g->backrefs = 1; 478 NEXT(); 479 break; 480 case BACKSL|'\0': 481 SETERROR(REG_EESCAPE); 482 break; 483 case '*': 484 REQUIRE(starordinary, REG_BADRPT); 485 /* FALLTHROUGH */ 486 default: 487 if (c & BACKSL) 488 c = GETNEXT(); 489 ordinary(p, (uchar)c); 490 break; 491 } 492 493 if (EAT('*')) { /* implemented as +? */ 494 INSERT(OPLUS_, pos); 495 BACK(O_PLUS, pos); 496 INSERT(OQUEST_, pos); 497 BACK(O_QUEST, pos); 498 } else if (EATTWO('\\', '{')) { 499 count = p_count(p); 500 if (EAT(',')) { 501 if (isdigit(PEEK())) { 502 count2 = p_count(p); 503 REQUIRE(count <= count2, REG_BADBR); 504 } else /* single number with comma */ 505 count2 = INFINITY; 506 } else /* just a single number */ 507 count2 = count; 508 repeat(p, pos, count, count2); 509 if (!EATTWO('\\', '}')) { /* error heuristics */ 510 while (!SEE('\0') && !SEETWO('\\', '}')) 511 NEXT(); 512 if (SEE('\0')) 513 SETERROR(REG_EBRACE); 514 else 515 SETERROR(REG_BADBR); 516 } 517 } else if (c == '$') /* unbackslashed $ not followed by reptn */ 518 return(1); 519 520 return(0); 521 } 522 523 /* 524 - p_count - parse a repetition count 525 == static int p_count(register struct parse *p); 526 */ 527 static int /* the value */ 528 p_count(p) 529 register struct parse *p; 530 { 531 register int count = 0; 532 register int ndigits = 0; 533 534 while (isdigit(PEEK()) && count <= DUPMAX) { 535 count = count*10 + (GETNEXT() - '0'); 536 ndigits++; 537 } 538 539 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 540 return(count); 541 } 542 543 /* 544 - p_bracket - parse a bracketed character list 545 == static void p_bracket(register struct parse *p); 546 * 547 * Note a significant property of this code: if the allocset() did SETERROR, 548 * no set operations are done. 549 */ 550 static void 551 p_bracket(p) 552 register struct parse *p; 553 { 554 register uchar c; 555 register cset *cs = allocset(p); 556 register int invert = 0; 557 558 if (EAT('^')) 559 invert++; /* make note to invert set at end */ 560 if (EAT(']')) 561 CHadd(cs, ']'); 562 while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']')) 563 p_b_term(p, cs); 564 if (EAT('-')) 565 CHadd(cs, '-'); 566 MUSTEAT(']', REG_EBRACK); 567 568 if (invert && p->error == 0) { 569 register int i; 570 571 for (i = p->g->csetsize - 1; i >= 0; i--) 572 if (CHIN(cs, i)) 573 CHsub(cs, i); 574 else 575 CHadd(cs, i); 576 if (p->g->cflags®_NEWLINE) 577 CHsub(cs, '\n'); 578 if (cs->multis != NULL) 579 mcinvert(p, cs); 580 } 581 assert(cs->multis == NULL); /* xxx */ 582 EMIT(OANYOF, freezeset(p, cs)); 583 } 584 585 /* 586 - p_b_term - parse one term of a bracketed character list 587 == static void p_b_term(register struct parse *p, register cset *cs); 588 */ 589 static void 590 p_b_term(p, cs) 591 register struct parse *p; 592 register cset *cs; 593 { 594 register uchar c; 595 register uchar start, finish; 596 register int i; 597 598 /* classify what we've got */ 599 switch (PEEK()) { 600 case '[': 601 c = PEEK2(); 602 break; 603 case '-': 604 SETERROR(REG_ERANGE); 605 return; /* NOTE RETURN */ 606 break; 607 default: 608 c = '\0'; 609 break; 610 } 611 612 switch (c) { 613 case ':': /* character class */ 614 NEXT2(); 615 c = PEEK(); 616 REQUIRE(c != '\0', REG_EBRACK); 617 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 618 p_b_cclass(p, cs); 619 MUSTNOTSEE('\0', REG_EBRACK); 620 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 621 break; 622 case '=': /* equivalence class */ 623 NEXT2(); 624 c = PEEK(); 625 REQUIRE(c != '\0', REG_EBRACK); 626 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 627 p_b_eclass(p, cs); 628 MUSTNOTSEE('\0', REG_EBRACK); 629 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 630 break; 631 default: /* symbol, ordinary character, or range */ 632 /* xxx revision needed for multichar stuff */ 633 start = p_b_symbol(p); 634 if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') { 635 /* range */ 636 NEXT(); 637 if (EAT('-')) 638 finish = '-'; 639 else 640 finish = p_b_symbol(p); 641 } else 642 finish = start; 643 REQUIRE(start <= finish, REG_ERANGE); 644 for (i = start; i <= finish; i++) { 645 CHadd(cs, i); 646 if ((p->g->cflags®_ICASE) && isalpha(i)) { 647 c = othercase((uchar)i); 648 CHadd(cs, c); 649 } 650 } 651 break; 652 } 653 } 654 655 /* 656 - p_b_cclass - parse a character-class name and deal with it 657 == static void p_b_cclass(register struct parse *p, register cset *cs); 658 */ 659 static void 660 p_b_cclass(p, cs) 661 register struct parse *p; 662 register cset *cs; 663 { 664 register uchar *sb = p->next; 665 register uchar *se = sb; 666 register struct cclass *cp; 667 register int len; 668 register uchar *u; 669 register uchar c; 670 671 while (isalpha(*se)) 672 se++; 673 len = se - sb; 674 NEXTn(len); 675 for (cp = cclasses; cp->name != NULL; cp++) 676 if (strncmp(cp->name, (char *)sb, len) == 0 && 677 cp->name[len] == '\0') 678 break; 679 if (cp->name == NULL) { 680 /* oops, didn't find it */ 681 SETERROR(REG_ECTYPE); 682 return; 683 } 684 685 u = (uchar *)cp->chars; 686 while ((c = *u++) != '\0') 687 CHadd(cs, c); 688 for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1) 689 MCadd(cs, u); 690 } 691 692 /* 693 - p_b_eclass - parse an equivalence-class name and deal with it 694 == static void p_b_eclass(register struct parse *p, register cset *cs); 695 * 696 * This implementation is incomplete. xxx 697 */ 698 static void 699 p_b_eclass(p, cs) 700 register struct parse *p; 701 register cset *cs; 702 { 703 register uchar c; 704 705 c = p_b_coll_elem(p, '='); 706 CHadd(cs, c); 707 } 708 709 /* 710 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 711 == static uchar p_b_symbol(register struct parse *p); 712 */ 713 static uchar /* value of symbol */ 714 p_b_symbol(p) 715 register struct parse *p; 716 { 717 register uchar value; 718 719 if (!EATTWO('[', '.')) { 720 MUSTNOTSEE('\0', REG_EBRACK); 721 return(GETNEXT()); 722 } 723 724 /* collating symbol */ 725 MUSTNOTSEE('\0', REG_EBRACK); 726 value = p_b_coll_elem(p, '.'); 727 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 728 return(value); 729 } 730 731 /* 732 - p_b_coll_elem - parse a collating-element name and look it up 733 == static uchar p_b_coll_elem(register struct parse *p, uchar endc); 734 */ 735 static uchar /* value of collating element */ 736 p_b_coll_elem(p, endc) 737 register struct parse *p; 738 uchar endc; /* name ended by endc,']' */ 739 { 740 register uchar *sp = p->next; 741 register struct cname *cp; 742 register int len; 743 register uchar c; 744 745 while ((c = PEEK()) != '\0' && !SEETWO(endc, ']')) 746 NEXT(); 747 if (c == '\0') { 748 SETERROR(REG_EBRACK); 749 return(0); 750 } 751 len = p->next - sp; 752 for (cp = cnames; cp->name != NULL; cp++) 753 if (strncmp(cp->name, (char *)sp, len) == 0 && 754 cp->name[len] == '\0') 755 return(cp->code); /* known name */ 756 if (len == 1) 757 return(*sp); /* single character */ 758 SETERROR(REG_ECOLLATE); /* neither */ 759 return(0); 760 } 761 762 /* 763 - othercase - return the case counterpart of an alphabetic 764 == static uchar othercase(uchar ch); 765 */ 766 static uchar 767 othercase(ch) 768 uchar ch; 769 { 770 assert(isalpha(ch)); 771 if (isupper(ch)) 772 return(tolower(ch)); 773 else if (islower(ch)) 774 return(toupper(ch)); 775 else /* peculiar, but could happen */ 776 return(ch); 777 } 778 779 /* 780 - bothcases - emit a dualcase version of a character 781 == static void bothcases(register struct parse *p, uchar ch); 782 * 783 * Boy, is this implementation ever a kludge... 784 */ 785 static void 786 bothcases(p, ch) 787 register struct parse *p; 788 uchar ch; 789 { 790 register uchar *oldnext; 791 uchar bracket[3]; 792 793 oldnext = p->next; 794 p->next = bracket; 795 bracket[0] = ch; 796 bracket[1] = ']'; 797 bracket[2] = '\0'; 798 p_bracket(p); 799 assert(p->next == bracket+2); 800 p->next = oldnext; 801 } 802 803 /* 804 - ordinary - emit an ordinary character 805 == static void ordinary(register struct parse *p, register uchar ch); 806 */ 807 static void 808 ordinary(p, ch) 809 register struct parse *p; 810 register uchar ch; 811 { 812 register uchar *cap = p->g->categories; 813 814 if ((p->g->cflags®_ICASE) && isalpha(ch)) { 815 bothcases(p, ch); 816 return; 817 } 818 819 EMIT(OCHAR, ch); 820 if (cap[ch] == 0) 821 cap[ch] = p->g->ncategories++; 822 } 823 824 /* 825 - nonnewline - emit REG_NEWLINE version of OANY 826 == static void nonnewline(register struct parse *p); 827 * 828 * Boy, is this implementation ever a kludge... 829 */ 830 static void 831 nonnewline(p) 832 register struct parse *p; 833 { 834 register uchar *oldnext; 835 uchar bracket[4]; 836 837 oldnext = p->next; 838 p->next = bracket; 839 bracket[0] = '^'; 840 bracket[1] = '\n'; 841 bracket[2] = ']'; 842 bracket[3] = '\0'; 843 p_bracket(p); 844 assert(p->next == bracket+3); 845 p->next = oldnext; 846 } 847 848 /* 849 - repeat - generate code for a bounded repetition, recursively if needed 850 == static void repeat(register struct parse *p, sopno start, int from, int to); 851 */ 852 static void 853 repeat(p, start, from, to) 854 register struct parse *p; 855 sopno start; /* operand from here to end of strip */ 856 int from; /* repeated from this number */ 857 int to; /* to this number of times (maybe INFINITY) */ 858 { 859 register sopno finish = HERE(); 860 # define N 2 861 # define INF 3 862 # define REP(f, t) ((f)*8 + (t)) 863 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 864 register sopno copy; 865 866 if (p->error != 0) /* head off possible runaway recursion */ 867 return; 868 869 assert(from <= to); 870 871 switch (REP(MAP(from), MAP(to))) { 872 case REP(0, 0): /* must be user doing this */ 873 DROP(finish-start); /* drop the operand */ 874 break; 875 case REP(0, 1): /* as x{1,1}? */ 876 case REP(0, N): /* as x{1,n}? */ 877 case REP(0, INF): /* as x{1,}? */ 878 INSERT(OQUEST_, start); /* offset is wrong... */ 879 repeat(p, start+1, 1, to); 880 FWD(start); /* ... fix it */ 881 BACK(O_QUEST, start); 882 break; 883 case REP(1, 1): /* trivial case */ 884 /* done */ 885 break; 886 case REP(1, N): /* as x?x{1,n-1} */ 887 INSERT(OQUEST_, start); 888 BACK(O_QUEST, start); 889 copy = dupl(p, start+1, finish+1); 890 assert(copy == finish+2); 891 repeat(p, copy, 1, to-1); 892 break; 893 case REP(1, INF): /* as x+ */ 894 INSERT(OPLUS_, start); 895 BACK(O_PLUS, start); 896 break; 897 case REP(N, N): /* as xx{m-1,n-1} */ 898 copy = dupl(p, start, finish); 899 repeat(p, copy, from-1, to-1); 900 break; 901 case REP(N, INF): /* as xx{n-1,INF} */ 902 copy = dupl(p, start, finish); 903 repeat(p, copy, from-1, to); 904 break; 905 default: /* "can't happen" */ 906 SETERROR(REG_ASSERT); /* just in case */ 907 break; 908 } 909 } 910 911 /* 912 - seterr - set an error condition 913 == static int seterr(register struct parse *p, int e); 914 */ 915 static int /* useless but makes type checking happy */ 916 seterr(p, e) 917 register struct parse *p; 918 int e; 919 { 920 if (p->error == 0) /* keep earliest error condition */ 921 p->error = e; 922 p->next = nuls; /* try to bring things to a halt */ 923 return(0); /* make the return value well-defined */ 924 } 925 926 /* 927 - allocset - allocate a set of characters for [] 928 == static cset *allocset(register struct parse *p); 929 */ 930 static cset * 931 allocset(p) 932 register struct parse *p; 933 { 934 register int no = p->g->ncsets++; 935 register size_t nc; 936 register size_t nbytes; 937 register cset *cs; 938 register size_t css = (size_t)p->g->csetsize; 939 940 if (no >= p->ncsalloc) { /* need another column of space */ 941 p->ncsalloc += CHAR_BIT; 942 nc = p->ncsalloc; 943 assert(nc % CHAR_BIT == 0); 944 nbytes = nc / CHAR_BIT * css; 945 if (p->g->sets == NULL) 946 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 947 else 948 p->g->sets = (cset *)realloc((char *)p->g->sets, 949 nc * sizeof(cset)); 950 if (p->g->setbits == NULL) 951 p->g->setbits = (uchar *)malloc(nbytes); 952 else 953 p->g->setbits = (uchar *)realloc((char *)p->g->setbits, 954 nbytes); 955 if (p->g->sets != NULL && p->g->setbits != NULL) 956 (void) memset((char *)p->g->setbits + (nbytes - css), 957 0, css); 958 else { 959 no = 0; 960 SETERROR(REG_ESPACE); 961 /* caller's responsibility not to do set ops */ 962 } 963 } 964 965 assert(p->g->sets != NULL); /* xxx */ 966 cs = &p->g->sets[no]; 967 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 968 cs->mask = 1 << ((no) % CHAR_BIT); 969 cs->hash = 0; 970 cs->smultis = 0; 971 cs->multis = NULL; 972 973 return(cs); 974 } 975 976 /* 977 - freezeset - final processing on a set of characters 978 == static int freezeset(register struct parse *p, register cset *cs); 979 * 980 * The main task here is merging identical sets. This is usually a waste 981 * of time (although the hash code minimizes the overhead), but can win 982 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 983 * is done using addition rather than xor -- all ASCII [aA] sets xor to 984 * the same value! 985 */ 986 static int /* set number */ 987 freezeset(p, cs) 988 register struct parse *p; 989 register cset *cs; 990 { 991 register uchar h = cs->hash; 992 register int i; 993 register cset *top = &p->g->sets[p->g->ncsets]; 994 register cset *cs2; 995 register size_t css = (size_t)p->g->csetsize; 996 997 /* look for an earlier one which is the same */ 998 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 999 if (cs2->hash == h && cs2 != cs) { 1000 /* maybe */ 1001 for (i = 0; i < css; i++) 1002 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1003 break; /* no */ 1004 if (i == css) 1005 break; /* yes */ 1006 } 1007 1008 if (cs2 < top) { /* found one */ 1009 assert(cs == top-1); 1010 p->g->ncsets--; 1011 for (i = 0; i < css; i++) 1012 CHsub(cs, i); 1013 cs = cs2; 1014 } 1015 1016 return((int)(cs - p->g->sets)); 1017 } 1018 1019 /* 1020 - mcadd - add a collating element to a cset 1021 == static void mcadd(register struct parse *p, register cset *cs, \ 1022 == register uchar *cp); 1023 */ 1024 static void 1025 mcadd(p, cs, cp) 1026 register struct parse *p; 1027 register cset *cs; 1028 register uchar *cp; 1029 { 1030 register size_t oldend = cs->smultis; 1031 1032 cs->smultis += strlen((char *)cp) + 1; 1033 if (cs->multis == NULL) 1034 cs->multis = (uchar *)malloc(cs->smultis); 1035 else 1036 cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 1037 if (cs->multis == NULL) { 1038 SETERROR(REG_ESPACE); 1039 return; 1040 } 1041 1042 (void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp); 1043 cs->multis[cs->smultis - 1] = '\0'; 1044 } 1045 1046 /* 1047 - mcsub - subtract a collating element from a cset 1048 == static void mcsub(register struct parse *p, register cset *cs, \ 1049 == register uchar *cp); 1050 */ 1051 static void 1052 mcsub(p, cs, cp) 1053 register struct parse *p; 1054 register cset *cs; 1055 register uchar *cp; 1056 { 1057 register uchar *fp = mcfind(cs, cp); 1058 register size_t len = strlen((char *)fp); 1059 1060 assert(p != NULL); 1061 (void) memmove((char *)fp, (char *)(fp + len + 1), 1062 cs->smultis - (fp + len + 1 - cs->multis)); 1063 cs->smultis -= len; 1064 1065 if (cs->smultis == 0) { 1066 free((char *)cs->multis); 1067 cs->multis = NULL; 1068 return; 1069 } 1070 1071 cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 1072 assert(cs->multis != NULL); 1073 } 1074 1075 /* 1076 - mcin - is a collating element in a cset? 1077 == static int mcin(register struct parse *p, register cset *cs, \ 1078 == register uchar *cp); 1079 */ 1080 static int 1081 mcin(p, cs, cp) 1082 register struct parse *p; 1083 register cset *cs; 1084 register uchar *cp; 1085 { 1086 return(mcfind(cs, cp) != NULL); 1087 } 1088 1089 /* 1090 - mcfind - find a collating element in a cset 1091 == static uchar *mcfind(register cset *cs, register uchar *cp); 1092 */ 1093 static uchar * 1094 mcfind(cs, cp) 1095 register cset *cs; 1096 register uchar *cp; 1097 { 1098 register uchar *p; 1099 1100 if (cs->multis == NULL) 1101 return(NULL); 1102 for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1) 1103 if (strcmp((char *)cp, (char *)p) == 0) 1104 return(p); 1105 return(NULL); 1106 } 1107 1108 /* 1109 - mcinvert - invert the list of collating elements in a cset 1110 == static void mcinvert(register struct parse *p, register cset *cs); 1111 * 1112 * This would have to know the set of possibilities. Implementation 1113 * is deferred. 1114 */ 1115 static void 1116 mcinvert(p, cs) 1117 register struct parse *p; 1118 register cset *cs; 1119 { 1120 assert(cs->multis == NULL); /* xxx */ 1121 } 1122 1123 /* 1124 - isinsets - is this character in any sets? 1125 == static int isinsets(register struct re_guts *g, uchar c); 1126 */ 1127 static int /* predicate */ 1128 isinsets(g, c) 1129 register struct re_guts *g; 1130 uchar c; 1131 { 1132 register uchar *col; 1133 register int i; 1134 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1135 1136 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1137 if (col[c] != 0) 1138 return(1); 1139 return(0); 1140 } 1141 1142 /* 1143 - samesets - are these two characters in exactly the same sets? 1144 == static int samesets(register struct re_guts *g, uchar c1, uchar c2); 1145 */ 1146 static int /* predicate */ 1147 samesets(g, c1, c2) 1148 register struct re_guts *g; 1149 register uchar c1; 1150 register uchar c2; 1151 { 1152 register uchar *col; 1153 register int i; 1154 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1155 1156 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1157 if (col[c1] != col[c2]) 1158 return(0); 1159 return(1); 1160 } 1161 1162 /* 1163 - categorize - sort out character categories 1164 == static void categorize(struct parse *p, register struct re_guts *g); 1165 */ 1166 static void 1167 categorize(p, g) 1168 struct parse *p; 1169 register struct re_guts *g; 1170 { 1171 register uchar *cats = g->categories; 1172 register unsigned c; 1173 register unsigned c2; 1174 register uchar cat; 1175 1176 /* avoid making error situations worse */ 1177 if (p->error != 0) 1178 return; 1179 1180 for (c = 0; c < g->csetsize; c++) 1181 if (cats[c] == 0 && isinsets(g, c)) { 1182 cat = g->ncategories++; 1183 cats[c] = cat; 1184 for (c2 = c+1; c2 < g->csetsize; c2++) 1185 if (cats[c2] == 0 && samesets(g, c, c2)) 1186 cats[c2] = cat; 1187 } 1188 } 1189 1190 /* 1191 - dupl - emit a duplicate of a bunch of sops 1192 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1193 */ 1194 static sopno /* start of duplicate */ 1195 dupl(p, start, finish) 1196 register struct parse *p; 1197 sopno start; /* from here */ 1198 sopno finish; /* to this less one */ 1199 { 1200 register sopno ret = HERE(); 1201 register sopno len = finish - start; 1202 1203 assert(finish >= start); 1204 if (len == 0) 1205 return(ret); 1206 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1207 assert(p->ssize >= p->slen + len); 1208 (void) memcpy((char *)(p->strip + p->slen), 1209 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1210 p->slen += len; 1211 return(ret); 1212 } 1213 1214 /* 1215 - doemit - emit a strip operator 1216 == static void doemit(register struct parse *p, sop op, size_t opnd); 1217 * 1218 * It might seem better to implement this as a macro with a function as 1219 * hard-case backup, but it's just too big and messy unless there are 1220 * some changes to the data structures. Maybe later. 1221 */ 1222 static void 1223 doemit(p, op, opnd) 1224 register struct parse *p; 1225 sop op; 1226 size_t opnd; 1227 { 1228 /* avoid making error situations worse */ 1229 if (p->error != 0) 1230 return; 1231 1232 /* deal with oversize operands ("can't happen", more or less) */ 1233 assert(opnd < 1<<OPSHIFT); 1234 1235 /* deal with undersized strip */ 1236 if (p->slen >= p->ssize) 1237 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1238 assert(p->slen < p->ssize); 1239 1240 /* finally, it's all reduced to the easy case */ 1241 p->strip[p->slen++] = SOP(op, opnd); 1242 } 1243 1244 /* 1245 - doinsert - insert a sop into the strip 1246 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1247 */ 1248 static void 1249 doinsert(p, op, opnd, pos) 1250 register struct parse *p; 1251 sop op; 1252 size_t opnd; 1253 sopno pos; 1254 { 1255 register sopno sn; 1256 register sop s; 1257 register int i; 1258 1259 /* avoid making error situations worse */ 1260 if (p->error != 0) 1261 return; 1262 1263 sn = HERE(); 1264 EMIT(op, opnd); /* do checks, ensure space */ 1265 assert(HERE() == sn+1); 1266 s = p->strip[sn]; 1267 1268 /* adjust paren pointers */ 1269 assert(pos > 0); 1270 for (i = 1; i < NPAREN; i++) { 1271 if (p->pbegin[i] >= pos) { 1272 p->pbegin[i]++; 1273 } 1274 if (p->pend[i] >= pos) { 1275 p->pend[i]++; 1276 } 1277 } 1278 1279 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1280 (HERE()-pos-1)*sizeof(sop)); 1281 p->strip[pos] = s; 1282 } 1283 1284 /* 1285 - dofwd - complete a forward reference 1286 == static void dofwd(register struct parse *p, sopno pos, sop value); 1287 */ 1288 static void 1289 dofwd(p, pos, value) 1290 register struct parse *p; 1291 register sopno pos; 1292 sop value; 1293 { 1294 /* avoid making error situations worse */ 1295 if (p->error != 0) 1296 return; 1297 1298 assert(value < 1<<OPSHIFT); 1299 p->strip[pos] = OP(p->strip[pos]) | value; 1300 } 1301 1302 /* 1303 - enlarge - enlarge the strip 1304 == static void enlarge(register struct parse *p, sopno size); 1305 */ 1306 static void 1307 enlarge(p, size) 1308 register struct parse *p; 1309 register sopno size; 1310 { 1311 register sop *sp; 1312 1313 if (p->ssize >= size) 1314 return; 1315 1316 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1317 if (sp == NULL) { 1318 SETERROR(REG_ESPACE); 1319 return; 1320 } 1321 p->strip = sp; 1322 p->ssize = size; 1323 } 1324 1325 /* 1326 - stripsnug - compact the strip 1327 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1328 */ 1329 static void 1330 stripsnug(p, g) 1331 register struct parse *p; 1332 register struct re_guts *g; 1333 { 1334 g->nstates = p->slen; 1335 g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop)); 1336 if (g->strip == NULL) { 1337 SETERROR(REG_ESPACE); 1338 g->strip = p->strip; 1339 } 1340 } 1341 1342 /* 1343 - findmust - fill in must and mlen with longest mandatory literal string 1344 == static void findmust(register struct parse *p, register struct re_guts *g); 1345 * 1346 * This algorithm could do fancy things like analyzing the operands of | 1347 * for common subsequences. Someday. This code is simple and finds most 1348 * of the interesting cases. 1349 * 1350 * Note that must and mlen got initialized during setup. 1351 */ 1352 static void 1353 findmust(p, g) 1354 struct parse *p; 1355 register struct re_guts *g; 1356 { 1357 register sop *scan; 1358 sop *start; 1359 register sop *newstart; 1360 register sopno newlen; 1361 register sop s; 1362 register char *cp; 1363 register sopno i; 1364 1365 /* avoid making error situations worse */ 1366 if (p->error != 0) 1367 return; 1368 1369 /* find the longest OCHAR sequence in strip */ 1370 newlen = 0; 1371 scan = g->strip + 1; 1372 do { 1373 s = *scan++; 1374 switch (OP(s)) { 1375 case OCHAR: /* sequence member */ 1376 if (newlen == 0) /* new sequence */ 1377 newstart = scan - 1; 1378 newlen++; 1379 break; 1380 case OPLUS_: /* things that don't break one */ 1381 case OLPAREN: 1382 case ORPAREN: 1383 break; 1384 case OQUEST_: /* things that must be skipped */ 1385 case OCH_: 1386 scan--; 1387 do { 1388 scan += OPND(s); 1389 s = *scan; 1390 /* assert() interferes w debug printouts */ 1391 if (OP(s) != O_QUEST && OP(s) != O_CH && 1392 OP(s) != OOR2) { 1393 g->iflags |= BAD; 1394 return; 1395 } 1396 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1397 /* fallthrough */ 1398 default: /* things that break a sequence */ 1399 if (newlen > g->mlen) { /* ends one */ 1400 start = newstart; 1401 g->mlen = newlen; 1402 } 1403 newlen = 0; 1404 break; 1405 } 1406 } while (OP(s) != OEND); 1407 1408 if (g->mlen == 0) /* there isn't one */ 1409 return; 1410 1411 /* turn it into a character string */ 1412 g->must = malloc((size_t)g->mlen + 1); 1413 if (g->must == NULL) { /* argh; just forget it */ 1414 g->mlen = 0; 1415 return; 1416 } 1417 cp = g->must; 1418 scan = start; 1419 for (i = g->mlen; i > 0; i--) { 1420 while (OP(s = *scan++) != OCHAR) 1421 continue; 1422 *cp++ = OPND(s); 1423 } 1424 *cp++ = '\0'; /* just on general principles */ 1425 } 1426 1427 /* 1428 - pluscount - count + nesting 1429 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1430 */ 1431 static sopno /* nesting depth */ 1432 pluscount(p, g) 1433 struct parse *p; 1434 register struct re_guts *g; 1435 { 1436 register sop *scan; 1437 register sop s; 1438 register sopno plusnest = 0; 1439 register sopno maxnest = 0; 1440 1441 if (p->error != 0) 1442 return(0); /* there may not be an OEND */ 1443 1444 scan = g->strip + 1; 1445 do { 1446 s = *scan++; 1447 switch (OP(s)) { 1448 case OPLUS_: 1449 plusnest++; 1450 break; 1451 case O_PLUS: 1452 if (plusnest > maxnest) 1453 maxnest = plusnest; 1454 plusnest--; 1455 break; 1456 } 1457 } while (OP(s) != OEND); 1458 if (plusnest != 0) 1459 g->iflags |= BAD; 1460 return(maxnest); 1461 } 1462