1 /* $OpenBSD: regcomp.c,v 1.41 2020/12/31 17:24:05 millert Exp $ */ 2 /*- 3 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 35 */ 36 37 #include <sys/types.h> 38 #include <stdio.h> 39 #include <string.h> 40 #include <ctype.h> 41 #include <limits.h> 42 #include <stdlib.h> 43 #include <regex.h> 44 45 #include "utils.h" 46 #include "regex2.h" 47 48 #include "cclass.h" 49 #include "cname.h" 50 51 /* 52 * parse structure, passed up and down to avoid global variables and 53 * other clumsinesses 54 */ 55 struct parse { 56 const char *next; /* next character in RE */ 57 const char *end; /* end of string (-> NUL normally) */ 58 int error; /* has an error been seen? */ 59 sop *strip; /* malloced strip */ 60 sopno ssize; /* malloced strip size (allocated) */ 61 sopno slen; /* malloced strip length (used) */ 62 int ncsalloc; /* number of csets allocated */ 63 struct re_guts *g; 64 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 65 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 66 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 67 }; 68 69 static void p_ere(struct parse *, int); 70 static void p_ere_exp(struct parse *); 71 static void p_str(struct parse *); 72 static void p_bre(struct parse *, int, int); 73 static int p_simp_re(struct parse *, int); 74 static int p_count(struct parse *); 75 static void p_bracket(struct parse *); 76 static void p_b_term(struct parse *, cset *); 77 static void p_b_cclass(struct parse *, cset *); 78 static void p_b_eclass(struct parse *, cset *); 79 static char p_b_symbol(struct parse *); 80 static char p_b_coll_elem(struct parse *, int); 81 static char othercase(int); 82 static void bothcases(struct parse *, int); 83 static void ordinary(struct parse *, int); 84 static void backslash(struct parse *, int); 85 static void nonnewline(struct parse *); 86 static void repeat(struct parse *, sopno, int, int); 87 static void seterr(struct parse *, int); 88 static cset *allocset(struct parse *); 89 static void freeset(struct parse *, cset *); 90 static int freezeset(struct parse *, cset *); 91 static int firstch(struct parse *, cset *); 92 static int nch(struct parse *, cset *); 93 static int isinsets(struct re_guts *, int); 94 static int samesets(struct re_guts *, int, int); 95 static sopno dupl(struct parse *, sopno, sopno); 96 static void doemit(struct parse *, sop, size_t); 97 static void doinsert(struct parse *, sop, size_t, sopno); 98 static void dofwd(struct parse *, sopno, sop); 99 static int enlarge(struct parse *, sopno); 100 static void stripsnug(struct parse *, struct re_guts *); 101 static void findmust(struct parse *, struct re_guts *); 102 static sopno pluscount(struct parse *, struct re_guts *); 103 104 static char nuls[10]; /* place to point scanner in event of error */ 105 106 /* 107 * macros for use with parse structure 108 * BEWARE: these know that the parse structure is named `p' !!! 109 */ 110 #define PEEK() (*p->next) 111 #define PEEK2() (*(p->next+1)) 112 #define MORE() (p->end - p->next > 0) 113 #define MORE2() (p->end - p->next > 1) 114 #define SEE(c) (MORE() && PEEK() == (c)) 115 #define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b)) 116 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 117 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 118 #define NEXT() (p->next++) 119 #define NEXT2() (p->next += 2) 120 #define NEXTn(n) (p->next += (n)) 121 #define GETNEXT() (*p->next++) 122 #define SETERROR(e) seterr(p, (e)) 123 #define REQUIRE(co, e) do { if (!(co)) SETERROR(e); } while (0) 124 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 125 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 126 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 127 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 128 #define HERE() (p->slen) 129 #define THERE() (p->slen - 1) 130 #define THERETHERE() (p->slen - 2) 131 #define DROP(n) (p->slen -= (n)) 132 133 #ifndef NDEBUG 134 static int never = 0; /* for use in asserts; shuts lint up */ 135 #else 136 #define never 0 /* some <assert.h>s have bugs too */ 137 #endif 138 139 /* 140 - regcomp - interface for parser and compilation 141 */ 142 int /* 0 success, otherwise REG_something */ 143 regcomp(regex_t *preg, const char *pattern, int cflags) 144 { 145 struct parse pa; 146 struct re_guts *g; 147 struct parse *p = &pa; 148 int i; 149 size_t len; 150 #ifdef REDEBUG 151 # define GOODFLAGS(f) (f) 152 #else 153 # define GOODFLAGS(f) ((f)&~REG_DUMP) 154 #endif 155 156 cflags = GOODFLAGS(cflags); 157 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 158 return(REG_INVARG); 159 160 if (cflags®_PEND) { 161 if (preg->re_endp < pattern) 162 return(REG_INVARG); 163 len = preg->re_endp - pattern; 164 } else 165 len = strlen((char *)pattern); 166 167 /* do the mallocs early so failure handling is easy */ 168 g = malloc(sizeof(struct re_guts)); 169 if (g == NULL) 170 return(REG_ESPACE); 171 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 172 p->strip = reallocarray(NULL, p->ssize, sizeof(sop)); 173 p->slen = 0; 174 if (p->strip == NULL) { 175 free(g); 176 return(REG_ESPACE); 177 } 178 179 /* set things up */ 180 p->g = g; 181 p->next = pattern; 182 p->end = p->next + len; 183 p->error = 0; 184 p->ncsalloc = 0; 185 for (i = 0; i < NPAREN; i++) { 186 p->pbegin[i] = 0; 187 p->pend[i] = 0; 188 } 189 g->csetsize = NC; 190 g->sets = NULL; 191 g->setbits = NULL; 192 g->ncsets = 0; 193 g->cflags = cflags; 194 g->iflags = 0; 195 g->nbol = 0; 196 g->neol = 0; 197 g->must = NULL; 198 g->mlen = 0; 199 g->nsub = 0; 200 g->backrefs = 0; 201 202 /* do it */ 203 EMIT(OEND, 0); 204 g->firststate = THERE(); 205 if (cflags®_EXTENDED) 206 p_ere(p, OUT); 207 else if (cflags®_NOSPEC) 208 p_str(p); 209 else 210 p_bre(p, OUT, OUT); 211 EMIT(OEND, 0); 212 g->laststate = THERE(); 213 214 /* tidy up loose ends and fill things in */ 215 stripsnug(p, g); 216 findmust(p, g); 217 g->nplus = pluscount(p, g); 218 g->magic = MAGIC2; 219 preg->re_nsub = g->nsub; 220 preg->re_g = g; 221 preg->re_magic = MAGIC1; 222 #ifndef REDEBUG 223 /* not debugging, so can't rely on the assert() in regexec() */ 224 if (g->iflags&BAD) 225 SETERROR(REG_ASSERT); 226 #endif 227 228 /* win or lose, we're done */ 229 if (p->error != 0) /* lose */ 230 regfree(preg); 231 return(p->error); 232 } 233 234 /* 235 - p_ere - ERE parser top level, concatenation and alternation 236 */ 237 static void 238 p_ere(struct parse *p, int stop) /* character this ERE should end at */ 239 { 240 char c; 241 sopno prevback; 242 sopno prevfwd; 243 sopno conc; 244 int first = 1; /* is this the first alternative? */ 245 246 for (;;) { 247 /* do a bunch of concatenated expressions */ 248 conc = HERE(); 249 while (MORE() && (c = PEEK()) != '|' && c != stop) 250 p_ere_exp(p); 251 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 252 253 if (!EAT('|')) 254 break; /* NOTE BREAK OUT */ 255 256 if (first) { 257 INSERT(OCH_, conc); /* offset is wrong */ 258 prevfwd = conc; 259 prevback = conc; 260 first = 0; 261 } 262 ASTERN(OOR1, prevback); 263 prevback = THERE(); 264 AHEAD(prevfwd); /* fix previous offset */ 265 prevfwd = HERE(); 266 EMIT(OOR2, 0); /* offset is very wrong */ 267 } 268 269 if (!first) { /* tail-end fixups */ 270 AHEAD(prevfwd); 271 ASTERN(O_CH, prevback); 272 } 273 274 assert(!MORE() || SEE(stop)); 275 } 276 277 /* 278 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 279 */ 280 static void 281 p_ere_exp(struct parse *p) 282 { 283 char c; 284 sopno pos; 285 int count; 286 int count2; 287 sopno subno; 288 int wascaret = 0; 289 290 assert(MORE()); /* caller should have ensured this */ 291 c = GETNEXT(); 292 293 pos = HERE(); 294 switch (c) { 295 case '(': 296 REQUIRE(MORE(), REG_EPAREN); 297 p->g->nsub++; 298 subno = p->g->nsub; 299 if (subno < NPAREN) 300 p->pbegin[subno] = HERE(); 301 EMIT(OLPAREN, subno); 302 if (!SEE(')')) 303 p_ere(p, ')'); 304 if (subno < NPAREN) { 305 p->pend[subno] = HERE(); 306 assert(p->pend[subno] != 0); 307 } 308 EMIT(ORPAREN, subno); 309 REQUIRE(MORE() && GETNEXT() == ')', REG_EPAREN); 310 break; 311 case '^': 312 EMIT(OBOL, 0); 313 p->g->iflags |= USEBOL; 314 p->g->nbol++; 315 wascaret = 1; 316 break; 317 case '$': 318 EMIT(OEOL, 0); 319 p->g->iflags |= USEEOL; 320 p->g->neol++; 321 break; 322 case '|': 323 SETERROR(REG_EMPTY); 324 break; 325 case '*': 326 case '+': 327 case '?': 328 SETERROR(REG_BADRPT); 329 break; 330 case '.': 331 if (p->g->cflags®_NEWLINE) 332 nonnewline(p); 333 else 334 EMIT(OANY, 0); 335 break; 336 case '[': 337 p_bracket(p); 338 break; 339 case '\\': 340 REQUIRE(MORE(), REG_EESCAPE); 341 c = GETNEXT(); 342 backslash(p, c); 343 break; 344 case '{': /* okay as ordinary except if digit follows */ 345 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 346 /* FALLTHROUGH */ 347 default: 348 if (p->error != 0) 349 return; 350 ordinary(p, c); 351 break; 352 } 353 354 if (!MORE()) 355 return; 356 c = PEEK(); 357 /* we call { a repetition if followed by a digit */ 358 if (!( c == '*' || c == '+' || c == '?' || 359 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 360 return; /* no repetition, we're done */ 361 NEXT(); 362 363 REQUIRE(!wascaret, REG_BADRPT); 364 switch (c) { 365 case '*': /* implemented as +? */ 366 /* this case does not require the (y|) trick, noKLUDGE */ 367 INSERT(OPLUS_, pos); 368 ASTERN(O_PLUS, pos); 369 INSERT(OQUEST_, pos); 370 ASTERN(O_QUEST, pos); 371 break; 372 case '+': 373 INSERT(OPLUS_, pos); 374 ASTERN(O_PLUS, pos); 375 break; 376 case '?': 377 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 378 INSERT(OCH_, pos); /* offset slightly wrong */ 379 ASTERN(OOR1, pos); /* this one's right */ 380 AHEAD(pos); /* fix the OCH_ */ 381 EMIT(OOR2, 0); /* offset very wrong... */ 382 AHEAD(THERE()); /* ...so fix it */ 383 ASTERN(O_CH, THERETHERE()); 384 break; 385 case '{': 386 count = p_count(p); 387 if (EAT(',')) { 388 if (isdigit((uch)PEEK())) { 389 count2 = p_count(p); 390 REQUIRE(count <= count2, REG_BADBR); 391 } else /* single number with comma */ 392 count2 = INFINITY; 393 } else /* just a single number */ 394 count2 = count; 395 repeat(p, pos, count, count2); 396 if (!EAT('}')) { /* error heuristics */ 397 while (MORE() && PEEK() != '}') 398 NEXT(); 399 REQUIRE(MORE(), REG_EBRACE); 400 SETERROR(REG_BADBR); 401 } 402 break; 403 } 404 405 if (!MORE()) 406 return; 407 c = PEEK(); 408 if (!( c == '*' || c == '+' || c == '?' || 409 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 410 return; 411 SETERROR(REG_BADRPT); 412 } 413 414 /* 415 - p_str - string (no metacharacters) "parser" 416 */ 417 static void 418 p_str(struct parse *p) 419 { 420 REQUIRE(MORE(), REG_EMPTY); 421 while (MORE()) 422 ordinary(p, GETNEXT()); 423 } 424 425 /* 426 - p_bre - BRE parser top level, anchoring and concatenation 427 * Giving end1 as OUT essentially eliminates the end1/end2 check. 428 * 429 * This implementation is a bit of a kludge, in that a trailing $ is first 430 * taken as an ordinary character and then revised to be an anchor. The 431 * only undesirable side effect is that '$' gets included as a character 432 * category in such cases. This is fairly harmless; not worth fixing. 433 * The amount of lookahead needed to avoid this kludge is excessive. 434 */ 435 static void 436 p_bre(struct parse *p, 437 int end1, /* first terminating character */ 438 int end2) /* second terminating character */ 439 { 440 sopno start = HERE(); 441 int first = 1; /* first subexpression? */ 442 int wasdollar = 0; 443 444 if (EAT('^')) { 445 EMIT(OBOL, 0); 446 p->g->iflags |= USEBOL; 447 p->g->nbol++; 448 } 449 while (MORE() && !SEETWO(end1, end2)) { 450 wasdollar = p_simp_re(p, first); 451 first = 0; 452 } 453 if (wasdollar) { /* oops, that was a trailing anchor */ 454 DROP(1); 455 EMIT(OEOL, 0); 456 p->g->iflags |= USEEOL; 457 p->g->neol++; 458 } 459 460 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 461 } 462 463 /* 464 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 465 */ 466 static int /* was the simple RE an unbackslashed $? */ 467 p_simp_re(struct parse *p, 468 int starordinary) /* is a leading * an ordinary character? */ 469 { 470 int c; 471 int count; 472 int count2; 473 sopno pos; 474 int i; 475 sopno subno; 476 # define BACKSL (1<<CHAR_BIT) 477 478 pos = HERE(); /* repetion op, if any, covers from here */ 479 480 assert(MORE()); /* caller should have ensured this */ 481 c = GETNEXT(); 482 if (c == '\\') { 483 REQUIRE(MORE(), REG_EESCAPE); 484 c = BACKSL | GETNEXT(); 485 } 486 switch (c) { 487 case '.': 488 if (p->g->cflags®_NEWLINE) 489 nonnewline(p); 490 else 491 EMIT(OANY, 0); 492 break; 493 case '[': 494 p_bracket(p); 495 break; 496 case BACKSL|'<': 497 EMIT(OBOW, 0); 498 break; 499 case BACKSL|'>': 500 EMIT(OEOW, 0); 501 break; 502 case BACKSL|'{': 503 SETERROR(REG_BADRPT); 504 break; 505 case BACKSL|'(': 506 p->g->nsub++; 507 subno = p->g->nsub; 508 if (subno < NPAREN) 509 p->pbegin[subno] = HERE(); 510 EMIT(OLPAREN, subno); 511 /* the MORE here is an error heuristic */ 512 if (MORE() && !SEETWO('\\', ')')) 513 p_bre(p, '\\', ')'); 514 if (subno < NPAREN) { 515 p->pend[subno] = HERE(); 516 assert(p->pend[subno] != 0); 517 } 518 EMIT(ORPAREN, subno); 519 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 520 break; 521 case BACKSL|')': /* should not get here -- must be user */ 522 case BACKSL|'}': 523 SETERROR(REG_EPAREN); 524 break; 525 case BACKSL|'1': 526 case BACKSL|'2': 527 case BACKSL|'3': 528 case BACKSL|'4': 529 case BACKSL|'5': 530 case BACKSL|'6': 531 case BACKSL|'7': 532 case BACKSL|'8': 533 case BACKSL|'9': 534 i = (c&~BACKSL) - '0'; 535 assert(i < NPAREN); 536 if (p->pend[i] != 0) { 537 assert(i <= p->g->nsub); 538 EMIT(OBACK_, i); 539 assert(p->pbegin[i] != 0); 540 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 541 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 542 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 543 EMIT(O_BACK, i); 544 } else 545 SETERROR(REG_ESUBREG); 546 p->g->backrefs = 1; 547 break; 548 case '*': 549 REQUIRE(starordinary, REG_BADRPT); 550 /* FALLTHROUGH */ 551 default: 552 if (p->error != 0) 553 return(0); /* Definitely not $... */ 554 ordinary(p, (char)c); 555 break; 556 } 557 558 if (EAT('*')) { /* implemented as +? */ 559 /* this case does not require the (y|) trick, noKLUDGE */ 560 INSERT(OPLUS_, pos); 561 ASTERN(O_PLUS, pos); 562 INSERT(OQUEST_, pos); 563 ASTERN(O_QUEST, pos); 564 } else if (EATTWO('\\', '{')) { 565 count = p_count(p); 566 if (EAT(',')) { 567 if (MORE() && isdigit((uch)PEEK())) { 568 count2 = p_count(p); 569 REQUIRE(count <= count2, REG_BADBR); 570 } else /* single number with comma */ 571 count2 = INFINITY; 572 } else /* just a single number */ 573 count2 = count; 574 repeat(p, pos, count, count2); 575 if (!EATTWO('\\', '}')) { /* error heuristics */ 576 while (MORE() && !SEETWO('\\', '}')) 577 NEXT(); 578 REQUIRE(MORE(), REG_EBRACE); 579 SETERROR(REG_BADBR); 580 } 581 } else if (c == '$') /* $ (but not \$) ends it */ 582 return(1); 583 584 return(0); 585 } 586 587 /* 588 - p_count - parse a repetition count 589 */ 590 static int /* the value */ 591 p_count(struct parse *p) 592 { 593 int count = 0; 594 int ndigits = 0; 595 596 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 597 count = count*10 + (GETNEXT() - '0'); 598 ndigits++; 599 } 600 601 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 602 return(count); 603 } 604 605 /* 606 - p_bracket - parse a bracketed character list 607 * 608 * Note a significant property of this code: if the allocset() did SETERROR, 609 * no set operations are done. 610 */ 611 static void 612 p_bracket(struct parse *p) 613 { 614 cset *cs; 615 int invert = 0; 616 617 /* Dept of Truly Sickening Special-Case Kludges */ 618 if (p->end - p->next > 5) { 619 if (strncmp(p->next, "[:<:]]", 6) == 0) { 620 EMIT(OBOW, 0); 621 NEXTn(6); 622 return; 623 } 624 if (strncmp(p->next, "[:>:]]", 6) == 0) { 625 EMIT(OEOW, 0); 626 NEXTn(6); 627 return; 628 } 629 } 630 631 if ((cs = allocset(p)) == NULL) { 632 /* allocset did set error status in p */ 633 return; 634 } 635 636 if (EAT('^')) 637 invert++; /* make note to invert set at end */ 638 if (EAT(']')) 639 CHadd(cs, ']'); 640 else if (EAT('-')) 641 CHadd(cs, '-'); 642 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 643 p_b_term(p, cs); 644 if (EAT('-')) 645 CHadd(cs, '-'); 646 REQUIRE(MORE() && GETNEXT() == ']', REG_EBRACK); 647 648 if (p->error != 0) { /* don't mess things up further */ 649 freeset(p, cs); 650 return; 651 } 652 653 if (p->g->cflags®_ICASE) { 654 int i; 655 int ci; 656 657 for (i = p->g->csetsize - 1; i >= 0; i--) 658 if (CHIN(cs, i) && isalpha(i)) { 659 ci = othercase(i); 660 if (ci != i) 661 CHadd(cs, ci); 662 } 663 } 664 if (invert) { 665 int i; 666 667 for (i = p->g->csetsize - 1; i >= 0; i--) 668 if (CHIN(cs, i)) 669 CHsub(cs, i); 670 else 671 CHadd(cs, i); 672 if (p->g->cflags®_NEWLINE) 673 CHsub(cs, '\n'); 674 } 675 676 if (nch(p, cs) == 1) { /* optimize singleton sets */ 677 ordinary(p, firstch(p, cs)); 678 freeset(p, cs); 679 } else 680 EMIT(OANYOF, freezeset(p, cs)); 681 } 682 683 /* 684 - p_b_term - parse one term of a bracketed character list 685 */ 686 static void 687 p_b_term(struct parse *p, cset *cs) 688 { 689 char c; 690 char start, finish; 691 int i; 692 693 /* classify what we've got */ 694 switch ((MORE()) ? PEEK() : '\0') { 695 case '[': 696 c = (MORE2()) ? PEEK2() : '\0'; 697 break; 698 case '-': 699 SETERROR(REG_ERANGE); 700 return; /* NOTE RETURN */ 701 break; 702 default: 703 c = '\0'; 704 break; 705 } 706 707 switch (c) { 708 case ':': /* character class */ 709 NEXT2(); 710 REQUIRE(MORE(), REG_EBRACK); 711 c = PEEK(); 712 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 713 p_b_cclass(p, cs); 714 REQUIRE(MORE(), REG_EBRACK); 715 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 716 break; 717 case '=': /* equivalence class */ 718 NEXT2(); 719 REQUIRE(MORE(), REG_EBRACK); 720 c = PEEK(); 721 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 722 p_b_eclass(p, cs); 723 REQUIRE(MORE(), REG_EBRACK); 724 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 725 break; 726 default: /* symbol, ordinary character, or range */ 727 /* xxx revision needed for multichar stuff */ 728 start = p_b_symbol(p); 729 if (SEE('-') && MORE2() && PEEK2() != ']') { 730 /* range */ 731 NEXT(); 732 if (EAT('-')) 733 finish = '-'; 734 else 735 finish = p_b_symbol(p); 736 } else 737 finish = start; 738 /* xxx what about signed chars here... */ 739 REQUIRE(start <= finish, REG_ERANGE); 740 for (i = start; i <= finish; i++) 741 CHadd(cs, i); 742 break; 743 } 744 } 745 746 /* 747 - p_b_cclass - parse a character-class name and deal with it 748 */ 749 static void 750 p_b_cclass(struct parse *p, cset *cs) 751 { 752 const char *sp = p->next; 753 const struct cclass *cp; 754 size_t len; 755 const char *u; 756 char c; 757 758 while (MORE() && isalpha((uch)PEEK())) 759 NEXT(); 760 len = p->next - sp; 761 for (cp = cclasses; cp->name != NULL; cp++) 762 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 763 break; 764 if (cp->name == NULL) { 765 /* oops, didn't find it */ 766 SETERROR(REG_ECTYPE); 767 return; 768 } 769 770 u = cp->chars; 771 while ((c = *u++) != '\0') 772 CHadd(cs, c); 773 } 774 775 /* 776 - p_b_eclass - parse an equivalence-class name and deal with it 777 * 778 * This implementation is incomplete. xxx 779 */ 780 static void 781 p_b_eclass(struct parse *p, cset *cs) 782 { 783 char c; 784 785 c = p_b_coll_elem(p, '='); 786 CHadd(cs, c); 787 } 788 789 /* 790 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 791 */ 792 static char /* value of symbol */ 793 p_b_symbol(struct parse *p) 794 { 795 char value; 796 797 REQUIRE(MORE(), REG_EBRACK); 798 if (!EATTWO('[', '.')) 799 return(GETNEXT()); 800 801 /* collating symbol */ 802 value = p_b_coll_elem(p, '.'); 803 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 804 return(value); 805 } 806 807 /* 808 - p_b_coll_elem - parse a collating-element name and look it up 809 */ 810 static char /* value of collating element */ 811 p_b_coll_elem(struct parse *p, 812 int endc) /* name ended by endc,']' */ 813 { 814 const char *sp = p->next; 815 const struct cname *cp; 816 size_t len; 817 818 while (MORE() && !SEETWO(endc, ']')) 819 NEXT(); 820 if (!MORE()) { 821 SETERROR(REG_EBRACK); 822 return(0); 823 } 824 len = p->next - sp; 825 for (cp = cnames; cp->name != NULL; cp++) 826 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 827 return(cp->code); /* known name */ 828 if (len == 1) 829 return(*sp); /* single character */ 830 SETERROR(REG_ECOLLATE); /* neither */ 831 return(0); 832 } 833 834 /* 835 - othercase - return the case counterpart of an alphabetic 836 */ 837 static char /* if no counterpart, return ch */ 838 othercase(int ch) 839 { 840 ch = (uch)ch; 841 assert(isalpha(ch)); 842 if (isupper(ch)) 843 return ((uch)tolower(ch)); 844 else if (islower(ch)) 845 return ((uch)toupper(ch)); 846 else /* peculiar, but could happen */ 847 return(ch); 848 } 849 850 /* 851 - bothcases - emit a dualcase version of a two-case character 852 * 853 * Boy, is this implementation ever a kludge... 854 */ 855 static void 856 bothcases(struct parse *p, int ch) 857 { 858 const char *oldnext = p->next; 859 const char *oldend = p->end; 860 char bracket[3]; 861 862 ch = (uch)ch; 863 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 864 p->next = bracket; 865 p->end = bracket+2; 866 bracket[0] = ch; 867 bracket[1] = ']'; 868 bracket[2] = '\0'; 869 p_bracket(p); 870 assert(p->next == bracket+2); 871 p->next = oldnext; 872 p->end = oldend; 873 } 874 875 /* 876 - ordinary - emit an ordinary character 877 */ 878 static void 879 ordinary(struct parse *p, int ch) 880 { 881 if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 882 bothcases(p, ch); 883 else 884 EMIT(OCHAR, (uch)ch); 885 } 886 887 /* 888 * do something magic with this character, but only if it's extra magic 889 */ 890 static void 891 backslash(struct parse *p, int ch) 892 { 893 switch (ch) { 894 case '<': 895 EMIT(OBOW, 0); 896 break; 897 case '>': 898 EMIT(OEOW, 0); 899 break; 900 default: 901 ordinary(p, ch); 902 break; 903 } 904 } 905 906 /* 907 - nonnewline - emit REG_NEWLINE version of OANY 908 * 909 * Boy, is this implementation ever a kludge... 910 */ 911 static void 912 nonnewline(struct parse *p) 913 { 914 const char *oldnext = p->next; 915 const char *oldend = p->end; 916 static const char bracket[4] = { '^', '\n', ']', '\0' }; 917 918 p->next = bracket; 919 p->end = bracket+3; 920 p_bracket(p); 921 assert(p->next == bracket+3); 922 p->next = oldnext; 923 p->end = oldend; 924 } 925 926 /* 927 - repeat - generate code for a bounded repetition, recursively if needed 928 */ 929 static void 930 repeat(struct parse *p, 931 sopno start, /* operand from here to end of strip */ 932 int from, /* repeated from this number */ 933 int to) /* to this number of times (maybe INFINITY) */ 934 { 935 sopno finish = HERE(); 936 # define N 2 937 # define INF 3 938 # define REP(f, t) ((f)*8 + (t)) 939 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 940 sopno copy; 941 942 if (p->error != 0) /* head off possible runaway recursion */ 943 return; 944 945 assert(from <= to); 946 947 switch (REP(MAP(from), MAP(to))) { 948 case REP(0, 0): /* must be user doing this */ 949 DROP(finish-start); /* drop the operand */ 950 break; 951 case REP(0, 1): /* as x{1,1}? */ 952 case REP(0, N): /* as x{1,n}? */ 953 case REP(0, INF): /* as x{1,}? */ 954 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 955 INSERT(OCH_, start); /* offset is wrong... */ 956 repeat(p, start+1, 1, to); 957 ASTERN(OOR1, start); 958 AHEAD(start); /* ... fix it */ 959 EMIT(OOR2, 0); 960 AHEAD(THERE()); 961 ASTERN(O_CH, THERETHERE()); 962 break; 963 case REP(1, 1): /* trivial case */ 964 /* done */ 965 break; 966 case REP(1, N): /* as x?x{1,n-1} */ 967 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 968 INSERT(OCH_, start); 969 ASTERN(OOR1, start); 970 AHEAD(start); 971 EMIT(OOR2, 0); /* offset very wrong... */ 972 AHEAD(THERE()); /* ...so fix it */ 973 ASTERN(O_CH, THERETHERE()); 974 copy = dupl(p, start+1, finish+1); 975 assert(copy == finish+4); 976 repeat(p, copy, 1, to-1); 977 break; 978 case REP(1, INF): /* as x+ */ 979 INSERT(OPLUS_, start); 980 ASTERN(O_PLUS, start); 981 break; 982 case REP(N, N): /* as xx{m-1,n-1} */ 983 copy = dupl(p, start, finish); 984 repeat(p, copy, from-1, to-1); 985 break; 986 case REP(N, INF): /* as xx{n-1,INF} */ 987 copy = dupl(p, start, finish); 988 repeat(p, copy, from-1, to); 989 break; 990 default: /* "can't happen" */ 991 SETERROR(REG_ASSERT); /* just in case */ 992 break; 993 } 994 } 995 996 /* 997 - seterr - set an error condition 998 */ 999 static void 1000 seterr(struct parse *p, int e) 1001 { 1002 if (p->error == 0) /* keep earliest error condition */ 1003 p->error = e; 1004 p->next = nuls; /* try to bring things to a halt */ 1005 p->end = nuls; 1006 } 1007 1008 /* 1009 - allocset - allocate a set of characters for [] 1010 */ 1011 static cset * 1012 allocset(struct parse *p) 1013 { 1014 int no = p->g->ncsets++; 1015 size_t nc; 1016 size_t nbytes; 1017 cset *cs; 1018 size_t css = (size_t)p->g->csetsize; 1019 int i; 1020 1021 if (no >= p->ncsalloc) { /* need another column of space */ 1022 void *ptr; 1023 1024 p->ncsalloc += CHAR_BIT; 1025 nc = p->ncsalloc; 1026 assert(nc % CHAR_BIT == 0); 1027 1028 ptr = reallocarray(p->g->sets, nc, sizeof(cset)); 1029 if (ptr == NULL) 1030 goto nomem; 1031 p->g->sets = ptr; 1032 1033 ptr = reallocarray(p->g->setbits, nc / CHAR_BIT, css); 1034 if (ptr == NULL) 1035 goto nomem; 1036 nbytes = (nc / CHAR_BIT) * css; 1037 p->g->setbits = ptr; 1038 1039 for (i = 0; i < no; i++) 1040 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1041 1042 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); 1043 } 1044 /* XXX should not happen */ 1045 if (p->g->sets == NULL || p->g->setbits == NULL) 1046 goto nomem; 1047 1048 cs = &p->g->sets[no]; 1049 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1050 cs->mask = 1 << ((no) % CHAR_BIT); 1051 cs->hash = 0; 1052 1053 return(cs); 1054 nomem: 1055 free(p->g->sets); 1056 p->g->sets = NULL; 1057 free(p->g->setbits); 1058 p->g->setbits = NULL; 1059 1060 SETERROR(REG_ESPACE); 1061 /* caller's responsibility not to do set ops */ 1062 return(NULL); 1063 } 1064 1065 /* 1066 - freeset - free a now-unused set 1067 */ 1068 static void 1069 freeset(struct parse *p, cset *cs) 1070 { 1071 int i; 1072 cset *top = &p->g->sets[p->g->ncsets]; 1073 size_t css = (size_t)p->g->csetsize; 1074 1075 for (i = 0; i < css; i++) 1076 CHsub(cs, i); 1077 if (cs == top-1) /* recover only the easy case */ 1078 p->g->ncsets--; 1079 } 1080 1081 /* 1082 - freezeset - final processing on a set of characters 1083 * 1084 * The main task here is merging identical sets. This is usually a waste 1085 * of time (although the hash code minimizes the overhead), but can win 1086 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1087 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1088 * the same value! 1089 */ 1090 static int /* set number */ 1091 freezeset(struct parse *p, cset *cs) 1092 { 1093 uch h = cs->hash; 1094 int i; 1095 cset *top = &p->g->sets[p->g->ncsets]; 1096 cset *cs2; 1097 size_t css = (size_t)p->g->csetsize; 1098 1099 /* look for an earlier one which is the same */ 1100 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1101 if (cs2->hash == h && cs2 != cs) { 1102 /* maybe */ 1103 for (i = 0; i < css; i++) 1104 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1105 break; /* no */ 1106 if (i == css) 1107 break; /* yes */ 1108 } 1109 1110 if (cs2 < top) { /* found one */ 1111 freeset(p, cs); 1112 cs = cs2; 1113 } 1114 1115 return((int)(cs - p->g->sets)); 1116 } 1117 1118 /* 1119 - firstch - return first character in a set (which must have at least one) 1120 */ 1121 static int /* character; there is no "none" value */ 1122 firstch(struct parse *p, cset *cs) 1123 { 1124 int i; 1125 size_t css = (size_t)p->g->csetsize; 1126 1127 for (i = 0; i < css; i++) 1128 if (CHIN(cs, i)) 1129 return((char)i); 1130 assert(never); 1131 return(0); /* arbitrary */ 1132 } 1133 1134 /* 1135 - nch - number of characters in a set 1136 */ 1137 static int 1138 nch(struct parse *p, cset *cs) 1139 { 1140 int i; 1141 size_t css = (size_t)p->g->csetsize; 1142 int n = 0; 1143 1144 for (i = 0; i < css; i++) 1145 if (CHIN(cs, i)) 1146 n++; 1147 return(n); 1148 } 1149 1150 /* 1151 - isinsets - is this character in any sets? 1152 */ 1153 static int /* predicate */ 1154 isinsets(struct re_guts *g, int c) 1155 { 1156 uch *col; 1157 int i; 1158 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1159 unsigned uc = (uch)c; 1160 1161 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1162 if (col[uc] != 0) 1163 return(1); 1164 return(0); 1165 } 1166 1167 /* 1168 - samesets - are these two characters in exactly the same sets? 1169 */ 1170 static int /* predicate */ 1171 samesets(struct re_guts *g, int c1, int c2) 1172 { 1173 uch *col; 1174 int i; 1175 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1176 unsigned uc1 = (uch)c1; 1177 unsigned uc2 = (uch)c2; 1178 1179 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1180 if (col[uc1] != col[uc2]) 1181 return(0); 1182 return(1); 1183 } 1184 1185 /* 1186 - dupl - emit a duplicate of a bunch of sops 1187 */ 1188 static sopno /* start of duplicate */ 1189 dupl(struct parse *p, 1190 sopno start, /* from here */ 1191 sopno finish) /* to this less one */ 1192 { 1193 sopno ret = HERE(); 1194 sopno len = finish - start; 1195 1196 assert(finish >= start); 1197 if (len == 0) 1198 return(ret); 1199 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1200 return(ret); 1201 (void) memcpy(p->strip + p->slen, p->strip + start, len * sizeof(sop)); 1202 p->slen += len; 1203 return(ret); 1204 } 1205 1206 /* 1207 - doemit - emit a strip operator 1208 * 1209 * It might seem better to implement this as a macro with a function as 1210 * hard-case backup, but it's just too big and messy unless there are 1211 * some changes to the data structures. Maybe later. 1212 */ 1213 static void 1214 doemit(struct parse *p, sop op, size_t opnd) 1215 { 1216 /* avoid making error situations worse */ 1217 if (p->error != 0) 1218 return; 1219 1220 /* deal with oversize operands ("can't happen", more or less) */ 1221 assert(opnd < 1<<OPSHIFT); 1222 1223 /* deal with undersized strip */ 1224 if (p->slen >= p->ssize) 1225 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1226 return; 1227 1228 /* finally, it's all reduced to the easy case */ 1229 p->strip[p->slen++] = SOP(op, opnd); 1230 } 1231 1232 /* 1233 - doinsert - insert a sop into the strip 1234 */ 1235 static void 1236 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1237 { 1238 sopno sn; 1239 sop s; 1240 int i; 1241 1242 /* avoid making error situations worse */ 1243 if (p->error != 0) 1244 return; 1245 1246 sn = HERE(); 1247 EMIT(op, opnd); /* do checks, ensure space */ 1248 assert(HERE() == sn+1); 1249 s = p->strip[sn]; 1250 1251 /* adjust paren pointers */ 1252 assert(pos > 0); 1253 for (i = 1; i < NPAREN; i++) { 1254 if (p->pbegin[i] >= pos) { 1255 p->pbegin[i]++; 1256 } 1257 if (p->pend[i] >= pos) { 1258 p->pend[i]++; 1259 } 1260 } 1261 1262 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1263 (HERE()-pos-1)*sizeof(sop)); 1264 p->strip[pos] = s; 1265 } 1266 1267 /* 1268 - dofwd - complete a forward reference 1269 */ 1270 static void 1271 dofwd(struct parse *p, sopno pos, sop value) 1272 { 1273 /* avoid making error situations worse */ 1274 if (p->error != 0) 1275 return; 1276 1277 assert(value < 1<<OPSHIFT); 1278 p->strip[pos] = OP(p->strip[pos]) | value; 1279 } 1280 1281 /* 1282 - enlarge - enlarge the strip 1283 */ 1284 static int 1285 enlarge(struct parse *p, sopno size) 1286 { 1287 sop *sp; 1288 1289 if (p->ssize >= size) 1290 return 1; 1291 1292 sp = reallocarray(p->strip, size, sizeof(sop)); 1293 if (sp == NULL) { 1294 SETERROR(REG_ESPACE); 1295 return 0; 1296 } 1297 p->strip = sp; 1298 p->ssize = size; 1299 return 1; 1300 } 1301 1302 /* 1303 - stripsnug - compact the strip 1304 */ 1305 static void 1306 stripsnug(struct parse *p, struct re_guts *g) 1307 { 1308 g->nstates = p->slen; 1309 g->strip = reallocarray(p->strip, p->slen, sizeof(sop)); 1310 if (g->strip == NULL) { 1311 SETERROR(REG_ESPACE); 1312 g->strip = p->strip; 1313 } 1314 } 1315 1316 /* 1317 - findmust - fill in must and mlen with longest mandatory literal string 1318 * 1319 * This algorithm could do fancy things like analyzing the operands of | 1320 * for common subsequences. Someday. This code is simple and finds most 1321 * of the interesting cases. 1322 * 1323 * Note that must and mlen got initialized during setup. 1324 */ 1325 static void 1326 findmust(struct parse *p, struct re_guts *g) 1327 { 1328 sop *scan; 1329 sop *start; /* start initialized in the default case, after that */ 1330 sop *newstart; /* newstart was initialized in the OCHAR case */ 1331 sopno newlen; 1332 sop s; 1333 char *cp; 1334 sopno i; 1335 1336 /* avoid making error situations worse */ 1337 if (p->error != 0) 1338 return; 1339 1340 /* find the longest OCHAR sequence in strip */ 1341 newlen = 0; 1342 scan = g->strip + 1; 1343 do { 1344 s = *scan++; 1345 switch (OP(s)) { 1346 case OCHAR: /* sequence member */ 1347 if (newlen == 0) /* new sequence */ 1348 newstart = scan - 1; 1349 newlen++; 1350 break; 1351 case OPLUS_: /* things that don't break one */ 1352 case OLPAREN: 1353 case ORPAREN: 1354 break; 1355 case OQUEST_: /* things that must be skipped */ 1356 case OCH_: 1357 scan--; 1358 do { 1359 scan += OPND(s); 1360 s = *scan; 1361 /* assert() interferes w debug printouts */ 1362 if (OP(s) != O_QUEST && OP(s) != O_CH && 1363 OP(s) != OOR2) { 1364 g->iflags |= BAD; 1365 return; 1366 } 1367 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1368 /* fallthrough */ 1369 default: /* things that break a sequence */ 1370 if (newlen > g->mlen) { /* ends one */ 1371 start = newstart; 1372 g->mlen = newlen; 1373 } 1374 newlen = 0; 1375 break; 1376 } 1377 } while (OP(s) != OEND); 1378 1379 if (g->mlen == 0) /* there isn't one */ 1380 return; 1381 1382 /* turn it into a character string */ 1383 g->must = malloc((size_t)g->mlen + 1); 1384 if (g->must == NULL) { /* argh; just forget it */ 1385 g->mlen = 0; 1386 return; 1387 } 1388 cp = g->must; 1389 scan = start; 1390 for (i = g->mlen; i > 0; i--) { 1391 while (OP(s = *scan++) != OCHAR) 1392 continue; 1393 assert(cp < g->must + g->mlen); 1394 *cp++ = (char)OPND(s); 1395 } 1396 assert(cp == g->must + g->mlen); 1397 *cp++ = '\0'; /* just on general principles */ 1398 } 1399 1400 /* 1401 - pluscount - count + nesting 1402 */ 1403 static sopno /* nesting depth */ 1404 pluscount(struct parse *p, struct re_guts *g) 1405 { 1406 sop *scan; 1407 sop s; 1408 sopno plusnest = 0; 1409 sopno maxnest = 0; 1410 1411 if (p->error != 0) 1412 return(0); /* there may not be an OEND */ 1413 1414 scan = g->strip + 1; 1415 do { 1416 s = *scan++; 1417 switch (OP(s)) { 1418 case OPLUS_: 1419 plusnest++; 1420 break; 1421 case O_PLUS: 1422 if (plusnest > maxnest) 1423 maxnest = plusnest; 1424 plusnest--; 1425 break; 1426 } 1427 } while (OP(s) != OEND); 1428 if (plusnest != 0) 1429 g->iflags |= BAD; 1430 return(maxnest); 1431 } 1432