1 /* $NetBSD: regcomp.c,v 1.32 2011/11/08 19:25:45 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 35 */ 36 37 /*- 38 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 39 * 40 * This code is derived from software contributed to Berkeley by 41 * Henry Spencer. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 72 */ 73 74 #include <sys/cdefs.h> 75 #if defined(LIBC_SCCS) && !defined(lint) 76 #if 0 77 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 78 #else 79 __RCSID("$NetBSD: regcomp.c,v 1.32 2011/11/08 19:25:45 christos Exp $"); 80 #endif 81 #endif /* LIBC_SCCS and not lint */ 82 83 #include "namespace.h" 84 #include <sys/types.h> 85 86 #include <assert.h> 87 #include <ctype.h> 88 #include <limits.h> 89 #include <stdio.h> 90 #include <stdlib.h> 91 #include <string.h> 92 #include <regex.h> 93 94 #ifdef __weak_alias 95 __weak_alias(regcomp,_regcomp) 96 #endif 97 98 #include "utils.h" 99 #include "regex2.h" 100 101 #include "cclass.h" 102 #include "cname.h" 103 104 /* 105 * parse structure, passed up and down to avoid global variables and 106 * other clumsinesses 107 */ 108 struct parse { 109 const char *next; /* next character in RE */ 110 const char *end; /* end of string (-> NUL normally) */ 111 int error; /* has an error been seen? */ 112 sop *strip; /* malloced strip */ 113 sopno ssize; /* malloced strip size (allocated) */ 114 sopno slen; /* malloced strip length (used) */ 115 int ncsalloc; /* number of csets allocated */ 116 struct re_guts *g; 117 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 118 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 119 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 120 }; 121 122 /* ========= begin header generated by ./mkh ========= */ 123 #ifdef __cplusplus 124 extern "C" { 125 #endif 126 127 /* === regcomp.c === */ 128 static void p_ere(struct parse *p, int stop, size_t reclimit); 129 static void p_ere_exp(struct parse *p, size_t reclimit); 130 static void p_str(struct parse *p); 131 static void p_bre(struct parse *p, int end1, int end2, size_t reclimit); 132 static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 133 static int p_count(struct parse *p); 134 static void p_bracket(struct parse *p); 135 static void p_b_term(struct parse *p, cset *cs); 136 static void p_b_cclass(struct parse *p, cset *cs); 137 static void p_b_eclass(struct parse *p, cset *cs); 138 static char p_b_symbol(struct parse *p); 139 static char p_b_coll_elem(struct parse *p, int endc); 140 static int othercase(int ch); 141 static void bothcases(struct parse *p, int ch); 142 static void ordinary(struct parse *p, int ch); 143 static void nonnewline(struct parse *p); 144 static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit); 145 static int seterr(struct parse *p, int e); 146 static cset *allocset(struct parse *p); 147 static void freeset(struct parse *p, cset *cs); 148 static sopno freezeset(struct parse *p, cset *cs); 149 static int firstch(struct parse *p, cset *cs); 150 static int nch(struct parse *p, cset *cs); 151 static void mcadd(struct parse *p, cset *cs, const char *cp); 152 #if 0 153 static void mcsub(cset *cs, char *cp); 154 static int mcin(cset *cs, char *cp); 155 static char *mcfind(cset *cs, char *cp); 156 #endif 157 static void mcinvert(struct parse *p, cset *cs); 158 static void mccase(struct parse *p, cset *cs); 159 static int isinsets(struct re_guts *g, int c); 160 static int samesets(struct re_guts *g, int c1, int c2); 161 static void categorize(struct parse *p, struct re_guts *g); 162 static sopno dupl(struct parse *p, sopno start, sopno finish); 163 static void doemit(struct parse *p, sop op, sopno opnd); 164 static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos); 165 static void dofwd(struct parse *p, sopno pos, sopno value); 166 static int enlarge(struct parse *p, sopno size); 167 static void stripsnug(struct parse *p, struct re_guts *g); 168 static void findmust(struct parse *p, struct re_guts *g); 169 static sopno pluscount(struct parse *p, struct re_guts *g); 170 171 #ifdef __cplusplus 172 } 173 #endif 174 /* ========= end header generated by ./mkh ========= */ 175 176 static char nuls[10]; /* place to point scanner in event of error */ 177 178 /* 179 * macros for use with parse structure 180 * BEWARE: these know that the parse structure is named `p' !!! 181 */ 182 #define PEEK() (*p->next) 183 #define PEEK2() (*(p->next+1)) 184 #define MORE() (p->next < p->end) 185 #define MORE2() (p->next+1 < p->end) 186 #define SEE(c) (MORE() && PEEK() == (c)) 187 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 188 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 189 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 190 #define NEXT() (p->next++) 191 #define NEXT2() (p->next += 2) 192 #define NEXTn(n) (p->next += (n)) 193 #define GETNEXT() (*p->next++) 194 #define SETERROR(e) seterr(p, (e)) 195 #define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 196 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 197 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 198 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 199 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd) 200 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 201 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 202 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 203 #define HERE() (p->slen) 204 #define THERE() (p->slen - 1) 205 #define THERETHERE() (p->slen - 2) 206 #define DROP(n) (p->slen -= (n)) 207 208 #ifndef NDEBUG 209 static int never = 0; /* for use in asserts; shuts lint up */ 210 #else 211 #define never 0 /* some <assert.h>s have bugs too */ 212 #endif 213 214 #define MEMLIMIT 0x8000000 215 #define MEMSIZE(p) \ 216 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \ 217 (p)->ncsalloc * sizeof(cset) + \ 218 (p)->ssize * sizeof(sop)) 219 #define RECLIMIT 256 220 221 /* 222 - regcomp - interface for parser and compilation 223 = extern int regcomp(regex_t *, const char *, int); 224 = #define REG_BASIC 0000 225 = #define REG_EXTENDED 0001 226 = #define REG_ICASE 0002 227 = #define REG_NOSUB 0004 228 = #define REG_NEWLINE 0010 229 = #define REG_NOSPEC 0020 230 = #define REG_PEND 0040 231 = #define REG_DUMP 0200 232 */ 233 int /* 0 success, otherwise REG_something */ 234 regcomp( 235 regex_t *preg, 236 const char *pattern, 237 int cflags) 238 { 239 struct parse pa; 240 struct re_guts *g; 241 struct parse *p = &pa; 242 int i; 243 size_t len; 244 #ifdef REDEBUG 245 # define GOODFLAGS(f) (f) 246 #else 247 # define GOODFLAGS(f) ((f)&~REG_DUMP) 248 #endif 249 250 _DIAGASSERT(preg != NULL); 251 _DIAGASSERT(pattern != NULL); 252 253 cflags = GOODFLAGS(cflags); 254 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 255 return(REG_INVARG); 256 257 if (cflags®_PEND) { 258 if (preg->re_endp < pattern) 259 return(REG_INVARG); 260 len = preg->re_endp - pattern; 261 } else 262 len = strlen(pattern); 263 264 /* do the mallocs early so failure handling is easy */ 265 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 266 (NC-1)*sizeof(cat_t)); 267 if (g == NULL) 268 return(REG_ESPACE); 269 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 270 p->strip = malloc(p->ssize * sizeof(sop)); 271 p->slen = 0; 272 if (p->strip == NULL) { 273 free(g); 274 return(REG_ESPACE); 275 } 276 277 /* set things up */ 278 p->g = g; 279 p->next = pattern; 280 p->end = p->next + len; 281 p->error = 0; 282 p->ncsalloc = 0; 283 for (i = 0; i < NPAREN; i++) { 284 p->pbegin[i] = 0; 285 p->pend[i] = 0; 286 } 287 g->csetsize = NC; 288 g->sets = NULL; 289 g->setbits = NULL; 290 g->ncsets = 0; 291 g->cflags = cflags; 292 g->iflags = 0; 293 g->nbol = 0; 294 g->neol = 0; 295 g->must = NULL; 296 g->mlen = 0; 297 g->nsub = 0; 298 g->ncategories = 1; /* category 0 is "everything else" */ 299 g->categories = &g->catspace[-(CHAR_MIN)]; 300 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 301 g->backrefs = 0; 302 303 /* do it */ 304 EMIT(OEND, 0); 305 g->firststate = THERE(); 306 if (cflags®_EXTENDED) 307 p_ere(p, OUT, 0); 308 else if (cflags®_NOSPEC) 309 p_str(p); 310 else 311 p_bre(p, OUT, OUT, 0); 312 EMIT(OEND, 0); 313 g->laststate = THERE(); 314 315 /* tidy up loose ends and fill things in */ 316 categorize(p, g); 317 stripsnug(p, g); 318 findmust(p, g); 319 g->nplus = pluscount(p, g); 320 g->magic = MAGIC2; 321 preg->re_nsub = g->nsub; 322 preg->re_g = g; 323 preg->re_magic = MAGIC1; 324 #ifndef REDEBUG 325 /* not debugging, so can't rely on the assert() in regexec() */ 326 if (g->iflags&BAD) 327 SETERROR(REG_ASSERT); 328 #endif 329 330 /* win or lose, we're done */ 331 if (p->error != 0) /* lose */ 332 regfree(preg); 333 return(p->error); 334 } 335 336 /* 337 - p_ere - ERE parser top level, concatenation and alternation 338 == static void p_ere(struct parse *p, int stop, size_t reclimit); 339 */ 340 static void 341 p_ere( 342 struct parse *p, 343 int stop, /* character this ERE should end at */ 344 size_t reclimit) 345 { 346 char c; 347 sopno prevback = 0; /* pacify gcc */ 348 sopno prevfwd = 0; /* pacify gcc */ 349 sopno conc; 350 int first = 1; /* is this the first alternative? */ 351 352 _DIAGASSERT(p != NULL); 353 354 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 355 p->error = REG_ESPACE; 356 return; 357 } 358 359 for (;;) { 360 /* do a bunch of concatenated expressions */ 361 conc = HERE(); 362 while (MORE() && (c = PEEK()) != '|' && c != stop) 363 p_ere_exp(p, reclimit); 364 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 365 366 if (!EAT('|')) 367 break; /* NOTE BREAK OUT */ 368 369 if (first) { 370 INSERT(OCH_, conc); /* offset is wrong */ 371 prevfwd = conc; 372 prevback = conc; 373 first = 0; 374 } 375 ASTERN(OOR1, prevback); 376 prevback = THERE(); 377 AHEAD(prevfwd); /* fix previous offset */ 378 prevfwd = HERE(); 379 EMIT(OOR2, 0); /* offset is very wrong */ 380 } 381 382 if (!first) { /* tail-end fixups */ 383 AHEAD(prevfwd); 384 ASTERN(O_CH, prevback); 385 } 386 387 assert(!MORE() || SEE(stop)); 388 } 389 390 /* 391 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 392 == static void p_ere_exp(struct parse *p, size_t reclimit); 393 */ 394 static void 395 p_ere_exp( 396 struct parse *p, 397 size_t reclimit) 398 { 399 char c; 400 sopno pos; 401 int count; 402 int count2; 403 sopno subno; 404 int wascaret = 0; 405 406 _DIAGASSERT(p != NULL); 407 408 assert(MORE()); /* caller should have ensured this */ 409 c = GETNEXT(); 410 411 pos = HERE(); 412 switch (c) { 413 case '(': 414 REQUIRE(MORE(), REG_EPAREN); 415 p->g->nsub++; 416 subno = p->g->nsub; 417 if (subno < NPAREN) 418 p->pbegin[subno] = HERE(); 419 EMIT(OLPAREN, subno); 420 if (!SEE(')')) 421 p_ere(p, ')', reclimit); 422 if (subno < NPAREN) { 423 p->pend[subno] = HERE(); 424 assert(p->pend[subno] != 0); 425 } 426 EMIT(ORPAREN, subno); 427 MUSTEAT(')', REG_EPAREN); 428 break; 429 #ifndef POSIX_MISTAKE 430 case ')': /* happens only if no current unmatched ( */ 431 /* 432 * You may ask, why the ifndef? Because I didn't notice 433 * this until slightly too late for 1003.2, and none of the 434 * other 1003.2 regular-expression reviewers noticed it at 435 * all. So an unmatched ) is legal POSIX, at least until 436 * we can get it fixed. 437 */ 438 SETERROR(REG_EPAREN); 439 break; 440 #endif 441 case '^': 442 EMIT(OBOL, 0); 443 p->g->iflags |= USEBOL; 444 p->g->nbol++; 445 wascaret = 1; 446 break; 447 case '$': 448 EMIT(OEOL, 0); 449 p->g->iflags |= USEEOL; 450 p->g->neol++; 451 break; 452 case '|': 453 SETERROR(REG_EMPTY); 454 break; 455 case '*': 456 case '+': 457 case '?': 458 SETERROR(REG_BADRPT); 459 break; 460 case '.': 461 if (p->g->cflags®_NEWLINE) 462 nonnewline(p); 463 else 464 EMIT(OANY, 0); 465 break; 466 case '[': 467 p_bracket(p); 468 break; 469 case '\\': 470 REQUIRE(MORE(), REG_EESCAPE); 471 c = GETNEXT(); 472 ordinary(p, c); 473 break; 474 case '{': /* okay as ordinary except if digit follows */ 475 REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT); 476 /* FALLTHROUGH */ 477 default: 478 ordinary(p, c); 479 break; 480 } 481 482 if (!MORE()) 483 return; 484 c = PEEK(); 485 /* we call { a repetition if followed by a digit */ 486 if (!( c == '*' || c == '+' || c == '?' || 487 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) )) 488 return; /* no repetition, we're done */ 489 NEXT(); 490 491 REQUIRE(!wascaret, REG_BADRPT); 492 switch (c) { 493 case '*': /* implemented as +? */ 494 /* this case does not require the (y|) trick, noKLUDGE */ 495 INSERT(OPLUS_, pos); 496 ASTERN(O_PLUS, pos); 497 INSERT(OQUEST_, pos); 498 ASTERN(O_QUEST, pos); 499 break; 500 case '+': 501 INSERT(OPLUS_, pos); 502 ASTERN(O_PLUS, pos); 503 break; 504 case '?': 505 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 506 INSERT(OCH_, pos); /* offset slightly wrong */ 507 ASTERN(OOR1, pos); /* this one's right */ 508 AHEAD(pos); /* fix the OCH_ */ 509 EMIT(OOR2, 0); /* offset very wrong... */ 510 AHEAD(THERE()); /* ...so fix it */ 511 ASTERN(O_CH, THERETHERE()); 512 break; 513 case '{': 514 count = p_count(p); 515 if (EAT(',')) { 516 if (isdigit((unsigned char)PEEK())) { 517 count2 = p_count(p); 518 REQUIRE(count <= count2, REG_BADBR); 519 } else /* single number with comma */ 520 count2 = INFINITY; 521 } else /* just a single number */ 522 count2 = count; 523 repeat(p, pos, count, count2, 0); 524 if (!EAT('}')) { /* error heuristics */ 525 while (MORE() && PEEK() != '}') 526 NEXT(); 527 REQUIRE(MORE(), REG_EBRACE); 528 SETERROR(REG_BADBR); 529 } 530 break; 531 } 532 533 if (!MORE()) 534 return; 535 c = PEEK(); 536 if (!( c == '*' || c == '+' || c == '?' || 537 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) ) 538 return; 539 SETERROR(REG_BADRPT); 540 } 541 542 /* 543 - p_str - string (no metacharacters) "parser" 544 == static void p_str(struct parse *p); 545 */ 546 static void 547 p_str( 548 struct parse *p) 549 { 550 551 _DIAGASSERT(p != NULL); 552 553 REQUIRE(MORE(), REG_EMPTY); 554 while (MORE()) 555 ordinary(p, GETNEXT()); 556 } 557 558 /* 559 - p_bre - BRE parser top level, anchoring and concatenation 560 == static void p_bre(struct parse *p, int end1, \ 561 == int end2, size_t reclimit); 562 * Giving end1 as OUT essentially eliminates the end1/end2 check. 563 * 564 * This implementation is a bit of a kludge, in that a trailing $ is first 565 * taken as an ordinary character and then revised to be an anchor. The 566 * only undesirable side effect is that '$' gets included as a character 567 * category in such cases. This is fairly harmless; not worth fixing. 568 * The amount of lookahead needed to avoid this kludge is excessive. 569 */ 570 static void 571 p_bre( 572 struct parse *p, 573 int end1, /* first terminating character */ 574 int end2, /* second terminating character */ 575 size_t reclimit) 576 { 577 sopno start; 578 int first = 1; /* first subexpression? */ 579 int wasdollar = 0; 580 581 _DIAGASSERT(p != NULL); 582 583 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 584 p->error = REG_ESPACE; 585 return; 586 } 587 588 start = HERE(); 589 590 if (EAT('^')) { 591 EMIT(OBOL, 0); 592 p->g->iflags |= USEBOL; 593 p->g->nbol++; 594 } 595 while (MORE() && !SEETWO(end1, end2)) { 596 wasdollar = p_simp_re(p, first, reclimit); 597 first = 0; 598 } 599 if (wasdollar) { /* oops, that was a trailing anchor */ 600 DROP(1); 601 EMIT(OEOL, 0); 602 p->g->iflags |= USEEOL; 603 p->g->neol++; 604 } 605 606 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 607 } 608 609 /* 610 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 611 == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 612 */ 613 static int /* was the simple RE an unbackslashed $? */ 614 p_simp_re( 615 struct parse *p, 616 int starordinary, /* is a leading * an ordinary character? */ 617 size_t reclimit) 618 { 619 int c; 620 int count; 621 int count2; 622 sopno pos, i; 623 sopno subno; 624 # define BACKSL (1<<CHAR_BIT) 625 626 _DIAGASSERT(p != NULL); 627 628 pos = HERE(); /* repetion op, if any, covers from here */ 629 630 assert(MORE()); /* caller should have ensured this */ 631 c = GETNEXT(); 632 if (c == '\\') { 633 REQUIRE(MORE(), REG_EESCAPE); 634 c = BACKSL | (unsigned char)GETNEXT(); 635 } 636 switch (c) { 637 case '.': 638 if (p->g->cflags®_NEWLINE) 639 nonnewline(p); 640 else 641 EMIT(OANY, 0); 642 break; 643 case '[': 644 p_bracket(p); 645 break; 646 case BACKSL|'{': 647 SETERROR(REG_BADRPT); 648 break; 649 case BACKSL|'(': 650 p->g->nsub++; 651 subno = p->g->nsub; 652 if (subno < NPAREN) 653 p->pbegin[subno] = HERE(); 654 EMIT(OLPAREN, subno); 655 /* the MORE here is an error heuristic */ 656 if (MORE() && !SEETWO('\\', ')')) 657 p_bre(p, '\\', ')', reclimit); 658 if (subno < NPAREN) { 659 p->pend[subno] = HERE(); 660 assert(p->pend[subno] != 0); 661 } 662 EMIT(ORPAREN, subno); 663 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 664 break; 665 case BACKSL|')': /* should not get here -- must be user */ 666 case BACKSL|'}': 667 SETERROR(REG_EPAREN); 668 break; 669 case BACKSL|'1': 670 case BACKSL|'2': 671 case BACKSL|'3': 672 case BACKSL|'4': 673 case BACKSL|'5': 674 case BACKSL|'6': 675 case BACKSL|'7': 676 case BACKSL|'8': 677 case BACKSL|'9': 678 i = (c&~BACKSL) - '0'; 679 assert(i < NPAREN); 680 if (p->pend[i] != 0) { 681 assert(i <= p->g->nsub); 682 EMIT(OBACK_, i); 683 assert(p->pbegin[i] != 0); 684 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 685 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 686 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 687 EMIT(O_BACK, i); 688 } else 689 SETERROR(REG_ESUBREG); 690 p->g->backrefs = 1; 691 break; 692 case '*': 693 REQUIRE(starordinary, REG_BADRPT); 694 /* FALLTHROUGH */ 695 default: 696 ordinary(p, c &~ BACKSL); 697 break; 698 } 699 700 if (EAT('*')) { /* implemented as +? */ 701 /* this case does not require the (y|) trick, noKLUDGE */ 702 INSERT(OPLUS_, pos); 703 ASTERN(O_PLUS, pos); 704 INSERT(OQUEST_, pos); 705 ASTERN(O_QUEST, pos); 706 } else if (EATTWO('\\', '{')) { 707 count = p_count(p); 708 if (EAT(',')) { 709 if (MORE() && isdigit((unsigned char)PEEK())) { 710 count2 = p_count(p); 711 REQUIRE(count <= count2, REG_BADBR); 712 } else /* single number with comma */ 713 count2 = INFINITY; 714 } else /* just a single number */ 715 count2 = count; 716 repeat(p, pos, count, count2, 0); 717 if (!EATTWO('\\', '}')) { /* error heuristics */ 718 while (MORE() && !SEETWO('\\', '}')) 719 NEXT(); 720 REQUIRE(MORE(), REG_EBRACE); 721 SETERROR(REG_BADBR); 722 } 723 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 724 return(1); 725 726 return(0); 727 } 728 729 /* 730 - p_count - parse a repetition count 731 == static int p_count(struct parse *p); 732 */ 733 static int /* the value */ 734 p_count( 735 struct parse *p) 736 { 737 int count = 0; 738 int ndigits = 0; 739 740 _DIAGASSERT(p != NULL); 741 742 while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) { 743 count = count*10 + (GETNEXT() - '0'); 744 ndigits++; 745 } 746 747 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 748 return(count); 749 } 750 751 /* 752 - p_bracket - parse a bracketed character list 753 == static void p_bracket(struct parse *p); 754 * 755 * Note a significant property of this code: if the allocset() did SETERROR, 756 * no set operations are done. 757 */ 758 static void 759 p_bracket( 760 struct parse *p) 761 { 762 cset *cs; 763 int invert = 0; 764 _DIAGASSERT(p != NULL); 765 766 cs = allocset(p); 767 if (cs == NULL) 768 return; 769 770 /* Dept of Truly Sickening Special-Case Kludges */ 771 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 772 (size_t)6) == 0) { 773 EMIT(OBOW, 0); 774 NEXTn(6); 775 return; 776 } 777 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 778 (size_t)6) == 0) { 779 EMIT(OEOW, 0); 780 NEXTn(6); 781 return; 782 } 783 784 if (EAT('^')) 785 invert++; /* make note to invert set at end */ 786 if (EAT(']')) 787 CHadd(cs, ']'); 788 else if (EAT('-')) 789 CHadd(cs, '-'); 790 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 791 p_b_term(p, cs); 792 if (EAT('-')) 793 CHadd(cs, '-'); 794 MUSTEAT(']', REG_EBRACK); 795 796 if (p->error != 0) /* don't mess things up further */ 797 return; 798 799 if (p->g->cflags®_ICASE) { 800 int i; 801 int ci; 802 803 for (i = p->g->csetsize - 1; i >= 0; i--) 804 if (CHIN(cs, i) && isalpha(i)) { 805 ci = othercase(i); 806 if (ci != i) 807 CHadd(cs, ci); 808 } 809 if (cs->multis != NULL) 810 mccase(p, cs); 811 } 812 if (invert) { 813 int i; 814 815 for (i = p->g->csetsize - 1; i >= 0; i--) 816 if (CHIN(cs, i)) 817 CHsub(cs, i); 818 else 819 CHadd(cs, i); 820 if (p->g->cflags®_NEWLINE) 821 CHsub(cs, '\n'); 822 if (cs->multis != NULL) 823 mcinvert(p, cs); 824 } 825 826 assert(cs->multis == NULL); /* xxx */ 827 828 if (nch(p, cs) == 1) { /* optimize singleton sets */ 829 ordinary(p, firstch(p, cs)); 830 freeset(p, cs); 831 } else 832 EMIT(OANYOF, freezeset(p, cs)); 833 } 834 835 /* 836 - p_b_term - parse one term of a bracketed character list 837 == static void p_b_term(struct parse *p, cset *cs); 838 */ 839 static void 840 p_b_term( 841 struct parse *p, 842 cset *cs) 843 { 844 char c; 845 char start, finish; 846 int i; 847 848 _DIAGASSERT(p != NULL); 849 _DIAGASSERT(cs != NULL); 850 851 /* classify what we've got */ 852 switch ((MORE()) ? PEEK() : '\0') { 853 case '[': 854 c = (MORE2()) ? PEEK2() : '\0'; 855 break; 856 857 case '-': 858 SETERROR(REG_ERANGE); 859 return; /* NOTE RETURN */ 860 861 default: 862 c = '\0'; 863 break; 864 } 865 866 switch (c) { 867 case ':': /* character class */ 868 NEXT2(); 869 REQUIRE(MORE(), REG_EBRACK); 870 c = PEEK(); 871 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 872 p_b_cclass(p, cs); 873 REQUIRE(MORE(), REG_EBRACK); 874 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 875 break; 876 case '=': /* equivalence class */ 877 NEXT2(); 878 REQUIRE(MORE(), REG_EBRACK); 879 c = PEEK(); 880 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 881 p_b_eclass(p, cs); 882 REQUIRE(MORE(), REG_EBRACK); 883 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 884 break; 885 default: /* symbol, ordinary character, or range */ 886 /* xxx revision needed for multichar stuff */ 887 start = p_b_symbol(p); 888 if (SEE('-') && MORE2() && PEEK2() != ']') { 889 /* range */ 890 NEXT(); 891 if (EAT('-')) 892 finish = '-'; 893 else 894 finish = p_b_symbol(p); 895 } else 896 finish = start; 897 /* xxx what about signed chars here... */ 898 REQUIRE(start <= finish, REG_ERANGE); 899 for (i = start; i <= finish; i++) 900 CHadd(cs, i); 901 break; 902 } 903 } 904 905 /* 906 - p_b_cclass - parse a character-class name and deal with it 907 == static void p_b_cclass(struct parse *p, cset *cs); 908 */ 909 static void 910 p_b_cclass( 911 struct parse *p, 912 cset *cs) 913 { 914 const char *sp; 915 const struct cclass *cp; 916 size_t len; 917 const char *u; 918 char c; 919 920 _DIAGASSERT(p != NULL); 921 _DIAGASSERT(cs != NULL); 922 923 sp = p->next; 924 925 while (MORE() && isalpha((unsigned char)PEEK())) 926 NEXT(); 927 len = p->next - sp; 928 for (cp = cclasses; cp->name != NULL; cp++) 929 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 930 break; 931 if (cp->name == NULL) { 932 /* oops, didn't find it */ 933 SETERROR(REG_ECTYPE); 934 return; 935 } 936 937 u = cp->chars; 938 while ((c = *u++) != '\0') 939 CHadd(cs, c); 940 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 941 MCadd(p, cs, u); 942 } 943 944 /* 945 - p_b_eclass - parse an equivalence-class name and deal with it 946 == static void p_b_eclass(struct parse *p, cset *cs); 947 * 948 * This implementation is incomplete. xxx 949 */ 950 static void 951 p_b_eclass( 952 struct parse *p, 953 cset *cs) 954 { 955 char c; 956 957 _DIAGASSERT(p != NULL); 958 _DIAGASSERT(cs != NULL); 959 960 c = p_b_coll_elem(p, '='); 961 CHadd(cs, c); 962 } 963 964 /* 965 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 966 == static char p_b_symbol(struct parse *p); 967 */ 968 static char /* value of symbol */ 969 p_b_symbol( 970 struct parse *p) 971 { 972 char value; 973 974 _DIAGASSERT(p != NULL); 975 976 REQUIRE(MORE(), REG_EBRACK); 977 if (!EATTWO('[', '.')) 978 return(GETNEXT()); 979 980 /* collating symbol */ 981 value = p_b_coll_elem(p, '.'); 982 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 983 return(value); 984 } 985 986 /* 987 - p_b_coll_elem - parse a collating-element name and look it up 988 == static char p_b_coll_elem(struct parse *p, int endc); 989 */ 990 static char /* value of collating element */ 991 p_b_coll_elem( 992 struct parse *p, 993 int endc) /* name ended by endc,']' */ 994 { 995 const char *sp; 996 const struct cname *cp; 997 size_t len; 998 999 _DIAGASSERT(p != NULL); 1000 1001 sp = p->next; 1002 1003 while (MORE() && !SEETWO(endc, ']')) 1004 NEXT(); 1005 if (!MORE()) { 1006 SETERROR(REG_EBRACK); 1007 return(0); 1008 } 1009 len = p->next - sp; 1010 for (cp = cnames; cp->name != NULL; cp++) 1011 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 1012 return(cp->code); /* known name */ 1013 if (len == 1) 1014 return(*sp); /* single character */ 1015 SETERROR(REG_ECOLLATE); /* neither */ 1016 return(0); 1017 } 1018 1019 /* 1020 - othercase - return the case counterpart of an alphabetic 1021 == static int othercase(int ch); 1022 */ 1023 static int /* if no counterpart, return ch */ 1024 othercase( 1025 int ch) 1026 { 1027 assert(isalpha(ch)); 1028 if (isupper(ch)) 1029 return(tolower(ch)); 1030 else if (islower(ch)) 1031 return(toupper(ch)); 1032 else /* peculiar, but could happen */ 1033 return(ch); 1034 } 1035 1036 /* 1037 - bothcases - emit a dualcase version of a two-case character 1038 == static void bothcases(struct parse *p, int ch); 1039 * 1040 * Boy, is this implementation ever a kludge... 1041 */ 1042 static void 1043 bothcases( 1044 struct parse *p, 1045 int ch) 1046 { 1047 const char *oldnext; 1048 const char *oldend; 1049 char bracket[3]; 1050 1051 _DIAGASSERT(p != NULL); 1052 1053 oldnext = p->next; 1054 oldend = p->end; 1055 1056 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1057 p->next = bracket; 1058 p->end = bracket+2; 1059 bracket[0] = ch; 1060 bracket[1] = ']'; 1061 bracket[2] = '\0'; 1062 p_bracket(p); 1063 assert(p->next == bracket+2); 1064 p->next = oldnext; 1065 p->end = oldend; 1066 } 1067 1068 /* 1069 - ordinary - emit an ordinary character 1070 == static void ordinary(struct parse *p, int ch); 1071 */ 1072 static void 1073 ordinary( 1074 struct parse *p, 1075 int ch) 1076 { 1077 cat_t *cap; 1078 1079 _DIAGASSERT(p != NULL); 1080 1081 cap = p->g->categories; 1082 if ((p->g->cflags®_ICASE) && isalpha((unsigned char) ch) 1083 && othercase((unsigned char) ch) != (unsigned char) ch) 1084 bothcases(p, (unsigned char) ch); 1085 else { 1086 EMIT(OCHAR, (sopno)(unsigned char)ch); 1087 if (cap[ch] == 0) 1088 cap[ch] = p->g->ncategories++; 1089 } 1090 } 1091 1092 /* 1093 - nonnewline - emit REG_NEWLINE version of OANY 1094 == static void nonnewline(struct parse *p); 1095 * 1096 * Boy, is this implementation ever a kludge... 1097 */ 1098 static void 1099 nonnewline( 1100 struct parse *p) 1101 { 1102 const char *oldnext; 1103 const char *oldend; 1104 char bracket[4]; 1105 1106 _DIAGASSERT(p != NULL); 1107 1108 oldnext = p->next; 1109 oldend = p->end; 1110 1111 p->next = bracket; 1112 p->end = bracket+3; 1113 bracket[0] = '^'; 1114 bracket[1] = '\n'; 1115 bracket[2] = ']'; 1116 bracket[3] = '\0'; 1117 p_bracket(p); 1118 assert(p->next == bracket+3); 1119 p->next = oldnext; 1120 p->end = oldend; 1121 } 1122 1123 /* 1124 - repeat - generate code for a bounded repetition, recursively if needed 1125 == static void repeat(struct parse *p, sopno start, int from, int to, 1126 == size_t reclimit); 1127 */ 1128 static void 1129 repeat( 1130 struct parse *p, 1131 sopno start, /* operand from here to end of strip */ 1132 int from, /* repeated from this number */ 1133 int to, /* to this number of times (maybe INFINITY) */ 1134 size_t reclimit) 1135 { 1136 sopno finish; 1137 # define N 2 1138 # define INF 3 1139 # define REP(f, t) ((f)*8 + (t)) 1140 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1141 sopno copy; 1142 1143 _DIAGASSERT(p != NULL); 1144 1145 if (reclimit++ > RECLIMIT) 1146 p->error = REG_ESPACE; 1147 if (p->error) 1148 return; 1149 1150 finish = HERE(); 1151 1152 assert(from <= to); 1153 1154 switch (REP(MAP(from), MAP(to))) { 1155 case REP(0, 0): /* must be user doing this */ 1156 DROP(finish-start); /* drop the operand */ 1157 break; 1158 case REP(0, 1): /* as x{1,1}? */ 1159 case REP(0, N): /* as x{1,n}? */ 1160 case REP(0, INF): /* as x{1,}? */ 1161 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1162 INSERT(OCH_, start); /* offset is wrong... */ 1163 repeat(p, start+1, 1, to, reclimit); 1164 ASTERN(OOR1, start); 1165 AHEAD(start); /* ... fix it */ 1166 EMIT(OOR2, 0); 1167 AHEAD(THERE()); 1168 ASTERN(O_CH, THERETHERE()); 1169 break; 1170 case REP(1, 1): /* trivial case */ 1171 /* done */ 1172 break; 1173 case REP(1, N): /* as x?x{1,n-1} */ 1174 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1175 INSERT(OCH_, start); 1176 ASTERN(OOR1, start); 1177 AHEAD(start); 1178 EMIT(OOR2, 0); /* offset very wrong... */ 1179 AHEAD(THERE()); /* ...so fix it */ 1180 ASTERN(O_CH, THERETHERE()); 1181 copy = dupl(p, start+1, finish+1); 1182 assert(copy == finish+4); 1183 repeat(p, copy, 1, to-1, reclimit); 1184 break; 1185 case REP(1, INF): /* as x+ */ 1186 INSERT(OPLUS_, start); 1187 ASTERN(O_PLUS, start); 1188 break; 1189 case REP(N, N): /* as xx{m-1,n-1} */ 1190 copy = dupl(p, start, finish); 1191 repeat(p, copy, from-1, to-1, reclimit); 1192 break; 1193 case REP(N, INF): /* as xx{n-1,INF} */ 1194 copy = dupl(p, start, finish); 1195 repeat(p, copy, from-1, to, reclimit); 1196 break; 1197 default: /* "can't happen" */ 1198 SETERROR(REG_ASSERT); /* just in case */ 1199 break; 1200 } 1201 } 1202 1203 /* 1204 - seterr - set an error condition 1205 == static int seterr(struct parse *p, int e); 1206 */ 1207 static int /* useless but makes type checking happy */ 1208 seterr( 1209 struct parse *p, 1210 int e) 1211 { 1212 1213 _DIAGASSERT(p != NULL); 1214 1215 if (p->error == 0) /* keep earliest error condition */ 1216 p->error = e; 1217 p->next = nuls; /* try to bring things to a halt */ 1218 p->end = nuls; 1219 return(0); /* make the return value well-defined */ 1220 } 1221 1222 /* 1223 - allocset - allocate a set of characters for [] 1224 == static cset *allocset(struct parse *p); 1225 */ 1226 static cset * 1227 allocset( 1228 struct parse *p) 1229 { 1230 int no; 1231 size_t nc; 1232 size_t nbytes; 1233 cset *cs; 1234 size_t css; 1235 int i; 1236 1237 _DIAGASSERT(p != NULL); 1238 1239 no = p->g->ncsets++; 1240 css = (size_t)p->g->csetsize; 1241 if (no >= p->ncsalloc) { /* need another column of space */ 1242 p->ncsalloc += CHAR_BIT; 1243 nc = p->ncsalloc; 1244 assert(nc % CHAR_BIT == 0); 1245 nbytes = nc / CHAR_BIT * css; 1246 if (MEMSIZE(p) > MEMLIMIT) 1247 goto oomem; 1248 if (p->g->sets == NULL) 1249 p->g->sets = malloc(nc * sizeof(cset)); 1250 else 1251 p->g->sets = realloc(p->g->sets, nc * sizeof(cset)); 1252 if (p->g->setbits == NULL) 1253 p->g->setbits = malloc(nbytes); 1254 else { 1255 p->g->setbits = realloc(p->g->setbits, nbytes); 1256 /* xxx this isn't right if setbits is now NULL */ 1257 for (i = 0; i < no; i++) 1258 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1259 } 1260 if (p->g->sets != NULL && p->g->setbits != NULL) 1261 (void) memset((char *)p->g->setbits + (nbytes - css), 1262 0, css); 1263 else { 1264 oomem: 1265 no = 0; 1266 SETERROR(REG_ESPACE); 1267 /* caller's responsibility not to do set ops */ 1268 return NULL; 1269 } 1270 } 1271 1272 cs = &p->g->sets[no]; 1273 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1274 cs->mask = 1 << ((no) % CHAR_BIT); 1275 cs->hash = 0; 1276 cs->smultis = 0; 1277 cs->multis = NULL; 1278 1279 return(cs); 1280 } 1281 1282 /* 1283 - freeset - free a now-unused set 1284 == static void freeset(struct parse *p, cset *cs); 1285 */ 1286 static void 1287 freeset( 1288 struct parse *p, 1289 cset *cs) 1290 { 1291 size_t i; 1292 cset *top; 1293 size_t css; 1294 1295 _DIAGASSERT(p != NULL); 1296 _DIAGASSERT(cs != NULL); 1297 1298 top = &p->g->sets[p->g->ncsets]; 1299 css = (size_t)p->g->csetsize; 1300 1301 for (i = 0; i < css; i++) 1302 CHsub(cs, i); 1303 if (cs == top-1) /* recover only the easy case */ 1304 p->g->ncsets--; 1305 } 1306 1307 /* 1308 - freezeset - final processing on a set of characters 1309 == static int freezeset(struct parse *p, cset *cs); 1310 * 1311 * The main task here is merging identical sets. This is usually a waste 1312 * of time (although the hash code minimizes the overhead), but can win 1313 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1314 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1315 * the same value! 1316 */ 1317 static sopno /* set number */ 1318 freezeset( 1319 struct parse *p, 1320 cset *cs) 1321 { 1322 uch h; 1323 size_t i; 1324 cset *top; 1325 cset *cs2; 1326 size_t css; 1327 1328 _DIAGASSERT(p != NULL); 1329 _DIAGASSERT(cs != NULL); 1330 1331 h = cs->hash; 1332 top = &p->g->sets[p->g->ncsets]; 1333 css = (size_t)p->g->csetsize; 1334 1335 /* look for an earlier one which is the same */ 1336 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1337 if (cs2->hash == h && cs2 != cs) { 1338 /* maybe */ 1339 for (i = 0; i < css; i++) 1340 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1341 break; /* no */ 1342 if (i == css) 1343 break; /* yes */ 1344 } 1345 1346 if (cs2 < top) { /* found one */ 1347 freeset(p, cs); 1348 cs = cs2; 1349 } 1350 1351 return (sopno)(cs - p->g->sets); 1352 } 1353 1354 /* 1355 - firstch - return first character in a set (which must have at least one) 1356 == static int firstch(struct parse *p, cset *cs); 1357 */ 1358 static int /* character; there is no "none" value */ 1359 firstch( 1360 struct parse *p, 1361 cset *cs) 1362 { 1363 size_t i; 1364 size_t css; 1365 1366 _DIAGASSERT(p != NULL); 1367 _DIAGASSERT(cs != NULL); 1368 1369 css = (size_t)p->g->csetsize; 1370 1371 for (i = 0; i < css; i++) 1372 if (CHIN(cs, i)) 1373 return((char)i); 1374 assert(never); 1375 return(0); /* arbitrary */ 1376 } 1377 1378 /* 1379 - nch - number of characters in a set 1380 == static int nch(struct parse *p, cset *cs); 1381 */ 1382 static int 1383 nch( 1384 struct parse *p, 1385 cset *cs) 1386 { 1387 size_t i; 1388 size_t css; 1389 int n = 0; 1390 1391 _DIAGASSERT(p != NULL); 1392 _DIAGASSERT(cs != NULL); 1393 1394 css = (size_t)p->g->csetsize; 1395 1396 for (i = 0; i < css; i++) 1397 if (CHIN(cs, i)) 1398 n++; 1399 return(n); 1400 } 1401 1402 /* 1403 - mcadd - add a collating element to a cset 1404 == static void mcadd(struct parse *p, cset *cs, \ 1405 == char *cp); 1406 */ 1407 static void 1408 mcadd( 1409 struct parse *p, 1410 cset *cs, 1411 const char *cp) 1412 { 1413 size_t oldend; 1414 1415 _DIAGASSERT(p != NULL); 1416 _DIAGASSERT(cs != NULL); 1417 _DIAGASSERT(cp != NULL); 1418 1419 oldend = cs->smultis; 1420 1421 cs->smultis += strlen(cp) + 1; 1422 if (cs->multis == NULL) 1423 cs->multis = malloc(cs->smultis); 1424 else 1425 cs->multis = realloc(cs->multis, cs->smultis); 1426 if (cs->multis == NULL) { 1427 SETERROR(REG_ESPACE); 1428 return; 1429 } 1430 1431 (void) strcpy(cs->multis + oldend - 1, cp); 1432 cs->multis[cs->smultis - 1] = '\0'; 1433 } 1434 1435 #if 0 1436 /* 1437 - mcsub - subtract a collating element from a cset 1438 == static void mcsub(cset *cs, char *cp); 1439 */ 1440 static void 1441 mcsub( 1442 cset *cs, 1443 char *cp) 1444 { 1445 char *fp; 1446 size_t len; 1447 1448 _DIAGASSERT(cs != NULL); 1449 _DIAGASSERT(cp != NULL); 1450 1451 fp = mcfind(cs, cp); 1452 len = strlen(fp); 1453 1454 assert(fp != NULL); 1455 (void) memmove(fp, fp + len + 1, 1456 cs->smultis - (fp + len + 1 - cs->multis)); 1457 cs->smultis -= len; 1458 1459 if (cs->smultis == 0) { 1460 free(cs->multis); 1461 cs->multis = NULL; 1462 return; 1463 } 1464 1465 cs->multis = realloc(cs->multis, cs->smultis); 1466 assert(cs->multis != NULL); 1467 } 1468 1469 /* 1470 - mcin - is a collating element in a cset? 1471 == static int mcin(cset *cs, char *cp); 1472 */ 1473 static int 1474 mcin( 1475 cset *cs, 1476 char *cp) 1477 { 1478 1479 _DIAGASSERT(cs != NULL); 1480 _DIAGASSERT(cp != NULL); 1481 1482 return(mcfind(cs, cp) != NULL); 1483 } 1484 1485 /* 1486 - mcfind - find a collating element in a cset 1487 == static char *mcfind(cset *cs, char *cp); 1488 */ 1489 static char * 1490 mcfind( 1491 cset *cs, 1492 char *cp) 1493 { 1494 char *p; 1495 1496 _DIAGASSERT(cs != NULL); 1497 _DIAGASSERT(cp != NULL); 1498 1499 if (cs->multis == NULL) 1500 return(NULL); 1501 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1502 if (strcmp(cp, p) == 0) 1503 return(p); 1504 return(NULL); 1505 } 1506 #endif 1507 1508 /* 1509 - mcinvert - invert the list of collating elements in a cset 1510 == static void mcinvert(struct parse *p, cset *cs); 1511 * 1512 * This would have to know the set of possibilities. Implementation 1513 * is deferred. 1514 */ 1515 /* ARGSUSED */ 1516 static void 1517 mcinvert( 1518 struct parse *p, 1519 cset *cs) 1520 { 1521 1522 _DIAGASSERT(p != NULL); 1523 _DIAGASSERT(cs != NULL); 1524 1525 assert(cs->multis == NULL); /* xxx */ 1526 } 1527 1528 /* 1529 - mccase - add case counterparts of the list of collating elements in a cset 1530 == static void mccase(struct parse *p, cset *cs); 1531 * 1532 * This would have to know the set of possibilities. Implementation 1533 * is deferred. 1534 */ 1535 /* ARGSUSED */ 1536 static void 1537 mccase( 1538 struct parse *p, 1539 cset *cs) 1540 { 1541 1542 _DIAGASSERT(p != NULL); 1543 _DIAGASSERT(cs != NULL); 1544 1545 assert(cs->multis == NULL); /* xxx */ 1546 } 1547 1548 /* 1549 - isinsets - is this character in any sets? 1550 == static int isinsets(struct re_guts *g, int c); 1551 */ 1552 static int /* predicate */ 1553 isinsets( 1554 struct re_guts *g, 1555 int c) 1556 { 1557 uch *col; 1558 int i; 1559 int ncols; 1560 unsigned uc = (unsigned char)c; 1561 1562 _DIAGASSERT(g != NULL); 1563 1564 if (g->setbits == NULL) 1565 return 0; 1566 1567 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1568 1569 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1570 if (col[uc] != 0) 1571 return(1); 1572 return(0); 1573 } 1574 1575 /* 1576 - samesets - are these two characters in exactly the same sets? 1577 == static int samesets(struct re_guts *g, int c1, int c2); 1578 */ 1579 static int /* predicate */ 1580 samesets( 1581 struct re_guts *g, 1582 int c1, 1583 int c2) 1584 { 1585 uch *col; 1586 int i; 1587 int ncols; 1588 unsigned uc1 = (unsigned char)c1; 1589 unsigned uc2 = (unsigned char)c2; 1590 1591 _DIAGASSERT(g != NULL); 1592 1593 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1594 1595 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1596 if (col[uc1] != col[uc2]) 1597 return(0); 1598 return(1); 1599 } 1600 1601 /* 1602 - categorize - sort out character categories 1603 == static void categorize(struct parse *p, struct re_guts *g); 1604 */ 1605 static void 1606 categorize( 1607 struct parse *p, 1608 struct re_guts *g) 1609 { 1610 cat_t *cats; 1611 int c; 1612 int c2; 1613 cat_t cat; 1614 1615 _DIAGASSERT(p != NULL); 1616 _DIAGASSERT(g != NULL); 1617 1618 cats = g->categories; 1619 1620 /* avoid making error situations worse */ 1621 if (p->error != 0) 1622 return; 1623 1624 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1625 if (cats[c] == 0 && isinsets(g, c)) { 1626 cat = g->ncategories++; 1627 cats[c] = cat; 1628 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1629 if (cats[c2] == 0 && samesets(g, c, c2)) 1630 cats[c2] = cat; 1631 } 1632 } 1633 1634 /* 1635 - dupl - emit a duplicate of a bunch of sops 1636 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1637 */ 1638 static sopno /* start of duplicate */ 1639 dupl( 1640 struct parse *p, 1641 sopno start, /* from here */ 1642 sopno finish) /* to this less one */ 1643 { 1644 sopno ret; 1645 sopno len = finish - start; 1646 1647 _DIAGASSERT(p != NULL); 1648 1649 ret = HERE(); 1650 1651 assert(finish >= start); 1652 if (len == 0) 1653 return(ret); 1654 if (!enlarge(p, p->ssize + len))/* this many unexpected additions */ 1655 return ret; 1656 (void)memcpy(p->strip + p->slen, p->strip + start, 1657 (size_t)len * sizeof(sop)); 1658 p->slen += len; 1659 return(ret); 1660 } 1661 1662 /* 1663 - doemit - emit a strip operator 1664 == static void doemit(struct parse *p, sop op, size_t opnd); 1665 * 1666 * It might seem better to implement this as a macro with a function as 1667 * hard-case backup, but it's just too big and messy unless there are 1668 * some changes to the data structures. Maybe later. 1669 */ 1670 static void 1671 doemit( 1672 struct parse *p, 1673 sop op, 1674 sopno opnd) 1675 { 1676 1677 _DIAGASSERT(p != NULL); 1678 1679 /* avoid making error situations worse */ 1680 if (p->error != 0) 1681 return; 1682 1683 /* deal with oversize operands ("can't happen", more or less) */ 1684 assert(opnd < 1<<OPSHIFT); 1685 1686 /* deal with undersized strip */ 1687 if (p->slen >= p->ssize) 1688 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1689 return; 1690 1691 /* finally, it's all reduced to the easy case */ 1692 p->strip[p->slen++] = SOP(op, opnd); 1693 } 1694 1695 /* 1696 - doinsert - insert a sop into the strip 1697 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1698 */ 1699 static void 1700 doinsert( 1701 struct parse *p, 1702 sop op, 1703 sopno opnd, 1704 sopno pos) 1705 { 1706 sopno sn; 1707 sop s; 1708 int i; 1709 1710 _DIAGASSERT(p != NULL); 1711 1712 /* avoid making error situations worse */ 1713 if (p->error != 0) 1714 return; 1715 1716 sn = HERE(); 1717 EMIT(op, opnd); /* do checks, ensure space */ 1718 assert(HERE() == sn+1); 1719 s = p->strip[sn]; 1720 1721 /* adjust paren pointers */ 1722 assert(pos > 0); 1723 for (i = 1; i < NPAREN; i++) { 1724 if (p->pbegin[i] >= pos) { 1725 p->pbegin[i]++; 1726 } 1727 if (p->pend[i] >= pos) { 1728 p->pend[i]++; 1729 } 1730 } 1731 1732 memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop)); 1733 p->strip[pos] = s; 1734 } 1735 1736 /* 1737 - dofwd - complete a forward reference 1738 == static void dofwd(struct parse *p, sopno pos, sop value); 1739 */ 1740 static void 1741 dofwd( 1742 struct parse *p, 1743 sopno pos, 1744 sopno value) 1745 { 1746 1747 _DIAGASSERT(p != NULL); 1748 1749 /* avoid making error situations worse */ 1750 if (p->error != 0) 1751 return; 1752 1753 assert(value < 1<<OPSHIFT); 1754 p->strip[pos] = OP(p->strip[pos]) | value; 1755 } 1756 1757 /* 1758 - enlarge - enlarge the strip 1759 == static void enlarge(struct parse *p, sopno size); 1760 */ 1761 static int 1762 enlarge( 1763 struct parse *p, 1764 sopno size) 1765 { 1766 sop *sp; 1767 sopno osize; 1768 1769 _DIAGASSERT(p != NULL); 1770 1771 if (p->ssize >= size) 1772 return 1; 1773 1774 osize = p->ssize; 1775 p->ssize = size; 1776 if (MEMSIZE(p) > MEMLIMIT) 1777 goto oomem; 1778 sp = realloc(p->strip, p->ssize * sizeof(sop)); 1779 if (sp == NULL) { 1780 oomem: 1781 p->ssize = osize; 1782 SETERROR(REG_ESPACE); 1783 return 0; 1784 } 1785 p->strip = sp; 1786 return 1; 1787 } 1788 1789 /* 1790 - stripsnug - compact the strip 1791 == static void stripsnug(struct parse *p, struct re_guts *g); 1792 */ 1793 static void 1794 stripsnug( 1795 struct parse *p, 1796 struct re_guts *g) 1797 { 1798 1799 _DIAGASSERT(p != NULL); 1800 _DIAGASSERT(g != NULL); 1801 1802 g->nstates = p->slen; 1803 g->strip = realloc(p->strip, p->slen * sizeof(sop)); 1804 if (g->strip == NULL) { 1805 SETERROR(REG_ESPACE); 1806 g->strip = p->strip; 1807 } 1808 } 1809 1810 /* 1811 - findmust - fill in must and mlen with longest mandatory literal string 1812 == static void findmust(struct parse *p, struct re_guts *g); 1813 * 1814 * This algorithm could do fancy things like analyzing the operands of | 1815 * for common subsequences. Someday. This code is simple and finds most 1816 * of the interesting cases. 1817 * 1818 * Note that must and mlen got initialized during setup. 1819 */ 1820 static void 1821 findmust( 1822 struct parse *p, 1823 struct re_guts *g) 1824 { 1825 sop *scan; 1826 sop *start = NULL; 1827 sop *newstart = NULL; 1828 sopno newlen; 1829 sop s; 1830 char *cp; 1831 sopno i; 1832 1833 _DIAGASSERT(p != NULL); 1834 _DIAGASSERT(g != NULL); 1835 1836 /* avoid making error situations worse */ 1837 if (p->error != 0) 1838 return; 1839 1840 /* find the longest OCHAR sequence in strip */ 1841 newlen = 0; 1842 scan = g->strip + 1; 1843 do { 1844 s = *scan++; 1845 switch (OP(s)) { 1846 case OCHAR: /* sequence member */ 1847 if (newlen == 0) /* new sequence */ 1848 newstart = scan - 1; 1849 newlen++; 1850 break; 1851 case OPLUS_: /* things that don't break one */ 1852 case OLPAREN: 1853 case ORPAREN: 1854 break; 1855 case OQUEST_: /* things that must be skipped */ 1856 case OCH_: 1857 scan--; 1858 do { 1859 scan += OPND(s); 1860 s = *scan; 1861 /* assert() interferes w debug printouts */ 1862 if (OP(s) != O_QUEST && OP(s) != O_CH && 1863 OP(s) != OOR2) { 1864 g->iflags |= BAD; 1865 return; 1866 } 1867 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1868 /* FALLTHROUGH */ 1869 default: /* things that break a sequence */ 1870 if (newlen > g->mlen) { /* ends one */ 1871 start = newstart; 1872 g->mlen = newlen; 1873 } 1874 newlen = 0; 1875 break; 1876 } 1877 } while (OP(s) != OEND); 1878 1879 if (start == NULL) 1880 g->mlen = 0; 1881 1882 if (g->mlen == 0) /* there isn't one */ 1883 return; 1884 1885 /* turn it into a character string */ 1886 g->must = malloc((size_t)g->mlen + 1); 1887 if (g->must == NULL) { /* argh; just forget it */ 1888 g->mlen = 0; 1889 return; 1890 } 1891 cp = g->must; 1892 scan = start; 1893 for (i = g->mlen; i > 0; i--) { 1894 while (OP(s = *scan++) != OCHAR) 1895 continue; 1896 assert(cp < g->must + g->mlen); 1897 *cp++ = (char)OPND(s); 1898 } 1899 assert(cp == g->must + g->mlen); 1900 *cp++ = '\0'; /* just on general principles */ 1901 } 1902 1903 /* 1904 - pluscount - count + nesting 1905 == static sopno pluscount(struct parse *p, struct re_guts *g); 1906 */ 1907 static sopno /* nesting depth */ 1908 pluscount( 1909 struct parse *p, 1910 struct re_guts *g) 1911 { 1912 sop *scan; 1913 sop s; 1914 sopno plusnest = 0; 1915 sopno maxnest = 0; 1916 1917 _DIAGASSERT(p != NULL); 1918 _DIAGASSERT(g != NULL); 1919 1920 if (p->error != 0) 1921 return(0); /* there may not be an OEND */ 1922 1923 scan = g->strip + 1; 1924 do { 1925 s = *scan++; 1926 switch (OP(s)) { 1927 case OPLUS_: 1928 plusnest++; 1929 break; 1930 case O_PLUS: 1931 if (plusnest > maxnest) 1932 maxnest = plusnest; 1933 plusnest--; 1934 break; 1935 } 1936 } while (OP(s) != OEND); 1937 if (plusnest != 0) 1938 g->iflags |= BAD; 1939 return(maxnest); 1940 } 1941