1 /* $NetBSD: regcomp.c,v 1.31 2011/10/09 22:14:17 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 35 */ 36 37 /*- 38 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 39 * 40 * This code is derived from software contributed to Berkeley by 41 * Henry Spencer. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 72 */ 73 74 #include <sys/cdefs.h> 75 #if defined(LIBC_SCCS) && !defined(lint) 76 #if 0 77 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 78 #else 79 __RCSID("$NetBSD: regcomp.c,v 1.31 2011/10/09 22:14:17 christos Exp $"); 80 #endif 81 #endif /* LIBC_SCCS and not lint */ 82 83 #include "namespace.h" 84 #include <sys/types.h> 85 86 #include <assert.h> 87 #include <ctype.h> 88 #include <limits.h> 89 #include <stdio.h> 90 #include <stdlib.h> 91 #include <string.h> 92 #include <regex.h> 93 94 #ifdef __weak_alias 95 __weak_alias(regcomp,_regcomp) 96 #endif 97 98 #include "utils.h" 99 #include "regex2.h" 100 101 #include "cclass.h" 102 #include "cname.h" 103 104 /* 105 * parse structure, passed up and down to avoid global variables and 106 * other clumsinesses 107 */ 108 struct parse { 109 const char *next; /* next character in RE */ 110 const char *end; /* end of string (-> NUL normally) */ 111 int error; /* has an error been seen? */ 112 sop *strip; /* malloced strip */ 113 sopno ssize; /* malloced strip size (allocated) */ 114 sopno slen; /* malloced strip length (used) */ 115 int ncsalloc; /* number of csets allocated */ 116 struct re_guts *g; 117 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 118 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 119 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 120 }; 121 122 /* ========= begin header generated by ./mkh ========= */ 123 #ifdef __cplusplus 124 extern "C" { 125 #endif 126 127 /* === regcomp.c === */ 128 static void p_ere(struct parse *p, int stop, size_t reclimit); 129 static void p_ere_exp(struct parse *p, size_t reclimit); 130 static void p_str(struct parse *p); 131 static void p_bre(struct parse *p, int end1, int end2, size_t reclimit); 132 static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 133 static int p_count(struct parse *p); 134 static void p_bracket(struct parse *p); 135 static void p_b_term(struct parse *p, cset *cs); 136 static void p_b_cclass(struct parse *p, cset *cs); 137 static void p_b_eclass(struct parse *p, cset *cs); 138 static char p_b_symbol(struct parse *p); 139 static char p_b_coll_elem(struct parse *p, int endc); 140 static int othercase(int ch); 141 static void bothcases(struct parse *p, int ch); 142 static void ordinary(struct parse *p, int ch); 143 static void nonnewline(struct parse *p); 144 static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit); 145 static int seterr(struct parse *p, int e); 146 static cset *allocset(struct parse *p); 147 static void freeset(struct parse *p, cset *cs); 148 static sopno freezeset(struct parse *p, cset *cs); 149 static int firstch(struct parse *p, cset *cs); 150 static int nch(struct parse *p, cset *cs); 151 static void mcadd(struct parse *p, cset *cs, const char *cp); 152 #if 0 153 static void mcsub(cset *cs, char *cp); 154 static int mcin(cset *cs, char *cp); 155 static char *mcfind(cset *cs, char *cp); 156 #endif 157 static void mcinvert(struct parse *p, cset *cs); 158 static void mccase(struct parse *p, cset *cs); 159 static int isinsets(struct re_guts *g, int c); 160 static int samesets(struct re_guts *g, int c1, int c2); 161 static void categorize(struct parse *p, struct re_guts *g); 162 static sopno dupl(struct parse *p, sopno start, sopno finish); 163 static void doemit(struct parse *p, sop op, sopno opnd); 164 static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos); 165 static void dofwd(struct parse *p, sopno pos, sopno value); 166 static int enlarge(struct parse *p, sopno size); 167 static void stripsnug(struct parse *p, struct re_guts *g); 168 static void findmust(struct parse *p, struct re_guts *g); 169 static sopno pluscount(struct parse *p, struct re_guts *g); 170 171 #ifdef __cplusplus 172 } 173 #endif 174 /* ========= end header generated by ./mkh ========= */ 175 176 static char nuls[10]; /* place to point scanner in event of error */ 177 178 /* 179 * macros for use with parse structure 180 * BEWARE: these know that the parse structure is named `p' !!! 181 */ 182 #define PEEK() (*p->next) 183 #define PEEK2() (*(p->next+1)) 184 #define MORE() (p->next < p->end) 185 #define MORE2() (p->next+1 < p->end) 186 #define SEE(c) (MORE() && PEEK() == (c)) 187 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 188 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 189 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 190 #define NEXT() (p->next++) 191 #define NEXT2() (p->next += 2) 192 #define NEXTn(n) (p->next += (n)) 193 #define GETNEXT() (*p->next++) 194 #define SETERROR(e) seterr(p, (e)) 195 #define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 196 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 197 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 198 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 199 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd) 200 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 201 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 202 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 203 #define HERE() (p->slen) 204 #define THERE() (p->slen - 1) 205 #define THERETHERE() (p->slen - 2) 206 #define DROP(n) (p->slen -= (n)) 207 208 #ifndef NDEBUG 209 static int never = 0; /* for use in asserts; shuts lint up */ 210 #else 211 #define never 0 /* some <assert.h>s have bugs too */ 212 #endif 213 214 #define MEMLIMIT 0x8000000 215 #define MEMSIZE(p) \ 216 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \ 217 (p)->ncsalloc * sizeof(cset) + \ 218 (p)->ssize * sizeof(sop)) 219 #define RECLIMIT 256 220 221 /* 222 - regcomp - interface for parser and compilation 223 = extern int regcomp(regex_t *, const char *, int); 224 = #define REG_BASIC 0000 225 = #define REG_EXTENDED 0001 226 = #define REG_ICASE 0002 227 = #define REG_NOSUB 0004 228 = #define REG_NEWLINE 0010 229 = #define REG_NOSPEC 0020 230 = #define REG_PEND 0040 231 = #define REG_DUMP 0200 232 */ 233 int /* 0 success, otherwise REG_something */ 234 regcomp( 235 regex_t *preg, 236 const char *pattern, 237 int cflags) 238 { 239 struct parse pa; 240 struct re_guts *g; 241 struct parse *p = &pa; 242 int i; 243 size_t len; 244 #ifdef REDEBUG 245 # define GOODFLAGS(f) (f) 246 #else 247 # define GOODFLAGS(f) ((f)&~REG_DUMP) 248 #endif 249 250 _DIAGASSERT(preg != NULL); 251 _DIAGASSERT(pattern != NULL); 252 253 cflags = GOODFLAGS(cflags); 254 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 255 return(REG_INVARG); 256 257 if (cflags®_PEND) { 258 if (preg->re_endp < pattern) 259 return(REG_INVARG); 260 len = preg->re_endp - pattern; 261 } else 262 len = strlen(pattern); 263 264 /* do the mallocs early so failure handling is easy */ 265 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 266 (NC-1)*sizeof(cat_t)); 267 if (g == NULL) 268 return(REG_ESPACE); 269 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 270 p->strip = malloc(p->ssize * sizeof(sop)); 271 p->slen = 0; 272 if (p->strip == NULL) { 273 free(g); 274 return(REG_ESPACE); 275 } 276 277 /* set things up */ 278 p->g = g; 279 p->next = pattern; 280 p->end = p->next + len; 281 p->error = 0; 282 p->ncsalloc = 0; 283 for (i = 0; i < NPAREN; i++) { 284 p->pbegin[i] = 0; 285 p->pend[i] = 0; 286 } 287 g->csetsize = NC; 288 g->sets = NULL; 289 g->setbits = NULL; 290 g->ncsets = 0; 291 g->cflags = cflags; 292 g->iflags = 0; 293 g->nbol = 0; 294 g->neol = 0; 295 g->must = NULL; 296 g->mlen = 0; 297 g->nsub = 0; 298 g->ncategories = 1; /* category 0 is "everything else" */ 299 g->categories = &g->catspace[-(CHAR_MIN)]; 300 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 301 g->backrefs = 0; 302 303 /* do it */ 304 EMIT(OEND, 0); 305 g->firststate = THERE(); 306 if (cflags®_EXTENDED) 307 p_ere(p, OUT, 0); 308 else if (cflags®_NOSPEC) 309 p_str(p); 310 else 311 p_bre(p, OUT, OUT, 0); 312 EMIT(OEND, 0); 313 g->laststate = THERE(); 314 315 /* tidy up loose ends and fill things in */ 316 categorize(p, g); 317 stripsnug(p, g); 318 findmust(p, g); 319 g->nplus = pluscount(p, g); 320 g->magic = MAGIC2; 321 preg->re_nsub = g->nsub; 322 preg->re_g = g; 323 preg->re_magic = MAGIC1; 324 #ifndef REDEBUG 325 /* not debugging, so can't rely on the assert() in regexec() */ 326 if (g->iflags&BAD) 327 SETERROR(REG_ASSERT); 328 #endif 329 330 /* win or lose, we're done */ 331 if (p->error != 0) /* lose */ 332 regfree(preg); 333 return(p->error); 334 } 335 336 /* 337 - p_ere - ERE parser top level, concatenation and alternation 338 == static void p_ere(struct parse *p, int stop, size_t reclimit); 339 */ 340 static void 341 p_ere( 342 struct parse *p, 343 int stop, /* character this ERE should end at */ 344 size_t reclimit) 345 { 346 char c; 347 sopno prevback = 0; /* pacify gcc */ 348 sopno prevfwd = 0; /* pacify gcc */ 349 sopno conc; 350 int first = 1; /* is this the first alternative? */ 351 352 _DIAGASSERT(p != NULL); 353 354 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 355 p->error = REG_ESPACE; 356 return; 357 } 358 359 for (;;) { 360 /* do a bunch of concatenated expressions */ 361 conc = HERE(); 362 while (MORE() && (c = PEEK()) != '|' && c != stop) 363 p_ere_exp(p, reclimit); 364 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 365 366 if (!EAT('|')) 367 break; /* NOTE BREAK OUT */ 368 369 if (first) { 370 INSERT(OCH_, conc); /* offset is wrong */ 371 prevfwd = conc; 372 prevback = conc; 373 first = 0; 374 } 375 ASTERN(OOR1, prevback); 376 prevback = THERE(); 377 AHEAD(prevfwd); /* fix previous offset */ 378 prevfwd = HERE(); 379 EMIT(OOR2, 0); /* offset is very wrong */ 380 } 381 382 if (!first) { /* tail-end fixups */ 383 AHEAD(prevfwd); 384 ASTERN(O_CH, prevback); 385 } 386 387 assert(!MORE() || SEE(stop)); 388 } 389 390 /* 391 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 392 == static void p_ere_exp(struct parse *p, size_t reclimit); 393 */ 394 static void 395 p_ere_exp( 396 struct parse *p, 397 size_t reclimit) 398 { 399 char c; 400 sopno pos; 401 int count; 402 int count2; 403 sopno subno; 404 int wascaret = 0; 405 406 _DIAGASSERT(p != NULL); 407 408 assert(MORE()); /* caller should have ensured this */ 409 c = GETNEXT(); 410 411 pos = HERE(); 412 switch (c) { 413 case '(': 414 REQUIRE(MORE(), REG_EPAREN); 415 p->g->nsub++; 416 subno = p->g->nsub; 417 if (subno < NPAREN) 418 p->pbegin[subno] = HERE(); 419 EMIT(OLPAREN, subno); 420 if (!SEE(')')) 421 p_ere(p, ')', reclimit); 422 if (subno < NPAREN) { 423 p->pend[subno] = HERE(); 424 assert(p->pend[subno] != 0); 425 } 426 EMIT(ORPAREN, subno); 427 MUSTEAT(')', REG_EPAREN); 428 break; 429 #ifndef POSIX_MISTAKE 430 case ')': /* happens only if no current unmatched ( */ 431 /* 432 * You may ask, why the ifndef? Because I didn't notice 433 * this until slightly too late for 1003.2, and none of the 434 * other 1003.2 regular-expression reviewers noticed it at 435 * all. So an unmatched ) is legal POSIX, at least until 436 * we can get it fixed. 437 */ 438 SETERROR(REG_EPAREN); 439 break; 440 #endif 441 case '^': 442 EMIT(OBOL, 0); 443 p->g->iflags |= USEBOL; 444 p->g->nbol++; 445 wascaret = 1; 446 break; 447 case '$': 448 EMIT(OEOL, 0); 449 p->g->iflags |= USEEOL; 450 p->g->neol++; 451 break; 452 case '|': 453 SETERROR(REG_EMPTY); 454 break; 455 case '*': 456 case '+': 457 case '?': 458 SETERROR(REG_BADRPT); 459 break; 460 case '.': 461 if (p->g->cflags®_NEWLINE) 462 nonnewline(p); 463 else 464 EMIT(OANY, 0); 465 break; 466 case '[': 467 p_bracket(p); 468 break; 469 case '\\': 470 REQUIRE(MORE(), REG_EESCAPE); 471 c = GETNEXT(); 472 ordinary(p, c); 473 break; 474 case '{': /* okay as ordinary except if digit follows */ 475 REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT); 476 /* FALLTHROUGH */ 477 default: 478 ordinary(p, c); 479 break; 480 } 481 482 if (!MORE()) 483 return; 484 c = PEEK(); 485 /* we call { a repetition if followed by a digit */ 486 if (!( c == '*' || c == '+' || c == '?' || 487 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) )) 488 return; /* no repetition, we're done */ 489 NEXT(); 490 491 REQUIRE(!wascaret, REG_BADRPT); 492 switch (c) { 493 case '*': /* implemented as +? */ 494 /* this case does not require the (y|) trick, noKLUDGE */ 495 INSERT(OPLUS_, pos); 496 ASTERN(O_PLUS, pos); 497 INSERT(OQUEST_, pos); 498 ASTERN(O_QUEST, pos); 499 break; 500 case '+': 501 INSERT(OPLUS_, pos); 502 ASTERN(O_PLUS, pos); 503 break; 504 case '?': 505 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 506 INSERT(OCH_, pos); /* offset slightly wrong */ 507 ASTERN(OOR1, pos); /* this one's right */ 508 AHEAD(pos); /* fix the OCH_ */ 509 EMIT(OOR2, 0); /* offset very wrong... */ 510 AHEAD(THERE()); /* ...so fix it */ 511 ASTERN(O_CH, THERETHERE()); 512 break; 513 case '{': 514 count = p_count(p); 515 if (EAT(',')) { 516 if (isdigit((unsigned char)PEEK())) { 517 count2 = p_count(p); 518 REQUIRE(count <= count2, REG_BADBR); 519 } else /* single number with comma */ 520 count2 = INFINITY; 521 } else /* just a single number */ 522 count2 = count; 523 repeat(p, pos, count, count2, 0); 524 if (!EAT('}')) { /* error heuristics */ 525 while (MORE() && PEEK() != '}') 526 NEXT(); 527 REQUIRE(MORE(), REG_EBRACE); 528 SETERROR(REG_BADBR); 529 } 530 break; 531 } 532 533 if (!MORE()) 534 return; 535 c = PEEK(); 536 if (!( c == '*' || c == '+' || c == '?' || 537 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) ) 538 return; 539 SETERROR(REG_BADRPT); 540 } 541 542 /* 543 - p_str - string (no metacharacters) "parser" 544 == static void p_str(struct parse *p); 545 */ 546 static void 547 p_str( 548 struct parse *p) 549 { 550 551 _DIAGASSERT(p != NULL); 552 553 REQUIRE(MORE(), REG_EMPTY); 554 while (MORE()) 555 ordinary(p, GETNEXT()); 556 } 557 558 /* 559 - p_bre - BRE parser top level, anchoring and concatenation 560 == static void p_bre(struct parse *p, int end1, \ 561 == int end2, size_t reclimit); 562 * Giving end1 as OUT essentially eliminates the end1/end2 check. 563 * 564 * This implementation is a bit of a kludge, in that a trailing $ is first 565 * taken as an ordinary character and then revised to be an anchor. The 566 * only undesirable side effect is that '$' gets included as a character 567 * category in such cases. This is fairly harmless; not worth fixing. 568 * The amount of lookahead needed to avoid this kludge is excessive. 569 */ 570 static void 571 p_bre( 572 struct parse *p, 573 int end1, /* first terminating character */ 574 int end2, /* second terminating character */ 575 size_t reclimit) 576 { 577 sopno start; 578 int first = 1; /* first subexpression? */ 579 int wasdollar = 0; 580 581 _DIAGASSERT(p != NULL); 582 583 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 584 p->error = REG_ESPACE; 585 return; 586 } 587 588 start = HERE(); 589 590 if (EAT('^')) { 591 EMIT(OBOL, 0); 592 p->g->iflags |= USEBOL; 593 p->g->nbol++; 594 } 595 while (MORE() && !SEETWO(end1, end2)) { 596 wasdollar = p_simp_re(p, first, reclimit); 597 first = 0; 598 } 599 if (wasdollar) { /* oops, that was a trailing anchor */ 600 DROP(1); 601 EMIT(OEOL, 0); 602 p->g->iflags |= USEEOL; 603 p->g->neol++; 604 } 605 606 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 607 } 608 609 /* 610 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 611 == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 612 */ 613 static int /* was the simple RE an unbackslashed $? */ 614 p_simp_re( 615 struct parse *p, 616 int starordinary, /* is a leading * an ordinary character? */ 617 size_t reclimit) 618 { 619 int c; 620 int count; 621 int count2; 622 sopno pos, i; 623 sopno subno; 624 # define BACKSL (1<<CHAR_BIT) 625 626 _DIAGASSERT(p != NULL); 627 628 pos = HERE(); /* repetion op, if any, covers from here */ 629 630 assert(MORE()); /* caller should have ensured this */ 631 c = GETNEXT(); 632 if (c == '\\') { 633 REQUIRE(MORE(), REG_EESCAPE); 634 c = BACKSL | (unsigned char)GETNEXT(); 635 } 636 switch (c) { 637 case '.': 638 if (p->g->cflags®_NEWLINE) 639 nonnewline(p); 640 else 641 EMIT(OANY, 0); 642 break; 643 case '[': 644 p_bracket(p); 645 break; 646 case BACKSL|'{': 647 SETERROR(REG_BADRPT); 648 break; 649 case BACKSL|'(': 650 p->g->nsub++; 651 subno = p->g->nsub; 652 if (subno < NPAREN) 653 p->pbegin[subno] = HERE(); 654 EMIT(OLPAREN, subno); 655 /* the MORE here is an error heuristic */ 656 if (MORE() && !SEETWO('\\', ')')) 657 p_bre(p, '\\', ')', reclimit); 658 if (subno < NPAREN) { 659 p->pend[subno] = HERE(); 660 assert(p->pend[subno] != 0); 661 } 662 EMIT(ORPAREN, subno); 663 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 664 break; 665 case BACKSL|')': /* should not get here -- must be user */ 666 case BACKSL|'}': 667 SETERROR(REG_EPAREN); 668 break; 669 case BACKSL|'1': 670 case BACKSL|'2': 671 case BACKSL|'3': 672 case BACKSL|'4': 673 case BACKSL|'5': 674 case BACKSL|'6': 675 case BACKSL|'7': 676 case BACKSL|'8': 677 case BACKSL|'9': 678 i = (c&~BACKSL) - '0'; 679 assert(i < NPAREN); 680 if (p->pend[i] != 0) { 681 assert(i <= p->g->nsub); 682 EMIT(OBACK_, i); 683 assert(p->pbegin[i] != 0); 684 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 685 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 686 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 687 EMIT(O_BACK, i); 688 } else 689 SETERROR(REG_ESUBREG); 690 p->g->backrefs = 1; 691 break; 692 case '*': 693 REQUIRE(starordinary, REG_BADRPT); 694 /* FALLTHROUGH */ 695 default: 696 ordinary(p, c &~ BACKSL); 697 break; 698 } 699 700 if (EAT('*')) { /* implemented as +? */ 701 /* this case does not require the (y|) trick, noKLUDGE */ 702 INSERT(OPLUS_, pos); 703 ASTERN(O_PLUS, pos); 704 INSERT(OQUEST_, pos); 705 ASTERN(O_QUEST, pos); 706 } else if (EATTWO('\\', '{')) { 707 count = p_count(p); 708 if (EAT(',')) { 709 if (MORE() && isdigit((unsigned char)PEEK())) { 710 count2 = p_count(p); 711 REQUIRE(count <= count2, REG_BADBR); 712 } else /* single number with comma */ 713 count2 = INFINITY; 714 } else /* just a single number */ 715 count2 = count; 716 repeat(p, pos, count, count2, 0); 717 if (!EATTWO('\\', '}')) { /* error heuristics */ 718 while (MORE() && !SEETWO('\\', '}')) 719 NEXT(); 720 REQUIRE(MORE(), REG_EBRACE); 721 SETERROR(REG_BADBR); 722 } 723 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 724 return(1); 725 726 return(0); 727 } 728 729 /* 730 - p_count - parse a repetition count 731 == static int p_count(struct parse *p); 732 */ 733 static int /* the value */ 734 p_count( 735 struct parse *p) 736 { 737 int count = 0; 738 int ndigits = 0; 739 740 _DIAGASSERT(p != NULL); 741 742 while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) { 743 count = count*10 + (GETNEXT() - '0'); 744 ndigits++; 745 } 746 747 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 748 return(count); 749 } 750 751 /* 752 - p_bracket - parse a bracketed character list 753 == static void p_bracket(struct parse *p); 754 * 755 * Note a significant property of this code: if the allocset() did SETERROR, 756 * no set operations are done. 757 */ 758 static void 759 p_bracket( 760 struct parse *p) 761 { 762 cset *cs; 763 int invert = 0; 764 765 _DIAGASSERT(p != NULL); 766 767 cs = allocset(p); 768 if (cs == NULL) 769 return; 770 771 /* Dept of Truly Sickening Special-Case Kludges */ 772 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 773 (size_t)6) == 0) { 774 EMIT(OBOW, 0); 775 NEXTn(6); 776 return; 777 } 778 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 779 (size_t)6) == 0) { 780 EMIT(OEOW, 0); 781 NEXTn(6); 782 return; 783 } 784 785 if (EAT('^')) 786 invert++; /* make note to invert set at end */ 787 if (EAT(']')) 788 CHadd(cs, ']'); 789 else if (EAT('-')) 790 CHadd(cs, '-'); 791 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 792 p_b_term(p, cs); 793 if (EAT('-')) 794 CHadd(cs, '-'); 795 MUSTEAT(']', REG_EBRACK); 796 797 if (p->error != 0) /* don't mess things up further */ 798 return; 799 800 if (p->g->cflags®_ICASE) { 801 int i; 802 int ci; 803 804 for (i = p->g->csetsize - 1; i >= 0; i--) 805 if (CHIN(cs, i) && isalpha(i)) { 806 ci = othercase(i); 807 if (ci != i) 808 CHadd(cs, ci); 809 } 810 if (cs->multis != NULL) 811 mccase(p, cs); 812 } 813 if (invert) { 814 int i; 815 816 for (i = p->g->csetsize - 1; i >= 0; i--) 817 if (CHIN(cs, i)) 818 CHsub(cs, i); 819 else 820 CHadd(cs, i); 821 if (p->g->cflags®_NEWLINE) 822 CHsub(cs, '\n'); 823 if (cs->multis != NULL) 824 mcinvert(p, cs); 825 } 826 827 assert(cs->multis == NULL); /* xxx */ 828 829 if (nch(p, cs) == 1) { /* optimize singleton sets */ 830 ordinary(p, firstch(p, cs)); 831 freeset(p, cs); 832 } else 833 EMIT(OANYOF, freezeset(p, cs)); 834 } 835 836 /* 837 - p_b_term - parse one term of a bracketed character list 838 == static void p_b_term(struct parse *p, cset *cs); 839 */ 840 static void 841 p_b_term( 842 struct parse *p, 843 cset *cs) 844 { 845 char c; 846 char start, finish; 847 int i; 848 849 _DIAGASSERT(p != NULL); 850 _DIAGASSERT(cs != NULL); 851 852 /* classify what we've got */ 853 switch ((MORE()) ? PEEK() : '\0') { 854 case '[': 855 c = (MORE2()) ? PEEK2() : '\0'; 856 break; 857 858 case '-': 859 SETERROR(REG_ERANGE); 860 return; /* NOTE RETURN */ 861 862 default: 863 c = '\0'; 864 break; 865 } 866 867 switch (c) { 868 case ':': /* character class */ 869 NEXT2(); 870 REQUIRE(MORE(), REG_EBRACK); 871 c = PEEK(); 872 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 873 p_b_cclass(p, cs); 874 REQUIRE(MORE(), REG_EBRACK); 875 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 876 break; 877 case '=': /* equivalence class */ 878 NEXT2(); 879 REQUIRE(MORE(), REG_EBRACK); 880 c = PEEK(); 881 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 882 p_b_eclass(p, cs); 883 REQUIRE(MORE(), REG_EBRACK); 884 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 885 break; 886 default: /* symbol, ordinary character, or range */ 887 /* xxx revision needed for multichar stuff */ 888 start = p_b_symbol(p); 889 if (SEE('-') && MORE2() && PEEK2() != ']') { 890 /* range */ 891 NEXT(); 892 if (EAT('-')) 893 finish = '-'; 894 else 895 finish = p_b_symbol(p); 896 } else 897 finish = start; 898 /* xxx what about signed chars here... */ 899 REQUIRE(start <= finish, REG_ERANGE); 900 for (i = start; i <= finish; i++) 901 CHadd(cs, i); 902 break; 903 } 904 } 905 906 /* 907 - p_b_cclass - parse a character-class name and deal with it 908 == static void p_b_cclass(struct parse *p, cset *cs); 909 */ 910 static void 911 p_b_cclass( 912 struct parse *p, 913 cset *cs) 914 { 915 const char *sp; 916 const struct cclass *cp; 917 size_t len; 918 const char *u; 919 char c; 920 921 _DIAGASSERT(p != NULL); 922 _DIAGASSERT(cs != NULL); 923 924 sp = p->next; 925 926 while (MORE() && isalpha((unsigned char)PEEK())) 927 NEXT(); 928 len = p->next - sp; 929 for (cp = cclasses; cp->name != NULL; cp++) 930 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 931 break; 932 if (cp->name == NULL) { 933 /* oops, didn't find it */ 934 SETERROR(REG_ECTYPE); 935 return; 936 } 937 938 u = cp->chars; 939 while ((c = *u++) != '\0') 940 CHadd(cs, c); 941 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 942 MCadd(p, cs, u); 943 } 944 945 /* 946 - p_b_eclass - parse an equivalence-class name and deal with it 947 == static void p_b_eclass(struct parse *p, cset *cs); 948 * 949 * This implementation is incomplete. xxx 950 */ 951 static void 952 p_b_eclass( 953 struct parse *p, 954 cset *cs) 955 { 956 char c; 957 958 _DIAGASSERT(p != NULL); 959 _DIAGASSERT(cs != NULL); 960 961 c = p_b_coll_elem(p, '='); 962 CHadd(cs, c); 963 } 964 965 /* 966 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 967 == static char p_b_symbol(struct parse *p); 968 */ 969 static char /* value of symbol */ 970 p_b_symbol( 971 struct parse *p) 972 { 973 char value; 974 975 _DIAGASSERT(p != NULL); 976 977 REQUIRE(MORE(), REG_EBRACK); 978 if (!EATTWO('[', '.')) 979 return(GETNEXT()); 980 981 /* collating symbol */ 982 value = p_b_coll_elem(p, '.'); 983 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 984 return(value); 985 } 986 987 /* 988 - p_b_coll_elem - parse a collating-element name and look it up 989 == static char p_b_coll_elem(struct parse *p, int endc); 990 */ 991 static char /* value of collating element */ 992 p_b_coll_elem( 993 struct parse *p, 994 int endc) /* name ended by endc,']' */ 995 { 996 const char *sp; 997 const struct cname *cp; 998 size_t len; 999 1000 _DIAGASSERT(p != NULL); 1001 1002 sp = p->next; 1003 1004 while (MORE() && !SEETWO(endc, ']')) 1005 NEXT(); 1006 if (!MORE()) { 1007 SETERROR(REG_EBRACK); 1008 return(0); 1009 } 1010 len = p->next - sp; 1011 for (cp = cnames; cp->name != NULL; cp++) 1012 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 1013 return(cp->code); /* known name */ 1014 if (len == 1) 1015 return(*sp); /* single character */ 1016 SETERROR(REG_ECOLLATE); /* neither */ 1017 return(0); 1018 } 1019 1020 /* 1021 - othercase - return the case counterpart of an alphabetic 1022 == static int othercase(int ch); 1023 */ 1024 static int /* if no counterpart, return ch */ 1025 othercase( 1026 int ch) 1027 { 1028 assert(isalpha(ch)); 1029 if (isupper(ch)) 1030 return(tolower(ch)); 1031 else if (islower(ch)) 1032 return(toupper(ch)); 1033 else /* peculiar, but could happen */ 1034 return(ch); 1035 } 1036 1037 /* 1038 - bothcases - emit a dualcase version of a two-case character 1039 == static void bothcases(struct parse *p, int ch); 1040 * 1041 * Boy, is this implementation ever a kludge... 1042 */ 1043 static void 1044 bothcases( 1045 struct parse *p, 1046 int ch) 1047 { 1048 const char *oldnext; 1049 const char *oldend; 1050 char bracket[3]; 1051 1052 _DIAGASSERT(p != NULL); 1053 1054 oldnext = p->next; 1055 oldend = p->end; 1056 1057 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1058 p->next = bracket; 1059 p->end = bracket+2; 1060 bracket[0] = ch; 1061 bracket[1] = ']'; 1062 bracket[2] = '\0'; 1063 p_bracket(p); 1064 assert(p->next == bracket+2); 1065 p->next = oldnext; 1066 p->end = oldend; 1067 } 1068 1069 /* 1070 - ordinary - emit an ordinary character 1071 == static void ordinary(struct parse *p, int ch); 1072 */ 1073 static void 1074 ordinary( 1075 struct parse *p, 1076 int ch) 1077 { 1078 cat_t *cap; 1079 1080 _DIAGASSERT(p != NULL); 1081 1082 cap = p->g->categories; 1083 if ((p->g->cflags®_ICASE) && isalpha((unsigned char) ch) 1084 && othercase((unsigned char) ch) != (unsigned char) ch) 1085 bothcases(p, (unsigned char) ch); 1086 else { 1087 EMIT(OCHAR, (sopno)(unsigned char)ch); 1088 if (cap[ch] == 0) 1089 cap[ch] = p->g->ncategories++; 1090 } 1091 } 1092 1093 /* 1094 - nonnewline - emit REG_NEWLINE version of OANY 1095 == static void nonnewline(struct parse *p); 1096 * 1097 * Boy, is this implementation ever a kludge... 1098 */ 1099 static void 1100 nonnewline( 1101 struct parse *p) 1102 { 1103 const char *oldnext; 1104 const char *oldend; 1105 char bracket[4]; 1106 1107 _DIAGASSERT(p != NULL); 1108 1109 oldnext = p->next; 1110 oldend = p->end; 1111 1112 p->next = bracket; 1113 p->end = bracket+3; 1114 bracket[0] = '^'; 1115 bracket[1] = '\n'; 1116 bracket[2] = ']'; 1117 bracket[3] = '\0'; 1118 p_bracket(p); 1119 assert(p->next == bracket+3); 1120 p->next = oldnext; 1121 p->end = oldend; 1122 } 1123 1124 /* 1125 - repeat - generate code for a bounded repetition, recursively if needed 1126 == static void repeat(struct parse *p, sopno start, int from, int to, 1127 == size_t reclimit); 1128 */ 1129 static void 1130 repeat( 1131 struct parse *p, 1132 sopno start, /* operand from here to end of strip */ 1133 int from, /* repeated from this number */ 1134 int to, /* to this number of times (maybe INFINITY) */ 1135 size_t reclimit) 1136 { 1137 sopno finish; 1138 # define N 2 1139 # define INF 3 1140 # define REP(f, t) ((f)*8 + (t)) 1141 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1142 sopno copy; 1143 1144 _DIAGASSERT(p != NULL); 1145 1146 if (reclimit++ > RECLIMIT) 1147 p->error = REG_ESPACE; 1148 if (p->error) 1149 return; 1150 1151 finish = HERE(); 1152 1153 assert(from <= to); 1154 1155 switch (REP(MAP(from), MAP(to))) { 1156 case REP(0, 0): /* must be user doing this */ 1157 DROP(finish-start); /* drop the operand */ 1158 break; 1159 case REP(0, 1): /* as x{1,1}? */ 1160 case REP(0, N): /* as x{1,n}? */ 1161 case REP(0, INF): /* as x{1,}? */ 1162 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1163 INSERT(OCH_, start); /* offset is wrong... */ 1164 repeat(p, start+1, 1, to, reclimit); 1165 ASTERN(OOR1, start); 1166 AHEAD(start); /* ... fix it */ 1167 EMIT(OOR2, 0); 1168 AHEAD(THERE()); 1169 ASTERN(O_CH, THERETHERE()); 1170 break; 1171 case REP(1, 1): /* trivial case */ 1172 /* done */ 1173 break; 1174 case REP(1, N): /* as x?x{1,n-1} */ 1175 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1176 INSERT(OCH_, start); 1177 ASTERN(OOR1, start); 1178 AHEAD(start); 1179 EMIT(OOR2, 0); /* offset very wrong... */ 1180 AHEAD(THERE()); /* ...so fix it */ 1181 ASTERN(O_CH, THERETHERE()); 1182 copy = dupl(p, start+1, finish+1); 1183 assert(copy == finish+4); 1184 repeat(p, copy, 1, to-1, reclimit); 1185 break; 1186 case REP(1, INF): /* as x+ */ 1187 INSERT(OPLUS_, start); 1188 ASTERN(O_PLUS, start); 1189 break; 1190 case REP(N, N): /* as xx{m-1,n-1} */ 1191 copy = dupl(p, start, finish); 1192 repeat(p, copy, from-1, to-1, reclimit); 1193 break; 1194 case REP(N, INF): /* as xx{n-1,INF} */ 1195 copy = dupl(p, start, finish); 1196 repeat(p, copy, from-1, to, reclimit); 1197 break; 1198 default: /* "can't happen" */ 1199 SETERROR(REG_ASSERT); /* just in case */ 1200 break; 1201 } 1202 } 1203 1204 /* 1205 - seterr - set an error condition 1206 == static int seterr(struct parse *p, int e); 1207 */ 1208 static int /* useless but makes type checking happy */ 1209 seterr( 1210 struct parse *p, 1211 int e) 1212 { 1213 1214 _DIAGASSERT(p != NULL); 1215 1216 if (p->error == 0) /* keep earliest error condition */ 1217 p->error = e; 1218 p->next = nuls; /* try to bring things to a halt */ 1219 p->end = nuls; 1220 return(0); /* make the return value well-defined */ 1221 } 1222 1223 /* 1224 - allocset - allocate a set of characters for [] 1225 == static cset *allocset(struct parse *p); 1226 */ 1227 static cset * 1228 allocset( 1229 struct parse *p) 1230 { 1231 int no; 1232 size_t nc; 1233 size_t nbytes; 1234 cset *cs; 1235 size_t css; 1236 int i; 1237 1238 _DIAGASSERT(p != NULL); 1239 1240 no = p->g->ncsets++; 1241 css = (size_t)p->g->csetsize; 1242 if (no >= p->ncsalloc) { /* need another column of space */ 1243 p->ncsalloc += CHAR_BIT; 1244 nc = p->ncsalloc; 1245 assert(nc % CHAR_BIT == 0); 1246 nbytes = nc / CHAR_BIT * css; 1247 if (MEMSIZE(p) > MEMLIMIT) 1248 goto oomem; 1249 if (p->g->sets == NULL) 1250 p->g->sets = malloc(nc * sizeof(cset)); 1251 else 1252 p->g->sets = realloc(p->g->sets, nc * sizeof(cset)); 1253 if (p->g->setbits == NULL) 1254 p->g->setbits = malloc(nbytes); 1255 else { 1256 p->g->setbits = realloc(p->g->setbits, nbytes); 1257 /* xxx this isn't right if setbits is now NULL */ 1258 for (i = 0; i < no; i++) 1259 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1260 } 1261 if (p->g->sets != NULL && p->g->setbits != NULL) 1262 (void) memset((char *)p->g->setbits + (nbytes - css), 1263 0, css); 1264 else { 1265 oomem: 1266 no = 0; 1267 SETERROR(REG_ESPACE); 1268 /* caller's responsibility not to do set ops */ 1269 return NULL; 1270 } 1271 } 1272 1273 cs = &p->g->sets[no]; 1274 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1275 cs->mask = 1 << ((no) % CHAR_BIT); 1276 cs->hash = 0; 1277 cs->smultis = 0; 1278 cs->multis = NULL; 1279 1280 return(cs); 1281 } 1282 1283 /* 1284 - freeset - free a now-unused set 1285 == static void freeset(struct parse *p, cset *cs); 1286 */ 1287 static void 1288 freeset( 1289 struct parse *p, 1290 cset *cs) 1291 { 1292 size_t i; 1293 cset *top; 1294 size_t css; 1295 1296 _DIAGASSERT(p != NULL); 1297 _DIAGASSERT(cs != NULL); 1298 1299 top = &p->g->sets[p->g->ncsets]; 1300 css = (size_t)p->g->csetsize; 1301 1302 for (i = 0; i < css; i++) 1303 CHsub(cs, i); 1304 if (cs == top-1) /* recover only the easy case */ 1305 p->g->ncsets--; 1306 } 1307 1308 /* 1309 - freezeset - final processing on a set of characters 1310 == static int freezeset(struct parse *p, cset *cs); 1311 * 1312 * The main task here is merging identical sets. This is usually a waste 1313 * of time (although the hash code minimizes the overhead), but can win 1314 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1315 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1316 * the same value! 1317 */ 1318 static sopno /* set number */ 1319 freezeset( 1320 struct parse *p, 1321 cset *cs) 1322 { 1323 uch h; 1324 size_t i; 1325 cset *top; 1326 cset *cs2; 1327 size_t css; 1328 1329 _DIAGASSERT(p != NULL); 1330 _DIAGASSERT(cs != NULL); 1331 1332 h = cs->hash; 1333 top = &p->g->sets[p->g->ncsets]; 1334 css = (size_t)p->g->csetsize; 1335 1336 /* look for an earlier one which is the same */ 1337 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1338 if (cs2->hash == h && cs2 != cs) { 1339 /* maybe */ 1340 for (i = 0; i < css; i++) 1341 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1342 break; /* no */ 1343 if (i == css) 1344 break; /* yes */ 1345 } 1346 1347 if (cs2 < top) { /* found one */ 1348 freeset(p, cs); 1349 cs = cs2; 1350 } 1351 1352 return (sopno)(cs - p->g->sets); 1353 } 1354 1355 /* 1356 - firstch - return first character in a set (which must have at least one) 1357 == static int firstch(struct parse *p, cset *cs); 1358 */ 1359 static int /* character; there is no "none" value */ 1360 firstch( 1361 struct parse *p, 1362 cset *cs) 1363 { 1364 size_t i; 1365 size_t css; 1366 1367 _DIAGASSERT(p != NULL); 1368 _DIAGASSERT(cs != NULL); 1369 1370 css = (size_t)p->g->csetsize; 1371 1372 for (i = 0; i < css; i++) 1373 if (CHIN(cs, i)) 1374 return((char)i); 1375 assert(never); 1376 return(0); /* arbitrary */ 1377 } 1378 1379 /* 1380 - nch - number of characters in a set 1381 == static int nch(struct parse *p, cset *cs); 1382 */ 1383 static int 1384 nch( 1385 struct parse *p, 1386 cset *cs) 1387 { 1388 size_t i; 1389 size_t css; 1390 int n = 0; 1391 1392 _DIAGASSERT(p != NULL); 1393 _DIAGASSERT(cs != NULL); 1394 1395 css = (size_t)p->g->csetsize; 1396 1397 for (i = 0; i < css; i++) 1398 if (CHIN(cs, i)) 1399 n++; 1400 return(n); 1401 } 1402 1403 /* 1404 - mcadd - add a collating element to a cset 1405 == static void mcadd(struct parse *p, cset *cs, \ 1406 == char *cp); 1407 */ 1408 static void 1409 mcadd( 1410 struct parse *p, 1411 cset *cs, 1412 const char *cp) 1413 { 1414 size_t oldend; 1415 1416 _DIAGASSERT(p != NULL); 1417 _DIAGASSERT(cs != NULL); 1418 _DIAGASSERT(cp != NULL); 1419 1420 oldend = cs->smultis; 1421 1422 cs->smultis += strlen(cp) + 1; 1423 if (cs->multis == NULL) 1424 cs->multis = malloc(cs->smultis); 1425 else 1426 cs->multis = realloc(cs->multis, cs->smultis); 1427 if (cs->multis == NULL) { 1428 SETERROR(REG_ESPACE); 1429 return; 1430 } 1431 1432 (void) strcpy(cs->multis + oldend - 1, cp); 1433 cs->multis[cs->smultis - 1] = '\0'; 1434 } 1435 1436 #if 0 1437 /* 1438 - mcsub - subtract a collating element from a cset 1439 == static void mcsub(cset *cs, char *cp); 1440 */ 1441 static void 1442 mcsub( 1443 cset *cs, 1444 char *cp) 1445 { 1446 char *fp; 1447 size_t len; 1448 1449 _DIAGASSERT(cs != NULL); 1450 _DIAGASSERT(cp != NULL); 1451 1452 fp = mcfind(cs, cp); 1453 len = strlen(fp); 1454 1455 assert(fp != NULL); 1456 (void) memmove(fp, fp + len + 1, 1457 cs->smultis - (fp + len + 1 - cs->multis)); 1458 cs->smultis -= len; 1459 1460 if (cs->smultis == 0) { 1461 free(cs->multis); 1462 cs->multis = NULL; 1463 return; 1464 } 1465 1466 cs->multis = realloc(cs->multis, cs->smultis); 1467 assert(cs->multis != NULL); 1468 } 1469 1470 /* 1471 - mcin - is a collating element in a cset? 1472 == static int mcin(cset *cs, char *cp); 1473 */ 1474 static int 1475 mcin( 1476 cset *cs, 1477 char *cp) 1478 { 1479 1480 _DIAGASSERT(cs != NULL); 1481 _DIAGASSERT(cp != NULL); 1482 1483 return(mcfind(cs, cp) != NULL); 1484 } 1485 1486 /* 1487 - mcfind - find a collating element in a cset 1488 == static char *mcfind(cset *cs, char *cp); 1489 */ 1490 static char * 1491 mcfind( 1492 cset *cs, 1493 char *cp) 1494 { 1495 char *p; 1496 1497 _DIAGASSERT(cs != NULL); 1498 _DIAGASSERT(cp != NULL); 1499 1500 if (cs->multis == NULL) 1501 return(NULL); 1502 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1503 if (strcmp(cp, p) == 0) 1504 return(p); 1505 return(NULL); 1506 } 1507 #endif 1508 1509 /* 1510 - mcinvert - invert the list of collating elements in a cset 1511 == static void mcinvert(struct parse *p, cset *cs); 1512 * 1513 * This would have to know the set of possibilities. Implementation 1514 * is deferred. 1515 */ 1516 /* ARGSUSED */ 1517 static void 1518 mcinvert( 1519 struct parse *p, 1520 cset *cs) 1521 { 1522 1523 _DIAGASSERT(p != NULL); 1524 _DIAGASSERT(cs != NULL); 1525 1526 assert(cs->multis == NULL); /* xxx */ 1527 } 1528 1529 /* 1530 - mccase - add case counterparts of the list of collating elements in a cset 1531 == static void mccase(struct parse *p, cset *cs); 1532 * 1533 * This would have to know the set of possibilities. Implementation 1534 * is deferred. 1535 */ 1536 /* ARGSUSED */ 1537 static void 1538 mccase( 1539 struct parse *p, 1540 cset *cs) 1541 { 1542 1543 _DIAGASSERT(p != NULL); 1544 _DIAGASSERT(cs != NULL); 1545 1546 assert(cs->multis == NULL); /* xxx */ 1547 } 1548 1549 /* 1550 - isinsets - is this character in any sets? 1551 == static int isinsets(struct re_guts *g, int c); 1552 */ 1553 static int /* predicate */ 1554 isinsets( 1555 struct re_guts *g, 1556 int c) 1557 { 1558 uch *col; 1559 int i; 1560 int ncols; 1561 unsigned uc = (unsigned char)c; 1562 1563 _DIAGASSERT(g != NULL); 1564 1565 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1566 1567 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1568 if (col[uc] != 0) 1569 return(1); 1570 return(0); 1571 } 1572 1573 /* 1574 - samesets - are these two characters in exactly the same sets? 1575 == static int samesets(struct re_guts *g, int c1, int c2); 1576 */ 1577 static int /* predicate */ 1578 samesets( 1579 struct re_guts *g, 1580 int c1, 1581 int c2) 1582 { 1583 uch *col; 1584 int i; 1585 int ncols; 1586 unsigned uc1 = (unsigned char)c1; 1587 unsigned uc2 = (unsigned char)c2; 1588 1589 _DIAGASSERT(g != NULL); 1590 1591 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1592 1593 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1594 if (col[uc1] != col[uc2]) 1595 return(0); 1596 return(1); 1597 } 1598 1599 /* 1600 - categorize - sort out character categories 1601 == static void categorize(struct parse *p, struct re_guts *g); 1602 */ 1603 static void 1604 categorize( 1605 struct parse *p, 1606 struct re_guts *g) 1607 { 1608 cat_t *cats; 1609 int c; 1610 int c2; 1611 cat_t cat; 1612 1613 _DIAGASSERT(p != NULL); 1614 _DIAGASSERT(g != NULL); 1615 1616 cats = g->categories; 1617 1618 /* avoid making error situations worse */ 1619 if (p->error != 0) 1620 return; 1621 1622 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1623 if (cats[c] == 0 && isinsets(g, c)) { 1624 cat = g->ncategories++; 1625 cats[c] = cat; 1626 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1627 if (cats[c2] == 0 && samesets(g, c, c2)) 1628 cats[c2] = cat; 1629 } 1630 } 1631 1632 /* 1633 - dupl - emit a duplicate of a bunch of sops 1634 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1635 */ 1636 static sopno /* start of duplicate */ 1637 dupl( 1638 struct parse *p, 1639 sopno start, /* from here */ 1640 sopno finish) /* to this less one */ 1641 { 1642 sopno ret; 1643 sopno len = finish - start; 1644 1645 _DIAGASSERT(p != NULL); 1646 1647 ret = HERE(); 1648 1649 assert(finish >= start); 1650 if (len == 0) 1651 return(ret); 1652 if (!enlarge(p, p->ssize + len))/* this many unexpected additions */ 1653 return ret; 1654 (void)memcpy(p->strip + p->slen, p->strip + start, 1655 (size_t)len * sizeof(sop)); 1656 p->slen += len; 1657 return(ret); 1658 } 1659 1660 /* 1661 - doemit - emit a strip operator 1662 == static void doemit(struct parse *p, sop op, size_t opnd); 1663 * 1664 * It might seem better to implement this as a macro with a function as 1665 * hard-case backup, but it's just too big and messy unless there are 1666 * some changes to the data structures. Maybe later. 1667 */ 1668 static void 1669 doemit( 1670 struct parse *p, 1671 sop op, 1672 sopno opnd) 1673 { 1674 1675 _DIAGASSERT(p != NULL); 1676 1677 /* avoid making error situations worse */ 1678 if (p->error != 0) 1679 return; 1680 1681 /* deal with oversize operands ("can't happen", more or less) */ 1682 assert(opnd < 1<<OPSHIFT); 1683 1684 /* deal with undersized strip */ 1685 if (p->slen >= p->ssize) 1686 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1687 return; 1688 1689 /* finally, it's all reduced to the easy case */ 1690 p->strip[p->slen++] = SOP(op, opnd); 1691 } 1692 1693 /* 1694 - doinsert - insert a sop into the strip 1695 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1696 */ 1697 static void 1698 doinsert( 1699 struct parse *p, 1700 sop op, 1701 sopno opnd, 1702 sopno pos) 1703 { 1704 sopno sn; 1705 sop s; 1706 int i; 1707 1708 _DIAGASSERT(p != NULL); 1709 1710 /* avoid making error situations worse */ 1711 if (p->error != 0) 1712 return; 1713 1714 sn = HERE(); 1715 EMIT(op, opnd); /* do checks, ensure space */ 1716 assert(HERE() == sn+1); 1717 s = p->strip[sn]; 1718 1719 /* adjust paren pointers */ 1720 assert(pos > 0); 1721 for (i = 1; i < NPAREN; i++) { 1722 if (p->pbegin[i] >= pos) { 1723 p->pbegin[i]++; 1724 } 1725 if (p->pend[i] >= pos) { 1726 p->pend[i]++; 1727 } 1728 } 1729 1730 memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop)); 1731 p->strip[pos] = s; 1732 } 1733 1734 /* 1735 - dofwd - complete a forward reference 1736 == static void dofwd(struct parse *p, sopno pos, sop value); 1737 */ 1738 static void 1739 dofwd( 1740 struct parse *p, 1741 sopno pos, 1742 sopno value) 1743 { 1744 1745 _DIAGASSERT(p != NULL); 1746 1747 /* avoid making error situations worse */ 1748 if (p->error != 0) 1749 return; 1750 1751 assert(value < 1<<OPSHIFT); 1752 p->strip[pos] = OP(p->strip[pos]) | value; 1753 } 1754 1755 /* 1756 - enlarge - enlarge the strip 1757 == static void enlarge(struct parse *p, sopno size); 1758 */ 1759 static int 1760 enlarge( 1761 struct parse *p, 1762 sopno size) 1763 { 1764 sop *sp; 1765 sopno osize; 1766 1767 _DIAGASSERT(p != NULL); 1768 1769 if (p->ssize >= size) 1770 return 1; 1771 1772 osize = p->ssize; 1773 p->ssize = size; 1774 if (MEMSIZE(p) > MEMLIMIT) 1775 goto oomem; 1776 sp = realloc(p->strip, p->ssize * sizeof(sop)); 1777 if (sp == NULL) { 1778 oomem: 1779 p->ssize = osize; 1780 SETERROR(REG_ESPACE); 1781 return 0; 1782 } 1783 p->strip = sp; 1784 return 1; 1785 } 1786 1787 /* 1788 - stripsnug - compact the strip 1789 == static void stripsnug(struct parse *p, struct re_guts *g); 1790 */ 1791 static void 1792 stripsnug( 1793 struct parse *p, 1794 struct re_guts *g) 1795 { 1796 1797 _DIAGASSERT(p != NULL); 1798 _DIAGASSERT(g != NULL); 1799 1800 g->nstates = p->slen; 1801 g->strip = realloc(p->strip, p->slen * sizeof(sop)); 1802 if (g->strip == NULL) { 1803 SETERROR(REG_ESPACE); 1804 g->strip = p->strip; 1805 } 1806 } 1807 1808 /* 1809 - findmust - fill in must and mlen with longest mandatory literal string 1810 == static void findmust(struct parse *p, struct re_guts *g); 1811 * 1812 * This algorithm could do fancy things like analyzing the operands of | 1813 * for common subsequences. Someday. This code is simple and finds most 1814 * of the interesting cases. 1815 * 1816 * Note that must and mlen got initialized during setup. 1817 */ 1818 static void 1819 findmust( 1820 struct parse *p, 1821 struct re_guts *g) 1822 { 1823 sop *scan; 1824 sop *start = NULL; 1825 sop *newstart = NULL; 1826 sopno newlen; 1827 sop s; 1828 char *cp; 1829 sopno i; 1830 1831 _DIAGASSERT(p != NULL); 1832 _DIAGASSERT(g != NULL); 1833 1834 /* avoid making error situations worse */ 1835 if (p->error != 0) 1836 return; 1837 1838 /* find the longest OCHAR sequence in strip */ 1839 newlen = 0; 1840 scan = g->strip + 1; 1841 do { 1842 s = *scan++; 1843 switch (OP(s)) { 1844 case OCHAR: /* sequence member */ 1845 if (newlen == 0) /* new sequence */ 1846 newstart = scan - 1; 1847 newlen++; 1848 break; 1849 case OPLUS_: /* things that don't break one */ 1850 case OLPAREN: 1851 case ORPAREN: 1852 break; 1853 case OQUEST_: /* things that must be skipped */ 1854 case OCH_: 1855 scan--; 1856 do { 1857 scan += OPND(s); 1858 s = *scan; 1859 /* assert() interferes w debug printouts */ 1860 if (OP(s) != O_QUEST && OP(s) != O_CH && 1861 OP(s) != OOR2) { 1862 g->iflags |= BAD; 1863 return; 1864 } 1865 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1866 /* FALLTHROUGH */ 1867 default: /* things that break a sequence */ 1868 if (newlen > g->mlen) { /* ends one */ 1869 start = newstart; 1870 g->mlen = newlen; 1871 } 1872 newlen = 0; 1873 break; 1874 } 1875 } while (OP(s) != OEND); 1876 1877 if (start == NULL) 1878 g->mlen = 0; 1879 1880 if (g->mlen == 0) /* there isn't one */ 1881 return; 1882 1883 /* turn it into a character string */ 1884 g->must = malloc((size_t)g->mlen + 1); 1885 if (g->must == NULL) { /* argh; just forget it */ 1886 g->mlen = 0; 1887 return; 1888 } 1889 cp = g->must; 1890 scan = start; 1891 for (i = g->mlen; i > 0; i--) { 1892 while (OP(s = *scan++) != OCHAR) 1893 continue; 1894 assert(cp < g->must + g->mlen); 1895 *cp++ = (char)OPND(s); 1896 } 1897 assert(cp == g->must + g->mlen); 1898 *cp++ = '\0'; /* just on general principles */ 1899 } 1900 1901 /* 1902 - pluscount - count + nesting 1903 == static sopno pluscount(struct parse *p, struct re_guts *g); 1904 */ 1905 static sopno /* nesting depth */ 1906 pluscount( 1907 struct parse *p, 1908 struct re_guts *g) 1909 { 1910 sop *scan; 1911 sop s; 1912 sopno plusnest = 0; 1913 sopno maxnest = 0; 1914 1915 _DIAGASSERT(p != NULL); 1916 _DIAGASSERT(g != NULL); 1917 1918 if (p->error != 0) 1919 return(0); /* there may not be an OEND */ 1920 1921 scan = g->strip + 1; 1922 do { 1923 s = *scan++; 1924 switch (OP(s)) { 1925 case OPLUS_: 1926 plusnest++; 1927 break; 1928 case O_PLUS: 1929 if (plusnest > maxnest) 1930 maxnest = plusnest; 1931 plusnest--; 1932 break; 1933 } 1934 } while (OP(s) != OEND); 1935 if (plusnest != 0) 1936 g->iflags |= BAD; 1937 return(maxnest); 1938 } 1939