1 /* $NetBSD: regcomp.c,v 1.38 2019/02/07 22:22:31 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 35 */ 36 37 /*- 38 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 39 * 40 * This code is derived from software contributed to Berkeley by 41 * Henry Spencer. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 72 */ 73 74 #include <sys/cdefs.h> 75 #if defined(LIBC_SCCS) && !defined(lint) 76 #if 0 77 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 78 #else 79 __RCSID("$NetBSD: regcomp.c,v 1.38 2019/02/07 22:22:31 christos Exp $"); 80 #endif 81 #endif /* LIBC_SCCS and not lint */ 82 83 #include "namespace.h" 84 #include <sys/types.h> 85 86 #include <assert.h> 87 #include <ctype.h> 88 #include <limits.h> 89 #include <stdio.h> 90 #include <stdlib.h> 91 #include <string.h> 92 #include <regex.h> 93 94 #ifdef __weak_alias 95 __weak_alias(regcomp,_regcomp) 96 #endif 97 98 #include "utils.h" 99 #include "regex2.h" 100 101 #include "cclass.h" 102 #include "cname.h" 103 104 /* 105 * parse structure, passed up and down to avoid global variables and 106 * other clumsinesses 107 */ 108 struct parse { 109 const char *next; /* next character in RE */ 110 const char *end; /* end of string (-> NUL normally) */ 111 int error; /* has an error been seen? */ 112 sop *strip; /* malloced strip */ 113 sopno ssize; /* malloced strip size (allocated) */ 114 sopno slen; /* malloced strip length (used) */ 115 size_t ncsalloc; /* number of csets allocated */ 116 struct re_guts *g; 117 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 118 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 119 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 120 }; 121 122 /* ========= begin header generated by ./mkh ========= */ 123 #ifdef __cplusplus 124 extern "C" { 125 #endif 126 127 /* === regcomp.c === */ 128 static void p_ere(struct parse *p, int stop, size_t reclimit); 129 static void p_ere_exp(struct parse *p, size_t reclimit); 130 static void p_str(struct parse *p); 131 static void p_bre(struct parse *p, int end1, int end2, size_t reclimit); 132 static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 133 static int p_count(struct parse *p); 134 static void p_bracket(struct parse *p); 135 static void p_b_term(struct parse *p, cset *cs); 136 static void p_b_cclass(struct parse *p, cset *cs); 137 static void p_b_eclass(struct parse *p, cset *cs); 138 static char p_b_symbol(struct parse *p); 139 static char p_b_coll_elem(struct parse *p, int endc); 140 static int othercase(int ch); 141 static void bothcases(struct parse *p, int ch); 142 static void ordinary(struct parse *p, int ch); 143 static void nonnewline(struct parse *p); 144 static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit); 145 static int seterr(struct parse *p, int e); 146 static cset *allocset(struct parse *p); 147 static void freeset(struct parse *p, cset *cs); 148 static sopno freezeset(struct parse *p, cset *cs); 149 static int firstch(struct parse *p, cset *cs); 150 static int nch(struct parse *p, cset *cs); 151 static void mcadd(struct parse *p, cset *cs, const char *cp); 152 #if 0 153 static void mcsub(cset *cs, char *cp); 154 static int mcin(cset *cs, char *cp); 155 static char *mcfind(cset *cs, char *cp); 156 #endif 157 static void mcinvert(struct parse *p, cset *cs); 158 static void mccase(struct parse *p, cset *cs); 159 static int isinsets(struct re_guts *g, int c); 160 static int samesets(struct re_guts *g, int c1, int c2); 161 static void categorize(struct parse *p, struct re_guts *g); 162 static sopno dupl(struct parse *p, sopno start, sopno finish); 163 static void doemit(struct parse *p, sop op, sopno opnd); 164 static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos); 165 static void dofwd(struct parse *p, sopno pos, sopno value); 166 static int enlarge(struct parse *p, sopno size); 167 static void stripsnug(struct parse *p, struct re_guts *g); 168 static void findmust(struct parse *p, struct re_guts *g); 169 static sopno pluscount(struct parse *p, struct re_guts *g); 170 171 #ifdef __cplusplus 172 } 173 #endif 174 /* ========= end header generated by ./mkh ========= */ 175 176 static char nuls[10]; /* place to point scanner in event of error */ 177 178 /* 179 * macros for use with parse structure 180 * BEWARE: these know that the parse structure is named `p' !!! 181 */ 182 #define PEEK() (*p->next) 183 #define PEEK2() (*(p->next+1)) 184 #define MORE() (p->next < p->end) 185 #define MORE2() (p->next+1 < p->end) 186 #define SEE(c) (MORE() && PEEK() == (c)) 187 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 188 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 189 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 190 #define NEXT() (p->next++) 191 #define NEXT2() (p->next += 2) 192 #define NEXTn(n) (p->next += (n)) 193 #define GETNEXT() (*p->next++) 194 #define SETERROR(e) seterr(p, (e)) 195 #define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 196 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 197 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 198 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 199 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd) 200 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 201 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 202 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 203 #define HERE() (p->slen) 204 #define THERE() (p->slen - 1) 205 #define THERETHERE() (p->slen - 2) 206 #define DROP(n) (p->slen -= (n)) 207 208 #ifndef NDEBUG 209 static int never = 0; /* for use in asserts; shuts lint up */ 210 #else 211 #define never 0 /* some <assert.h>s have bugs too */ 212 #endif 213 214 #define MEMLIMIT 0x8000000 215 #define MEMSIZE(p) \ 216 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \ 217 (p)->ncsalloc * sizeof(cset) + \ 218 (p)->ssize * sizeof(sop)) 219 #define RECLIMIT 256 220 221 /* 222 - regcomp - interface for parser and compilation 223 = extern int regcomp(regex_t *, const char *, int); 224 = #define REG_BASIC 0000 225 = #define REG_EXTENDED 0001 226 = #define REG_ICASE 0002 227 = #define REG_NOSUB 0004 228 = #define REG_NEWLINE 0010 229 = #define REG_NOSPEC 0020 230 = #define REG_PEND 0040 231 = #define REG_DUMP 0200 232 */ 233 int /* 0 success, otherwise REG_something */ 234 regcomp( 235 regex_t *preg, 236 const char *pattern, 237 int cflags) 238 { 239 struct parse pa; 240 struct re_guts *g; 241 struct parse *p = &pa; 242 int i; 243 size_t len; 244 #ifdef REDEBUG 245 # define GOODFLAGS(f) (f) 246 #else 247 # define GOODFLAGS(f) ((f)&~REG_DUMP) 248 #endif 249 250 _DIAGASSERT(preg != NULL); 251 _DIAGASSERT(pattern != NULL); 252 253 cflags = GOODFLAGS(cflags); 254 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 255 return(REG_INVARG); 256 257 if (cflags®_PEND) { 258 if (preg->re_endp < pattern) 259 return(REG_INVARG); 260 len = preg->re_endp - pattern; 261 } else 262 len = strlen(pattern); 263 264 /* do the mallocs early so failure handling is easy */ 265 g = malloc(sizeof(struct re_guts) + (NC - 1) * sizeof(cat_t)); 266 if (g == NULL) 267 return(REG_ESPACE); 268 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 269 p->strip = calloc(p->ssize, sizeof(sop)); 270 p->slen = 0; 271 if (p->strip == NULL) { 272 free(g); 273 return(REG_ESPACE); 274 } 275 276 /* set things up */ 277 p->g = g; 278 p->next = pattern; 279 p->end = p->next + len; 280 p->error = 0; 281 p->ncsalloc = 0; 282 for (i = 0; i < NPAREN; i++) { 283 p->pbegin[i] = 0; 284 p->pend[i] = 0; 285 } 286 g->csetsize = NC; 287 g->sets = NULL; 288 g->setbits = NULL; 289 g->ncsets = 0; 290 g->cflags = cflags; 291 g->iflags = 0; 292 g->nbol = 0; 293 g->neol = 0; 294 g->must = NULL; 295 g->mlen = 0; 296 g->nsub = 0; 297 g->ncategories = 1; /* category 0 is "everything else" */ 298 g->categories = &g->catspace[-(CHAR_MIN)]; 299 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 300 g->backrefs = 0; 301 302 /* do it */ 303 EMIT(OEND, 0); 304 g->firststate = THERE(); 305 if (cflags®_EXTENDED) 306 p_ere(p, OUT, 0); 307 else if (cflags®_NOSPEC) 308 p_str(p); 309 else 310 p_bre(p, OUT, OUT, 0); 311 EMIT(OEND, 0); 312 g->laststate = THERE(); 313 314 /* tidy up loose ends and fill things in */ 315 categorize(p, g); 316 stripsnug(p, g); 317 findmust(p, g); 318 g->nplus = pluscount(p, g); 319 g->magic = MAGIC2; 320 preg->re_nsub = g->nsub; 321 preg->re_g = g; 322 preg->re_magic = MAGIC1; 323 #ifndef REDEBUG 324 /* not debugging, so can't rely on the assert() in regexec() */ 325 if (g->iflags&BAD) 326 SETERROR(REG_ASSERT); 327 #endif 328 329 /* win or lose, we're done */ 330 if (p->error != 0) /* lose */ 331 regfree(preg); 332 return(p->error); 333 } 334 335 /* 336 - p_ere - ERE parser top level, concatenation and alternation 337 == static void p_ere(struct parse *p, int stop, size_t reclimit); 338 */ 339 static void 340 p_ere( 341 struct parse *p, 342 int stop, /* character this ERE should end at */ 343 size_t reclimit) 344 { 345 char c; 346 sopno prevback = 0; /* pacify gcc */ 347 sopno prevfwd = 0; /* pacify gcc */ 348 sopno conc; 349 int first = 1; /* is this the first alternative? */ 350 351 _DIAGASSERT(p != NULL); 352 353 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 354 p->error = REG_ESPACE; 355 return; 356 } 357 358 for (;;) { 359 /* do a bunch of concatenated expressions */ 360 conc = HERE(); 361 while (MORE() && (c = PEEK()) != '|' && c != stop) 362 p_ere_exp(p, reclimit); 363 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 364 365 if (!EAT('|')) 366 break; /* NOTE BREAK OUT */ 367 368 if (first) { 369 INSERT(OCH_, conc); /* offset is wrong */ 370 prevfwd = conc; 371 prevback = conc; 372 first = 0; 373 } 374 ASTERN(OOR1, prevback); 375 prevback = THERE(); 376 AHEAD(prevfwd); /* fix previous offset */ 377 prevfwd = HERE(); 378 EMIT(OOR2, 0); /* offset is very wrong */ 379 } 380 381 if (!first) { /* tail-end fixups */ 382 AHEAD(prevfwd); 383 ASTERN(O_CH, prevback); 384 } 385 386 assert(!MORE() || SEE(stop)); 387 } 388 389 /* 390 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 391 == static void p_ere_exp(struct parse *p, size_t reclimit); 392 */ 393 static void 394 p_ere_exp( 395 struct parse *p, 396 size_t reclimit) 397 { 398 char c; 399 sopno pos; 400 int count; 401 int count2; 402 sopno subno; 403 int wascaret = 0; 404 405 _DIAGASSERT(p != NULL); 406 407 assert(MORE()); /* caller should have ensured this */ 408 c = GETNEXT(); 409 410 pos = HERE(); 411 switch (c) { 412 case '(': 413 REQUIRE(MORE(), REG_EPAREN); 414 p->g->nsub++; 415 subno = p->g->nsub; 416 if (subno < NPAREN) 417 p->pbegin[subno] = HERE(); 418 EMIT(OLPAREN, subno); 419 if (!SEE(')')) 420 p_ere(p, ')', reclimit); 421 if (subno < NPAREN) { 422 p->pend[subno] = HERE(); 423 assert(p->pend[subno] != 0); 424 } 425 EMIT(ORPAREN, subno); 426 MUSTEAT(')', REG_EPAREN); 427 break; 428 #ifndef POSIX_MISTAKE 429 case ')': /* happens only if no current unmatched ( */ 430 /* 431 * You may ask, why the ifndef? Because I didn't notice 432 * this until slightly too late for 1003.2, and none of the 433 * other 1003.2 regular-expression reviewers noticed it at 434 * all. So an unmatched ) is legal POSIX, at least until 435 * we can get it fixed. 436 */ 437 SETERROR(REG_EPAREN); 438 break; 439 #endif 440 case '^': 441 EMIT(OBOL, 0); 442 p->g->iflags |= USEBOL; 443 p->g->nbol++; 444 wascaret = 1; 445 break; 446 case '$': 447 EMIT(OEOL, 0); 448 p->g->iflags |= USEEOL; 449 p->g->neol++; 450 break; 451 case '|': 452 SETERROR(REG_EMPTY); 453 break; 454 case '*': 455 case '+': 456 case '?': 457 SETERROR(REG_BADRPT); 458 break; 459 case '.': 460 if (p->g->cflags®_NEWLINE) 461 nonnewline(p); 462 else 463 EMIT(OANY, 0); 464 break; 465 case '[': 466 p_bracket(p); 467 break; 468 case '\\': 469 REQUIRE(MORE(), REG_EESCAPE); 470 c = GETNEXT(); 471 ordinary(p, c); 472 break; 473 case '{': /* okay as ordinary except if digit follows */ 474 REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT); 475 /* FALLTHROUGH */ 476 default: 477 if (p->error != 0) 478 return; 479 ordinary(p, c); 480 break; 481 } 482 483 if (!MORE()) 484 return; 485 c = PEEK(); 486 /* we call { a repetition if followed by a digit */ 487 if (!( c == '*' || c == '+' || c == '?' || 488 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) )) 489 return; /* no repetition, we're done */ 490 NEXT(); 491 492 REQUIRE(!wascaret, REG_BADRPT); 493 switch (c) { 494 case '*': /* implemented as +? */ 495 /* this case does not require the (y|) trick, noKLUDGE */ 496 INSERT(OPLUS_, pos); 497 ASTERN(O_PLUS, pos); 498 INSERT(OQUEST_, pos); 499 ASTERN(O_QUEST, pos); 500 break; 501 case '+': 502 INSERT(OPLUS_, pos); 503 ASTERN(O_PLUS, pos); 504 break; 505 case '?': 506 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 507 INSERT(OCH_, pos); /* offset slightly wrong */ 508 ASTERN(OOR1, pos); /* this one's right */ 509 AHEAD(pos); /* fix the OCH_ */ 510 EMIT(OOR2, 0); /* offset very wrong... */ 511 AHEAD(THERE()); /* ...so fix it */ 512 ASTERN(O_CH, THERETHERE()); 513 break; 514 case '{': 515 count = p_count(p); 516 if (EAT(',')) { 517 if (isdigit((unsigned char)PEEK())) { 518 count2 = p_count(p); 519 REQUIRE(count <= count2, REG_BADBR); 520 } else /* single number with comma */ 521 count2 = INFINITY; 522 } else /* just a single number */ 523 count2 = count; 524 repeat(p, pos, count, count2, 0); 525 if (!EAT('}')) { /* error heuristics */ 526 while (MORE() && PEEK() != '}') 527 NEXT(); 528 REQUIRE(MORE(), REG_EBRACE); 529 SETERROR(REG_BADBR); 530 } 531 break; 532 } 533 534 if (!MORE()) 535 return; 536 c = PEEK(); 537 if (!( c == '*' || c == '+' || c == '?' || 538 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) ) 539 return; 540 SETERROR(REG_BADRPT); 541 } 542 543 /* 544 - p_str - string (no metacharacters) "parser" 545 == static void p_str(struct parse *p); 546 */ 547 static void 548 p_str( 549 struct parse *p) 550 { 551 552 _DIAGASSERT(p != NULL); 553 554 REQUIRE(MORE(), REG_EMPTY); 555 while (MORE()) 556 ordinary(p, GETNEXT()); 557 } 558 559 /* 560 - p_bre - BRE parser top level, anchoring and concatenation 561 == static void p_bre(struct parse *p, int end1, \ 562 == int end2, size_t reclimit); 563 * Giving end1 as OUT essentially eliminates the end1/end2 check. 564 * 565 * This implementation is a bit of a kludge, in that a trailing $ is first 566 * taken as an ordinary character and then revised to be an anchor. The 567 * only undesirable side effect is that '$' gets included as a character 568 * category in such cases. This is fairly harmless; not worth fixing. 569 * The amount of lookahead needed to avoid this kludge is excessive. 570 */ 571 static void 572 p_bre( 573 struct parse *p, 574 int end1, /* first terminating character */ 575 int end2, /* second terminating character */ 576 size_t reclimit) 577 { 578 sopno start; 579 int first = 1; /* first subexpression? */ 580 int wasdollar = 0; 581 582 _DIAGASSERT(p != NULL); 583 584 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 585 p->error = REG_ESPACE; 586 return; 587 } 588 589 start = HERE(); 590 591 if (EAT('^')) { 592 EMIT(OBOL, 0); 593 p->g->iflags |= USEBOL; 594 p->g->nbol++; 595 } 596 while (MORE() && !SEETWO(end1, end2)) { 597 wasdollar = p_simp_re(p, first, reclimit); 598 first = 0; 599 } 600 if (wasdollar) { /* oops, that was a trailing anchor */ 601 DROP(1); 602 EMIT(OEOL, 0); 603 p->g->iflags |= USEEOL; 604 p->g->neol++; 605 } 606 607 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 608 } 609 610 /* 611 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 612 == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 613 */ 614 static int /* was the simple RE an unbackslashed $? */ 615 p_simp_re( 616 struct parse *p, 617 int starordinary, /* is a leading * an ordinary character? */ 618 size_t reclimit) 619 { 620 int c; 621 int count; 622 int count2; 623 sopno pos, i; 624 sopno subno; 625 # define BACKSL (1<<CHAR_BIT) 626 627 _DIAGASSERT(p != NULL); 628 629 pos = HERE(); /* repetion op, if any, covers from here */ 630 631 assert(MORE()); /* caller should have ensured this */ 632 c = GETNEXT(); 633 if (c == '\\') { 634 REQUIRE(MORE(), REG_EESCAPE); 635 c = BACKSL | (unsigned char)GETNEXT(); 636 } 637 switch (c) { 638 case '.': 639 if (p->g->cflags®_NEWLINE) 640 nonnewline(p); 641 else 642 EMIT(OANY, 0); 643 break; 644 case '[': 645 p_bracket(p); 646 break; 647 case BACKSL|'{': 648 SETERROR(REG_BADRPT); 649 break; 650 case BACKSL|'(': 651 p->g->nsub++; 652 subno = p->g->nsub; 653 if (subno < NPAREN) 654 p->pbegin[subno] = HERE(); 655 EMIT(OLPAREN, subno); 656 /* the MORE here is an error heuristic */ 657 if (MORE() && !SEETWO('\\', ')')) 658 p_bre(p, '\\', ')', reclimit); 659 if (subno < NPAREN) { 660 p->pend[subno] = HERE(); 661 assert(p->pend[subno] != 0); 662 } 663 EMIT(ORPAREN, subno); 664 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 665 break; 666 case BACKSL|')': /* should not get here -- must be user */ 667 case BACKSL|'}': 668 SETERROR(REG_EPAREN); 669 break; 670 case BACKSL|'1': 671 case BACKSL|'2': 672 case BACKSL|'3': 673 case BACKSL|'4': 674 case BACKSL|'5': 675 case BACKSL|'6': 676 case BACKSL|'7': 677 case BACKSL|'8': 678 case BACKSL|'9': 679 i = (c&~BACKSL) - '0'; 680 assert(i < NPAREN); 681 if (p->pend[i] != 0) { 682 assert(i <= p->g->nsub); 683 EMIT(OBACK_, i); 684 assert(p->pbegin[i] != 0); 685 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 686 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 687 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 688 EMIT(O_BACK, i); 689 } else 690 SETERROR(REG_ESUBREG); 691 p->g->backrefs = 1; 692 break; 693 case '*': 694 REQUIRE(starordinary, REG_BADRPT); 695 /* FALLTHROUGH */ 696 default: 697 if (p->error != 0) 698 return(0); 699 ordinary(p, c &~ BACKSL); 700 break; 701 } 702 703 if (EAT('*')) { /* implemented as +? */ 704 /* this case does not require the (y|) trick, noKLUDGE */ 705 INSERT(OPLUS_, pos); 706 ASTERN(O_PLUS, pos); 707 INSERT(OQUEST_, pos); 708 ASTERN(O_QUEST, pos); 709 } else if (EATTWO('\\', '{')) { 710 count = p_count(p); 711 if (EAT(',')) { 712 if (MORE() && isdigit((unsigned char)PEEK())) { 713 count2 = p_count(p); 714 REQUIRE(count <= count2, REG_BADBR); 715 } else /* single number with comma */ 716 count2 = INFINITY; 717 } else /* just a single number */ 718 count2 = count; 719 repeat(p, pos, count, count2, 0); 720 if (!EATTWO('\\', '}')) { /* error heuristics */ 721 while (MORE() && !SEETWO('\\', '}')) 722 NEXT(); 723 REQUIRE(MORE(), REG_EBRACE); 724 SETERROR(REG_BADBR); 725 } 726 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 727 return(1); 728 729 return(0); 730 } 731 732 /* 733 - p_count - parse a repetition count 734 == static int p_count(struct parse *p); 735 */ 736 static int /* the value */ 737 p_count( 738 struct parse *p) 739 { 740 int count = 0; 741 int ndigits = 0; 742 743 _DIAGASSERT(p != NULL); 744 745 while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) { 746 count = count*10 + (GETNEXT() - '0'); 747 ndigits++; 748 } 749 750 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 751 return(count); 752 } 753 754 /* 755 - p_bracket - parse a bracketed character list 756 == static void p_bracket(struct parse *p); 757 * 758 * Note a significant property of this code: if the allocset() did SETERROR, 759 * no set operations are done. 760 */ 761 static void 762 p_bracket( 763 struct parse *p) 764 { 765 cset *cs; 766 int invert = 0; 767 _DIAGASSERT(p != NULL); 768 769 cs = allocset(p); 770 if (cs == NULL) 771 return; 772 773 /* Dept of Truly Sickening Special-Case Kludges */ 774 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 775 (size_t)6) == 0) { 776 EMIT(OBOW, 0); 777 NEXTn(6); 778 return; 779 } 780 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 781 (size_t)6) == 0) { 782 EMIT(OEOW, 0); 783 NEXTn(6); 784 return; 785 } 786 787 if (EAT('^')) 788 invert++; /* make note to invert set at end */ 789 if (EAT(']')) 790 CHadd(cs, ']'); 791 else if (EAT('-')) 792 CHadd(cs, '-'); 793 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 794 p_b_term(p, cs); 795 if (EAT('-')) 796 CHadd(cs, '-'); 797 MUSTEAT(']', REG_EBRACK); 798 799 if (p->error != 0) /* don't mess things up further */ 800 return; 801 802 if (p->g->cflags®_ICASE) { 803 ssize_t i; 804 int ci; 805 806 for (i = p->g->csetsize - 1; i >= 0; i--) 807 if (CHIN(cs, i) && isalpha(i)) { 808 ci = othercase((int)i); 809 if (ci != i) 810 CHadd(cs, ci); 811 } 812 if (cs->multis != NULL) 813 mccase(p, cs); 814 } 815 if (invert) { 816 ssize_t i; 817 818 for (i = p->g->csetsize - 1; i >= 0; i--) 819 if (CHIN(cs, i)) 820 CHsub(cs, (int)i); 821 else 822 CHadd(cs, (int)i); 823 if (p->g->cflags®_NEWLINE) 824 CHsub(cs, '\n'); 825 if (cs->multis != NULL) 826 mcinvert(p, cs); 827 } 828 829 assert(cs->multis == NULL); /* xxx */ 830 831 if (nch(p, cs) == 1) { /* optimize singleton sets */ 832 ordinary(p, firstch(p, cs)); 833 freeset(p, cs); 834 } else 835 EMIT(OANYOF, freezeset(p, cs)); 836 } 837 838 /* 839 - p_b_term - parse one term of a bracketed character list 840 == static void p_b_term(struct parse *p, cset *cs); 841 */ 842 static void 843 p_b_term( 844 struct parse *p, 845 cset *cs) 846 { 847 char c; 848 char start, finish; 849 int i; 850 851 _DIAGASSERT(p != NULL); 852 _DIAGASSERT(cs != NULL); 853 854 /* classify what we've got */ 855 switch ((MORE()) ? PEEK() : '\0') { 856 case '[': 857 c = (MORE2()) ? PEEK2() : '\0'; 858 break; 859 860 case '-': 861 SETERROR(REG_ERANGE); 862 return; /* NOTE RETURN */ 863 864 default: 865 c = '\0'; 866 break; 867 } 868 869 switch (c) { 870 case ':': /* character class */ 871 NEXT2(); 872 REQUIRE(MORE(), REG_EBRACK); 873 c = PEEK(); 874 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 875 p_b_cclass(p, cs); 876 REQUIRE(MORE(), REG_EBRACK); 877 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 878 break; 879 case '=': /* equivalence class */ 880 NEXT2(); 881 REQUIRE(MORE(), REG_EBRACK); 882 c = PEEK(); 883 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 884 p_b_eclass(p, cs); 885 REQUIRE(MORE(), REG_EBRACK); 886 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 887 break; 888 default: /* symbol, ordinary character, or range */ 889 /* xxx revision needed for multichar stuff */ 890 start = p_b_symbol(p); 891 if (SEE('-') && MORE2() && PEEK2() != ']') { 892 /* range */ 893 NEXT(); 894 if (EAT('-')) 895 finish = '-'; 896 else 897 finish = p_b_symbol(p); 898 } else 899 finish = start; 900 /* xxx what about signed chars here... */ 901 REQUIRE(start <= finish, REG_ERANGE); 902 for (i = start; i <= finish; i++) 903 CHadd(cs, i); 904 break; 905 } 906 } 907 908 /* 909 - p_b_cclass - parse a character-class name and deal with it 910 == static void p_b_cclass(struct parse *p, cset *cs); 911 */ 912 static void 913 p_b_cclass( 914 struct parse *p, 915 cset *cs) 916 { 917 const char *sp; 918 const struct cclass *cp; 919 size_t len; 920 const char *u; 921 char c; 922 923 _DIAGASSERT(p != NULL); 924 _DIAGASSERT(cs != NULL); 925 926 sp = p->next; 927 928 while (MORE() && isalpha((unsigned char)PEEK())) 929 NEXT(); 930 len = p->next - sp; 931 for (cp = cclasses; cp->name != NULL; cp++) 932 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 933 break; 934 if (cp->name == NULL) { 935 /* oops, didn't find it */ 936 SETERROR(REG_ECTYPE); 937 return; 938 } 939 940 u = cp->chars; 941 while ((c = *u++) != '\0') 942 CHadd(cs, c); 943 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 944 MCadd(p, cs, u); 945 } 946 947 /* 948 - p_b_eclass - parse an equivalence-class name and deal with it 949 == static void p_b_eclass(struct parse *p, cset *cs); 950 * 951 * This implementation is incomplete. xxx 952 */ 953 static void 954 p_b_eclass( 955 struct parse *p, 956 cset *cs) 957 { 958 char c; 959 960 _DIAGASSERT(p != NULL); 961 _DIAGASSERT(cs != NULL); 962 963 c = p_b_coll_elem(p, '='); 964 CHadd(cs, c); 965 } 966 967 /* 968 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 969 == static char p_b_symbol(struct parse *p); 970 */ 971 static char /* value of symbol */ 972 p_b_symbol( 973 struct parse *p) 974 { 975 char value; 976 977 _DIAGASSERT(p != NULL); 978 979 REQUIRE(MORE(), REG_EBRACK); 980 if (!EATTWO('[', '.')) 981 return(GETNEXT()); 982 983 /* collating symbol */ 984 value = p_b_coll_elem(p, '.'); 985 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 986 return(value); 987 } 988 989 /* 990 - p_b_coll_elem - parse a collating-element name and look it up 991 == static char p_b_coll_elem(struct parse *p, int endc); 992 */ 993 static char /* value of collating element */ 994 p_b_coll_elem( 995 struct parse *p, 996 int endc) /* name ended by endc,']' */ 997 { 998 const char *sp; 999 const struct cname *cp; 1000 size_t len; 1001 1002 _DIAGASSERT(p != NULL); 1003 1004 sp = p->next; 1005 1006 while (MORE() && !SEETWO(endc, ']')) 1007 NEXT(); 1008 if (!MORE()) { 1009 SETERROR(REG_EBRACK); 1010 return(0); 1011 } 1012 len = p->next - sp; 1013 for (cp = cnames; cp->name != NULL; cp++) 1014 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1015 return(cp->code); /* known name */ 1016 if (len == 1) 1017 return(*sp); /* single character */ 1018 SETERROR(REG_ECOLLATE); /* neither */ 1019 return(0); 1020 } 1021 1022 /* 1023 - othercase - return the case counterpart of an alphabetic 1024 == static int othercase(int ch); 1025 */ 1026 static int /* if no counterpart, return ch */ 1027 othercase( 1028 int ch) 1029 { 1030 assert(isalpha(ch)); 1031 if (isupper(ch)) 1032 return(tolower(ch)); 1033 else if (islower(ch)) 1034 return(toupper(ch)); 1035 else /* peculiar, but could happen */ 1036 return(ch); 1037 } 1038 1039 /* 1040 - bothcases - emit a dualcase version of a two-case character 1041 == static void bothcases(struct parse *p, int ch); 1042 * 1043 * Boy, is this implementation ever a kludge... 1044 */ 1045 static void 1046 bothcases( 1047 struct parse *p, 1048 int ch) 1049 { 1050 const char *oldnext; 1051 const char *oldend; 1052 char bracket[3]; 1053 1054 _DIAGASSERT(p != NULL); 1055 1056 oldnext = p->next; 1057 oldend = p->end; 1058 1059 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1060 p->next = bracket; 1061 p->end = bracket+2; 1062 bracket[0] = ch; 1063 bracket[1] = ']'; 1064 bracket[2] = '\0'; 1065 p_bracket(p); 1066 assert(p->next == bracket+2); 1067 p->next = oldnext; 1068 p->end = oldend; 1069 } 1070 1071 /* 1072 - ordinary - emit an ordinary character 1073 == static void ordinary(struct parse *p, int ch); 1074 */ 1075 static void 1076 ordinary( 1077 struct parse *p, 1078 int ch) 1079 { 1080 cat_t *cap; 1081 unsigned char uc = (unsigned char)ch; 1082 1083 _DIAGASSERT(p != NULL); 1084 1085 cap = p->g->categories; 1086 if ((p->g->cflags & REG_ICASE) && isalpha(uc) && othercase(uc) != uc) 1087 bothcases(p, uc); 1088 else { 1089 EMIT(OCHAR, (sopno)uc); 1090 if (cap[uc] == 0) { 1091 _DIAGASSERT(__type_fit(unsigned char, 1092 p->g->ncategories + 1)); 1093 cap[uc] = (unsigned char)p->g->ncategories++; 1094 } 1095 } 1096 } 1097 1098 /* 1099 - nonnewline - emit REG_NEWLINE version of OANY 1100 == static void nonnewline(struct parse *p); 1101 * 1102 * Boy, is this implementation ever a kludge... 1103 */ 1104 static void 1105 nonnewline( 1106 struct parse *p) 1107 { 1108 const char *oldnext; 1109 const char *oldend; 1110 char bracket[4]; 1111 1112 _DIAGASSERT(p != NULL); 1113 1114 oldnext = p->next; 1115 oldend = p->end; 1116 1117 p->next = bracket; 1118 p->end = bracket+3; 1119 bracket[0] = '^'; 1120 bracket[1] = '\n'; 1121 bracket[2] = ']'; 1122 bracket[3] = '\0'; 1123 p_bracket(p); 1124 assert(p->next == bracket+3); 1125 p->next = oldnext; 1126 p->end = oldend; 1127 } 1128 1129 /* 1130 - repeat - generate code for a bounded repetition, recursively if needed 1131 == static void repeat(struct parse *p, sopno start, int from, int to, 1132 == size_t reclimit); 1133 */ 1134 static void 1135 repeat( 1136 struct parse *p, 1137 sopno start, /* operand from here to end of strip */ 1138 int from, /* repeated from this number */ 1139 int to, /* to this number of times (maybe INFINITY) */ 1140 size_t reclimit) 1141 { 1142 sopno finish; 1143 # define N 2 1144 # define INF 3 1145 # define REP(f, t) ((f)*8 + (t)) 1146 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1147 sopno copy; 1148 1149 _DIAGASSERT(p != NULL); 1150 1151 if (reclimit++ > RECLIMIT) 1152 p->error = REG_ESPACE; 1153 if (p->error) 1154 return; 1155 1156 finish = HERE(); 1157 1158 assert(from <= to); 1159 1160 switch (REP(MAP(from), MAP(to))) { 1161 case REP(0, 0): /* must be user doing this */ 1162 DROP(finish-start); /* drop the operand */ 1163 break; 1164 case REP(0, 1): /* as x{1,1}? */ 1165 case REP(0, N): /* as x{1,n}? */ 1166 case REP(0, INF): /* as x{1,}? */ 1167 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1168 INSERT(OCH_, start); /* offset is wrong... */ 1169 repeat(p, start+1, 1, to, reclimit); 1170 ASTERN(OOR1, start); 1171 AHEAD(start); /* ... fix it */ 1172 EMIT(OOR2, 0); 1173 AHEAD(THERE()); 1174 ASTERN(O_CH, THERETHERE()); 1175 break; 1176 case REP(1, 1): /* trivial case */ 1177 /* done */ 1178 break; 1179 case REP(1, N): /* as x?x{1,n-1} */ 1180 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1181 INSERT(OCH_, start); 1182 ASTERN(OOR1, start); 1183 AHEAD(start); 1184 EMIT(OOR2, 0); /* offset very wrong... */ 1185 AHEAD(THERE()); /* ...so fix it */ 1186 ASTERN(O_CH, THERETHERE()); 1187 copy = dupl(p, start+1, finish+1); 1188 assert(copy == finish+4); 1189 repeat(p, copy, 1, to-1, reclimit); 1190 break; 1191 case REP(1, INF): /* as x+ */ 1192 INSERT(OPLUS_, start); 1193 ASTERN(O_PLUS, start); 1194 break; 1195 case REP(N, N): /* as xx{m-1,n-1} */ 1196 copy = dupl(p, start, finish); 1197 repeat(p, copy, from-1, to-1, reclimit); 1198 break; 1199 case REP(N, INF): /* as xx{n-1,INF} */ 1200 copy = dupl(p, start, finish); 1201 repeat(p, copy, from-1, to, reclimit); 1202 break; 1203 default: /* "can't happen" */ 1204 SETERROR(REG_ASSERT); /* just in case */ 1205 break; 1206 } 1207 } 1208 1209 /* 1210 - seterr - set an error condition 1211 == static int seterr(struct parse *p, int e); 1212 */ 1213 static int /* useless but makes type checking happy */ 1214 seterr( 1215 struct parse *p, 1216 int e) 1217 { 1218 1219 _DIAGASSERT(p != NULL); 1220 1221 if (p->error == 0) /* keep earliest error condition */ 1222 p->error = e; 1223 p->next = nuls; /* try to bring things to a halt */ 1224 p->end = nuls; 1225 return(0); /* make the return value well-defined */ 1226 } 1227 1228 /* 1229 - allocset - allocate a set of characters for [] 1230 == static cset *allocset(struct parse *p); 1231 */ 1232 static cset * 1233 allocset( 1234 struct parse *p) 1235 { 1236 size_t no; 1237 size_t nc; 1238 size_t nbytes; 1239 cset *cs; 1240 size_t css; 1241 size_t i; 1242 void *old_ptr; 1243 1244 _DIAGASSERT(p != NULL); 1245 1246 no = p->g->ncsets++; 1247 css = (size_t)p->g->csetsize; 1248 if (no >= p->ncsalloc) { /* need another column of space */ 1249 p->ncsalloc += CHAR_BIT; 1250 nc = p->ncsalloc; 1251 assert(nc % CHAR_BIT == 0); 1252 nbytes = nc / CHAR_BIT * css; 1253 if (MEMSIZE(p) > MEMLIMIT) 1254 goto oomem; 1255 if (reallocarr(&p->g->sets, nc, sizeof(cset))) 1256 goto oomem; 1257 old_ptr = p->g->setbits; 1258 if (reallocarr(&p->g->setbits, nc / CHAR_BIT, css)) { 1259 free(old_ptr); 1260 goto oomem; 1261 } 1262 if (old_ptr != p->g->setbits) { 1263 for (i = 0; i < no; i++) 1264 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1265 } 1266 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); 1267 } 1268 1269 cs = &p->g->sets[no]; 1270 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1271 cs->mask = 1 << (unsigned int)((no) % CHAR_BIT); 1272 cs->hash = 0; 1273 cs->smultis = 0; 1274 cs->multis = NULL; 1275 1276 return(cs); 1277 1278 oomem: 1279 SETERROR(REG_ESPACE); 1280 /* caller's responsibility not to do set ops */ 1281 return NULL; 1282 } 1283 1284 /* 1285 - freeset - free a now-unused set 1286 == static void freeset(struct parse *p, cset *cs); 1287 */ 1288 static void 1289 freeset( 1290 struct parse *p, 1291 cset *cs) 1292 { 1293 size_t i; 1294 cset *top; 1295 size_t css; 1296 1297 _DIAGASSERT(p != NULL); 1298 _DIAGASSERT(cs != NULL); 1299 1300 top = &p->g->sets[p->g->ncsets]; 1301 css = (size_t)p->g->csetsize; 1302 1303 for (i = 0; i < css; i++) 1304 CHsub(cs, (int)i); 1305 if (cs == top-1) /* recover only the easy case */ 1306 p->g->ncsets--; 1307 } 1308 1309 /* 1310 - freezeset - final processing on a set of characters 1311 == static int freezeset(struct parse *p, cset *cs); 1312 * 1313 * The main task here is merging identical sets. This is usually a waste 1314 * of time (although the hash code minimizes the overhead), but can win 1315 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1316 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1317 * the same value! 1318 */ 1319 static sopno /* set number */ 1320 freezeset( 1321 struct parse *p, 1322 cset *cs) 1323 { 1324 uch h; 1325 size_t i; 1326 cset *top; 1327 cset *cs2; 1328 size_t css; 1329 1330 _DIAGASSERT(p != NULL); 1331 _DIAGASSERT(cs != NULL); 1332 1333 h = cs->hash; 1334 top = &p->g->sets[p->g->ncsets]; 1335 css = (size_t)p->g->csetsize; 1336 1337 /* look for an earlier one which is the same */ 1338 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1339 if (cs2->hash == h && cs2 != cs) { 1340 /* maybe */ 1341 for (i = 0; i < css; i++) 1342 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1343 break; /* no */ 1344 if (i == css) 1345 break; /* yes */ 1346 } 1347 1348 if (cs2 < top) { /* found one */ 1349 freeset(p, cs); 1350 cs = cs2; 1351 } 1352 1353 return (sopno)(cs - p->g->sets); 1354 } 1355 1356 /* 1357 - firstch - return first character in a set (which must have at least one) 1358 == static int firstch(struct parse *p, cset *cs); 1359 */ 1360 static int /* character; there is no "none" value */ 1361 firstch( 1362 struct parse *p, 1363 cset *cs) 1364 { 1365 size_t i; 1366 size_t css; 1367 1368 _DIAGASSERT(p != NULL); 1369 _DIAGASSERT(cs != NULL); 1370 1371 css = (size_t)p->g->csetsize; 1372 1373 for (i = 0; i < css; i++) 1374 if (CHIN(cs, i)) 1375 return((char)i); 1376 assert(never); 1377 return(0); /* arbitrary */ 1378 } 1379 1380 /* 1381 - nch - number of characters in a set 1382 == static int nch(struct parse *p, cset *cs); 1383 */ 1384 static int 1385 nch( 1386 struct parse *p, 1387 cset *cs) 1388 { 1389 size_t i; 1390 size_t css; 1391 int n = 0; 1392 1393 _DIAGASSERT(p != NULL); 1394 _DIAGASSERT(cs != NULL); 1395 1396 css = (size_t)p->g->csetsize; 1397 1398 for (i = 0; i < css; i++) 1399 if (CHIN(cs, i)) 1400 n++; 1401 return(n); 1402 } 1403 1404 /* 1405 - mcadd - add a collating element to a cset 1406 == static void mcadd(struct parse *p, cset *cs, \ 1407 == char *cp); 1408 */ 1409 static void 1410 mcadd( 1411 struct parse *p, 1412 cset *cs, 1413 const char *cp) 1414 { 1415 size_t oldend; 1416 1417 _DIAGASSERT(p != NULL); 1418 _DIAGASSERT(cs != NULL); 1419 _DIAGASSERT(cp != NULL); 1420 1421 oldend = cs->smultis; 1422 1423 cs->smultis += strlen(cp) + 1; 1424 if (cs->multis == NULL) 1425 cs->multis = malloc(cs->smultis); 1426 else 1427 cs->multis = realloc(cs->multis, cs->smultis); 1428 if (cs->multis == NULL) { 1429 SETERROR(REG_ESPACE); 1430 return; 1431 } 1432 1433 (void) strcpy(cs->multis + oldend - 1, cp); 1434 cs->multis[cs->smultis - 1] = '\0'; 1435 } 1436 1437 #if 0 1438 /* 1439 - mcsub - subtract a collating element from a cset 1440 == static void mcsub(cset *cs, char *cp); 1441 */ 1442 static void 1443 mcsub( 1444 cset *cs, 1445 char *cp) 1446 { 1447 char *fp; 1448 size_t len; 1449 1450 _DIAGASSERT(cs != NULL); 1451 _DIAGASSERT(cp != NULL); 1452 1453 fp = mcfind(cs, cp); 1454 len = strlen(fp); 1455 1456 assert(fp != NULL); 1457 (void) memmove(fp, fp + len + 1, 1458 cs->smultis - (fp + len + 1 - cs->multis)); 1459 cs->smultis -= len; 1460 1461 if (cs->smultis == 0) { 1462 free(cs->multis); 1463 cs->multis = NULL; 1464 return; 1465 } 1466 1467 cs->multis = realloc(cs->multis, cs->smultis); 1468 assert(cs->multis != NULL); 1469 } 1470 1471 /* 1472 - mcin - is a collating element in a cset? 1473 == static int mcin(cset *cs, char *cp); 1474 */ 1475 static int 1476 mcin( 1477 cset *cs, 1478 char *cp) 1479 { 1480 1481 _DIAGASSERT(cs != NULL); 1482 _DIAGASSERT(cp != NULL); 1483 1484 return(mcfind(cs, cp) != NULL); 1485 } 1486 1487 /* 1488 - mcfind - find a collating element in a cset 1489 == static char *mcfind(cset *cs, char *cp); 1490 */ 1491 static char * 1492 mcfind( 1493 cset *cs, 1494 char *cp) 1495 { 1496 char *p; 1497 1498 _DIAGASSERT(cs != NULL); 1499 _DIAGASSERT(cp != NULL); 1500 1501 if (cs->multis == NULL) 1502 return(NULL); 1503 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1504 if (strcmp(cp, p) == 0) 1505 return(p); 1506 return(NULL); 1507 } 1508 #endif 1509 1510 /* 1511 - mcinvert - invert the list of collating elements in a cset 1512 == static void mcinvert(struct parse *p, cset *cs); 1513 * 1514 * This would have to know the set of possibilities. Implementation 1515 * is deferred. 1516 */ 1517 /* ARGSUSED */ 1518 static void 1519 mcinvert( 1520 struct parse *p, 1521 cset *cs) 1522 { 1523 1524 _DIAGASSERT(p != NULL); 1525 _DIAGASSERT(cs != NULL); 1526 1527 assert(cs->multis == NULL); /* xxx */ 1528 } 1529 1530 /* 1531 - mccase - add case counterparts of the list of collating elements in a cset 1532 == static void mccase(struct parse *p, cset *cs); 1533 * 1534 * This would have to know the set of possibilities. Implementation 1535 * is deferred. 1536 */ 1537 /* ARGSUSED */ 1538 static void 1539 mccase( 1540 struct parse *p, 1541 cset *cs) 1542 { 1543 1544 _DIAGASSERT(p != NULL); 1545 _DIAGASSERT(cs != NULL); 1546 1547 assert(cs->multis == NULL); /* xxx */ 1548 } 1549 1550 /* 1551 - isinsets - is this character in any sets? 1552 == static int isinsets(struct re_guts *g, int c); 1553 */ 1554 static int /* predicate */ 1555 isinsets( 1556 struct re_guts *g, 1557 int c) 1558 { 1559 uch *col; 1560 size_t i; 1561 size_t ncols; 1562 unsigned uc = (unsigned char)c; 1563 1564 _DIAGASSERT(g != NULL); 1565 1566 if (g->setbits == NULL) 1567 return 0; 1568 1569 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1570 1571 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1572 if (col[uc] != 0) 1573 return(1); 1574 return(0); 1575 } 1576 1577 /* 1578 - samesets - are these two characters in exactly the same sets? 1579 == static int samesets(struct re_guts *g, int c1, int c2); 1580 */ 1581 static int /* predicate */ 1582 samesets( 1583 struct re_guts *g, 1584 int c1, 1585 int c2) 1586 { 1587 uch *col; 1588 size_t i; 1589 size_t ncols; 1590 unsigned uc1 = (unsigned char)c1; 1591 unsigned uc2 = (unsigned char)c2; 1592 1593 _DIAGASSERT(g != NULL); 1594 1595 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1596 1597 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1598 if (col[uc1] != col[uc2]) 1599 return(0); 1600 return(1); 1601 } 1602 1603 /* 1604 - categorize - sort out character categories 1605 == static void categorize(struct parse *p, struct re_guts *g); 1606 */ 1607 static void 1608 categorize( 1609 struct parse *p, 1610 struct re_guts *g) 1611 { 1612 cat_t *cats; 1613 int c; 1614 int c2; 1615 cat_t cat; 1616 1617 _DIAGASSERT(p != NULL); 1618 _DIAGASSERT(g != NULL); 1619 1620 cats = g->categories; 1621 1622 /* avoid making error situations worse */ 1623 if (p->error != 0) 1624 return; 1625 1626 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1627 if (cats[c] == 0 && isinsets(g, c)) { 1628 _DIAGASSERT(__type_fit(unsigned char, 1629 g->ncategories + 1)); 1630 cat = g->ncategories++; 1631 cats[c] = cat; 1632 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1633 if (cats[c2] == 0 && samesets(g, c, c2)) 1634 cats[c2] = cat; 1635 } 1636 } 1637 1638 /* 1639 - dupl - emit a duplicate of a bunch of sops 1640 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1641 */ 1642 static sopno /* start of duplicate */ 1643 dupl( 1644 struct parse *p, 1645 sopno start, /* from here */ 1646 sopno finish) /* to this less one */ 1647 { 1648 sopno ret; 1649 sopno len = finish - start; 1650 1651 _DIAGASSERT(p != NULL); 1652 1653 ret = HERE(); 1654 1655 assert(finish >= start); 1656 if (len == 0) 1657 return(ret); 1658 if (!enlarge(p, p->ssize + len))/* this many unexpected additions */ 1659 return ret; 1660 (void)memcpy(p->strip + p->slen, p->strip + start, 1661 (size_t)len * sizeof(sop)); 1662 p->slen += len; 1663 return(ret); 1664 } 1665 1666 /* 1667 - doemit - emit a strip operator 1668 == static void doemit(struct parse *p, sop op, size_t opnd); 1669 * 1670 * It might seem better to implement this as a macro with a function as 1671 * hard-case backup, but it's just too big and messy unless there are 1672 * some changes to the data structures. Maybe later. 1673 */ 1674 static void 1675 doemit( 1676 struct parse *p, 1677 sop op, 1678 sopno opnd) 1679 { 1680 _DIAGASSERT(p != NULL); 1681 1682 /* avoid making error situations worse */ 1683 if (p->error != 0) 1684 return; 1685 1686 /* deal with oversize operands ("can't happen", more or less) */ 1687 assert(opnd < 1<<OPSHIFT); 1688 1689 /* deal with undersized strip */ 1690 if (p->slen >= p->ssize) 1691 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1692 return; 1693 1694 /* finally, it's all reduced to the easy case */ 1695 p->strip[p->slen++] = (sop)SOP(op, opnd); 1696 } 1697 1698 /* 1699 - doinsert - insert a sop into the strip 1700 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1701 */ 1702 static void 1703 doinsert( 1704 struct parse *p, 1705 sop op, 1706 sopno opnd, 1707 sopno pos) 1708 { 1709 sopno sn; 1710 sop s; 1711 int i; 1712 1713 _DIAGASSERT(p != NULL); 1714 1715 /* avoid making error situations worse */ 1716 if (p->error != 0) 1717 return; 1718 1719 sn = HERE(); 1720 EMIT(op, opnd); /* do checks, ensure space */ 1721 assert(HERE() == sn+1); 1722 s = p->strip[sn]; 1723 1724 /* adjust paren pointers */ 1725 assert(pos > 0); 1726 for (i = 1; i < NPAREN; i++) { 1727 if (p->pbegin[i] >= pos) { 1728 p->pbegin[i]++; 1729 } 1730 if (p->pend[i] >= pos) { 1731 p->pend[i]++; 1732 } 1733 } 1734 1735 memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop)); 1736 p->strip[pos] = s; 1737 } 1738 1739 /* 1740 - dofwd - complete a forward reference 1741 == static void dofwd(struct parse *p, sopno pos, sop value); 1742 */ 1743 static void 1744 dofwd( 1745 struct parse *p, 1746 sopno pos, 1747 sopno value) 1748 { 1749 1750 _DIAGASSERT(p != NULL); 1751 1752 /* avoid making error situations worse */ 1753 if (p->error != 0) 1754 return; 1755 1756 assert(value < 1<<OPSHIFT); 1757 p->strip[pos] = (sop)(OP(p->strip[pos]) | value); 1758 } 1759 1760 /* 1761 - enlarge - enlarge the strip 1762 == static void enlarge(struct parse *p, sopno size); 1763 */ 1764 static int 1765 enlarge(struct parse *p, sopno size) 1766 { 1767 _DIAGASSERT(p != NULL); 1768 1769 if (p->ssize >= size) 1770 return 1; 1771 1772 if (MEMSIZE(p) > MEMLIMIT || reallocarr(&p->strip, size, sizeof(sop))) { 1773 SETERROR(REG_ESPACE); 1774 return 0; 1775 } 1776 p->ssize = size; 1777 return 1; 1778 } 1779 1780 /* 1781 - stripsnug - compact the strip 1782 == static void stripsnug(struct parse *p, struct re_guts *g); 1783 */ 1784 static void 1785 stripsnug( 1786 struct parse *p, 1787 struct re_guts *g) 1788 { 1789 1790 _DIAGASSERT(p != NULL); 1791 _DIAGASSERT(g != NULL); 1792 1793 g->nstates = p->slen; 1794 g->strip = p->strip; 1795 reallocarr(&g->strip, p->slen, sizeof(sop)); 1796 /* Ignore error as tries to free memory only. */ 1797 } 1798 1799 /* 1800 - findmust - fill in must and mlen with longest mandatory literal string 1801 == static void findmust(struct parse *p, struct re_guts *g); 1802 * 1803 * This algorithm could do fancy things like analyzing the operands of | 1804 * for common subsequences. Someday. This code is simple and finds most 1805 * of the interesting cases. 1806 * 1807 * Note that must and mlen got initialized during setup. 1808 */ 1809 static void 1810 findmust( 1811 struct parse *p, 1812 struct re_guts *g) 1813 { 1814 sop *scan; 1815 sop *start = NULL; 1816 sop *newstart = NULL; 1817 sopno newlen; 1818 sop s; 1819 char *cp; 1820 sopno i; 1821 1822 _DIAGASSERT(p != NULL); 1823 _DIAGASSERT(g != NULL); 1824 1825 /* avoid making error situations worse */ 1826 if (p->error != 0) 1827 return; 1828 1829 /* find the longest OCHAR sequence in strip */ 1830 newlen = 0; 1831 scan = g->strip + 1; 1832 do { 1833 s = *scan++; 1834 switch (OP(s)) { 1835 case OCHAR: /* sequence member */ 1836 if (newlen == 0) /* new sequence */ 1837 newstart = scan - 1; 1838 newlen++; 1839 break; 1840 case OPLUS_: /* things that don't break one */ 1841 case OLPAREN: 1842 case ORPAREN: 1843 break; 1844 case OQUEST_: /* things that must be skipped */ 1845 case OCH_: 1846 scan--; 1847 do { 1848 scan += OPND(s); 1849 s = *scan; 1850 /* assert() interferes w debug printouts */ 1851 if (OP(s) != O_QUEST && OP(s) != O_CH && 1852 OP(s) != OOR2) { 1853 g->iflags |= BAD; 1854 return; 1855 } 1856 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1857 /* FALLTHROUGH */ 1858 default: /* things that break a sequence */ 1859 if (newlen > g->mlen) { /* ends one */ 1860 start = newstart; 1861 g->mlen = newlen; 1862 } 1863 newlen = 0; 1864 break; 1865 } 1866 } while (OP(s) != OEND); 1867 1868 if (start == NULL) 1869 g->mlen = 0; 1870 1871 if (g->mlen == 0) /* there isn't one */ 1872 return; 1873 1874 /* turn it into a character string */ 1875 g->must = malloc((size_t)g->mlen + 1); 1876 if (g->must == NULL) { /* argh; just forget it */ 1877 g->mlen = 0; 1878 return; 1879 } 1880 cp = g->must; 1881 scan = start; 1882 for (i = g->mlen; i > 0; i--) { 1883 while (OP(s = *scan++) != OCHAR) 1884 continue; 1885 assert(cp < g->must + g->mlen); 1886 *cp++ = (char)OPND(s); 1887 } 1888 assert(cp == g->must + g->mlen); 1889 *cp++ = '\0'; /* just on general principles */ 1890 } 1891 1892 /* 1893 - pluscount - count + nesting 1894 == static sopno pluscount(struct parse *p, struct re_guts *g); 1895 */ 1896 static sopno /* nesting depth */ 1897 pluscount( 1898 struct parse *p, 1899 struct re_guts *g) 1900 { 1901 sop *scan; 1902 sop s; 1903 sopno plusnest = 0; 1904 sopno maxnest = 0; 1905 1906 _DIAGASSERT(p != NULL); 1907 _DIAGASSERT(g != NULL); 1908 1909 if (p->error != 0) 1910 return(0); /* there may not be an OEND */ 1911 1912 scan = g->strip + 1; 1913 do { 1914 s = *scan++; 1915 switch (OP(s)) { 1916 case OPLUS_: 1917 plusnest++; 1918 break; 1919 case O_PLUS: 1920 if (plusnest > maxnest) 1921 maxnest = plusnest; 1922 plusnest--; 1923 break; 1924 } 1925 } while (OP(s) != OEND); 1926 if (plusnest != 0) 1927 g->iflags |= BAD; 1928 return(maxnest); 1929 } 1930