1 /* $NetBSD: regcomp.c,v 1.28 2007/02/09 23:44:18 junyoung Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 35 */ 36 37 /*- 38 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 39 * 40 * This code is derived from software contributed to Berkeley by 41 * Henry Spencer. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 72 */ 73 74 #include <sys/cdefs.h> 75 #if defined(LIBC_SCCS) && !defined(lint) 76 #if 0 77 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 78 #else 79 __RCSID("$NetBSD: regcomp.c,v 1.28 2007/02/09 23:44:18 junyoung Exp $"); 80 #endif 81 #endif /* LIBC_SCCS and not lint */ 82 83 #include "namespace.h" 84 #include <sys/types.h> 85 86 #include <assert.h> 87 #include <ctype.h> 88 #include <limits.h> 89 #include <stdio.h> 90 #include <stdlib.h> 91 #include <string.h> 92 #include <regex.h> 93 94 #ifdef __weak_alias 95 __weak_alias(regcomp,_regcomp) 96 #endif 97 98 #include "utils.h" 99 #include "regex2.h" 100 101 #include "cclass.h" 102 #include "cname.h" 103 104 /* 105 * parse structure, passed up and down to avoid global variables and 106 * other clumsinesses 107 */ 108 struct parse { 109 const char *next; /* next character in RE */ 110 const char *end; /* end of string (-> NUL normally) */ 111 int error; /* has an error been seen? */ 112 sop *strip; /* malloced strip */ 113 sopno ssize; /* malloced strip size (allocated) */ 114 sopno slen; /* malloced strip length (used) */ 115 int ncsalloc; /* number of csets allocated */ 116 struct re_guts *g; 117 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 118 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 119 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 120 }; 121 122 /* ========= begin header generated by ./mkh ========= */ 123 #ifdef __cplusplus 124 extern "C" { 125 #endif 126 127 /* === regcomp.c === */ 128 static void p_ere(struct parse *p, int stop); 129 static void p_ere_exp(struct parse *p); 130 static void p_str(struct parse *p); 131 static void p_bre(struct parse *p, int end1, int end2); 132 static int p_simp_re(struct parse *p, int starordinary); 133 static int p_count(struct parse *p); 134 static void p_bracket(struct parse *p); 135 static void p_b_term(struct parse *p, cset *cs); 136 static void p_b_cclass(struct parse *p, cset *cs); 137 static void p_b_eclass(struct parse *p, cset *cs); 138 static char p_b_symbol(struct parse *p); 139 static char p_b_coll_elem(struct parse *p, int endc); 140 static int othercase(int ch); 141 static void bothcases(struct parse *p, int ch); 142 static void ordinary(struct parse *p, int ch); 143 static void nonnewline(struct parse *p); 144 static void repeat(struct parse *p, sopno start, int from, int to); 145 static int seterr(struct parse *p, int e); 146 static cset *allocset(struct parse *p); 147 static void freeset(struct parse *p, cset *cs); 148 static int freezeset(struct parse *p, cset *cs); 149 static int firstch(struct parse *p, cset *cs); 150 static int nch(struct parse *p, cset *cs); 151 static void mcadd(struct parse *p, cset *cs, const char *cp); 152 #if 0 153 static void mcsub(cset *cs, char *cp); 154 static int mcin(cset *cs, char *cp); 155 static char *mcfind(cset *cs, char *cp); 156 #endif 157 static void mcinvert(struct parse *p, cset *cs); 158 static void mccase(struct parse *p, cset *cs); 159 static int isinsets(struct re_guts *g, int c); 160 static int samesets(struct re_guts *g, int c1, int c2); 161 static void categorize(struct parse *p, struct re_guts *g); 162 static sopno dupl(struct parse *p, sopno start, sopno finish); 163 static void doemit(struct parse *p, sop op, sopno opnd); 164 static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos); 165 static void dofwd(struct parse *p, sopno pos, sopno value); 166 static void enlarge(struct parse *p, sopno size); 167 static void stripsnug(struct parse *p, struct re_guts *g); 168 static void findmust(struct parse *p, struct re_guts *g); 169 static sopno pluscount(struct parse *p, struct re_guts *g); 170 171 #ifdef __cplusplus 172 } 173 #endif 174 /* ========= end header generated by ./mkh ========= */ 175 176 static char nuls[10]; /* place to point scanner in event of error */ 177 178 /* 179 * macros for use with parse structure 180 * BEWARE: these know that the parse structure is named `p' !!! 181 */ 182 #define PEEK() (*p->next) 183 #define PEEK2() (*(p->next+1)) 184 #define MORE() (p->next < p->end) 185 #define MORE2() (p->next+1 < p->end) 186 #define SEE(c) (MORE() && PEEK() == (c)) 187 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 188 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 189 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 190 #define NEXT() (p->next++) 191 #define NEXT2() (p->next += 2) 192 #define NEXTn(n) (p->next += (n)) 193 #define GETNEXT() (*p->next++) 194 #define SETERROR(e) seterr(p, (e)) 195 #define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 196 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 197 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 198 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 199 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd) 200 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 201 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 202 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 203 #define HERE() (p->slen) 204 #define THERE() (p->slen - 1) 205 #define THERETHERE() (p->slen - 2) 206 #define DROP(n) (p->slen -= (n)) 207 208 #ifndef NDEBUG 209 static int never = 0; /* for use in asserts; shuts lint up */ 210 #else 211 #define never 0 /* some <assert.h>s have bugs too */ 212 #endif 213 214 /* 215 - regcomp - interface for parser and compilation 216 = extern int regcomp(regex_t *, const char *, int); 217 = #define REG_BASIC 0000 218 = #define REG_EXTENDED 0001 219 = #define REG_ICASE 0002 220 = #define REG_NOSUB 0004 221 = #define REG_NEWLINE 0010 222 = #define REG_NOSPEC 0020 223 = #define REG_PEND 0040 224 = #define REG_DUMP 0200 225 */ 226 int /* 0 success, otherwise REG_something */ 227 regcomp( 228 regex_t *preg, 229 const char *pattern, 230 int cflags) 231 { 232 struct parse pa; 233 struct re_guts *g; 234 struct parse *p = &pa; 235 int i; 236 size_t len; 237 #ifdef REDEBUG 238 # define GOODFLAGS(f) (f) 239 #else 240 # define GOODFLAGS(f) ((f)&~REG_DUMP) 241 #endif 242 243 _DIAGASSERT(preg != NULL); 244 _DIAGASSERT(pattern != NULL); 245 246 cflags = GOODFLAGS(cflags); 247 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 248 return(REG_INVARG); 249 250 if (cflags®_PEND) { 251 if (preg->re_endp < pattern) 252 return(REG_INVARG); 253 len = preg->re_endp - pattern; 254 } else 255 len = strlen(pattern); 256 257 /* do the mallocs early so failure handling is easy */ 258 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 259 (NC-1)*sizeof(cat_t)); 260 if (g == NULL) 261 return(REG_ESPACE); 262 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 263 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 264 p->slen = 0; 265 if (p->strip == NULL) { 266 free(g); 267 return(REG_ESPACE); 268 } 269 270 /* set things up */ 271 p->g = g; 272 p->next = pattern; 273 p->end = p->next + len; 274 p->error = 0; 275 p->ncsalloc = 0; 276 for (i = 0; i < NPAREN; i++) { 277 p->pbegin[i] = 0; 278 p->pend[i] = 0; 279 } 280 g->csetsize = NC; 281 g->sets = NULL; 282 g->setbits = NULL; 283 g->ncsets = 0; 284 g->cflags = cflags; 285 g->iflags = 0; 286 g->nbol = 0; 287 g->neol = 0; 288 g->must = NULL; 289 g->mlen = 0; 290 g->nsub = 0; 291 g->ncategories = 1; /* category 0 is "everything else" */ 292 g->categories = &g->catspace[-(CHAR_MIN)]; 293 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 294 g->backrefs = 0; 295 296 /* do it */ 297 EMIT(OEND, 0); 298 g->firststate = THERE(); 299 if (cflags®_EXTENDED) 300 p_ere(p, OUT); 301 else if (cflags®_NOSPEC) 302 p_str(p); 303 else 304 p_bre(p, OUT, OUT); 305 EMIT(OEND, 0); 306 g->laststate = THERE(); 307 308 /* tidy up loose ends and fill things in */ 309 categorize(p, g); 310 stripsnug(p, g); 311 findmust(p, g); 312 g->nplus = pluscount(p, g); 313 g->magic = MAGIC2; 314 preg->re_nsub = g->nsub; 315 preg->re_g = g; 316 preg->re_magic = MAGIC1; 317 #ifndef REDEBUG 318 /* not debugging, so can't rely on the assert() in regexec() */ 319 if (g->iflags&BAD) 320 SETERROR(REG_ASSERT); 321 #endif 322 323 /* win or lose, we're done */ 324 if (p->error != 0) /* lose */ 325 regfree(preg); 326 return(p->error); 327 } 328 329 /* 330 - p_ere - ERE parser top level, concatenation and alternation 331 == static void p_ere(struct parse *p, int stop); 332 */ 333 static void 334 p_ere( 335 struct parse *p, 336 int stop) /* character this ERE should end at */ 337 { 338 char c; 339 sopno prevback = 0; /* pacify gcc */ 340 sopno prevfwd = 0; /* pacify gcc */ 341 sopno conc; 342 int first = 1; /* is this the first alternative? */ 343 344 _DIAGASSERT(p != NULL); 345 346 for (;;) { 347 /* do a bunch of concatenated expressions */ 348 conc = HERE(); 349 while (MORE() && (c = PEEK()) != '|' && c != stop) 350 p_ere_exp(p); 351 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 352 353 if (!EAT('|')) 354 break; /* NOTE BREAK OUT */ 355 356 if (first) { 357 INSERT(OCH_, conc); /* offset is wrong */ 358 prevfwd = conc; 359 prevback = conc; 360 first = 0; 361 } 362 ASTERN(OOR1, prevback); 363 prevback = THERE(); 364 AHEAD(prevfwd); /* fix previous offset */ 365 prevfwd = HERE(); 366 EMIT(OOR2, 0); /* offset is very wrong */ 367 } 368 369 if (!first) { /* tail-end fixups */ 370 AHEAD(prevfwd); 371 ASTERN(O_CH, prevback); 372 } 373 374 assert(!MORE() || SEE(stop)); 375 } 376 377 /* 378 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 379 == static void p_ere_exp(struct parse *p); 380 */ 381 static void 382 p_ere_exp( 383 struct parse *p) 384 { 385 char c; 386 sopno pos; 387 int count; 388 int count2; 389 sopno subno; 390 int wascaret = 0; 391 392 _DIAGASSERT(p != NULL); 393 394 assert(MORE()); /* caller should have ensured this */ 395 c = GETNEXT(); 396 397 pos = HERE(); 398 switch (c) { 399 case '(': 400 REQUIRE(MORE(), REG_EPAREN); 401 p->g->nsub++; 402 subno = p->g->nsub; 403 if (subno < NPAREN) 404 p->pbegin[subno] = HERE(); 405 EMIT(OLPAREN, subno); 406 if (!SEE(')')) 407 p_ere(p, ')'); 408 if (subno < NPAREN) { 409 p->pend[subno] = HERE(); 410 assert(p->pend[subno] != 0); 411 } 412 EMIT(ORPAREN, subno); 413 MUSTEAT(')', REG_EPAREN); 414 break; 415 #ifndef POSIX_MISTAKE 416 case ')': /* happens only if no current unmatched ( */ 417 /* 418 * You may ask, why the ifndef? Because I didn't notice 419 * this until slightly too late for 1003.2, and none of the 420 * other 1003.2 regular-expression reviewers noticed it at 421 * all. So an unmatched ) is legal POSIX, at least until 422 * we can get it fixed. 423 */ 424 SETERROR(REG_EPAREN); 425 break; 426 #endif 427 case '^': 428 EMIT(OBOL, 0); 429 p->g->iflags |= USEBOL; 430 p->g->nbol++; 431 wascaret = 1; 432 break; 433 case '$': 434 EMIT(OEOL, 0); 435 p->g->iflags |= USEEOL; 436 p->g->neol++; 437 break; 438 case '|': 439 SETERROR(REG_EMPTY); 440 break; 441 case '*': 442 case '+': 443 case '?': 444 SETERROR(REG_BADRPT); 445 break; 446 case '.': 447 if (p->g->cflags®_NEWLINE) 448 nonnewline(p); 449 else 450 EMIT(OANY, 0); 451 break; 452 case '[': 453 p_bracket(p); 454 break; 455 case '\\': 456 REQUIRE(MORE(), REG_EESCAPE); 457 c = GETNEXT(); 458 ordinary(p, c); 459 break; 460 case '{': /* okay as ordinary except if digit follows */ 461 REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT); 462 /* FALLTHROUGH */ 463 default: 464 ordinary(p, c); 465 break; 466 } 467 468 if (!MORE()) 469 return; 470 c = PEEK(); 471 /* we call { a repetition if followed by a digit */ 472 if (!( c == '*' || c == '+' || c == '?' || 473 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) )) 474 return; /* no repetition, we're done */ 475 NEXT(); 476 477 REQUIRE(!wascaret, REG_BADRPT); 478 switch (c) { 479 case '*': /* implemented as +? */ 480 /* this case does not require the (y|) trick, noKLUDGE */ 481 INSERT(OPLUS_, pos); 482 ASTERN(O_PLUS, pos); 483 INSERT(OQUEST_, pos); 484 ASTERN(O_QUEST, pos); 485 break; 486 case '+': 487 INSERT(OPLUS_, pos); 488 ASTERN(O_PLUS, pos); 489 break; 490 case '?': 491 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 492 INSERT(OCH_, pos); /* offset slightly wrong */ 493 ASTERN(OOR1, pos); /* this one's right */ 494 AHEAD(pos); /* fix the OCH_ */ 495 EMIT(OOR2, 0); /* offset very wrong... */ 496 AHEAD(THERE()); /* ...so fix it */ 497 ASTERN(O_CH, THERETHERE()); 498 break; 499 case '{': 500 count = p_count(p); 501 if (EAT(',')) { 502 if (isdigit((unsigned char)PEEK())) { 503 count2 = p_count(p); 504 REQUIRE(count <= count2, REG_BADBR); 505 } else /* single number with comma */ 506 count2 = INFINITY; 507 } else /* just a single number */ 508 count2 = count; 509 repeat(p, pos, count, count2); 510 if (!EAT('}')) { /* error heuristics */ 511 while (MORE() && PEEK() != '}') 512 NEXT(); 513 REQUIRE(MORE(), REG_EBRACE); 514 SETERROR(REG_BADBR); 515 } 516 break; 517 } 518 519 if (!MORE()) 520 return; 521 c = PEEK(); 522 if (!( c == '*' || c == '+' || c == '?' || 523 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) ) 524 return; 525 SETERROR(REG_BADRPT); 526 } 527 528 /* 529 - p_str - string (no metacharacters) "parser" 530 == static void p_str(struct parse *p); 531 */ 532 static void 533 p_str( 534 struct parse *p) 535 { 536 537 _DIAGASSERT(p != NULL); 538 539 REQUIRE(MORE(), REG_EMPTY); 540 while (MORE()) 541 ordinary(p, GETNEXT()); 542 } 543 544 /* 545 - p_bre - BRE parser top level, anchoring and concatenation 546 == static void p_bre(struct parse *p, int end1, \ 547 == int end2); 548 * Giving end1 as OUT essentially eliminates the end1/end2 check. 549 * 550 * This implementation is a bit of a kludge, in that a trailing $ is first 551 * taken as an ordinary character and then revised to be an anchor. The 552 * only undesirable side effect is that '$' gets included as a character 553 * category in such cases. This is fairly harmless; not worth fixing. 554 * The amount of lookahead needed to avoid this kludge is excessive. 555 */ 556 static void 557 p_bre( 558 struct parse *p, 559 int end1, /* first terminating character */ 560 int end2) /* second terminating character */ 561 { 562 sopno start; 563 int first = 1; /* first subexpression? */ 564 int wasdollar = 0; 565 566 _DIAGASSERT(p != NULL); 567 568 start = HERE(); 569 570 if (EAT('^')) { 571 EMIT(OBOL, 0); 572 p->g->iflags |= USEBOL; 573 p->g->nbol++; 574 } 575 while (MORE() && !SEETWO(end1, end2)) { 576 wasdollar = p_simp_re(p, first); 577 first = 0; 578 } 579 if (wasdollar) { /* oops, that was a trailing anchor */ 580 DROP(1); 581 EMIT(OEOL, 0); 582 p->g->iflags |= USEEOL; 583 p->g->neol++; 584 } 585 586 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 587 } 588 589 /* 590 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 591 == static int p_simp_re(struct parse *p, int starordinary); 592 */ 593 static int /* was the simple RE an unbackslashed $? */ 594 p_simp_re( 595 struct parse *p, 596 int starordinary) /* is a leading * an ordinary character? */ 597 { 598 int c; 599 int count; 600 int count2; 601 sopno pos; 602 int i; 603 sopno subno; 604 # define BACKSL (1<<CHAR_BIT) 605 606 _DIAGASSERT(p != NULL); 607 608 pos = HERE(); /* repetion op, if any, covers from here */ 609 610 assert(MORE()); /* caller should have ensured this */ 611 c = GETNEXT(); 612 if (c == '\\') { 613 REQUIRE(MORE(), REG_EESCAPE); 614 c = BACKSL | (unsigned char)GETNEXT(); 615 } 616 switch (c) { 617 case '.': 618 if (p->g->cflags®_NEWLINE) 619 nonnewline(p); 620 else 621 EMIT(OANY, 0); 622 break; 623 case '[': 624 p_bracket(p); 625 break; 626 case BACKSL|'{': 627 SETERROR(REG_BADRPT); 628 break; 629 case BACKSL|'(': 630 p->g->nsub++; 631 subno = p->g->nsub; 632 if (subno < NPAREN) 633 p->pbegin[subno] = HERE(); 634 EMIT(OLPAREN, subno); 635 /* the MORE here is an error heuristic */ 636 if (MORE() && !SEETWO('\\', ')')) 637 p_bre(p, '\\', ')'); 638 if (subno < NPAREN) { 639 p->pend[subno] = HERE(); 640 assert(p->pend[subno] != 0); 641 } 642 EMIT(ORPAREN, subno); 643 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 644 break; 645 case BACKSL|')': /* should not get here -- must be user */ 646 case BACKSL|'}': 647 SETERROR(REG_EPAREN); 648 break; 649 case BACKSL|'1': 650 case BACKSL|'2': 651 case BACKSL|'3': 652 case BACKSL|'4': 653 case BACKSL|'5': 654 case BACKSL|'6': 655 case BACKSL|'7': 656 case BACKSL|'8': 657 case BACKSL|'9': 658 i = (c&~BACKSL) - '0'; 659 assert(i < NPAREN); 660 if (p->pend[i] != 0) { 661 assert(i <= p->g->nsub); 662 EMIT(OBACK_, i); 663 assert(p->pbegin[i] != 0); 664 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 665 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 666 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 667 EMIT(O_BACK, i); 668 } else 669 SETERROR(REG_ESUBREG); 670 p->g->backrefs = 1; 671 break; 672 case '*': 673 REQUIRE(starordinary, REG_BADRPT); 674 /* FALLTHROUGH */ 675 default: 676 ordinary(p, c &~ BACKSL); 677 break; 678 } 679 680 if (EAT('*')) { /* implemented as +? */ 681 /* this case does not require the (y|) trick, noKLUDGE */ 682 INSERT(OPLUS_, pos); 683 ASTERN(O_PLUS, pos); 684 INSERT(OQUEST_, pos); 685 ASTERN(O_QUEST, pos); 686 } else if (EATTWO('\\', '{')) { 687 count = p_count(p); 688 if (EAT(',')) { 689 if (MORE() && isdigit((unsigned char)PEEK())) { 690 count2 = p_count(p); 691 REQUIRE(count <= count2, REG_BADBR); 692 } else /* single number with comma */ 693 count2 = INFINITY; 694 } else /* just a single number */ 695 count2 = count; 696 repeat(p, pos, count, count2); 697 if (!EATTWO('\\', '}')) { /* error heuristics */ 698 while (MORE() && !SEETWO('\\', '}')) 699 NEXT(); 700 REQUIRE(MORE(), REG_EBRACE); 701 SETERROR(REG_BADBR); 702 } 703 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 704 return(1); 705 706 return(0); 707 } 708 709 /* 710 - p_count - parse a repetition count 711 == static int p_count(struct parse *p); 712 */ 713 static int /* the value */ 714 p_count( 715 struct parse *p) 716 { 717 int count = 0; 718 int ndigits = 0; 719 720 _DIAGASSERT(p != NULL); 721 722 while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) { 723 count = count*10 + (GETNEXT() - '0'); 724 ndigits++; 725 } 726 727 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 728 return(count); 729 } 730 731 /* 732 - p_bracket - parse a bracketed character list 733 == static void p_bracket(struct parse *p); 734 * 735 * Note a significant property of this code: if the allocset() did SETERROR, 736 * no set operations are done. 737 */ 738 static void 739 p_bracket( 740 struct parse *p) 741 { 742 cset *cs; 743 int invert = 0; 744 745 _DIAGASSERT(p != NULL); 746 747 cs = allocset(p); 748 749 /* Dept of Truly Sickening Special-Case Kludges */ 750 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 751 (size_t)6) == 0) { 752 EMIT(OBOW, 0); 753 NEXTn(6); 754 return; 755 } 756 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 757 (size_t)6) == 0) { 758 EMIT(OEOW, 0); 759 NEXTn(6); 760 return; 761 } 762 763 if (EAT('^')) 764 invert++; /* make note to invert set at end */ 765 if (EAT(']')) 766 CHadd(cs, ']'); 767 else if (EAT('-')) 768 CHadd(cs, '-'); 769 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 770 p_b_term(p, cs); 771 if (EAT('-')) 772 CHadd(cs, '-'); 773 MUSTEAT(']', REG_EBRACK); 774 775 if (p->error != 0) /* don't mess things up further */ 776 return; 777 778 if (p->g->cflags®_ICASE) { 779 int i; 780 int ci; 781 782 for (i = p->g->csetsize - 1; i >= 0; i--) 783 if (CHIN(cs, i) && isalpha(i)) { 784 ci = othercase(i); 785 if (ci != i) 786 CHadd(cs, ci); 787 } 788 if (cs->multis != NULL) 789 mccase(p, cs); 790 } 791 if (invert) { 792 int i; 793 794 for (i = p->g->csetsize - 1; i >= 0; i--) 795 if (CHIN(cs, i)) 796 CHsub(cs, i); 797 else 798 CHadd(cs, i); 799 if (p->g->cflags®_NEWLINE) 800 CHsub(cs, '\n'); 801 if (cs->multis != NULL) 802 mcinvert(p, cs); 803 } 804 805 assert(cs->multis == NULL); /* xxx */ 806 807 if (nch(p, cs) == 1) { /* optimize singleton sets */ 808 ordinary(p, firstch(p, cs)); 809 freeset(p, cs); 810 } else 811 EMIT(OANYOF, freezeset(p, cs)); 812 } 813 814 /* 815 - p_b_term - parse one term of a bracketed character list 816 == static void p_b_term(struct parse *p, cset *cs); 817 */ 818 static void 819 p_b_term( 820 struct parse *p, 821 cset *cs) 822 { 823 char c; 824 char start, finish; 825 int i; 826 827 _DIAGASSERT(p != NULL); 828 _DIAGASSERT(cs != NULL); 829 830 /* classify what we've got */ 831 switch ((MORE()) ? PEEK() : '\0') { 832 case '[': 833 c = (MORE2()) ? PEEK2() : '\0'; 834 break; 835 836 case '-': 837 SETERROR(REG_ERANGE); 838 return; /* NOTE RETURN */ 839 840 default: 841 c = '\0'; 842 break; 843 } 844 845 switch (c) { 846 case ':': /* character class */ 847 NEXT2(); 848 REQUIRE(MORE(), REG_EBRACK); 849 c = PEEK(); 850 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 851 p_b_cclass(p, cs); 852 REQUIRE(MORE(), REG_EBRACK); 853 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 854 break; 855 case '=': /* equivalence class */ 856 NEXT2(); 857 REQUIRE(MORE(), REG_EBRACK); 858 c = PEEK(); 859 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 860 p_b_eclass(p, cs); 861 REQUIRE(MORE(), REG_EBRACK); 862 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 863 break; 864 default: /* symbol, ordinary character, or range */ 865 /* xxx revision needed for multichar stuff */ 866 start = p_b_symbol(p); 867 if (SEE('-') && MORE2() && PEEK2() != ']') { 868 /* range */ 869 NEXT(); 870 if (EAT('-')) 871 finish = '-'; 872 else 873 finish = p_b_symbol(p); 874 } else 875 finish = start; 876 /* xxx what about signed chars here... */ 877 REQUIRE(start <= finish, REG_ERANGE); 878 for (i = start; i <= finish; i++) 879 CHadd(cs, i); 880 break; 881 } 882 } 883 884 /* 885 - p_b_cclass - parse a character-class name and deal with it 886 == static void p_b_cclass(struct parse *p, cset *cs); 887 */ 888 static void 889 p_b_cclass( 890 struct parse *p, 891 cset *cs) 892 { 893 const char *sp; 894 const struct cclass *cp; 895 size_t len; 896 const char *u; 897 char c; 898 899 _DIAGASSERT(p != NULL); 900 _DIAGASSERT(cs != NULL); 901 902 sp = p->next; 903 904 while (MORE() && isalpha((unsigned char)PEEK())) 905 NEXT(); 906 len = p->next - sp; 907 for (cp = cclasses; cp->name != NULL; cp++) 908 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 909 break; 910 if (cp->name == NULL) { 911 /* oops, didn't find it */ 912 SETERROR(REG_ECTYPE); 913 return; 914 } 915 916 u = cp->chars; 917 while ((c = *u++) != '\0') 918 CHadd(cs, c); 919 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 920 MCadd(p, cs, u); 921 } 922 923 /* 924 - p_b_eclass - parse an equivalence-class name and deal with it 925 == static void p_b_eclass(struct parse *p, cset *cs); 926 * 927 * This implementation is incomplete. xxx 928 */ 929 static void 930 p_b_eclass( 931 struct parse *p, 932 cset *cs) 933 { 934 char c; 935 936 _DIAGASSERT(p != NULL); 937 _DIAGASSERT(cs != NULL); 938 939 c = p_b_coll_elem(p, '='); 940 CHadd(cs, c); 941 } 942 943 /* 944 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 945 == static char p_b_symbol(struct parse *p); 946 */ 947 static char /* value of symbol */ 948 p_b_symbol( 949 struct parse *p) 950 { 951 char value; 952 953 _DIAGASSERT(p != NULL); 954 955 REQUIRE(MORE(), REG_EBRACK); 956 if (!EATTWO('[', '.')) 957 return(GETNEXT()); 958 959 /* collating symbol */ 960 value = p_b_coll_elem(p, '.'); 961 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 962 return(value); 963 } 964 965 /* 966 - p_b_coll_elem - parse a collating-element name and look it up 967 == static char p_b_coll_elem(struct parse *p, int endc); 968 */ 969 static char /* value of collating element */ 970 p_b_coll_elem( 971 struct parse *p, 972 int endc) /* name ended by endc,']' */ 973 { 974 const char *sp; 975 const struct cname *cp; 976 size_t len; 977 978 _DIAGASSERT(p != NULL); 979 980 sp = p->next; 981 982 while (MORE() && !SEETWO(endc, ']')) 983 NEXT(); 984 if (!MORE()) { 985 SETERROR(REG_EBRACK); 986 return(0); 987 } 988 len = p->next - sp; 989 for (cp = cnames; cp->name != NULL; cp++) 990 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 991 return(cp->code); /* known name */ 992 if (len == 1) 993 return(*sp); /* single character */ 994 SETERROR(REG_ECOLLATE); /* neither */ 995 return(0); 996 } 997 998 /* 999 - othercase - return the case counterpart of an alphabetic 1000 == static int othercase(int ch); 1001 */ 1002 static int /* if no counterpart, return ch */ 1003 othercase( 1004 int ch) 1005 { 1006 assert(isalpha(ch)); 1007 if (isupper(ch)) 1008 return(tolower(ch)); 1009 else if (islower(ch)) 1010 return(toupper(ch)); 1011 else /* peculiar, but could happen */ 1012 return(ch); 1013 } 1014 1015 /* 1016 - bothcases - emit a dualcase version of a two-case character 1017 == static void bothcases(struct parse *p, int ch); 1018 * 1019 * Boy, is this implementation ever a kludge... 1020 */ 1021 static void 1022 bothcases( 1023 struct parse *p, 1024 int ch) 1025 { 1026 const char *oldnext; 1027 const char *oldend; 1028 char bracket[3]; 1029 1030 _DIAGASSERT(p != NULL); 1031 1032 oldnext = p->next; 1033 oldend = p->end; 1034 1035 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1036 p->next = bracket; 1037 p->end = bracket+2; 1038 bracket[0] = ch; 1039 bracket[1] = ']'; 1040 bracket[2] = '\0'; 1041 p_bracket(p); 1042 assert(p->next == bracket+2); 1043 p->next = oldnext; 1044 p->end = oldend; 1045 } 1046 1047 /* 1048 - ordinary - emit an ordinary character 1049 == static void ordinary(struct parse *p, int ch); 1050 */ 1051 static void 1052 ordinary( 1053 struct parse *p, 1054 int ch) 1055 { 1056 cat_t *cap; 1057 1058 _DIAGASSERT(p != NULL); 1059 1060 cap = p->g->categories; 1061 if ((p->g->cflags®_ICASE) && isalpha((unsigned char) ch) 1062 && othercase((unsigned char) ch) != (unsigned char) ch) 1063 bothcases(p, (unsigned char) ch); 1064 else { 1065 EMIT(OCHAR, (unsigned char)ch); 1066 if (cap[ch] == 0) 1067 cap[ch] = p->g->ncategories++; 1068 } 1069 } 1070 1071 /* 1072 - nonnewline - emit REG_NEWLINE version of OANY 1073 == static void nonnewline(struct parse *p); 1074 * 1075 * Boy, is this implementation ever a kludge... 1076 */ 1077 static void 1078 nonnewline( 1079 struct parse *p) 1080 { 1081 const char *oldnext; 1082 const char *oldend; 1083 char bracket[4]; 1084 1085 _DIAGASSERT(p != NULL); 1086 1087 oldnext = p->next; 1088 oldend = p->end; 1089 1090 p->next = bracket; 1091 p->end = bracket+3; 1092 bracket[0] = '^'; 1093 bracket[1] = '\n'; 1094 bracket[2] = ']'; 1095 bracket[3] = '\0'; 1096 p_bracket(p); 1097 assert(p->next == bracket+3); 1098 p->next = oldnext; 1099 p->end = oldend; 1100 } 1101 1102 /* 1103 - repeat - generate code for a bounded repetition, recursively if needed 1104 == static void repeat(struct parse *p, sopno start, int from, int to); 1105 */ 1106 static void 1107 repeat( 1108 struct parse *p, 1109 sopno start, /* operand from here to end of strip */ 1110 int from, /* repeated from this number */ 1111 int to) /* to this number of times (maybe INFINITY) */ 1112 { 1113 sopno finish; 1114 # define N 2 1115 # define INF 3 1116 # define REP(f, t) ((f)*8 + (t)) 1117 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1118 sopno copy; 1119 1120 _DIAGASSERT(p != NULL); 1121 1122 finish = HERE(); 1123 1124 if (p->error != 0) /* head off possible runaway recursion */ 1125 return; 1126 1127 assert(from <= to); 1128 1129 switch (REP(MAP(from), MAP(to))) { 1130 case REP(0, 0): /* must be user doing this */ 1131 DROP(finish-start); /* drop the operand */ 1132 break; 1133 case REP(0, 1): /* as x{1,1}? */ 1134 case REP(0, N): /* as x{1,n}? */ 1135 case REP(0, INF): /* as x{1,}? */ 1136 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1137 INSERT(OCH_, start); /* offset is wrong... */ 1138 repeat(p, start+1, 1, to); 1139 ASTERN(OOR1, start); 1140 AHEAD(start); /* ... fix it */ 1141 EMIT(OOR2, 0); 1142 AHEAD(THERE()); 1143 ASTERN(O_CH, THERETHERE()); 1144 break; 1145 case REP(1, 1): /* trivial case */ 1146 /* done */ 1147 break; 1148 case REP(1, N): /* as x?x{1,n-1} */ 1149 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1150 INSERT(OCH_, start); 1151 ASTERN(OOR1, start); 1152 AHEAD(start); 1153 EMIT(OOR2, 0); /* offset very wrong... */ 1154 AHEAD(THERE()); /* ...so fix it */ 1155 ASTERN(O_CH, THERETHERE()); 1156 copy = dupl(p, start+1, finish+1); 1157 assert(copy == finish+4); 1158 repeat(p, copy, 1, to-1); 1159 break; 1160 case REP(1, INF): /* as x+ */ 1161 INSERT(OPLUS_, start); 1162 ASTERN(O_PLUS, start); 1163 break; 1164 case REP(N, N): /* as xx{m-1,n-1} */ 1165 copy = dupl(p, start, finish); 1166 repeat(p, copy, from-1, to-1); 1167 break; 1168 case REP(N, INF): /* as xx{n-1,INF} */ 1169 copy = dupl(p, start, finish); 1170 repeat(p, copy, from-1, to); 1171 break; 1172 default: /* "can't happen" */ 1173 SETERROR(REG_ASSERT); /* just in case */ 1174 break; 1175 } 1176 } 1177 1178 /* 1179 - seterr - set an error condition 1180 == static int seterr(struct parse *p, int e); 1181 */ 1182 static int /* useless but makes type checking happy */ 1183 seterr( 1184 struct parse *p, 1185 int e) 1186 { 1187 1188 _DIAGASSERT(p != NULL); 1189 1190 if (p->error == 0) /* keep earliest error condition */ 1191 p->error = e; 1192 p->next = nuls; /* try to bring things to a halt */ 1193 p->end = nuls; 1194 return(0); /* make the return value well-defined */ 1195 } 1196 1197 /* 1198 - allocset - allocate a set of characters for [] 1199 == static cset *allocset(struct parse *p); 1200 */ 1201 static cset * 1202 allocset( 1203 struct parse *p) 1204 { 1205 int no; 1206 size_t nc; 1207 size_t nbytes; 1208 cset *cs; 1209 size_t css; 1210 int i; 1211 1212 _DIAGASSERT(p != NULL); 1213 1214 no = p->g->ncsets++; 1215 css = (size_t)p->g->csetsize; 1216 if (no >= p->ncsalloc) { /* need another column of space */ 1217 p->ncsalloc += CHAR_BIT; 1218 nc = p->ncsalloc; 1219 assert(nc % CHAR_BIT == 0); 1220 nbytes = nc / CHAR_BIT * css; 1221 if (p->g->sets == NULL) 1222 p->g->sets = malloc(nc * sizeof(cset)); 1223 else 1224 p->g->sets = realloc(p->g->sets, nc * sizeof(cset)); 1225 if (p->g->setbits == NULL) 1226 p->g->setbits = malloc(nbytes); 1227 else { 1228 p->g->setbits = realloc(p->g->setbits, nbytes); 1229 /* xxx this isn't right if setbits is now NULL */ 1230 for (i = 0; i < no; i++) 1231 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1232 } 1233 if (p->g->sets != NULL && p->g->setbits != NULL) 1234 (void) memset((char *)p->g->setbits + (nbytes - css), 1235 0, css); 1236 else { 1237 no = 0; 1238 SETERROR(REG_ESPACE); 1239 /* caller's responsibility not to do set ops */ 1240 } 1241 } 1242 1243 assert(p->g->sets != NULL); /* xxx */ 1244 cs = &p->g->sets[no]; 1245 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1246 cs->mask = 1 << ((no) % CHAR_BIT); 1247 cs->hash = 0; 1248 cs->smultis = 0; 1249 cs->multis = NULL; 1250 1251 return(cs); 1252 } 1253 1254 /* 1255 - freeset - free a now-unused set 1256 == static void freeset(struct parse *p, cset *cs); 1257 */ 1258 static void 1259 freeset( 1260 struct parse *p, 1261 cset *cs) 1262 { 1263 int i; 1264 cset *top; 1265 size_t css; 1266 1267 _DIAGASSERT(p != NULL); 1268 _DIAGASSERT(cs != NULL); 1269 1270 top = &p->g->sets[p->g->ncsets]; 1271 css = (size_t)p->g->csetsize; 1272 1273 for (i = 0; i < css; i++) 1274 CHsub(cs, i); 1275 if (cs == top-1) /* recover only the easy case */ 1276 p->g->ncsets--; 1277 } 1278 1279 /* 1280 - freezeset - final processing on a set of characters 1281 == static int freezeset(struct parse *p, cset *cs); 1282 * 1283 * The main task here is merging identical sets. This is usually a waste 1284 * of time (although the hash code minimizes the overhead), but can win 1285 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1286 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1287 * the same value! 1288 */ 1289 static int /* set number */ 1290 freezeset( 1291 struct parse *p, 1292 cset *cs) 1293 { 1294 uch h; 1295 int i; 1296 cset *top; 1297 cset *cs2; 1298 size_t css; 1299 1300 _DIAGASSERT(p != NULL); 1301 _DIAGASSERT(cs != NULL); 1302 1303 h = cs->hash; 1304 top = &p->g->sets[p->g->ncsets]; 1305 css = (size_t)p->g->csetsize; 1306 1307 /* look for an earlier one which is the same */ 1308 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1309 if (cs2->hash == h && cs2 != cs) { 1310 /* maybe */ 1311 for (i = 0; i < css; i++) 1312 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1313 break; /* no */ 1314 if (i == css) 1315 break; /* yes */ 1316 } 1317 1318 if (cs2 < top) { /* found one */ 1319 freeset(p, cs); 1320 cs = cs2; 1321 } 1322 1323 return((int)(cs - p->g->sets)); 1324 } 1325 1326 /* 1327 - firstch - return first character in a set (which must have at least one) 1328 == static int firstch(struct parse *p, cset *cs); 1329 */ 1330 static int /* character; there is no "none" value */ 1331 firstch( 1332 struct parse *p, 1333 cset *cs) 1334 { 1335 int i; 1336 size_t css; 1337 1338 _DIAGASSERT(p != NULL); 1339 _DIAGASSERT(cs != NULL); 1340 1341 css = (size_t)p->g->csetsize; 1342 1343 for (i = 0; i < css; i++) 1344 if (CHIN(cs, i)) 1345 return((char)i); 1346 assert(never); 1347 return(0); /* arbitrary */ 1348 } 1349 1350 /* 1351 - nch - number of characters in a set 1352 == static int nch(struct parse *p, cset *cs); 1353 */ 1354 static int 1355 nch( 1356 struct parse *p, 1357 cset *cs) 1358 { 1359 int i; 1360 size_t css; 1361 int n = 0; 1362 1363 _DIAGASSERT(p != NULL); 1364 _DIAGASSERT(cs != NULL); 1365 1366 css = (size_t)p->g->csetsize; 1367 1368 for (i = 0; i < css; i++) 1369 if (CHIN(cs, i)) 1370 n++; 1371 return(n); 1372 } 1373 1374 /* 1375 - mcadd - add a collating element to a cset 1376 == static void mcadd(struct parse *p, cset *cs, \ 1377 == char *cp); 1378 */ 1379 static void 1380 mcadd( 1381 struct parse *p, 1382 cset *cs, 1383 const char *cp) 1384 { 1385 size_t oldend; 1386 1387 _DIAGASSERT(p != NULL); 1388 _DIAGASSERT(cs != NULL); 1389 _DIAGASSERT(cp != NULL); 1390 1391 oldend = cs->smultis; 1392 1393 cs->smultis += strlen(cp) + 1; 1394 if (cs->multis == NULL) 1395 cs->multis = malloc(cs->smultis); 1396 else 1397 cs->multis = realloc(cs->multis, cs->smultis); 1398 if (cs->multis == NULL) { 1399 SETERROR(REG_ESPACE); 1400 return; 1401 } 1402 1403 (void) strcpy(cs->multis + oldend - 1, cp); 1404 cs->multis[cs->smultis - 1] = '\0'; 1405 } 1406 1407 #if 0 1408 /* 1409 - mcsub - subtract a collating element from a cset 1410 == static void mcsub(cset *cs, char *cp); 1411 */ 1412 static void 1413 mcsub( 1414 cset *cs, 1415 char *cp) 1416 { 1417 char *fp; 1418 size_t len; 1419 1420 _DIAGASSERT(cs != NULL); 1421 _DIAGASSERT(cp != NULL); 1422 1423 fp = mcfind(cs, cp); 1424 len = strlen(fp); 1425 1426 assert(fp != NULL); 1427 (void) memmove(fp, fp + len + 1, 1428 cs->smultis - (fp + len + 1 - cs->multis)); 1429 cs->smultis -= len; 1430 1431 if (cs->smultis == 0) { 1432 free(cs->multis); 1433 cs->multis = NULL; 1434 return; 1435 } 1436 1437 cs->multis = realloc(cs->multis, cs->smultis); 1438 assert(cs->multis != NULL); 1439 } 1440 1441 /* 1442 - mcin - is a collating element in a cset? 1443 == static int mcin(cset *cs, char *cp); 1444 */ 1445 static int 1446 mcin( 1447 cset *cs, 1448 char *cp) 1449 { 1450 1451 _DIAGASSERT(cs != NULL); 1452 _DIAGASSERT(cp != NULL); 1453 1454 return(mcfind(cs, cp) != NULL); 1455 } 1456 1457 /* 1458 - mcfind - find a collating element in a cset 1459 == static char *mcfind(cset *cs, char *cp); 1460 */ 1461 static char * 1462 mcfind( 1463 cset *cs, 1464 char *cp) 1465 { 1466 char *p; 1467 1468 _DIAGASSERT(cs != NULL); 1469 _DIAGASSERT(cp != NULL); 1470 1471 if (cs->multis == NULL) 1472 return(NULL); 1473 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1474 if (strcmp(cp, p) == 0) 1475 return(p); 1476 return(NULL); 1477 } 1478 #endif 1479 1480 /* 1481 - mcinvert - invert the list of collating elements in a cset 1482 == static void mcinvert(struct parse *p, cset *cs); 1483 * 1484 * This would have to know the set of possibilities. Implementation 1485 * is deferred. 1486 */ 1487 /* ARGSUSED */ 1488 static void 1489 mcinvert( 1490 struct parse *p, 1491 cset *cs) 1492 { 1493 1494 _DIAGASSERT(p != NULL); 1495 _DIAGASSERT(cs != NULL); 1496 1497 assert(cs->multis == NULL); /* xxx */ 1498 } 1499 1500 /* 1501 - mccase - add case counterparts of the list of collating elements in a cset 1502 == static void mccase(struct parse *p, cset *cs); 1503 * 1504 * This would have to know the set of possibilities. Implementation 1505 * is deferred. 1506 */ 1507 /* ARGSUSED */ 1508 static void 1509 mccase( 1510 struct parse *p, 1511 cset *cs) 1512 { 1513 1514 _DIAGASSERT(p != NULL); 1515 _DIAGASSERT(cs != NULL); 1516 1517 assert(cs->multis == NULL); /* xxx */ 1518 } 1519 1520 /* 1521 - isinsets - is this character in any sets? 1522 == static int isinsets(struct re_guts *g, int c); 1523 */ 1524 static int /* predicate */ 1525 isinsets( 1526 struct re_guts *g, 1527 int c) 1528 { 1529 uch *col; 1530 int i; 1531 int ncols; 1532 unsigned uc = (unsigned char)c; 1533 1534 _DIAGASSERT(g != NULL); 1535 1536 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1537 1538 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1539 if (col[uc] != 0) 1540 return(1); 1541 return(0); 1542 } 1543 1544 /* 1545 - samesets - are these two characters in exactly the same sets? 1546 == static int samesets(struct re_guts *g, int c1, int c2); 1547 */ 1548 static int /* predicate */ 1549 samesets( 1550 struct re_guts *g, 1551 int c1, 1552 int c2) 1553 { 1554 uch *col; 1555 int i; 1556 int ncols; 1557 unsigned uc1 = (unsigned char)c1; 1558 unsigned uc2 = (unsigned char)c2; 1559 1560 _DIAGASSERT(g != NULL); 1561 1562 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1563 1564 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1565 if (col[uc1] != col[uc2]) 1566 return(0); 1567 return(1); 1568 } 1569 1570 /* 1571 - categorize - sort out character categories 1572 == static void categorize(struct parse *p, struct re_guts *g); 1573 */ 1574 static void 1575 categorize( 1576 struct parse *p, 1577 struct re_guts *g) 1578 { 1579 cat_t *cats; 1580 int c; 1581 int c2; 1582 cat_t cat; 1583 1584 _DIAGASSERT(p != NULL); 1585 _DIAGASSERT(g != NULL); 1586 1587 cats = g->categories; 1588 1589 /* avoid making error situations worse */ 1590 if (p->error != 0) 1591 return; 1592 1593 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1594 if (cats[c] == 0 && isinsets(g, c)) { 1595 cat = g->ncategories++; 1596 cats[c] = cat; 1597 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1598 if (cats[c2] == 0 && samesets(g, c, c2)) 1599 cats[c2] = cat; 1600 } 1601 } 1602 1603 /* 1604 - dupl - emit a duplicate of a bunch of sops 1605 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1606 */ 1607 static sopno /* start of duplicate */ 1608 dupl( 1609 struct parse *p, 1610 sopno start, /* from here */ 1611 sopno finish) /* to this less one */ 1612 { 1613 sopno ret; 1614 sopno len = finish - start; 1615 1616 _DIAGASSERT(p != NULL); 1617 1618 ret = HERE(); 1619 1620 assert(finish >= start); 1621 if (len == 0) 1622 return(ret); 1623 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1624 assert(p->ssize >= p->slen + len); 1625 (void)memcpy(p->strip + p->slen, p->strip + start, 1626 (size_t)len * sizeof(sop)); 1627 p->slen += len; 1628 return(ret); 1629 } 1630 1631 /* 1632 - doemit - emit a strip operator 1633 == static void doemit(struct parse *p, sop op, size_t opnd); 1634 * 1635 * It might seem better to implement this as a macro with a function as 1636 * hard-case backup, but it's just too big and messy unless there are 1637 * some changes to the data structures. Maybe later. 1638 */ 1639 static void 1640 doemit( 1641 struct parse *p, 1642 sop op, 1643 sopno opnd) 1644 { 1645 1646 _DIAGASSERT(p != NULL); 1647 1648 /* avoid making error situations worse */ 1649 if (p->error != 0) 1650 return; 1651 1652 /* deal with oversize operands ("can't happen", more or less) */ 1653 assert(opnd < 1<<OPSHIFT); 1654 1655 /* deal with undersized strip */ 1656 if (p->slen >= p->ssize) 1657 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1658 assert(p->slen < p->ssize); 1659 1660 /* finally, it's all reduced to the easy case */ 1661 p->strip[p->slen++] = SOP(op, opnd); 1662 } 1663 1664 /* 1665 - doinsert - insert a sop into the strip 1666 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1667 */ 1668 static void 1669 doinsert( 1670 struct parse *p, 1671 sop op, 1672 sopno opnd, 1673 sopno pos) 1674 { 1675 sopno sn; 1676 sop s; 1677 int i; 1678 1679 _DIAGASSERT(p != NULL); 1680 1681 /* avoid making error situations worse */ 1682 if (p->error != 0) 1683 return; 1684 1685 sn = HERE(); 1686 EMIT(op, opnd); /* do checks, ensure space */ 1687 assert(HERE() == sn+1); 1688 s = p->strip[sn]; 1689 1690 /* adjust paren pointers */ 1691 assert(pos > 0); 1692 for (i = 1; i < NPAREN; i++) { 1693 if (p->pbegin[i] >= pos) { 1694 p->pbegin[i]++; 1695 } 1696 if (p->pend[i] >= pos) { 1697 p->pend[i]++; 1698 } 1699 } 1700 1701 memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop)); 1702 p->strip[pos] = s; 1703 } 1704 1705 /* 1706 - dofwd - complete a forward reference 1707 == static void dofwd(struct parse *p, sopno pos, sop value); 1708 */ 1709 static void 1710 dofwd( 1711 struct parse *p, 1712 sopno pos, 1713 sopno value) 1714 { 1715 1716 _DIAGASSERT(p != NULL); 1717 1718 /* avoid making error situations worse */ 1719 if (p->error != 0) 1720 return; 1721 1722 assert(value < 1<<OPSHIFT); 1723 p->strip[pos] = OP(p->strip[pos]) | value; 1724 } 1725 1726 /* 1727 - enlarge - enlarge the strip 1728 == static void enlarge(struct parse *p, sopno size); 1729 */ 1730 static void 1731 enlarge( 1732 struct parse *p, 1733 sopno size) 1734 { 1735 sop *sp; 1736 1737 _DIAGASSERT(p != NULL); 1738 1739 if (p->ssize >= size) 1740 return; 1741 1742 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1743 if (sp == NULL) { 1744 SETERROR(REG_ESPACE); 1745 return; 1746 } 1747 p->strip = sp; 1748 p->ssize = size; 1749 } 1750 1751 /* 1752 - stripsnug - compact the strip 1753 == static void stripsnug(struct parse *p, struct re_guts *g); 1754 */ 1755 static void 1756 stripsnug( 1757 struct parse *p, 1758 struct re_guts *g) 1759 { 1760 1761 _DIAGASSERT(p != NULL); 1762 _DIAGASSERT(g != NULL); 1763 1764 g->nstates = p->slen; 1765 g->strip = realloc(p->strip, p->slen * sizeof(sop)); 1766 if (g->strip == NULL) { 1767 SETERROR(REG_ESPACE); 1768 g->strip = p->strip; 1769 } 1770 } 1771 1772 /* 1773 - findmust - fill in must and mlen with longest mandatory literal string 1774 == static void findmust(struct parse *p, struct re_guts *g); 1775 * 1776 * This algorithm could do fancy things like analyzing the operands of | 1777 * for common subsequences. Someday. This code is simple and finds most 1778 * of the interesting cases. 1779 * 1780 * Note that must and mlen got initialized during setup. 1781 */ 1782 static void 1783 findmust( 1784 struct parse *p, 1785 struct re_guts *g) 1786 { 1787 sop *scan; 1788 sop *start = NULL; 1789 sop *newstart = NULL; 1790 sopno newlen; 1791 sop s; 1792 char *cp; 1793 sopno i; 1794 1795 _DIAGASSERT(p != NULL); 1796 _DIAGASSERT(g != NULL); 1797 1798 /* avoid making error situations worse */ 1799 if (p->error != 0) 1800 return; 1801 1802 /* find the longest OCHAR sequence in strip */ 1803 newlen = 0; 1804 scan = g->strip + 1; 1805 do { 1806 s = *scan++; 1807 switch (OP(s)) { 1808 case OCHAR: /* sequence member */ 1809 if (newlen == 0) /* new sequence */ 1810 newstart = scan - 1; 1811 newlen++; 1812 break; 1813 case OPLUS_: /* things that don't break one */ 1814 case OLPAREN: 1815 case ORPAREN: 1816 break; 1817 case OQUEST_: /* things that must be skipped */ 1818 case OCH_: 1819 scan--; 1820 do { 1821 scan += OPND(s); 1822 s = *scan; 1823 /* assert() interferes w debug printouts */ 1824 if (OP(s) != O_QUEST && OP(s) != O_CH && 1825 OP(s) != OOR2) { 1826 g->iflags |= BAD; 1827 return; 1828 } 1829 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1830 /* FALLTHROUGH */ 1831 default: /* things that break a sequence */ 1832 if (newlen > g->mlen) { /* ends one */ 1833 start = newstart; 1834 g->mlen = newlen; 1835 } 1836 newlen = 0; 1837 break; 1838 } 1839 } while (OP(s) != OEND); 1840 1841 if (start == NULL) 1842 g->mlen = 0; 1843 1844 if (g->mlen == 0) /* there isn't one */ 1845 return; 1846 1847 /* turn it into a character string */ 1848 g->must = malloc((size_t)g->mlen + 1); 1849 if (g->must == NULL) { /* argh; just forget it */ 1850 g->mlen = 0; 1851 return; 1852 } 1853 cp = g->must; 1854 scan = start; 1855 for (i = g->mlen; i > 0; i--) { 1856 while (OP(s = *scan++) != OCHAR) 1857 continue; 1858 assert(cp < g->must + g->mlen); 1859 *cp++ = (char)OPND(s); 1860 } 1861 assert(cp == g->must + g->mlen); 1862 *cp++ = '\0'; /* just on general principles */ 1863 } 1864 1865 /* 1866 - pluscount - count + nesting 1867 == static sopno pluscount(struct parse *p, struct re_guts *g); 1868 */ 1869 static sopno /* nesting depth */ 1870 pluscount( 1871 struct parse *p, 1872 struct re_guts *g) 1873 { 1874 sop *scan; 1875 sop s; 1876 sopno plusnest = 0; 1877 sopno maxnest = 0; 1878 1879 _DIAGASSERT(p != NULL); 1880 _DIAGASSERT(g != NULL); 1881 1882 if (p->error != 0) 1883 return(0); /* there may not be an OEND */ 1884 1885 scan = g->strip + 1; 1886 do { 1887 s = *scan++; 1888 switch (OP(s)) { 1889 case OPLUS_: 1890 plusnest++; 1891 break; 1892 case O_PLUS: 1893 if (plusnest > maxnest) 1894 maxnest = plusnest; 1895 plusnest--; 1896 break; 1897 } 1898 } while (OP(s) != OEND); 1899 if (plusnest != 0) 1900 g->iflags |= BAD; 1901 return(maxnest); 1902 } 1903