1 /* $NetBSD: regcomp.c,v 1.13 1998/12/13 12:21:46 drochner Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 40 */ 41 42 #include <sys/cdefs.h> 43 #if defined(LIBC_SCCS) && !defined(lint) 44 #if 0 45 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 46 #else 47 __RCSID("$NetBSD: regcomp.c,v 1.13 1998/12/13 12:21:46 drochner Exp $"); 48 #endif 49 #endif /* LIBC_SCCS and not lint */ 50 51 #include "namespace.h" 52 #include <sys/types.h> 53 #include <stdio.h> 54 #include <string.h> 55 #include <ctype.h> 56 #include <limits.h> 57 #include <stdlib.h> 58 #include <regex.h> 59 60 #ifdef __weak_alias 61 __weak_alias(regcomp,_regcomp); 62 #endif 63 64 #include "utils.h" 65 #include "regex2.h" 66 67 #include "cclass.h" 68 #include "cname.h" 69 70 /* 71 * parse structure, passed up and down to avoid global variables and 72 * other clumsinesses 73 */ 74 struct parse { 75 char *next; /* next character in RE */ 76 char *end; /* end of string (-> NUL normally) */ 77 int error; /* has an error been seen? */ 78 sop *strip; /* malloced strip */ 79 sopno ssize; /* malloced strip size (allocated) */ 80 sopno slen; /* malloced strip length (used) */ 81 int ncsalloc; /* number of csets allocated */ 82 struct re_guts *g; 83 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 84 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 85 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 86 }; 87 88 /* ========= begin header generated by ./mkh ========= */ 89 #ifdef __cplusplus 90 extern "C" { 91 #endif 92 93 /* === regcomp.c === */ 94 static void p_ere __P((struct parse *p, int stop)); 95 static void p_ere_exp __P((struct parse *p)); 96 static void p_str __P((struct parse *p)); 97 static void p_bre __P((struct parse *p, int end1, int end2)); 98 static int p_simp_re __P((struct parse *p, int starordinary)); 99 static int p_count __P((struct parse *p)); 100 static void p_bracket __P((struct parse *p)); 101 static void p_b_term __P((struct parse *p, cset *cs)); 102 static void p_b_cclass __P((struct parse *p, cset *cs)); 103 static void p_b_eclass __P((struct parse *p, cset *cs)); 104 static char p_b_symbol __P((struct parse *p)); 105 static char p_b_coll_elem __P((struct parse *p, int endc)); 106 static char othercase __P((int ch)); 107 static void bothcases __P((struct parse *p, int ch)); 108 static void ordinary __P((struct parse *p, int ch)); 109 static void nonnewline __P((struct parse *p)); 110 static void repeat __P((struct parse *p, sopno start, int from, int to)); 111 static int seterr __P((struct parse *p, int e)); 112 static cset *allocset __P((struct parse *p)); 113 static void freeset __P((struct parse *p, cset *cs)); 114 static int freezeset __P((struct parse *p, cset *cs)); 115 static int firstch __P((struct parse *p, cset *cs)); 116 static int nch __P((struct parse *p, cset *cs)); 117 static void mcadd __P((struct parse *p, cset *cs, const char *cp)); 118 #if 0 119 static void mcsub __P((cset *cs, char *cp)); 120 static int mcin __P((cset *cs, char *cp)); 121 static char *mcfind __P((cset *cs, char *cp)); 122 #endif 123 static void mcinvert __P((struct parse *p, cset *cs)); 124 static void mccase __P((struct parse *p, cset *cs)); 125 static int isinsets __P((struct re_guts *g, int c)); 126 static int samesets __P((struct re_guts *g, int c1, int c2)); 127 static void categorize __P((struct parse *p, struct re_guts *g)); 128 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 129 static void doemit __P((struct parse *p, sop op, sopno opnd)); 130 static void doinsert __P((struct parse *p, sop op, sopno opnd, sopno pos)); 131 static void dofwd __P((struct parse *p, sopno pos, sopno value)); 132 static void enlarge __P((struct parse *p, sopno size)); 133 static void stripsnug __P((struct parse *p, struct re_guts *g)); 134 static void findmust __P((struct parse *p, struct re_guts *g)); 135 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 136 137 #ifdef __cplusplus 138 } 139 #endif 140 /* ========= end header generated by ./mkh ========= */ 141 142 static char nuls[10]; /* place to point scanner in event of error */ 143 144 /* 145 * macros for use with parse structure 146 * BEWARE: these know that the parse structure is named `p' !!! 147 */ 148 #define PEEK() (*p->next) 149 #define PEEK2() (*(p->next+1)) 150 #define MORE() (p->next < p->end) 151 #define MORE2() (p->next+1 < p->end) 152 #define SEE(c) (MORE() && PEEK() == (c)) 153 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 154 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 155 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 156 #define NEXT() (p->next++) 157 #define NEXT2() (p->next += 2) 158 #define NEXTn(n) (p->next += (n)) 159 #define GETNEXT() (*p->next++) 160 #define SETERROR(e) seterr(p, (e)) 161 #define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 162 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 163 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 164 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 165 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd) 166 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 167 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 168 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 169 #define HERE() (p->slen) 170 #define THERE() (p->slen - 1) 171 #define THERETHERE() (p->slen - 2) 172 #define DROP(n) (p->slen -= (n)) 173 174 #ifndef NDEBUG 175 static int never = 0; /* for use in asserts; shuts lint up */ 176 #else 177 #define never 0 /* some <assert.h>s have bugs too */ 178 #endif 179 180 /* 181 - regcomp - interface for parser and compilation 182 = extern int regcomp(regex_t *, const char *, int); 183 = #define REG_BASIC 0000 184 = #define REG_EXTENDED 0001 185 = #define REG_ICASE 0002 186 = #define REG_NOSUB 0004 187 = #define REG_NEWLINE 0010 188 = #define REG_NOSPEC 0020 189 = #define REG_PEND 0040 190 = #define REG_DUMP 0200 191 */ 192 int /* 0 success, otherwise REG_something */ 193 regcomp(preg, pattern, cflags) 194 regex_t *preg; 195 const char *pattern; 196 int cflags; 197 { 198 struct parse pa; 199 struct re_guts *g; 200 struct parse *p = &pa; 201 int i; 202 size_t len; 203 #ifdef REDEBUG 204 # define GOODFLAGS(f) (f) 205 #else 206 # define GOODFLAGS(f) ((f)&~REG_DUMP) 207 #endif 208 209 cflags = GOODFLAGS(cflags); 210 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 211 return(REG_INVARG); 212 213 if (cflags®_PEND) { 214 if (preg->re_endp < pattern) 215 return(REG_INVARG); 216 len = preg->re_endp - pattern; 217 } else 218 len = strlen(pattern); 219 220 /* do the mallocs early so failure handling is easy */ 221 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 222 (NC-1)*sizeof(cat_t)); 223 if (g == NULL) 224 return(REG_ESPACE); 225 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 226 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 227 p->slen = 0; 228 if (p->strip == NULL) { 229 free(g); 230 return(REG_ESPACE); 231 } 232 233 /* set things up */ 234 p->g = g; 235 /* LINTED convenience; we do not modify it */ 236 p->next = (char *)pattern; 237 p->end = p->next + len; 238 p->error = 0; 239 p->ncsalloc = 0; 240 for (i = 0; i < NPAREN; i++) { 241 p->pbegin[i] = 0; 242 p->pend[i] = 0; 243 } 244 g->csetsize = NC; 245 g->sets = NULL; 246 g->setbits = NULL; 247 g->ncsets = 0; 248 g->cflags = cflags; 249 g->iflags = 0; 250 g->nbol = 0; 251 g->neol = 0; 252 g->must = NULL; 253 g->mlen = 0; 254 g->nsub = 0; 255 g->ncategories = 1; /* category 0 is "everything else" */ 256 g->categories = &g->catspace[-(CHAR_MIN)]; 257 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 258 g->backrefs = 0; 259 260 /* do it */ 261 EMIT(OEND, 0); 262 g->firststate = THERE(); 263 if (cflags®_EXTENDED) 264 p_ere(p, OUT); 265 else if (cflags®_NOSPEC) 266 p_str(p); 267 else 268 p_bre(p, OUT, OUT); 269 EMIT(OEND, 0); 270 g->laststate = THERE(); 271 272 /* tidy up loose ends and fill things in */ 273 categorize(p, g); 274 stripsnug(p, g); 275 findmust(p, g); 276 g->nplus = pluscount(p, g); 277 g->magic = MAGIC2; 278 preg->re_nsub = g->nsub; 279 preg->re_g = g; 280 preg->re_magic = MAGIC1; 281 #ifndef REDEBUG 282 /* not debugging, so can't rely on the assert() in regexec() */ 283 if (g->iflags&BAD) 284 SETERROR(REG_ASSERT); 285 #endif 286 287 /* win or lose, we're done */ 288 if (p->error != 0) /* lose */ 289 regfree(preg); 290 return(p->error); 291 } 292 293 /* 294 - p_ere - ERE parser top level, concatenation and alternation 295 == static void p_ere(struct parse *p, int stop); 296 */ 297 static void 298 p_ere(p, stop) 299 struct parse *p; 300 int stop; /* character this ERE should end at */ 301 { 302 char c; 303 sopno prevback = 0; /* pacify gcc */ 304 sopno prevfwd = 0; /* pacify gcc */ 305 sopno conc; 306 int first = 1; /* is this the first alternative? */ 307 308 for (;;) { 309 /* do a bunch of concatenated expressions */ 310 conc = HERE(); 311 while (MORE() && (c = PEEK()) != '|' && c != stop) 312 p_ere_exp(p); 313 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 314 315 if (!EAT('|')) 316 break; /* NOTE BREAK OUT */ 317 318 if (first) { 319 INSERT(OCH_, conc); /* offset is wrong */ 320 prevfwd = conc; 321 prevback = conc; 322 first = 0; 323 } 324 ASTERN(OOR1, prevback); 325 prevback = THERE(); 326 AHEAD(prevfwd); /* fix previous offset */ 327 prevfwd = HERE(); 328 EMIT(OOR2, 0); /* offset is very wrong */ 329 } 330 331 if (!first) { /* tail-end fixups */ 332 AHEAD(prevfwd); 333 ASTERN(O_CH, prevback); 334 } 335 336 assert(!MORE() || SEE(stop)); 337 } 338 339 /* 340 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 341 == static void p_ere_exp(struct parse *p); 342 */ 343 static void 344 p_ere_exp(p) 345 struct parse *p; 346 { 347 char c; 348 sopno pos; 349 int count; 350 int count2; 351 sopno subno; 352 int wascaret = 0; 353 354 assert(MORE()); /* caller should have ensured this */ 355 c = GETNEXT(); 356 357 pos = HERE(); 358 switch (c) { 359 case '(': 360 REQUIRE(MORE(), REG_EPAREN); 361 p->g->nsub++; 362 subno = p->g->nsub; 363 if (subno < NPAREN) 364 p->pbegin[subno] = HERE(); 365 EMIT(OLPAREN, subno); 366 if (!SEE(')')) 367 p_ere(p, ')'); 368 if (subno < NPAREN) { 369 p->pend[subno] = HERE(); 370 assert(p->pend[subno] != 0); 371 } 372 EMIT(ORPAREN, subno); 373 MUSTEAT(')', REG_EPAREN); 374 break; 375 #ifndef POSIX_MISTAKE 376 case ')': /* happens only if no current unmatched ( */ 377 /* 378 * You may ask, why the ifndef? Because I didn't notice 379 * this until slightly too late for 1003.2, and none of the 380 * other 1003.2 regular-expression reviewers noticed it at 381 * all. So an unmatched ) is legal POSIX, at least until 382 * we can get it fixed. 383 */ 384 SETERROR(REG_EPAREN); 385 break; 386 #endif 387 case '^': 388 EMIT(OBOL, 0); 389 p->g->iflags |= USEBOL; 390 p->g->nbol++; 391 wascaret = 1; 392 break; 393 case '$': 394 EMIT(OEOL, 0); 395 p->g->iflags |= USEEOL; 396 p->g->neol++; 397 break; 398 case '|': 399 SETERROR(REG_EMPTY); 400 break; 401 case '*': 402 case '+': 403 case '?': 404 SETERROR(REG_BADRPT); 405 break; 406 case '.': 407 if (p->g->cflags®_NEWLINE) 408 nonnewline(p); 409 else 410 EMIT(OANY, 0); 411 break; 412 case '[': 413 p_bracket(p); 414 break; 415 case '\\': 416 REQUIRE(MORE(), REG_EESCAPE); 417 c = GETNEXT(); 418 ordinary(p, c); 419 break; 420 case '{': /* okay as ordinary except if digit follows */ 421 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 422 /* FALLTHROUGH */ 423 default: 424 ordinary(p, c); 425 break; 426 } 427 428 if (!MORE()) 429 return; 430 c = PEEK(); 431 /* we call { a repetition if followed by a digit */ 432 if (!( c == '*' || c == '+' || c == '?' || 433 (c == '{' && MORE2() && isdigit(PEEK2())) )) 434 return; /* no repetition, we're done */ 435 NEXT(); 436 437 REQUIRE(!wascaret, REG_BADRPT); 438 switch (c) { 439 case '*': /* implemented as +? */ 440 /* this case does not require the (y|) trick, noKLUDGE */ 441 INSERT(OPLUS_, pos); 442 ASTERN(O_PLUS, pos); 443 INSERT(OQUEST_, pos); 444 ASTERN(O_QUEST, pos); 445 break; 446 case '+': 447 INSERT(OPLUS_, pos); 448 ASTERN(O_PLUS, pos); 449 break; 450 case '?': 451 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 452 INSERT(OCH_, pos); /* offset slightly wrong */ 453 ASTERN(OOR1, pos); /* this one's right */ 454 AHEAD(pos); /* fix the OCH_ */ 455 EMIT(OOR2, 0); /* offset very wrong... */ 456 AHEAD(THERE()); /* ...so fix it */ 457 ASTERN(O_CH, THERETHERE()); 458 break; 459 case '{': 460 count = p_count(p); 461 if (EAT(',')) { 462 if (isdigit(PEEK())) { 463 count2 = p_count(p); 464 REQUIRE(count <= count2, REG_BADBR); 465 } else /* single number with comma */ 466 count2 = INFINITY; 467 } else /* just a single number */ 468 count2 = count; 469 repeat(p, pos, count, count2); 470 if (!EAT('}')) { /* error heuristics */ 471 while (MORE() && PEEK() != '}') 472 NEXT(); 473 REQUIRE(MORE(), REG_EBRACE); 474 SETERROR(REG_BADBR); 475 } 476 break; 477 } 478 479 if (!MORE()) 480 return; 481 c = PEEK(); 482 if (!( c == '*' || c == '+' || c == '?' || 483 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 484 return; 485 SETERROR(REG_BADRPT); 486 } 487 488 /* 489 - p_str - string (no metacharacters) "parser" 490 == static void p_str(struct parse *p); 491 */ 492 static void 493 p_str(p) 494 struct parse *p; 495 { 496 REQUIRE(MORE(), REG_EMPTY); 497 while (MORE()) 498 ordinary(p, GETNEXT()); 499 } 500 501 /* 502 - p_bre - BRE parser top level, anchoring and concatenation 503 == static void p_bre(struct parse *p, int end1, \ 504 == int end2); 505 * Giving end1 as OUT essentially eliminates the end1/end2 check. 506 * 507 * This implementation is a bit of a kludge, in that a trailing $ is first 508 * taken as an ordinary character and then revised to be an anchor. The 509 * only undesirable side effect is that '$' gets included as a character 510 * category in such cases. This is fairly harmless; not worth fixing. 511 * The amount of lookahead needed to avoid this kludge is excessive. 512 */ 513 static void 514 p_bre(p, end1, end2) 515 struct parse *p; 516 int end1; /* first terminating character */ 517 int end2; /* second terminating character */ 518 { 519 sopno start = HERE(); 520 int first = 1; /* first subexpression? */ 521 int wasdollar = 0; 522 523 if (EAT('^')) { 524 EMIT(OBOL, 0); 525 p->g->iflags |= USEBOL; 526 p->g->nbol++; 527 } 528 while (MORE() && !SEETWO(end1, end2)) { 529 wasdollar = p_simp_re(p, first); 530 first = 0; 531 } 532 if (wasdollar) { /* oops, that was a trailing anchor */ 533 DROP(1); 534 EMIT(OEOL, 0); 535 p->g->iflags |= USEEOL; 536 p->g->neol++; 537 } 538 539 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 540 } 541 542 /* 543 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 544 == static int p_simp_re(struct parse *p, int starordinary); 545 */ 546 static int /* was the simple RE an unbackslashed $? */ 547 p_simp_re(p, starordinary) 548 struct parse *p; 549 int starordinary; /* is a leading * an ordinary character? */ 550 { 551 int c; 552 int count; 553 int count2; 554 sopno pos; 555 int i; 556 sopno subno; 557 # define BACKSL (1<<CHAR_BIT) 558 559 pos = HERE(); /* repetion op, if any, covers from here */ 560 561 assert(MORE()); /* caller should have ensured this */ 562 c = GETNEXT(); 563 if (c == '\\') { 564 REQUIRE(MORE(), REG_EESCAPE); 565 c = BACKSL | (unsigned char)GETNEXT(); 566 } 567 switch (c) { 568 case '.': 569 if (p->g->cflags®_NEWLINE) 570 nonnewline(p); 571 else 572 EMIT(OANY, 0); 573 break; 574 case '[': 575 p_bracket(p); 576 break; 577 case BACKSL|'{': 578 SETERROR(REG_BADRPT); 579 break; 580 case BACKSL|'(': 581 p->g->nsub++; 582 subno = p->g->nsub; 583 if (subno < NPAREN) 584 p->pbegin[subno] = HERE(); 585 EMIT(OLPAREN, subno); 586 /* the MORE here is an error heuristic */ 587 if (MORE() && !SEETWO('\\', ')')) 588 p_bre(p, '\\', ')'); 589 if (subno < NPAREN) { 590 p->pend[subno] = HERE(); 591 assert(p->pend[subno] != 0); 592 } 593 EMIT(ORPAREN, subno); 594 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 595 break; 596 case BACKSL|')': /* should not get here -- must be user */ 597 case BACKSL|'}': 598 SETERROR(REG_EPAREN); 599 break; 600 case BACKSL|'1': 601 case BACKSL|'2': 602 case BACKSL|'3': 603 case BACKSL|'4': 604 case BACKSL|'5': 605 case BACKSL|'6': 606 case BACKSL|'7': 607 case BACKSL|'8': 608 case BACKSL|'9': 609 i = (c&~BACKSL) - '0'; 610 assert(i < NPAREN); 611 if (p->pend[i] != 0) { 612 assert(i <= p->g->nsub); 613 EMIT(OBACK_, i); 614 assert(p->pbegin[i] != 0); 615 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 616 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 617 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 618 EMIT(O_BACK, i); 619 } else 620 SETERROR(REG_ESUBREG); 621 p->g->backrefs = 1; 622 break; 623 case '*': 624 REQUIRE(starordinary, REG_BADRPT); 625 /* FALLTHROUGH */ 626 default: 627 ordinary(p, c &~ BACKSL); 628 break; 629 } 630 631 if (EAT('*')) { /* implemented as +? */ 632 /* this case does not require the (y|) trick, noKLUDGE */ 633 INSERT(OPLUS_, pos); 634 ASTERN(O_PLUS, pos); 635 INSERT(OQUEST_, pos); 636 ASTERN(O_QUEST, pos); 637 } else if (EATTWO('\\', '{')) { 638 count = p_count(p); 639 if (EAT(',')) { 640 if (MORE() && isdigit(PEEK())) { 641 count2 = p_count(p); 642 REQUIRE(count <= count2, REG_BADBR); 643 } else /* single number with comma */ 644 count2 = INFINITY; 645 } else /* just a single number */ 646 count2 = count; 647 repeat(p, pos, count, count2); 648 if (!EATTWO('\\', '}')) { /* error heuristics */ 649 while (MORE() && !SEETWO('\\', '}')) 650 NEXT(); 651 REQUIRE(MORE(), REG_EBRACE); 652 SETERROR(REG_BADBR); 653 } 654 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 655 return(1); 656 657 return(0); 658 } 659 660 /* 661 - p_count - parse a repetition count 662 == static int p_count(struct parse *p); 663 */ 664 static int /* the value */ 665 p_count(p) 666 struct parse *p; 667 { 668 int count = 0; 669 int ndigits = 0; 670 671 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 672 count = count*10 + (GETNEXT() - '0'); 673 ndigits++; 674 } 675 676 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 677 return(count); 678 } 679 680 /* 681 - p_bracket - parse a bracketed character list 682 == static void p_bracket(struct parse *p); 683 * 684 * Note a significant property of this code: if the allocset() did SETERROR, 685 * no set operations are done. 686 */ 687 static void 688 p_bracket(p) 689 struct parse *p; 690 { 691 cset *cs = allocset(p); 692 int invert = 0; 693 694 /* Dept of Truly Sickening Special-Case Kludges */ 695 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 696 (size_t)6) == 0) { 697 EMIT(OBOW, 0); 698 NEXTn(6); 699 return; 700 } 701 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 702 (size_t)6) == 0) { 703 EMIT(OEOW, 0); 704 NEXTn(6); 705 return; 706 } 707 708 if (EAT('^')) 709 invert++; /* make note to invert set at end */ 710 if (EAT(']')) 711 CHadd(cs, ']'); 712 else if (EAT('-')) 713 CHadd(cs, '-'); 714 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 715 p_b_term(p, cs); 716 if (EAT('-')) 717 CHadd(cs, '-'); 718 MUSTEAT(']', REG_EBRACK); 719 720 if (p->error != 0) /* don't mess things up further */ 721 return; 722 723 if (p->g->cflags®_ICASE) { 724 int i; 725 int ci; 726 727 for (i = p->g->csetsize - 1; i >= 0; i--) 728 if (CHIN(cs, i) && isalpha(i)) { 729 ci = othercase(i); 730 if (ci != i) 731 CHadd(cs, ci); 732 } 733 if (cs->multis != NULL) 734 mccase(p, cs); 735 } 736 if (invert) { 737 int i; 738 739 for (i = p->g->csetsize - 1; i >= 0; i--) 740 if (CHIN(cs, i)) 741 CHsub(cs, i); 742 else 743 CHadd(cs, i); 744 if (p->g->cflags®_NEWLINE) 745 CHsub(cs, '\n'); 746 if (cs->multis != NULL) 747 mcinvert(p, cs); 748 } 749 750 assert(cs->multis == NULL); /* xxx */ 751 752 if (nch(p, cs) == 1) { /* optimize singleton sets */ 753 ordinary(p, firstch(p, cs)); 754 freeset(p, cs); 755 } else 756 EMIT(OANYOF, freezeset(p, cs)); 757 } 758 759 /* 760 - p_b_term - parse one term of a bracketed character list 761 == static void p_b_term(struct parse *p, cset *cs); 762 */ 763 static void 764 p_b_term(p, cs) 765 struct parse *p; 766 cset *cs; 767 { 768 char c; 769 char start, finish; 770 int i; 771 772 /* classify what we've got */ 773 switch ((MORE()) ? PEEK() : '\0') { 774 case '[': 775 c = (MORE2()) ? PEEK2() : '\0'; 776 break; 777 778 case '-': 779 SETERROR(REG_ERANGE); 780 return; /* NOTE RETURN */ 781 782 default: 783 c = '\0'; 784 break; 785 } 786 787 switch (c) { 788 case ':': /* character class */ 789 NEXT2(); 790 REQUIRE(MORE(), REG_EBRACK); 791 c = PEEK(); 792 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 793 p_b_cclass(p, cs); 794 REQUIRE(MORE(), REG_EBRACK); 795 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 796 break; 797 case '=': /* equivalence class */ 798 NEXT2(); 799 REQUIRE(MORE(), REG_EBRACK); 800 c = PEEK(); 801 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 802 p_b_eclass(p, cs); 803 REQUIRE(MORE(), REG_EBRACK); 804 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 805 break; 806 default: /* symbol, ordinary character, or range */ 807 /* xxx revision needed for multichar stuff */ 808 start = p_b_symbol(p); 809 if (SEE('-') && MORE2() && PEEK2() != ']') { 810 /* range */ 811 NEXT(); 812 if (EAT('-')) 813 finish = '-'; 814 else 815 finish = p_b_symbol(p); 816 } else 817 finish = start; 818 /* xxx what about signed chars here... */ 819 REQUIRE(start <= finish, REG_ERANGE); 820 for (i = start; i <= finish; i++) 821 CHadd(cs, i); 822 break; 823 } 824 } 825 826 /* 827 - p_b_cclass - parse a character-class name and deal with it 828 == static void p_b_cclass(struct parse *p, cset *cs); 829 */ 830 static void 831 p_b_cclass(p, cs) 832 struct parse *p; 833 cset *cs; 834 { 835 char *sp = p->next; 836 const struct cclass *cp; 837 size_t len; 838 const char *u; 839 char c; 840 841 while (MORE() && isalpha(PEEK())) 842 NEXT(); 843 len = p->next - sp; 844 for (cp = cclasses; cp->name != NULL; cp++) 845 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 846 break; 847 if (cp->name == NULL) { 848 /* oops, didn't find it */ 849 SETERROR(REG_ECTYPE); 850 return; 851 } 852 853 u = cp->chars; 854 while ((c = *u++) != '\0') 855 CHadd(cs, c); 856 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 857 MCadd(p, cs, u); 858 } 859 860 /* 861 - p_b_eclass - parse an equivalence-class name and deal with it 862 == static void p_b_eclass(struct parse *p, cset *cs); 863 * 864 * This implementation is incomplete. xxx 865 */ 866 static void 867 p_b_eclass(p, cs) 868 struct parse *p; 869 cset *cs; 870 { 871 char c; 872 873 c = p_b_coll_elem(p, '='); 874 CHadd(cs, c); 875 } 876 877 /* 878 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 879 == static char p_b_symbol(struct parse *p); 880 */ 881 static char /* value of symbol */ 882 p_b_symbol(p) 883 struct parse *p; 884 { 885 char value; 886 887 REQUIRE(MORE(), REG_EBRACK); 888 if (!EATTWO('[', '.')) 889 return(GETNEXT()); 890 891 /* collating symbol */ 892 value = p_b_coll_elem(p, '.'); 893 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 894 return(value); 895 } 896 897 /* 898 - p_b_coll_elem - parse a collating-element name and look it up 899 == static char p_b_coll_elem(struct parse *p, int endc); 900 */ 901 static char /* value of collating element */ 902 p_b_coll_elem(p, endc) 903 struct parse *p; 904 int endc; /* name ended by endc,']' */ 905 { 906 char *sp = p->next; 907 const struct cname *cp; 908 size_t len; 909 910 while (MORE() && !SEETWO(endc, ']')) 911 NEXT(); 912 if (!MORE()) { 913 SETERROR(REG_EBRACK); 914 return(0); 915 } 916 len = p->next - sp; 917 for (cp = cnames; cp->name != NULL; cp++) 918 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 919 return(cp->code); /* known name */ 920 if (len == 1) 921 return(*sp); /* single character */ 922 SETERROR(REG_ECOLLATE); /* neither */ 923 return(0); 924 } 925 926 /* 927 - othercase - return the case counterpart of an alphabetic 928 == static char othercase(int ch); 929 */ 930 static char /* if no counterpart, return ch */ 931 othercase(ch) 932 int ch; 933 { 934 assert(isalpha(ch)); 935 if (isupper(ch)) 936 return(tolower(ch)); 937 else if (islower(ch)) 938 return(toupper(ch)); 939 else /* peculiar, but could happen */ 940 return(ch); 941 } 942 943 /* 944 - bothcases - emit a dualcase version of a two-case character 945 == static void bothcases(struct parse *p, int ch); 946 * 947 * Boy, is this implementation ever a kludge... 948 */ 949 static void 950 bothcases(p, ch) 951 struct parse *p; 952 int ch; 953 { 954 char *oldnext = p->next; 955 char *oldend = p->end; 956 char bracket[3]; 957 958 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 959 p->next = bracket; 960 p->end = bracket+2; 961 bracket[0] = ch; 962 bracket[1] = ']'; 963 bracket[2] = '\0'; 964 p_bracket(p); 965 assert(p->next == bracket+2); 966 p->next = oldnext; 967 p->end = oldend; 968 } 969 970 /* 971 - ordinary - emit an ordinary character 972 == static void ordinary(struct parse *p, int ch); 973 */ 974 static void 975 ordinary(p, ch) 976 struct parse *p; 977 int ch; 978 { 979 cat_t *cap = p->g->categories; 980 981 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 982 bothcases(p, ch); 983 else { 984 EMIT(OCHAR, (unsigned char)ch); 985 if (cap[ch] == 0) 986 cap[ch] = p->g->ncategories++; 987 } 988 } 989 990 /* 991 - nonnewline - emit REG_NEWLINE version of OANY 992 == static void nonnewline(struct parse *p); 993 * 994 * Boy, is this implementation ever a kludge... 995 */ 996 static void 997 nonnewline(p) 998 struct parse *p; 999 { 1000 char *oldnext = p->next; 1001 char *oldend = p->end; 1002 char bracket[4]; 1003 1004 p->next = bracket; 1005 p->end = bracket+3; 1006 bracket[0] = '^'; 1007 bracket[1] = '\n'; 1008 bracket[2] = ']'; 1009 bracket[3] = '\0'; 1010 p_bracket(p); 1011 assert(p->next == bracket+3); 1012 p->next = oldnext; 1013 p->end = oldend; 1014 } 1015 1016 /* 1017 - repeat - generate code for a bounded repetition, recursively if needed 1018 == static void repeat(struct parse *p, sopno start, int from, int to); 1019 */ 1020 static void 1021 repeat(p, start, from, to) 1022 struct parse *p; 1023 sopno start; /* operand from here to end of strip */ 1024 int from; /* repeated from this number */ 1025 int to; /* to this number of times (maybe INFINITY) */ 1026 { 1027 sopno finish = HERE(); 1028 # define N 2 1029 # define INF 3 1030 # define REP(f, t) ((f)*8 + (t)) 1031 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1032 sopno copy; 1033 1034 if (p->error != 0) /* head off possible runaway recursion */ 1035 return; 1036 1037 assert(from <= to); 1038 1039 switch (REP(MAP(from), MAP(to))) { 1040 case REP(0, 0): /* must be user doing this */ 1041 DROP(finish-start); /* drop the operand */ 1042 break; 1043 case REP(0, 1): /* as x{1,1}? */ 1044 case REP(0, N): /* as x{1,n}? */ 1045 case REP(0, INF): /* as x{1,}? */ 1046 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1047 INSERT(OCH_, start); /* offset is wrong... */ 1048 repeat(p, start+1, 1, to); 1049 ASTERN(OOR1, start); 1050 AHEAD(start); /* ... fix it */ 1051 EMIT(OOR2, 0); 1052 AHEAD(THERE()); 1053 ASTERN(O_CH, THERETHERE()); 1054 break; 1055 case REP(1, 1): /* trivial case */ 1056 /* done */ 1057 break; 1058 case REP(1, N): /* as x?x{1,n-1} */ 1059 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1060 INSERT(OCH_, start); 1061 ASTERN(OOR1, start); 1062 AHEAD(start); 1063 EMIT(OOR2, 0); /* offset very wrong... */ 1064 AHEAD(THERE()); /* ...so fix it */ 1065 ASTERN(O_CH, THERETHERE()); 1066 copy = dupl(p, start+1, finish+1); 1067 assert(copy == finish+4); 1068 repeat(p, copy, 1, to-1); 1069 break; 1070 case REP(1, INF): /* as x+ */ 1071 INSERT(OPLUS_, start); 1072 ASTERN(O_PLUS, start); 1073 break; 1074 case REP(N, N): /* as xx{m-1,n-1} */ 1075 copy = dupl(p, start, finish); 1076 repeat(p, copy, from-1, to-1); 1077 break; 1078 case REP(N, INF): /* as xx{n-1,INF} */ 1079 copy = dupl(p, start, finish); 1080 repeat(p, copy, from-1, to); 1081 break; 1082 default: /* "can't happen" */ 1083 SETERROR(REG_ASSERT); /* just in case */ 1084 break; 1085 } 1086 } 1087 1088 /* 1089 - seterr - set an error condition 1090 == static int seterr(struct parse *p, int e); 1091 */ 1092 static int /* useless but makes type checking happy */ 1093 seterr(p, e) 1094 struct parse *p; 1095 int e; 1096 { 1097 if (p->error == 0) /* keep earliest error condition */ 1098 p->error = e; 1099 p->next = nuls; /* try to bring things to a halt */ 1100 p->end = nuls; 1101 return(0); /* make the return value well-defined */ 1102 } 1103 1104 /* 1105 - allocset - allocate a set of characters for [] 1106 == static cset *allocset(struct parse *p); 1107 */ 1108 static cset * 1109 allocset(p) 1110 struct parse *p; 1111 { 1112 int no = p->g->ncsets++; 1113 size_t nc; 1114 size_t nbytes; 1115 cset *cs; 1116 size_t css = (size_t)p->g->csetsize; 1117 int i; 1118 1119 if (no >= p->ncsalloc) { /* need another column of space */ 1120 p->ncsalloc += CHAR_BIT; 1121 nc = p->ncsalloc; 1122 assert(nc % CHAR_BIT == 0); 1123 nbytes = nc / CHAR_BIT * css; 1124 if (p->g->sets == NULL) 1125 p->g->sets = malloc(nc * sizeof(cset)); 1126 else 1127 p->g->sets = realloc(p->g->sets, nc * sizeof(cset)); 1128 if (p->g->setbits == NULL) 1129 p->g->setbits = malloc(nbytes); 1130 else { 1131 p->g->setbits = realloc(p->g->setbits, nbytes); 1132 /* xxx this isn't right if setbits is now NULL */ 1133 for (i = 0; i < no; i++) 1134 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1135 } 1136 if (p->g->sets != NULL && p->g->setbits != NULL) 1137 (void) memset((char *)p->g->setbits + (nbytes - css), 1138 0, css); 1139 else { 1140 no = 0; 1141 SETERROR(REG_ESPACE); 1142 /* caller's responsibility not to do set ops */ 1143 } 1144 } 1145 1146 assert(p->g->sets != NULL); /* xxx */ 1147 cs = &p->g->sets[no]; 1148 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1149 cs->mask = 1 << ((no) % CHAR_BIT); 1150 cs->hash = 0; 1151 cs->smultis = 0; 1152 cs->multis = NULL; 1153 1154 return(cs); 1155 } 1156 1157 /* 1158 - freeset - free a now-unused set 1159 == static void freeset(struct parse *p, cset *cs); 1160 */ 1161 static void 1162 freeset(p, cs) 1163 struct parse *p; 1164 cset *cs; 1165 { 1166 int i; 1167 cset *top = &p->g->sets[p->g->ncsets]; 1168 size_t css = (size_t)p->g->csetsize; 1169 1170 for (i = 0; i < css; i++) 1171 CHsub(cs, i); 1172 if (cs == top-1) /* recover only the easy case */ 1173 p->g->ncsets--; 1174 } 1175 1176 /* 1177 - freezeset - final processing on a set of characters 1178 == static int freezeset(struct parse *p, cset *cs); 1179 * 1180 * The main task here is merging identical sets. This is usually a waste 1181 * of time (although the hash code minimizes the overhead), but can win 1182 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1183 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1184 * the same value! 1185 */ 1186 static int /* set number */ 1187 freezeset(p, cs) 1188 struct parse *p; 1189 cset *cs; 1190 { 1191 uch h = cs->hash; 1192 int i; 1193 cset *top = &p->g->sets[p->g->ncsets]; 1194 cset *cs2; 1195 size_t css = (size_t)p->g->csetsize; 1196 1197 /* look for an earlier one which is the same */ 1198 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1199 if (cs2->hash == h && cs2 != cs) { 1200 /* maybe */ 1201 for (i = 0; i < css; i++) 1202 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1203 break; /* no */ 1204 if (i == css) 1205 break; /* yes */ 1206 } 1207 1208 if (cs2 < top) { /* found one */ 1209 freeset(p, cs); 1210 cs = cs2; 1211 } 1212 1213 return((int)(cs - p->g->sets)); 1214 } 1215 1216 /* 1217 - firstch - return first character in a set (which must have at least one) 1218 == static int firstch(struct parse *p, cset *cs); 1219 */ 1220 static int /* character; there is no "none" value */ 1221 firstch(p, cs) 1222 struct parse *p; 1223 cset *cs; 1224 { 1225 int i; 1226 size_t css = (size_t)p->g->csetsize; 1227 1228 for (i = 0; i < css; i++) 1229 if (CHIN(cs, i)) 1230 return((char)i); 1231 assert(never); 1232 return(0); /* arbitrary */ 1233 } 1234 1235 /* 1236 - nch - number of characters in a set 1237 == static int nch(struct parse *p, cset *cs); 1238 */ 1239 static int 1240 nch(p, cs) 1241 struct parse *p; 1242 cset *cs; 1243 { 1244 int i; 1245 size_t css = (size_t)p->g->csetsize; 1246 int n = 0; 1247 1248 for (i = 0; i < css; i++) 1249 if (CHIN(cs, i)) 1250 n++; 1251 return(n); 1252 } 1253 1254 /* 1255 - mcadd - add a collating element to a cset 1256 == static void mcadd(struct parse *p, cset *cs, \ 1257 == char *cp); 1258 */ 1259 static void 1260 mcadd(p, cs, cp) 1261 struct parse *p; 1262 cset *cs; 1263 const char *cp; 1264 { 1265 size_t oldend = cs->smultis; 1266 1267 cs->smultis += strlen(cp) + 1; 1268 if (cs->multis == NULL) 1269 cs->multis = malloc(cs->smultis); 1270 else 1271 cs->multis = realloc(cs->multis, cs->smultis); 1272 if (cs->multis == NULL) { 1273 SETERROR(REG_ESPACE); 1274 return; 1275 } 1276 1277 (void) strcpy(cs->multis + oldend - 1, cp); 1278 cs->multis[cs->smultis - 1] = '\0'; 1279 } 1280 1281 #if 0 1282 /* 1283 - mcsub - subtract a collating element from a cset 1284 == static void mcsub(cset *cs, char *cp); 1285 */ 1286 static void 1287 mcsub(cs, cp) 1288 cset *cs; 1289 char *cp; 1290 { 1291 char *fp = mcfind(cs, cp); 1292 size_t len = strlen(fp); 1293 1294 assert(fp != NULL); 1295 (void) memmove(fp, fp + len + 1, 1296 cs->smultis - (fp + len + 1 - cs->multis)); 1297 cs->smultis -= len; 1298 1299 if (cs->smultis == 0) { 1300 free(cs->multis); 1301 cs->multis = NULL; 1302 return; 1303 } 1304 1305 cs->multis = realloc(cs->multis, cs->smultis); 1306 assert(cs->multis != NULL); 1307 } 1308 1309 /* 1310 - mcin - is a collating element in a cset? 1311 == static int mcin(cset *cs, char *cp); 1312 */ 1313 static int 1314 mcin(cs, cp) 1315 cset *cs; 1316 char *cp; 1317 { 1318 return(mcfind(cs, cp) != NULL); 1319 } 1320 1321 /* 1322 - mcfind - find a collating element in a cset 1323 == static char *mcfind(cset *cs, char *cp); 1324 */ 1325 static char * 1326 mcfind(cs, cp) 1327 cset *cs; 1328 char *cp; 1329 { 1330 char *p; 1331 1332 if (cs->multis == NULL) 1333 return(NULL); 1334 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1335 if (strcmp(cp, p) == 0) 1336 return(p); 1337 return(NULL); 1338 } 1339 #endif 1340 1341 /* 1342 - mcinvert - invert the list of collating elements in a cset 1343 == static void mcinvert(struct parse *p, cset *cs); 1344 * 1345 * This would have to know the set of possibilities. Implementation 1346 * is deferred. 1347 */ 1348 /* ARGSUSED */ 1349 static void 1350 mcinvert(p, cs) 1351 struct parse *p; 1352 cset *cs; 1353 { 1354 assert(cs->multis == NULL); /* xxx */ 1355 } 1356 1357 /* 1358 - mccase - add case counterparts of the list of collating elements in a cset 1359 == static void mccase(struct parse *p, cset *cs); 1360 * 1361 * This would have to know the set of possibilities. Implementation 1362 * is deferred. 1363 */ 1364 /* ARGSUSED */ 1365 static void 1366 mccase(p, cs) 1367 struct parse *p; 1368 cset *cs; 1369 { 1370 assert(cs->multis == NULL); /* xxx */ 1371 } 1372 1373 /* 1374 - isinsets - is this character in any sets? 1375 == static int isinsets(struct re_guts *g, int c); 1376 */ 1377 static int /* predicate */ 1378 isinsets(g, c) 1379 struct re_guts *g; 1380 int c; 1381 { 1382 uch *col; 1383 int i; 1384 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1385 unsigned uc = (unsigned char)c; 1386 1387 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1388 if (col[uc] != 0) 1389 return(1); 1390 return(0); 1391 } 1392 1393 /* 1394 - samesets - are these two characters in exactly the same sets? 1395 == static int samesets(struct re_guts *g, int c1, int c2); 1396 */ 1397 static int /* predicate */ 1398 samesets(g, c1, c2) 1399 struct re_guts *g; 1400 int c1; 1401 int c2; 1402 { 1403 uch *col; 1404 int i; 1405 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1406 unsigned uc1 = (unsigned char)c1; 1407 unsigned uc2 = (unsigned char)c2; 1408 1409 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1410 if (col[uc1] != col[uc2]) 1411 return(0); 1412 return(1); 1413 } 1414 1415 /* 1416 - categorize - sort out character categories 1417 == static void categorize(struct parse *p, struct re_guts *g); 1418 */ 1419 static void 1420 categorize(p, g) 1421 struct parse *p; 1422 struct re_guts *g; 1423 { 1424 cat_t *cats = g->categories; 1425 int c; 1426 int c2; 1427 cat_t cat; 1428 1429 /* avoid making error situations worse */ 1430 if (p->error != 0) 1431 return; 1432 1433 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1434 if (cats[c] == 0 && isinsets(g, c)) { 1435 cat = g->ncategories++; 1436 cats[c] = cat; 1437 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1438 if (cats[c2] == 0 && samesets(g, c, c2)) 1439 cats[c2] = cat; 1440 } 1441 } 1442 1443 /* 1444 - dupl - emit a duplicate of a bunch of sops 1445 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1446 */ 1447 static sopno /* start of duplicate */ 1448 dupl(p, start, finish) 1449 struct parse *p; 1450 sopno start; /* from here */ 1451 sopno finish; /* to this less one */ 1452 { 1453 sopno ret = HERE(); 1454 sopno len = finish - start; 1455 1456 assert(finish >= start); 1457 if (len == 0) 1458 return(ret); 1459 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1460 assert(p->ssize >= p->slen + len); 1461 (void)memcpy(p->strip + p->slen, p->strip + start, 1462 (size_t)len * sizeof(sop)); 1463 p->slen += len; 1464 return(ret); 1465 } 1466 1467 /* 1468 - doemit - emit a strip operator 1469 == static void doemit(struct parse *p, sop op, size_t opnd); 1470 * 1471 * It might seem better to implement this as a macro with a function as 1472 * hard-case backup, but it's just too big and messy unless there are 1473 * some changes to the data structures. Maybe later. 1474 */ 1475 static void 1476 doemit(p, op, opnd) 1477 struct parse *p; 1478 sop op; 1479 sopno opnd; 1480 { 1481 /* avoid making error situations worse */ 1482 if (p->error != 0) 1483 return; 1484 1485 /* deal with oversize operands ("can't happen", more or less) */ 1486 assert(opnd < 1<<OPSHIFT); 1487 1488 /* deal with undersized strip */ 1489 if (p->slen >= p->ssize) 1490 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1491 assert(p->slen < p->ssize); 1492 1493 /* finally, it's all reduced to the easy case */ 1494 p->strip[p->slen++] = SOP(op, opnd); 1495 } 1496 1497 /* 1498 - doinsert - insert a sop into the strip 1499 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1500 */ 1501 static void 1502 doinsert(p, op, opnd, pos) 1503 struct parse *p; 1504 sop op; 1505 sopno opnd; 1506 sopno pos; 1507 { 1508 sopno sn; 1509 sop s; 1510 int i; 1511 1512 /* avoid making error situations worse */ 1513 if (p->error != 0) 1514 return; 1515 1516 sn = HERE(); 1517 EMIT(op, opnd); /* do checks, ensure space */ 1518 assert(HERE() == sn+1); 1519 s = p->strip[sn]; 1520 1521 /* adjust paren pointers */ 1522 assert(pos > 0); 1523 for (i = 1; i < NPAREN; i++) { 1524 if (p->pbegin[i] >= pos) { 1525 p->pbegin[i]++; 1526 } 1527 if (p->pend[i] >= pos) { 1528 p->pend[i]++; 1529 } 1530 } 1531 1532 memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop)); 1533 p->strip[pos] = s; 1534 } 1535 1536 /* 1537 - dofwd - complete a forward reference 1538 == static void dofwd(struct parse *p, sopno pos, sop value); 1539 */ 1540 static void 1541 dofwd(p, pos, value) 1542 struct parse *p; 1543 sopno pos; 1544 sopno value; 1545 { 1546 /* avoid making error situations worse */ 1547 if (p->error != 0) 1548 return; 1549 1550 assert(value < 1<<OPSHIFT); 1551 p->strip[pos] = OP(p->strip[pos]) | value; 1552 } 1553 1554 /* 1555 - enlarge - enlarge the strip 1556 == static void enlarge(struct parse *p, sopno size); 1557 */ 1558 static void 1559 enlarge(p, size) 1560 struct parse *p; 1561 sopno size; 1562 { 1563 sop *sp; 1564 1565 if (p->ssize >= size) 1566 return; 1567 1568 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1569 if (sp == NULL) { 1570 SETERROR(REG_ESPACE); 1571 return; 1572 } 1573 p->strip = sp; 1574 p->ssize = size; 1575 } 1576 1577 /* 1578 - stripsnug - compact the strip 1579 == static void stripsnug(struct parse *p, struct re_guts *g); 1580 */ 1581 static void 1582 stripsnug(p, g) 1583 struct parse *p; 1584 struct re_guts *g; 1585 { 1586 g->nstates = p->slen; 1587 g->strip = realloc(p->strip, p->slen * sizeof(sop)); 1588 if (g->strip == NULL) { 1589 SETERROR(REG_ESPACE); 1590 g->strip = p->strip; 1591 } 1592 } 1593 1594 /* 1595 - findmust - fill in must and mlen with longest mandatory literal string 1596 == static void findmust(struct parse *p, struct re_guts *g); 1597 * 1598 * This algorithm could do fancy things like analyzing the operands of | 1599 * for common subsequences. Someday. This code is simple and finds most 1600 * of the interesting cases. 1601 * 1602 * Note that must and mlen got initialized during setup. 1603 */ 1604 static void 1605 findmust(p, g) 1606 struct parse *p; 1607 struct re_guts *g; 1608 { 1609 sop *scan; 1610 sop *start = NULL; 1611 sop *newstart = NULL; 1612 sopno newlen; 1613 sop s; 1614 char *cp; 1615 sopno i; 1616 1617 /* avoid making error situations worse */ 1618 if (p->error != 0) 1619 return; 1620 1621 /* find the longest OCHAR sequence in strip */ 1622 newlen = 0; 1623 scan = g->strip + 1; 1624 do { 1625 s = *scan++; 1626 switch (OP(s)) { 1627 case OCHAR: /* sequence member */ 1628 if (newlen == 0) /* new sequence */ 1629 newstart = scan - 1; 1630 newlen++; 1631 break; 1632 case OPLUS_: /* things that don't break one */ 1633 case OLPAREN: 1634 case ORPAREN: 1635 break; 1636 case OQUEST_: /* things that must be skipped */ 1637 case OCH_: 1638 scan--; 1639 do { 1640 scan += OPND(s); 1641 s = *scan; 1642 /* assert() interferes w debug printouts */ 1643 if (OP(s) != O_QUEST && OP(s) != O_CH && 1644 OP(s) != OOR2) { 1645 g->iflags |= BAD; 1646 return; 1647 } 1648 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1649 /* FALLTHROUGH */ 1650 default: /* things that break a sequence */ 1651 if (newlen > g->mlen) { /* ends one */ 1652 start = newstart; 1653 g->mlen = newlen; 1654 } 1655 newlen = 0; 1656 break; 1657 } 1658 } while (OP(s) != OEND); 1659 1660 if (g->mlen == 0) /* there isn't one */ 1661 return; 1662 1663 /* turn it into a character string */ 1664 g->must = malloc((size_t)g->mlen + 1); 1665 if (g->must == NULL) { /* argh; just forget it */ 1666 g->mlen = 0; 1667 return; 1668 } 1669 cp = g->must; 1670 scan = start; 1671 for (i = g->mlen; i > 0; i--) { 1672 while (OP(s = *scan++) != OCHAR) 1673 continue; 1674 assert(cp < g->must + g->mlen); 1675 *cp++ = (char)OPND(s); 1676 } 1677 assert(cp == g->must + g->mlen); 1678 *cp++ = '\0'; /* just on general principles */ 1679 } 1680 1681 /* 1682 - pluscount - count + nesting 1683 == static sopno pluscount(struct parse *p, struct re_guts *g); 1684 */ 1685 static sopno /* nesting depth */ 1686 pluscount(p, g) 1687 struct parse *p; 1688 struct re_guts *g; 1689 { 1690 sop *scan; 1691 sop s; 1692 sopno plusnest = 0; 1693 sopno maxnest = 0; 1694 1695 if (p->error != 0) 1696 return(0); /* there may not be an OEND */ 1697 1698 scan = g->strip + 1; 1699 do { 1700 s = *scan++; 1701 switch (OP(s)) { 1702 case OPLUS_: 1703 plusnest++; 1704 break; 1705 case O_PLUS: 1706 if (plusnest > maxnest) 1707 maxnest = plusnest; 1708 plusnest--; 1709 break; 1710 } 1711 } while (OP(s) != OEND); 1712 if (plusnest != 0) 1713 g->iflags |= BAD; 1714 return(maxnest); 1715 } 1716