1 /* $NetBSD: regcomp.c,v 1.7 1997/07/13 20:04:32 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 40 */ 41 42 #include <sys/cdefs.h> 43 #if defined(LIBC_SCCS) && !defined(lint) 44 #if 0 45 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 46 #else 47 __RCSID("$NetBSD: regcomp.c,v 1.7 1997/07/13 20:04:32 christos Exp $"); 48 #endif 49 #endif /* LIBC_SCCS and not lint */ 50 51 #include <sys/types.h> 52 #include <stdio.h> 53 #include <string.h> 54 #include <ctype.h> 55 #include <limits.h> 56 #include <stdlib.h> 57 #include <regex.h> 58 59 #include "utils.h" 60 #include "regex2.h" 61 62 #include "cclass.h" 63 #include "cname.h" 64 65 /* 66 * parse structure, passed up and down to avoid global variables and 67 * other clumsinesses 68 */ 69 struct parse { 70 char *next; /* next character in RE */ 71 char *end; /* end of string (-> NUL normally) */ 72 int error; /* has an error been seen? */ 73 sop *strip; /* malloced strip */ 74 sopno ssize; /* malloced strip size (allocated) */ 75 sopno slen; /* malloced strip length (used) */ 76 int ncsalloc; /* number of csets allocated */ 77 struct re_guts *g; 78 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 79 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 80 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 81 }; 82 83 /* ========= begin header generated by ./mkh ========= */ 84 #ifdef __cplusplus 85 extern "C" { 86 #endif 87 88 /* === regcomp.c === */ 89 static void p_ere __P((struct parse *p, int stop)); 90 static void p_ere_exp __P((struct parse *p)); 91 static void p_str __P((struct parse *p)); 92 static void p_bre __P((struct parse *p, int end1, int end2)); 93 static int p_simp_re __P((struct parse *p, int starordinary)); 94 static int p_count __P((struct parse *p)); 95 static void p_bracket __P((struct parse *p)); 96 static void p_b_term __P((struct parse *p, cset *cs)); 97 static void p_b_cclass __P((struct parse *p, cset *cs)); 98 static void p_b_eclass __P((struct parse *p, cset *cs)); 99 static char p_b_symbol __P((struct parse *p)); 100 static char p_b_coll_elem __P((struct parse *p, int endc)); 101 static char othercase __P((int ch)); 102 static void bothcases __P((struct parse *p, int ch)); 103 static void ordinary __P((struct parse *p, int ch)); 104 static void nonnewline __P((struct parse *p)); 105 static void repeat __P((struct parse *p, sopno start, int from, int to)); 106 static int seterr __P((struct parse *p, int e)); 107 static cset *allocset __P((struct parse *p)); 108 static void freeset __P((struct parse *p, cset *cs)); 109 static int freezeset __P((struct parse *p, cset *cs)); 110 static int firstch __P((struct parse *p, cset *cs)); 111 static int nch __P((struct parse *p, cset *cs)); 112 static void mcadd __P((struct parse *p, cset *cs, char *cp)); 113 #if 0 114 static void mcsub __P((cset *cs, char *cp)); 115 static int mcin __P((cset *cs, char *cp)); 116 static char *mcfind __P((cset *cs, char *cp)); 117 #endif 118 static void mcinvert __P((struct parse *p, cset *cs)); 119 static void mccase __P((struct parse *p, cset *cs)); 120 static int isinsets __P((struct re_guts *g, int c)); 121 static int samesets __P((struct re_guts *g, int c1, int c2)); 122 static void categorize __P((struct parse *p, struct re_guts *g)); 123 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 124 static void doemit __P((struct parse *p, sop op, size_t opnd)); 125 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 126 static void dofwd __P((struct parse *p, sopno pos, sop value)); 127 static void enlarge __P((struct parse *p, sopno size)); 128 static void stripsnug __P((struct parse *p, struct re_guts *g)); 129 static void findmust __P((struct parse *p, struct re_guts *g)); 130 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 131 132 #ifdef __cplusplus 133 } 134 #endif 135 /* ========= end header generated by ./mkh ========= */ 136 137 static char nuls[10]; /* place to point scanner in event of error */ 138 139 /* 140 * macros for use with parse structure 141 * BEWARE: these know that the parse structure is named `p' !!! 142 */ 143 #define PEEK() (*p->next) 144 #define PEEK2() (*(p->next+1)) 145 #define MORE() (p->next < p->end) 146 #define MORE2() (p->next+1 < p->end) 147 #define SEE(c) (MORE() && PEEK() == (c)) 148 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 149 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 150 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 151 #define NEXT() (p->next++) 152 #define NEXT2() (p->next += 2) 153 #define NEXTn(n) (p->next += (n)) 154 #define GETNEXT() (*p->next++) 155 #define SETERROR(e) seterr(p, (e)) 156 #define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 157 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 158 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 159 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 160 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 161 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 162 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 163 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 164 #define HERE() (p->slen) 165 #define THERE() (p->slen - 1) 166 #define THERETHERE() (p->slen - 2) 167 #define DROP(n) (p->slen -= (n)) 168 169 #ifndef NDEBUG 170 static int never = 0; /* for use in asserts; shuts lint up */ 171 #else 172 #define never 0 /* some <assert.h>s have bugs too */ 173 #endif 174 175 /* 176 - regcomp - interface for parser and compilation 177 = extern int regcomp(regex_t *, const char *, int); 178 = #define REG_BASIC 0000 179 = #define REG_EXTENDED 0001 180 = #define REG_ICASE 0002 181 = #define REG_NOSUB 0004 182 = #define REG_NEWLINE 0010 183 = #define REG_NOSPEC 0020 184 = #define REG_PEND 0040 185 = #define REG_DUMP 0200 186 */ 187 int /* 0 success, otherwise REG_something */ 188 regcomp(preg, pattern, cflags) 189 regex_t *preg; 190 const char *pattern; 191 int cflags; 192 { 193 struct parse pa; 194 register struct re_guts *g; 195 register struct parse *p = &pa; 196 register int i; 197 register size_t len; 198 #ifdef REDEBUG 199 # define GOODFLAGS(f) (f) 200 #else 201 # define GOODFLAGS(f) ((f)&~REG_DUMP) 202 #endif 203 204 cflags = GOODFLAGS(cflags); 205 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 206 return(REG_INVARG); 207 208 if (cflags®_PEND) { 209 if (preg->re_endp < pattern) 210 return(REG_INVARG); 211 len = preg->re_endp - pattern; 212 } else 213 len = strlen((char *)pattern); 214 215 /* do the mallocs early so failure handling is easy */ 216 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 217 (NC-1)*sizeof(cat_t)); 218 if (g == NULL) 219 return(REG_ESPACE); 220 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 221 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 222 p->slen = 0; 223 if (p->strip == NULL) { 224 free((char *)g); 225 return(REG_ESPACE); 226 } 227 228 /* set things up */ 229 p->g = g; 230 p->next = (char *)pattern; /* convenience; we do not modify it */ 231 p->end = p->next + len; 232 p->error = 0; 233 p->ncsalloc = 0; 234 for (i = 0; i < NPAREN; i++) { 235 p->pbegin[i] = 0; 236 p->pend[i] = 0; 237 } 238 g->csetsize = NC; 239 g->sets = NULL; 240 g->setbits = NULL; 241 g->ncsets = 0; 242 g->cflags = cflags; 243 g->iflags = 0; 244 g->nbol = 0; 245 g->neol = 0; 246 g->must = NULL; 247 g->mlen = 0; 248 g->nsub = 0; 249 g->ncategories = 1; /* category 0 is "everything else" */ 250 g->categories = &g->catspace[-(CHAR_MIN)]; 251 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 252 g->backrefs = 0; 253 254 /* do it */ 255 EMIT(OEND, 0); 256 g->firststate = THERE(); 257 if (cflags®_EXTENDED) 258 p_ere(p, OUT); 259 else if (cflags®_NOSPEC) 260 p_str(p); 261 else 262 p_bre(p, OUT, OUT); 263 EMIT(OEND, 0); 264 g->laststate = THERE(); 265 266 /* tidy up loose ends and fill things in */ 267 categorize(p, g); 268 stripsnug(p, g); 269 findmust(p, g); 270 g->nplus = pluscount(p, g); 271 g->magic = MAGIC2; 272 preg->re_nsub = g->nsub; 273 preg->re_g = g; 274 preg->re_magic = MAGIC1; 275 #ifndef REDEBUG 276 /* not debugging, so can't rely on the assert() in regexec() */ 277 if (g->iflags&BAD) 278 SETERROR(REG_ASSERT); 279 #endif 280 281 /* win or lose, we're done */ 282 if (p->error != 0) /* lose */ 283 regfree(preg); 284 return(p->error); 285 } 286 287 /* 288 - p_ere - ERE parser top level, concatenation and alternation 289 == static void p_ere(register struct parse *p, int stop); 290 */ 291 static void 292 p_ere(p, stop) 293 register struct parse *p; 294 int stop; /* character this ERE should end at */ 295 { 296 register char c; 297 register sopno prevback = 0; /* pacify gcc */ 298 register sopno prevfwd = 0; /* pacify gcc */ 299 register sopno conc; 300 register int first = 1; /* is this the first alternative? */ 301 302 for (;;) { 303 /* do a bunch of concatenated expressions */ 304 conc = HERE(); 305 while (MORE() && (c = PEEK()) != '|' && c != stop) 306 p_ere_exp(p); 307 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 308 309 if (!EAT('|')) 310 break; /* NOTE BREAK OUT */ 311 312 if (first) { 313 INSERT(OCH_, conc); /* offset is wrong */ 314 prevfwd = conc; 315 prevback = conc; 316 first = 0; 317 } 318 ASTERN(OOR1, prevback); 319 prevback = THERE(); 320 AHEAD(prevfwd); /* fix previous offset */ 321 prevfwd = HERE(); 322 EMIT(OOR2, 0); /* offset is very wrong */ 323 } 324 325 if (!first) { /* tail-end fixups */ 326 AHEAD(prevfwd); 327 ASTERN(O_CH, prevback); 328 } 329 330 assert(!MORE() || SEE(stop)); 331 } 332 333 /* 334 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 335 == static void p_ere_exp(register struct parse *p); 336 */ 337 static void 338 p_ere_exp(p) 339 register struct parse *p; 340 { 341 register char c; 342 register sopno pos; 343 register int count; 344 register int count2; 345 register sopno subno; 346 int wascaret = 0; 347 348 assert(MORE()); /* caller should have ensured this */ 349 c = GETNEXT(); 350 351 pos = HERE(); 352 switch (c) { 353 case '(': 354 REQUIRE(MORE(), REG_EPAREN); 355 p->g->nsub++; 356 subno = p->g->nsub; 357 if (subno < NPAREN) 358 p->pbegin[subno] = HERE(); 359 EMIT(OLPAREN, subno); 360 if (!SEE(')')) 361 p_ere(p, ')'); 362 if (subno < NPAREN) { 363 p->pend[subno] = HERE(); 364 assert(p->pend[subno] != 0); 365 } 366 EMIT(ORPAREN, subno); 367 MUSTEAT(')', REG_EPAREN); 368 break; 369 #ifndef POSIX_MISTAKE 370 case ')': /* happens only if no current unmatched ( */ 371 /* 372 * You may ask, why the ifndef? Because I didn't notice 373 * this until slightly too late for 1003.2, and none of the 374 * other 1003.2 regular-expression reviewers noticed it at 375 * all. So an unmatched ) is legal POSIX, at least until 376 * we can get it fixed. 377 */ 378 SETERROR(REG_EPAREN); 379 break; 380 #endif 381 case '^': 382 EMIT(OBOL, 0); 383 p->g->iflags |= USEBOL; 384 p->g->nbol++; 385 wascaret = 1; 386 break; 387 case '$': 388 EMIT(OEOL, 0); 389 p->g->iflags |= USEEOL; 390 p->g->neol++; 391 break; 392 case '|': 393 SETERROR(REG_EMPTY); 394 break; 395 case '*': 396 case '+': 397 case '?': 398 SETERROR(REG_BADRPT); 399 break; 400 case '.': 401 if (p->g->cflags®_NEWLINE) 402 nonnewline(p); 403 else 404 EMIT(OANY, 0); 405 break; 406 case '[': 407 p_bracket(p); 408 break; 409 case '\\': 410 REQUIRE(MORE(), REG_EESCAPE); 411 c = GETNEXT(); 412 ordinary(p, c); 413 break; 414 case '{': /* okay as ordinary except if digit follows */ 415 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 416 /* FALLTHROUGH */ 417 default: 418 ordinary(p, c); 419 break; 420 } 421 422 if (!MORE()) 423 return; 424 c = PEEK(); 425 /* we call { a repetition if followed by a digit */ 426 if (!( c == '*' || c == '+' || c == '?' || 427 (c == '{' && MORE2() && isdigit(PEEK2())) )) 428 return; /* no repetition, we're done */ 429 NEXT(); 430 431 REQUIRE(!wascaret, REG_BADRPT); 432 switch (c) { 433 case '*': /* implemented as +? */ 434 /* this case does not require the (y|) trick, noKLUDGE */ 435 INSERT(OPLUS_, pos); 436 ASTERN(O_PLUS, pos); 437 INSERT(OQUEST_, pos); 438 ASTERN(O_QUEST, pos); 439 break; 440 case '+': 441 INSERT(OPLUS_, pos); 442 ASTERN(O_PLUS, pos); 443 break; 444 case '?': 445 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 446 INSERT(OCH_, pos); /* offset slightly wrong */ 447 ASTERN(OOR1, pos); /* this one's right */ 448 AHEAD(pos); /* fix the OCH_ */ 449 EMIT(OOR2, 0); /* offset very wrong... */ 450 AHEAD(THERE()); /* ...so fix it */ 451 ASTERN(O_CH, THERETHERE()); 452 break; 453 case '{': 454 count = p_count(p); 455 if (EAT(',')) { 456 if (isdigit(PEEK())) { 457 count2 = p_count(p); 458 REQUIRE(count <= count2, REG_BADBR); 459 } else /* single number with comma */ 460 count2 = INFINITY; 461 } else /* just a single number */ 462 count2 = count; 463 repeat(p, pos, count, count2); 464 if (!EAT('}')) { /* error heuristics */ 465 while (MORE() && PEEK() != '}') 466 NEXT(); 467 REQUIRE(MORE(), REG_EBRACE); 468 SETERROR(REG_BADBR); 469 } 470 break; 471 } 472 473 if (!MORE()) 474 return; 475 c = PEEK(); 476 if (!( c == '*' || c == '+' || c == '?' || 477 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 478 return; 479 SETERROR(REG_BADRPT); 480 } 481 482 /* 483 - p_str - string (no metacharacters) "parser" 484 == static void p_str(register struct parse *p); 485 */ 486 static void 487 p_str(p) 488 register struct parse *p; 489 { 490 REQUIRE(MORE(), REG_EMPTY); 491 while (MORE()) 492 ordinary(p, GETNEXT()); 493 } 494 495 /* 496 - p_bre - BRE parser top level, anchoring and concatenation 497 == static void p_bre(register struct parse *p, register int end1, \ 498 == register int end2); 499 * Giving end1 as OUT essentially eliminates the end1/end2 check. 500 * 501 * This implementation is a bit of a kludge, in that a trailing $ is first 502 * taken as an ordinary character and then revised to be an anchor. The 503 * only undesirable side effect is that '$' gets included as a character 504 * category in such cases. This is fairly harmless; not worth fixing. 505 * The amount of lookahead needed to avoid this kludge is excessive. 506 */ 507 static void 508 p_bre(p, end1, end2) 509 register struct parse *p; 510 register int end1; /* first terminating character */ 511 register int end2; /* second terminating character */ 512 { 513 register sopno start = HERE(); 514 register int first = 1; /* first subexpression? */ 515 register int wasdollar = 0; 516 517 if (EAT('^')) { 518 EMIT(OBOL, 0); 519 p->g->iflags |= USEBOL; 520 p->g->nbol++; 521 } 522 while (MORE() && !SEETWO(end1, end2)) { 523 wasdollar = p_simp_re(p, first); 524 first = 0; 525 } 526 if (wasdollar) { /* oops, that was a trailing anchor */ 527 DROP(1); 528 EMIT(OEOL, 0); 529 p->g->iflags |= USEEOL; 530 p->g->neol++; 531 } 532 533 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 534 } 535 536 /* 537 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 538 == static int p_simp_re(register struct parse *p, int starordinary); 539 */ 540 static int /* was the simple RE an unbackslashed $? */ 541 p_simp_re(p, starordinary) 542 register struct parse *p; 543 int starordinary; /* is a leading * an ordinary character? */ 544 { 545 register int c; 546 register int count; 547 register int count2; 548 register sopno pos; 549 register int i; 550 register sopno subno; 551 # define BACKSL (1<<CHAR_BIT) 552 553 pos = HERE(); /* repetion op, if any, covers from here */ 554 555 assert(MORE()); /* caller should have ensured this */ 556 c = GETNEXT(); 557 if (c == '\\') { 558 REQUIRE(MORE(), REG_EESCAPE); 559 c = BACKSL | (unsigned char)GETNEXT(); 560 } 561 switch (c) { 562 case '.': 563 if (p->g->cflags®_NEWLINE) 564 nonnewline(p); 565 else 566 EMIT(OANY, 0); 567 break; 568 case '[': 569 p_bracket(p); 570 break; 571 case BACKSL|'{': 572 SETERROR(REG_BADRPT); 573 break; 574 case BACKSL|'(': 575 p->g->nsub++; 576 subno = p->g->nsub; 577 if (subno < NPAREN) 578 p->pbegin[subno] = HERE(); 579 EMIT(OLPAREN, subno); 580 /* the MORE here is an error heuristic */ 581 if (MORE() && !SEETWO('\\', ')')) 582 p_bre(p, '\\', ')'); 583 if (subno < NPAREN) { 584 p->pend[subno] = HERE(); 585 assert(p->pend[subno] != 0); 586 } 587 EMIT(ORPAREN, subno); 588 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 589 break; 590 case BACKSL|')': /* should not get here -- must be user */ 591 case BACKSL|'}': 592 SETERROR(REG_EPAREN); 593 break; 594 case BACKSL|'1': 595 case BACKSL|'2': 596 case BACKSL|'3': 597 case BACKSL|'4': 598 case BACKSL|'5': 599 case BACKSL|'6': 600 case BACKSL|'7': 601 case BACKSL|'8': 602 case BACKSL|'9': 603 i = (c&~BACKSL) - '0'; 604 assert(i < NPAREN); 605 if (p->pend[i] != 0) { 606 assert(i <= p->g->nsub); 607 EMIT(OBACK_, i); 608 assert(p->pbegin[i] != 0); 609 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 610 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 611 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 612 EMIT(O_BACK, i); 613 } else 614 SETERROR(REG_ESUBREG); 615 p->g->backrefs = 1; 616 break; 617 case '*': 618 REQUIRE(starordinary, REG_BADRPT); 619 /* FALLTHROUGH */ 620 default: 621 ordinary(p, c &~ BACKSL); 622 break; 623 } 624 625 if (EAT('*')) { /* implemented as +? */ 626 /* this case does not require the (y|) trick, noKLUDGE */ 627 INSERT(OPLUS_, pos); 628 ASTERN(O_PLUS, pos); 629 INSERT(OQUEST_, pos); 630 ASTERN(O_QUEST, pos); 631 } else if (EATTWO('\\', '{')) { 632 count = p_count(p); 633 if (EAT(',')) { 634 if (MORE() && isdigit(PEEK())) { 635 count2 = p_count(p); 636 REQUIRE(count <= count2, REG_BADBR); 637 } else /* single number with comma */ 638 count2 = INFINITY; 639 } else /* just a single number */ 640 count2 = count; 641 repeat(p, pos, count, count2); 642 if (!EATTWO('\\', '}')) { /* error heuristics */ 643 while (MORE() && !SEETWO('\\', '}')) 644 NEXT(); 645 REQUIRE(MORE(), REG_EBRACE); 646 SETERROR(REG_BADBR); 647 } 648 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 649 return(1); 650 651 return(0); 652 } 653 654 /* 655 - p_count - parse a repetition count 656 == static int p_count(register struct parse *p); 657 */ 658 static int /* the value */ 659 p_count(p) 660 register struct parse *p; 661 { 662 register int count = 0; 663 register int ndigits = 0; 664 665 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 666 count = count*10 + (GETNEXT() - '0'); 667 ndigits++; 668 } 669 670 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 671 return(count); 672 } 673 674 /* 675 - p_bracket - parse a bracketed character list 676 == static void p_bracket(register struct parse *p); 677 * 678 * Note a significant property of this code: if the allocset() did SETERROR, 679 * no set operations are done. 680 */ 681 static void 682 p_bracket(p) 683 register struct parse *p; 684 { 685 register cset *cs = allocset(p); 686 register int invert = 0; 687 688 /* Dept of Truly Sickening Special-Case Kludges */ 689 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 690 EMIT(OBOW, 0); 691 NEXTn(6); 692 return; 693 } 694 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 695 EMIT(OEOW, 0); 696 NEXTn(6); 697 return; 698 } 699 700 if (EAT('^')) 701 invert++; /* make note to invert set at end */ 702 if (EAT(']')) 703 CHadd(cs, ']'); 704 else if (EAT('-')) 705 CHadd(cs, '-'); 706 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 707 p_b_term(p, cs); 708 if (EAT('-')) 709 CHadd(cs, '-'); 710 MUSTEAT(']', REG_EBRACK); 711 712 if (p->error != 0) /* don't mess things up further */ 713 return; 714 715 if (p->g->cflags®_ICASE) { 716 register int i; 717 register int ci; 718 719 for (i = p->g->csetsize - 1; i >= 0; i--) 720 if (CHIN(cs, i) && isalpha(i)) { 721 ci = othercase(i); 722 if (ci != i) 723 CHadd(cs, ci); 724 } 725 if (cs->multis != NULL) 726 mccase(p, cs); 727 } 728 if (invert) { 729 register int i; 730 731 for (i = p->g->csetsize - 1; i >= 0; i--) 732 if (CHIN(cs, i)) 733 CHsub(cs, i); 734 else 735 CHadd(cs, i); 736 if (p->g->cflags®_NEWLINE) 737 CHsub(cs, '\n'); 738 if (cs->multis != NULL) 739 mcinvert(p, cs); 740 } 741 742 assert(cs->multis == NULL); /* xxx */ 743 744 if (nch(p, cs) == 1) { /* optimize singleton sets */ 745 ordinary(p, firstch(p, cs)); 746 freeset(p, cs); 747 } else 748 EMIT(OANYOF, freezeset(p, cs)); 749 } 750 751 /* 752 - p_b_term - parse one term of a bracketed character list 753 == static void p_b_term(register struct parse *p, register cset *cs); 754 */ 755 static void 756 p_b_term(p, cs) 757 register struct parse *p; 758 register cset *cs; 759 { 760 register char c; 761 register char start, finish; 762 register int i; 763 764 /* classify what we've got */ 765 switch ((MORE()) ? PEEK() : '\0') { 766 case '[': 767 c = (MORE2()) ? PEEK2() : '\0'; 768 break; 769 case '-': 770 SETERROR(REG_ERANGE); 771 return; /* NOTE RETURN */ 772 break; 773 default: 774 c = '\0'; 775 break; 776 } 777 778 switch (c) { 779 case ':': /* character class */ 780 NEXT2(); 781 REQUIRE(MORE(), REG_EBRACK); 782 c = PEEK(); 783 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 784 p_b_cclass(p, cs); 785 REQUIRE(MORE(), REG_EBRACK); 786 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 787 break; 788 case '=': /* equivalence class */ 789 NEXT2(); 790 REQUIRE(MORE(), REG_EBRACK); 791 c = PEEK(); 792 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 793 p_b_eclass(p, cs); 794 REQUIRE(MORE(), REG_EBRACK); 795 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 796 break; 797 default: /* symbol, ordinary character, or range */ 798 /* xxx revision needed for multichar stuff */ 799 start = p_b_symbol(p); 800 if (SEE('-') && MORE2() && PEEK2() != ']') { 801 /* range */ 802 NEXT(); 803 if (EAT('-')) 804 finish = '-'; 805 else 806 finish = p_b_symbol(p); 807 } else 808 finish = start; 809 /* xxx what about signed chars here... */ 810 REQUIRE(start <= finish, REG_ERANGE); 811 for (i = start; i <= finish; i++) 812 CHadd(cs, i); 813 break; 814 } 815 } 816 817 /* 818 - p_b_cclass - parse a character-class name and deal with it 819 == static void p_b_cclass(register struct parse *p, register cset *cs); 820 */ 821 static void 822 p_b_cclass(p, cs) 823 register struct parse *p; 824 register cset *cs; 825 { 826 register char *sp = p->next; 827 register struct cclass *cp; 828 register size_t len; 829 register char *u; 830 register char c; 831 832 while (MORE() && isalpha(PEEK())) 833 NEXT(); 834 len = p->next - sp; 835 for (cp = cclasses; cp->name != NULL; cp++) 836 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 837 break; 838 if (cp->name == NULL) { 839 /* oops, didn't find it */ 840 SETERROR(REG_ECTYPE); 841 return; 842 } 843 844 u = cp->chars; 845 while ((c = *u++) != '\0') 846 CHadd(cs, c); 847 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 848 MCadd(p, cs, u); 849 } 850 851 /* 852 - p_b_eclass - parse an equivalence-class name and deal with it 853 == static void p_b_eclass(register struct parse *p, register cset *cs); 854 * 855 * This implementation is incomplete. xxx 856 */ 857 static void 858 p_b_eclass(p, cs) 859 register struct parse *p; 860 register cset *cs; 861 { 862 register char c; 863 864 c = p_b_coll_elem(p, '='); 865 CHadd(cs, c); 866 } 867 868 /* 869 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 870 == static char p_b_symbol(register struct parse *p); 871 */ 872 static char /* value of symbol */ 873 p_b_symbol(p) 874 register struct parse *p; 875 { 876 register char value; 877 878 REQUIRE(MORE(), REG_EBRACK); 879 if (!EATTWO('[', '.')) 880 return(GETNEXT()); 881 882 /* collating symbol */ 883 value = p_b_coll_elem(p, '.'); 884 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 885 return(value); 886 } 887 888 /* 889 - p_b_coll_elem - parse a collating-element name and look it up 890 == static char p_b_coll_elem(register struct parse *p, int endc); 891 */ 892 static char /* value of collating element */ 893 p_b_coll_elem(p, endc) 894 register struct parse *p; 895 int endc; /* name ended by endc,']' */ 896 { 897 register char *sp = p->next; 898 register struct cname *cp; 899 register int len; 900 901 while (MORE() && !SEETWO(endc, ']')) 902 NEXT(); 903 if (!MORE()) { 904 SETERROR(REG_EBRACK); 905 return(0); 906 } 907 len = p->next - sp; 908 for (cp = cnames; cp->name != NULL; cp++) 909 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 910 return(cp->code); /* known name */ 911 if (len == 1) 912 return(*sp); /* single character */ 913 SETERROR(REG_ECOLLATE); /* neither */ 914 return(0); 915 } 916 917 /* 918 - othercase - return the case counterpart of an alphabetic 919 == static char othercase(int ch); 920 */ 921 static char /* if no counterpart, return ch */ 922 othercase(ch) 923 int ch; 924 { 925 assert(isalpha(ch)); 926 if (isupper(ch)) 927 return(tolower(ch)); 928 else if (islower(ch)) 929 return(toupper(ch)); 930 else /* peculiar, but could happen */ 931 return(ch); 932 } 933 934 /* 935 - bothcases - emit a dualcase version of a two-case character 936 == static void bothcases(register struct parse *p, int ch); 937 * 938 * Boy, is this implementation ever a kludge... 939 */ 940 static void 941 bothcases(p, ch) 942 register struct parse *p; 943 int ch; 944 { 945 register char *oldnext = p->next; 946 register char *oldend = p->end; 947 char bracket[3]; 948 949 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 950 p->next = bracket; 951 p->end = bracket+2; 952 bracket[0] = ch; 953 bracket[1] = ']'; 954 bracket[2] = '\0'; 955 p_bracket(p); 956 assert(p->next == bracket+2); 957 p->next = oldnext; 958 p->end = oldend; 959 } 960 961 /* 962 - ordinary - emit an ordinary character 963 == static void ordinary(register struct parse *p, register int ch); 964 */ 965 static void 966 ordinary(p, ch) 967 register struct parse *p; 968 register int ch; 969 { 970 register cat_t *cap = p->g->categories; 971 972 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 973 bothcases(p, ch); 974 else { 975 EMIT(OCHAR, (unsigned char)ch); 976 if (cap[ch] == 0) 977 cap[ch] = p->g->ncategories++; 978 } 979 } 980 981 /* 982 - nonnewline - emit REG_NEWLINE version of OANY 983 == static void nonnewline(register struct parse *p); 984 * 985 * Boy, is this implementation ever a kludge... 986 */ 987 static void 988 nonnewline(p) 989 register struct parse *p; 990 { 991 register char *oldnext = p->next; 992 register char *oldend = p->end; 993 char bracket[4]; 994 995 p->next = bracket; 996 p->end = bracket+3; 997 bracket[0] = '^'; 998 bracket[1] = '\n'; 999 bracket[2] = ']'; 1000 bracket[3] = '\0'; 1001 p_bracket(p); 1002 assert(p->next == bracket+3); 1003 p->next = oldnext; 1004 p->end = oldend; 1005 } 1006 1007 /* 1008 - repeat - generate code for a bounded repetition, recursively if needed 1009 == static void repeat(register struct parse *p, sopno start, int from, int to); 1010 */ 1011 static void 1012 repeat(p, start, from, to) 1013 register struct parse *p; 1014 sopno start; /* operand from here to end of strip */ 1015 int from; /* repeated from this number */ 1016 int to; /* to this number of times (maybe INFINITY) */ 1017 { 1018 register sopno finish = HERE(); 1019 # define N 2 1020 # define INF 3 1021 # define REP(f, t) ((f)*8 + (t)) 1022 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1023 register sopno copy; 1024 1025 if (p->error != 0) /* head off possible runaway recursion */ 1026 return; 1027 1028 assert(from <= to); 1029 1030 switch (REP(MAP(from), MAP(to))) { 1031 case REP(0, 0): /* must be user doing this */ 1032 DROP(finish-start); /* drop the operand */ 1033 break; 1034 case REP(0, 1): /* as x{1,1}? */ 1035 case REP(0, N): /* as x{1,n}? */ 1036 case REP(0, INF): /* as x{1,}? */ 1037 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1038 INSERT(OCH_, start); /* offset is wrong... */ 1039 repeat(p, start+1, 1, to); 1040 ASTERN(OOR1, start); 1041 AHEAD(start); /* ... fix it */ 1042 EMIT(OOR2, 0); 1043 AHEAD(THERE()); 1044 ASTERN(O_CH, THERETHERE()); 1045 break; 1046 case REP(1, 1): /* trivial case */ 1047 /* done */ 1048 break; 1049 case REP(1, N): /* as x?x{1,n-1} */ 1050 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1051 INSERT(OCH_, start); 1052 ASTERN(OOR1, start); 1053 AHEAD(start); 1054 EMIT(OOR2, 0); /* offset very wrong... */ 1055 AHEAD(THERE()); /* ...so fix it */ 1056 ASTERN(O_CH, THERETHERE()); 1057 copy = dupl(p, start+1, finish+1); 1058 assert(copy == finish+4); 1059 repeat(p, copy, 1, to-1); 1060 break; 1061 case REP(1, INF): /* as x+ */ 1062 INSERT(OPLUS_, start); 1063 ASTERN(O_PLUS, start); 1064 break; 1065 case REP(N, N): /* as xx{m-1,n-1} */ 1066 copy = dupl(p, start, finish); 1067 repeat(p, copy, from-1, to-1); 1068 break; 1069 case REP(N, INF): /* as xx{n-1,INF} */ 1070 copy = dupl(p, start, finish); 1071 repeat(p, copy, from-1, to); 1072 break; 1073 default: /* "can't happen" */ 1074 SETERROR(REG_ASSERT); /* just in case */ 1075 break; 1076 } 1077 } 1078 1079 /* 1080 - seterr - set an error condition 1081 == static int seterr(register struct parse *p, int e); 1082 */ 1083 static int /* useless but makes type checking happy */ 1084 seterr(p, e) 1085 register struct parse *p; 1086 int e; 1087 { 1088 if (p->error == 0) /* keep earliest error condition */ 1089 p->error = e; 1090 p->next = nuls; /* try to bring things to a halt */ 1091 p->end = nuls; 1092 return(0); /* make the return value well-defined */ 1093 } 1094 1095 /* 1096 - allocset - allocate a set of characters for [] 1097 == static cset *allocset(register struct parse *p); 1098 */ 1099 static cset * 1100 allocset(p) 1101 register struct parse *p; 1102 { 1103 register int no = p->g->ncsets++; 1104 register size_t nc; 1105 register size_t nbytes; 1106 register cset *cs; 1107 register size_t css = (size_t)p->g->csetsize; 1108 register int i; 1109 1110 if (no >= p->ncsalloc) { /* need another column of space */ 1111 p->ncsalloc += CHAR_BIT; 1112 nc = p->ncsalloc; 1113 assert(nc % CHAR_BIT == 0); 1114 nbytes = nc / CHAR_BIT * css; 1115 if (p->g->sets == NULL) 1116 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1117 else 1118 p->g->sets = (cset *)realloc((char *)p->g->sets, 1119 nc * sizeof(cset)); 1120 if (p->g->setbits == NULL) 1121 p->g->setbits = (uch *)malloc(nbytes); 1122 else { 1123 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1124 nbytes); 1125 /* xxx this isn't right if setbits is now NULL */ 1126 for (i = 0; i < no; i++) 1127 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1128 } 1129 if (p->g->sets != NULL && p->g->setbits != NULL) 1130 (void) memset((char *)p->g->setbits + (nbytes - css), 1131 0, css); 1132 else { 1133 no = 0; 1134 SETERROR(REG_ESPACE); 1135 /* caller's responsibility not to do set ops */ 1136 } 1137 } 1138 1139 assert(p->g->sets != NULL); /* xxx */ 1140 cs = &p->g->sets[no]; 1141 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1142 cs->mask = 1 << ((no) % CHAR_BIT); 1143 cs->hash = 0; 1144 cs->smultis = 0; 1145 cs->multis = NULL; 1146 1147 return(cs); 1148 } 1149 1150 /* 1151 - freeset - free a now-unused set 1152 == static void freeset(register struct parse *p, register cset *cs); 1153 */ 1154 static void 1155 freeset(p, cs) 1156 register struct parse *p; 1157 register cset *cs; 1158 { 1159 register int i; 1160 register cset *top = &p->g->sets[p->g->ncsets]; 1161 register size_t css = (size_t)p->g->csetsize; 1162 1163 for (i = 0; i < css; i++) 1164 CHsub(cs, i); 1165 if (cs == top-1) /* recover only the easy case */ 1166 p->g->ncsets--; 1167 } 1168 1169 /* 1170 - freezeset - final processing on a set of characters 1171 == static int freezeset(register struct parse *p, register cset *cs); 1172 * 1173 * The main task here is merging identical sets. This is usually a waste 1174 * of time (although the hash code minimizes the overhead), but can win 1175 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1176 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1177 * the same value! 1178 */ 1179 static int /* set number */ 1180 freezeset(p, cs) 1181 register struct parse *p; 1182 register cset *cs; 1183 { 1184 register uch h = cs->hash; 1185 register int i; 1186 register cset *top = &p->g->sets[p->g->ncsets]; 1187 register cset *cs2; 1188 register size_t css = (size_t)p->g->csetsize; 1189 1190 /* look for an earlier one which is the same */ 1191 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1192 if (cs2->hash == h && cs2 != cs) { 1193 /* maybe */ 1194 for (i = 0; i < css; i++) 1195 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1196 break; /* no */ 1197 if (i == css) 1198 break; /* yes */ 1199 } 1200 1201 if (cs2 < top) { /* found one */ 1202 freeset(p, cs); 1203 cs = cs2; 1204 } 1205 1206 return((int)(cs - p->g->sets)); 1207 } 1208 1209 /* 1210 - firstch - return first character in a set (which must have at least one) 1211 == static int firstch(register struct parse *p, register cset *cs); 1212 */ 1213 static int /* character; there is no "none" value */ 1214 firstch(p, cs) 1215 register struct parse *p; 1216 register cset *cs; 1217 { 1218 register int i; 1219 register size_t css = (size_t)p->g->csetsize; 1220 1221 for (i = 0; i < css; i++) 1222 if (CHIN(cs, i)) 1223 return((char)i); 1224 assert(never); 1225 return(0); /* arbitrary */ 1226 } 1227 1228 /* 1229 - nch - number of characters in a set 1230 == static int nch(register struct parse *p, register cset *cs); 1231 */ 1232 static int 1233 nch(p, cs) 1234 register struct parse *p; 1235 register cset *cs; 1236 { 1237 register int i; 1238 register size_t css = (size_t)p->g->csetsize; 1239 register int n = 0; 1240 1241 for (i = 0; i < css; i++) 1242 if (CHIN(cs, i)) 1243 n++; 1244 return(n); 1245 } 1246 1247 /* 1248 - mcadd - add a collating element to a cset 1249 == static void mcadd(register struct parse *p, register cset *cs, \ 1250 == register char *cp); 1251 */ 1252 static void 1253 mcadd(p, cs, cp) 1254 register struct parse *p; 1255 register cset *cs; 1256 register char *cp; 1257 { 1258 register size_t oldend = cs->smultis; 1259 1260 cs->smultis += strlen(cp) + 1; 1261 if (cs->multis == NULL) 1262 cs->multis = malloc(cs->smultis); 1263 else 1264 cs->multis = realloc(cs->multis, cs->smultis); 1265 if (cs->multis == NULL) { 1266 SETERROR(REG_ESPACE); 1267 return; 1268 } 1269 1270 (void) strcpy(cs->multis + oldend - 1, cp); 1271 cs->multis[cs->smultis - 1] = '\0'; 1272 } 1273 1274 #if 0 1275 /* 1276 - mcsub - subtract a collating element from a cset 1277 == static void mcsub(register cset *cs, register char *cp); 1278 */ 1279 static void 1280 mcsub(cs, cp) 1281 register cset *cs; 1282 register char *cp; 1283 { 1284 register char *fp = mcfind(cs, cp); 1285 register size_t len = strlen(fp); 1286 1287 assert(fp != NULL); 1288 (void) memmove(fp, fp + len + 1, 1289 cs->smultis - (fp + len + 1 - cs->multis)); 1290 cs->smultis -= len; 1291 1292 if (cs->smultis == 0) { 1293 free(cs->multis); 1294 cs->multis = NULL; 1295 return; 1296 } 1297 1298 cs->multis = realloc(cs->multis, cs->smultis); 1299 assert(cs->multis != NULL); 1300 } 1301 1302 /* 1303 - mcin - is a collating element in a cset? 1304 == static int mcin(register cset *cs, register char *cp); 1305 */ 1306 static int 1307 mcin(cs, cp) 1308 register cset *cs; 1309 register char *cp; 1310 { 1311 return(mcfind(cs, cp) != NULL); 1312 } 1313 1314 /* 1315 - mcfind - find a collating element in a cset 1316 == static char *mcfind(register cset *cs, register char *cp); 1317 */ 1318 static char * 1319 mcfind(cs, cp) 1320 register cset *cs; 1321 register char *cp; 1322 { 1323 register char *p; 1324 1325 if (cs->multis == NULL) 1326 return(NULL); 1327 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1328 if (strcmp(cp, p) == 0) 1329 return(p); 1330 return(NULL); 1331 } 1332 #endif 1333 1334 /* 1335 - mcinvert - invert the list of collating elements in a cset 1336 == static void mcinvert(register struct parse *p, register cset *cs); 1337 * 1338 * This would have to know the set of possibilities. Implementation 1339 * is deferred. 1340 */ 1341 static void 1342 mcinvert(p, cs) 1343 register struct parse *p; 1344 register cset *cs; 1345 { 1346 assert(cs->multis == NULL); /* xxx */ 1347 } 1348 1349 /* 1350 - mccase - add case counterparts of the list of collating elements in a cset 1351 == static void mccase(register struct parse *p, register cset *cs); 1352 * 1353 * This would have to know the set of possibilities. Implementation 1354 * is deferred. 1355 */ 1356 static void 1357 mccase(p, cs) 1358 register struct parse *p; 1359 register cset *cs; 1360 { 1361 assert(cs->multis == NULL); /* xxx */ 1362 } 1363 1364 /* 1365 - isinsets - is this character in any sets? 1366 == static int isinsets(register struct re_guts *g, int c); 1367 */ 1368 static int /* predicate */ 1369 isinsets(g, c) 1370 register struct re_guts *g; 1371 int c; 1372 { 1373 register uch *col; 1374 register int i; 1375 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1376 register unsigned uc = (unsigned char)c; 1377 1378 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1379 if (col[uc] != 0) 1380 return(1); 1381 return(0); 1382 } 1383 1384 /* 1385 - samesets - are these two characters in exactly the same sets? 1386 == static int samesets(register struct re_guts *g, int c1, int c2); 1387 */ 1388 static int /* predicate */ 1389 samesets(g, c1, c2) 1390 register struct re_guts *g; 1391 int c1; 1392 int c2; 1393 { 1394 register uch *col; 1395 register int i; 1396 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1397 register unsigned uc1 = (unsigned char)c1; 1398 register unsigned uc2 = (unsigned char)c2; 1399 1400 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1401 if (col[uc1] != col[uc2]) 1402 return(0); 1403 return(1); 1404 } 1405 1406 /* 1407 - categorize - sort out character categories 1408 == static void categorize(struct parse *p, register struct re_guts *g); 1409 */ 1410 static void 1411 categorize(p, g) 1412 struct parse *p; 1413 register struct re_guts *g; 1414 { 1415 register cat_t *cats = g->categories; 1416 register int c; 1417 register int c2; 1418 register cat_t cat; 1419 1420 /* avoid making error situations worse */ 1421 if (p->error != 0) 1422 return; 1423 1424 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1425 if (cats[c] == 0 && isinsets(g, c)) { 1426 cat = g->ncategories++; 1427 cats[c] = cat; 1428 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1429 if (cats[c2] == 0 && samesets(g, c, c2)) 1430 cats[c2] = cat; 1431 } 1432 } 1433 1434 /* 1435 - dupl - emit a duplicate of a bunch of sops 1436 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1437 */ 1438 static sopno /* start of duplicate */ 1439 dupl(p, start, finish) 1440 register struct parse *p; 1441 sopno start; /* from here */ 1442 sopno finish; /* to this less one */ 1443 { 1444 register sopno ret = HERE(); 1445 register sopno len = finish - start; 1446 1447 assert(finish >= start); 1448 if (len == 0) 1449 return(ret); 1450 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1451 assert(p->ssize >= p->slen + len); 1452 (void) memcpy((char *)(p->strip + p->slen), 1453 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1454 p->slen += len; 1455 return(ret); 1456 } 1457 1458 /* 1459 - doemit - emit a strip operator 1460 == static void doemit(register struct parse *p, sop op, size_t opnd); 1461 * 1462 * It might seem better to implement this as a macro with a function as 1463 * hard-case backup, but it's just too big and messy unless there are 1464 * some changes to the data structures. Maybe later. 1465 */ 1466 static void 1467 doemit(p, op, opnd) 1468 register struct parse *p; 1469 sop op; 1470 size_t opnd; 1471 { 1472 /* avoid making error situations worse */ 1473 if (p->error != 0) 1474 return; 1475 1476 /* deal with oversize operands ("can't happen", more or less) */ 1477 assert(opnd < 1<<OPSHIFT); 1478 1479 /* deal with undersized strip */ 1480 if (p->slen >= p->ssize) 1481 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1482 assert(p->slen < p->ssize); 1483 1484 /* finally, it's all reduced to the easy case */ 1485 p->strip[p->slen++] = SOP(op, opnd); 1486 } 1487 1488 /* 1489 - doinsert - insert a sop into the strip 1490 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1491 */ 1492 static void 1493 doinsert(p, op, opnd, pos) 1494 register struct parse *p; 1495 sop op; 1496 size_t opnd; 1497 sopno pos; 1498 { 1499 register sopno sn; 1500 register sop s; 1501 register int i; 1502 1503 /* avoid making error situations worse */ 1504 if (p->error != 0) 1505 return; 1506 1507 sn = HERE(); 1508 EMIT(op, opnd); /* do checks, ensure space */ 1509 assert(HERE() == sn+1); 1510 s = p->strip[sn]; 1511 1512 /* adjust paren pointers */ 1513 assert(pos > 0); 1514 for (i = 1; i < NPAREN; i++) { 1515 if (p->pbegin[i] >= pos) { 1516 p->pbegin[i]++; 1517 } 1518 if (p->pend[i] >= pos) { 1519 p->pend[i]++; 1520 } 1521 } 1522 1523 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1524 (HERE()-pos-1)*sizeof(sop)); 1525 p->strip[pos] = s; 1526 } 1527 1528 /* 1529 - dofwd - complete a forward reference 1530 == static void dofwd(register struct parse *p, sopno pos, sop value); 1531 */ 1532 static void 1533 dofwd(p, pos, value) 1534 register struct parse *p; 1535 register sopno pos; 1536 sop value; 1537 { 1538 /* avoid making error situations worse */ 1539 if (p->error != 0) 1540 return; 1541 1542 assert(value < 1<<OPSHIFT); 1543 p->strip[pos] = OP(p->strip[pos]) | value; 1544 } 1545 1546 /* 1547 - enlarge - enlarge the strip 1548 == static void enlarge(register struct parse *p, sopno size); 1549 */ 1550 static void 1551 enlarge(p, size) 1552 register struct parse *p; 1553 register sopno size; 1554 { 1555 register sop *sp; 1556 1557 if (p->ssize >= size) 1558 return; 1559 1560 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1561 if (sp == NULL) { 1562 SETERROR(REG_ESPACE); 1563 return; 1564 } 1565 p->strip = sp; 1566 p->ssize = size; 1567 } 1568 1569 /* 1570 - stripsnug - compact the strip 1571 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1572 */ 1573 static void 1574 stripsnug(p, g) 1575 register struct parse *p; 1576 register struct re_guts *g; 1577 { 1578 g->nstates = p->slen; 1579 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1580 if (g->strip == NULL) { 1581 SETERROR(REG_ESPACE); 1582 g->strip = p->strip; 1583 } 1584 } 1585 1586 /* 1587 - findmust - fill in must and mlen with longest mandatory literal string 1588 == static void findmust(register struct parse *p, register struct re_guts *g); 1589 * 1590 * This algorithm could do fancy things like analyzing the operands of | 1591 * for common subsequences. Someday. This code is simple and finds most 1592 * of the interesting cases. 1593 * 1594 * Note that must and mlen got initialized during setup. 1595 */ 1596 static void 1597 findmust(p, g) 1598 struct parse *p; 1599 register struct re_guts *g; 1600 { 1601 register sop *scan; 1602 sop *start = NULL; 1603 register sop *newstart = NULL; 1604 register sopno newlen; 1605 register sop s; 1606 register char *cp; 1607 register sopno i; 1608 1609 /* avoid making error situations worse */ 1610 if (p->error != 0) 1611 return; 1612 1613 /* find the longest OCHAR sequence in strip */ 1614 newlen = 0; 1615 scan = g->strip + 1; 1616 do { 1617 s = *scan++; 1618 switch (OP(s)) { 1619 case OCHAR: /* sequence member */ 1620 if (newlen == 0) /* new sequence */ 1621 newstart = scan - 1; 1622 newlen++; 1623 break; 1624 case OPLUS_: /* things that don't break one */ 1625 case OLPAREN: 1626 case ORPAREN: 1627 break; 1628 case OQUEST_: /* things that must be skipped */ 1629 case OCH_: 1630 scan--; 1631 do { 1632 scan += OPND(s); 1633 s = *scan; 1634 /* assert() interferes w debug printouts */ 1635 if (OP(s) != O_QUEST && OP(s) != O_CH && 1636 OP(s) != OOR2) { 1637 g->iflags |= BAD; 1638 return; 1639 } 1640 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1641 /* fallthrough */ 1642 default: /* things that break a sequence */ 1643 if (newlen > g->mlen) { /* ends one */ 1644 start = newstart; 1645 g->mlen = newlen; 1646 } 1647 newlen = 0; 1648 break; 1649 } 1650 } while (OP(s) != OEND); 1651 1652 if (g->mlen == 0) /* there isn't one */ 1653 return; 1654 1655 /* turn it into a character string */ 1656 g->must = malloc((size_t)g->mlen + 1); 1657 if (g->must == NULL) { /* argh; just forget it */ 1658 g->mlen = 0; 1659 return; 1660 } 1661 cp = g->must; 1662 scan = start; 1663 for (i = g->mlen; i > 0; i--) { 1664 while (OP(s = *scan++) != OCHAR) 1665 continue; 1666 assert(cp < g->must + g->mlen); 1667 *cp++ = (char)OPND(s); 1668 } 1669 assert(cp == g->must + g->mlen); 1670 *cp++ = '\0'; /* just on general principles */ 1671 } 1672 1673 /* 1674 - pluscount - count + nesting 1675 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1676 */ 1677 static sopno /* nesting depth */ 1678 pluscount(p, g) 1679 struct parse *p; 1680 register struct re_guts *g; 1681 { 1682 register sop *scan; 1683 register sop s; 1684 register sopno plusnest = 0; 1685 register sopno maxnest = 0; 1686 1687 if (p->error != 0) 1688 return(0); /* there may not be an OEND */ 1689 1690 scan = g->strip + 1; 1691 do { 1692 s = *scan++; 1693 switch (OP(s)) { 1694 case OPLUS_: 1695 plusnest++; 1696 break; 1697 case O_PLUS: 1698 if (plusnest > maxnest) 1699 maxnest = plusnest; 1700 plusnest--; 1701 break; 1702 } 1703 } while (OP(s) != OEND); 1704 if (plusnest != 0) 1705 g->iflags |= BAD; 1706 return(maxnest); 1707 } 1708