1 /* $NetBSD: regcomp.c,v 1.8 1997/07/21 14:08:15 jtc Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 40 */ 41 42 #include <sys/cdefs.h> 43 #if defined(LIBC_SCCS) && !defined(lint) 44 #if 0 45 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 46 #else 47 __RCSID("$NetBSD: regcomp.c,v 1.8 1997/07/21 14:08:15 jtc Exp $"); 48 #endif 49 #endif /* LIBC_SCCS and not lint */ 50 51 #include "namespace.h" 52 #include <sys/types.h> 53 #include <stdio.h> 54 #include <string.h> 55 #include <ctype.h> 56 #include <limits.h> 57 #include <stdlib.h> 58 #include <regex.h> 59 60 #ifdef __weak_alias 61 __weak_alias(regcomp,_regcomp); 62 #endif 63 64 #include "utils.h" 65 #include "regex2.h" 66 67 #include "cclass.h" 68 #include "cname.h" 69 70 /* 71 * parse structure, passed up and down to avoid global variables and 72 * other clumsinesses 73 */ 74 struct parse { 75 char *next; /* next character in RE */ 76 char *end; /* end of string (-> NUL normally) */ 77 int error; /* has an error been seen? */ 78 sop *strip; /* malloced strip */ 79 sopno ssize; /* malloced strip size (allocated) */ 80 sopno slen; /* malloced strip length (used) */ 81 int ncsalloc; /* number of csets allocated */ 82 struct re_guts *g; 83 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 84 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 85 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 86 }; 87 88 /* ========= begin header generated by ./mkh ========= */ 89 #ifdef __cplusplus 90 extern "C" { 91 #endif 92 93 /* === regcomp.c === */ 94 static void p_ere __P((struct parse *p, int stop)); 95 static void p_ere_exp __P((struct parse *p)); 96 static void p_str __P((struct parse *p)); 97 static void p_bre __P((struct parse *p, int end1, int end2)); 98 static int p_simp_re __P((struct parse *p, int starordinary)); 99 static int p_count __P((struct parse *p)); 100 static void p_bracket __P((struct parse *p)); 101 static void p_b_term __P((struct parse *p, cset *cs)); 102 static void p_b_cclass __P((struct parse *p, cset *cs)); 103 static void p_b_eclass __P((struct parse *p, cset *cs)); 104 static char p_b_symbol __P((struct parse *p)); 105 static char p_b_coll_elem __P((struct parse *p, int endc)); 106 static char othercase __P((int ch)); 107 static void bothcases __P((struct parse *p, int ch)); 108 static void ordinary __P((struct parse *p, int ch)); 109 static void nonnewline __P((struct parse *p)); 110 static void repeat __P((struct parse *p, sopno start, int from, int to)); 111 static int seterr __P((struct parse *p, int e)); 112 static cset *allocset __P((struct parse *p)); 113 static void freeset __P((struct parse *p, cset *cs)); 114 static int freezeset __P((struct parse *p, cset *cs)); 115 static int firstch __P((struct parse *p, cset *cs)); 116 static int nch __P((struct parse *p, cset *cs)); 117 static void mcadd __P((struct parse *p, cset *cs, char *cp)); 118 #if 0 119 static void mcsub __P((cset *cs, char *cp)); 120 static int mcin __P((cset *cs, char *cp)); 121 static char *mcfind __P((cset *cs, char *cp)); 122 #endif 123 static void mcinvert __P((struct parse *p, cset *cs)); 124 static void mccase __P((struct parse *p, cset *cs)); 125 static int isinsets __P((struct re_guts *g, int c)); 126 static int samesets __P((struct re_guts *g, int c1, int c2)); 127 static void categorize __P((struct parse *p, struct re_guts *g)); 128 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 129 static void doemit __P((struct parse *p, sop op, size_t opnd)); 130 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 131 static void dofwd __P((struct parse *p, sopno pos, sop value)); 132 static void enlarge __P((struct parse *p, sopno size)); 133 static void stripsnug __P((struct parse *p, struct re_guts *g)); 134 static void findmust __P((struct parse *p, struct re_guts *g)); 135 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 136 137 #ifdef __cplusplus 138 } 139 #endif 140 /* ========= end header generated by ./mkh ========= */ 141 142 static char nuls[10]; /* place to point scanner in event of error */ 143 144 /* 145 * macros for use with parse structure 146 * BEWARE: these know that the parse structure is named `p' !!! 147 */ 148 #define PEEK() (*p->next) 149 #define PEEK2() (*(p->next+1)) 150 #define MORE() (p->next < p->end) 151 #define MORE2() (p->next+1 < p->end) 152 #define SEE(c) (MORE() && PEEK() == (c)) 153 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 154 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 155 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 156 #define NEXT() (p->next++) 157 #define NEXT2() (p->next += 2) 158 #define NEXTn(n) (p->next += (n)) 159 #define GETNEXT() (*p->next++) 160 #define SETERROR(e) seterr(p, (e)) 161 #define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 162 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 163 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 164 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 165 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 166 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 167 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 168 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 169 #define HERE() (p->slen) 170 #define THERE() (p->slen - 1) 171 #define THERETHERE() (p->slen - 2) 172 #define DROP(n) (p->slen -= (n)) 173 174 #ifndef NDEBUG 175 static int never = 0; /* for use in asserts; shuts lint up */ 176 #else 177 #define never 0 /* some <assert.h>s have bugs too */ 178 #endif 179 180 /* 181 - regcomp - interface for parser and compilation 182 = extern int regcomp(regex_t *, const char *, int); 183 = #define REG_BASIC 0000 184 = #define REG_EXTENDED 0001 185 = #define REG_ICASE 0002 186 = #define REG_NOSUB 0004 187 = #define REG_NEWLINE 0010 188 = #define REG_NOSPEC 0020 189 = #define REG_PEND 0040 190 = #define REG_DUMP 0200 191 */ 192 int /* 0 success, otherwise REG_something */ 193 regcomp(preg, pattern, cflags) 194 regex_t *preg; 195 const char *pattern; 196 int cflags; 197 { 198 struct parse pa; 199 register struct re_guts *g; 200 register struct parse *p = &pa; 201 register int i; 202 register size_t len; 203 #ifdef REDEBUG 204 # define GOODFLAGS(f) (f) 205 #else 206 # define GOODFLAGS(f) ((f)&~REG_DUMP) 207 #endif 208 209 cflags = GOODFLAGS(cflags); 210 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 211 return(REG_INVARG); 212 213 if (cflags®_PEND) { 214 if (preg->re_endp < pattern) 215 return(REG_INVARG); 216 len = preg->re_endp - pattern; 217 } else 218 len = strlen((char *)pattern); 219 220 /* do the mallocs early so failure handling is easy */ 221 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 222 (NC-1)*sizeof(cat_t)); 223 if (g == NULL) 224 return(REG_ESPACE); 225 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 226 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 227 p->slen = 0; 228 if (p->strip == NULL) { 229 free((char *)g); 230 return(REG_ESPACE); 231 } 232 233 /* set things up */ 234 p->g = g; 235 p->next = (char *)pattern; /* convenience; we do not modify it */ 236 p->end = p->next + len; 237 p->error = 0; 238 p->ncsalloc = 0; 239 for (i = 0; i < NPAREN; i++) { 240 p->pbegin[i] = 0; 241 p->pend[i] = 0; 242 } 243 g->csetsize = NC; 244 g->sets = NULL; 245 g->setbits = NULL; 246 g->ncsets = 0; 247 g->cflags = cflags; 248 g->iflags = 0; 249 g->nbol = 0; 250 g->neol = 0; 251 g->must = NULL; 252 g->mlen = 0; 253 g->nsub = 0; 254 g->ncategories = 1; /* category 0 is "everything else" */ 255 g->categories = &g->catspace[-(CHAR_MIN)]; 256 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 257 g->backrefs = 0; 258 259 /* do it */ 260 EMIT(OEND, 0); 261 g->firststate = THERE(); 262 if (cflags®_EXTENDED) 263 p_ere(p, OUT); 264 else if (cflags®_NOSPEC) 265 p_str(p); 266 else 267 p_bre(p, OUT, OUT); 268 EMIT(OEND, 0); 269 g->laststate = THERE(); 270 271 /* tidy up loose ends and fill things in */ 272 categorize(p, g); 273 stripsnug(p, g); 274 findmust(p, g); 275 g->nplus = pluscount(p, g); 276 g->magic = MAGIC2; 277 preg->re_nsub = g->nsub; 278 preg->re_g = g; 279 preg->re_magic = MAGIC1; 280 #ifndef REDEBUG 281 /* not debugging, so can't rely on the assert() in regexec() */ 282 if (g->iflags&BAD) 283 SETERROR(REG_ASSERT); 284 #endif 285 286 /* win or lose, we're done */ 287 if (p->error != 0) /* lose */ 288 regfree(preg); 289 return(p->error); 290 } 291 292 /* 293 - p_ere - ERE parser top level, concatenation and alternation 294 == static void p_ere(register struct parse *p, int stop); 295 */ 296 static void 297 p_ere(p, stop) 298 register struct parse *p; 299 int stop; /* character this ERE should end at */ 300 { 301 register char c; 302 register sopno prevback = 0; /* pacify gcc */ 303 register sopno prevfwd = 0; /* pacify gcc */ 304 register sopno conc; 305 register int first = 1; /* is this the first alternative? */ 306 307 for (;;) { 308 /* do a bunch of concatenated expressions */ 309 conc = HERE(); 310 while (MORE() && (c = PEEK()) != '|' && c != stop) 311 p_ere_exp(p); 312 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 313 314 if (!EAT('|')) 315 break; /* NOTE BREAK OUT */ 316 317 if (first) { 318 INSERT(OCH_, conc); /* offset is wrong */ 319 prevfwd = conc; 320 prevback = conc; 321 first = 0; 322 } 323 ASTERN(OOR1, prevback); 324 prevback = THERE(); 325 AHEAD(prevfwd); /* fix previous offset */ 326 prevfwd = HERE(); 327 EMIT(OOR2, 0); /* offset is very wrong */ 328 } 329 330 if (!first) { /* tail-end fixups */ 331 AHEAD(prevfwd); 332 ASTERN(O_CH, prevback); 333 } 334 335 assert(!MORE() || SEE(stop)); 336 } 337 338 /* 339 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 340 == static void p_ere_exp(register struct parse *p); 341 */ 342 static void 343 p_ere_exp(p) 344 register struct parse *p; 345 { 346 register char c; 347 register sopno pos; 348 register int count; 349 register int count2; 350 register sopno subno; 351 int wascaret = 0; 352 353 assert(MORE()); /* caller should have ensured this */ 354 c = GETNEXT(); 355 356 pos = HERE(); 357 switch (c) { 358 case '(': 359 REQUIRE(MORE(), REG_EPAREN); 360 p->g->nsub++; 361 subno = p->g->nsub; 362 if (subno < NPAREN) 363 p->pbegin[subno] = HERE(); 364 EMIT(OLPAREN, subno); 365 if (!SEE(')')) 366 p_ere(p, ')'); 367 if (subno < NPAREN) { 368 p->pend[subno] = HERE(); 369 assert(p->pend[subno] != 0); 370 } 371 EMIT(ORPAREN, subno); 372 MUSTEAT(')', REG_EPAREN); 373 break; 374 #ifndef POSIX_MISTAKE 375 case ')': /* happens only if no current unmatched ( */ 376 /* 377 * You may ask, why the ifndef? Because I didn't notice 378 * this until slightly too late for 1003.2, and none of the 379 * other 1003.2 regular-expression reviewers noticed it at 380 * all. So an unmatched ) is legal POSIX, at least until 381 * we can get it fixed. 382 */ 383 SETERROR(REG_EPAREN); 384 break; 385 #endif 386 case '^': 387 EMIT(OBOL, 0); 388 p->g->iflags |= USEBOL; 389 p->g->nbol++; 390 wascaret = 1; 391 break; 392 case '$': 393 EMIT(OEOL, 0); 394 p->g->iflags |= USEEOL; 395 p->g->neol++; 396 break; 397 case '|': 398 SETERROR(REG_EMPTY); 399 break; 400 case '*': 401 case '+': 402 case '?': 403 SETERROR(REG_BADRPT); 404 break; 405 case '.': 406 if (p->g->cflags®_NEWLINE) 407 nonnewline(p); 408 else 409 EMIT(OANY, 0); 410 break; 411 case '[': 412 p_bracket(p); 413 break; 414 case '\\': 415 REQUIRE(MORE(), REG_EESCAPE); 416 c = GETNEXT(); 417 ordinary(p, c); 418 break; 419 case '{': /* okay as ordinary except if digit follows */ 420 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 421 /* FALLTHROUGH */ 422 default: 423 ordinary(p, c); 424 break; 425 } 426 427 if (!MORE()) 428 return; 429 c = PEEK(); 430 /* we call { a repetition if followed by a digit */ 431 if (!( c == '*' || c == '+' || c == '?' || 432 (c == '{' && MORE2() && isdigit(PEEK2())) )) 433 return; /* no repetition, we're done */ 434 NEXT(); 435 436 REQUIRE(!wascaret, REG_BADRPT); 437 switch (c) { 438 case '*': /* implemented as +? */ 439 /* this case does not require the (y|) trick, noKLUDGE */ 440 INSERT(OPLUS_, pos); 441 ASTERN(O_PLUS, pos); 442 INSERT(OQUEST_, pos); 443 ASTERN(O_QUEST, pos); 444 break; 445 case '+': 446 INSERT(OPLUS_, pos); 447 ASTERN(O_PLUS, pos); 448 break; 449 case '?': 450 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 451 INSERT(OCH_, pos); /* offset slightly wrong */ 452 ASTERN(OOR1, pos); /* this one's right */ 453 AHEAD(pos); /* fix the OCH_ */ 454 EMIT(OOR2, 0); /* offset very wrong... */ 455 AHEAD(THERE()); /* ...so fix it */ 456 ASTERN(O_CH, THERETHERE()); 457 break; 458 case '{': 459 count = p_count(p); 460 if (EAT(',')) { 461 if (isdigit(PEEK())) { 462 count2 = p_count(p); 463 REQUIRE(count <= count2, REG_BADBR); 464 } else /* single number with comma */ 465 count2 = INFINITY; 466 } else /* just a single number */ 467 count2 = count; 468 repeat(p, pos, count, count2); 469 if (!EAT('}')) { /* error heuristics */ 470 while (MORE() && PEEK() != '}') 471 NEXT(); 472 REQUIRE(MORE(), REG_EBRACE); 473 SETERROR(REG_BADBR); 474 } 475 break; 476 } 477 478 if (!MORE()) 479 return; 480 c = PEEK(); 481 if (!( c == '*' || c == '+' || c == '?' || 482 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 483 return; 484 SETERROR(REG_BADRPT); 485 } 486 487 /* 488 - p_str - string (no metacharacters) "parser" 489 == static void p_str(register struct parse *p); 490 */ 491 static void 492 p_str(p) 493 register struct parse *p; 494 { 495 REQUIRE(MORE(), REG_EMPTY); 496 while (MORE()) 497 ordinary(p, GETNEXT()); 498 } 499 500 /* 501 - p_bre - BRE parser top level, anchoring and concatenation 502 == static void p_bre(register struct parse *p, register int end1, \ 503 == register int end2); 504 * Giving end1 as OUT essentially eliminates the end1/end2 check. 505 * 506 * This implementation is a bit of a kludge, in that a trailing $ is first 507 * taken as an ordinary character and then revised to be an anchor. The 508 * only undesirable side effect is that '$' gets included as a character 509 * category in such cases. This is fairly harmless; not worth fixing. 510 * The amount of lookahead needed to avoid this kludge is excessive. 511 */ 512 static void 513 p_bre(p, end1, end2) 514 register struct parse *p; 515 register int end1; /* first terminating character */ 516 register int end2; /* second terminating character */ 517 { 518 register sopno start = HERE(); 519 register int first = 1; /* first subexpression? */ 520 register int wasdollar = 0; 521 522 if (EAT('^')) { 523 EMIT(OBOL, 0); 524 p->g->iflags |= USEBOL; 525 p->g->nbol++; 526 } 527 while (MORE() && !SEETWO(end1, end2)) { 528 wasdollar = p_simp_re(p, first); 529 first = 0; 530 } 531 if (wasdollar) { /* oops, that was a trailing anchor */ 532 DROP(1); 533 EMIT(OEOL, 0); 534 p->g->iflags |= USEEOL; 535 p->g->neol++; 536 } 537 538 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 539 } 540 541 /* 542 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 543 == static int p_simp_re(register struct parse *p, int starordinary); 544 */ 545 static int /* was the simple RE an unbackslashed $? */ 546 p_simp_re(p, starordinary) 547 register struct parse *p; 548 int starordinary; /* is a leading * an ordinary character? */ 549 { 550 register int c; 551 register int count; 552 register int count2; 553 register sopno pos; 554 register int i; 555 register sopno subno; 556 # define BACKSL (1<<CHAR_BIT) 557 558 pos = HERE(); /* repetion op, if any, covers from here */ 559 560 assert(MORE()); /* caller should have ensured this */ 561 c = GETNEXT(); 562 if (c == '\\') { 563 REQUIRE(MORE(), REG_EESCAPE); 564 c = BACKSL | (unsigned char)GETNEXT(); 565 } 566 switch (c) { 567 case '.': 568 if (p->g->cflags®_NEWLINE) 569 nonnewline(p); 570 else 571 EMIT(OANY, 0); 572 break; 573 case '[': 574 p_bracket(p); 575 break; 576 case BACKSL|'{': 577 SETERROR(REG_BADRPT); 578 break; 579 case BACKSL|'(': 580 p->g->nsub++; 581 subno = p->g->nsub; 582 if (subno < NPAREN) 583 p->pbegin[subno] = HERE(); 584 EMIT(OLPAREN, subno); 585 /* the MORE here is an error heuristic */ 586 if (MORE() && !SEETWO('\\', ')')) 587 p_bre(p, '\\', ')'); 588 if (subno < NPAREN) { 589 p->pend[subno] = HERE(); 590 assert(p->pend[subno] != 0); 591 } 592 EMIT(ORPAREN, subno); 593 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 594 break; 595 case BACKSL|')': /* should not get here -- must be user */ 596 case BACKSL|'}': 597 SETERROR(REG_EPAREN); 598 break; 599 case BACKSL|'1': 600 case BACKSL|'2': 601 case BACKSL|'3': 602 case BACKSL|'4': 603 case BACKSL|'5': 604 case BACKSL|'6': 605 case BACKSL|'7': 606 case BACKSL|'8': 607 case BACKSL|'9': 608 i = (c&~BACKSL) - '0'; 609 assert(i < NPAREN); 610 if (p->pend[i] != 0) { 611 assert(i <= p->g->nsub); 612 EMIT(OBACK_, i); 613 assert(p->pbegin[i] != 0); 614 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 615 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 616 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 617 EMIT(O_BACK, i); 618 } else 619 SETERROR(REG_ESUBREG); 620 p->g->backrefs = 1; 621 break; 622 case '*': 623 REQUIRE(starordinary, REG_BADRPT); 624 /* FALLTHROUGH */ 625 default: 626 ordinary(p, c &~ BACKSL); 627 break; 628 } 629 630 if (EAT('*')) { /* implemented as +? */ 631 /* this case does not require the (y|) trick, noKLUDGE */ 632 INSERT(OPLUS_, pos); 633 ASTERN(O_PLUS, pos); 634 INSERT(OQUEST_, pos); 635 ASTERN(O_QUEST, pos); 636 } else if (EATTWO('\\', '{')) { 637 count = p_count(p); 638 if (EAT(',')) { 639 if (MORE() && isdigit(PEEK())) { 640 count2 = p_count(p); 641 REQUIRE(count <= count2, REG_BADBR); 642 } else /* single number with comma */ 643 count2 = INFINITY; 644 } else /* just a single number */ 645 count2 = count; 646 repeat(p, pos, count, count2); 647 if (!EATTWO('\\', '}')) { /* error heuristics */ 648 while (MORE() && !SEETWO('\\', '}')) 649 NEXT(); 650 REQUIRE(MORE(), REG_EBRACE); 651 SETERROR(REG_BADBR); 652 } 653 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 654 return(1); 655 656 return(0); 657 } 658 659 /* 660 - p_count - parse a repetition count 661 == static int p_count(register struct parse *p); 662 */ 663 static int /* the value */ 664 p_count(p) 665 register struct parse *p; 666 { 667 register int count = 0; 668 register int ndigits = 0; 669 670 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 671 count = count*10 + (GETNEXT() - '0'); 672 ndigits++; 673 } 674 675 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 676 return(count); 677 } 678 679 /* 680 - p_bracket - parse a bracketed character list 681 == static void p_bracket(register struct parse *p); 682 * 683 * Note a significant property of this code: if the allocset() did SETERROR, 684 * no set operations are done. 685 */ 686 static void 687 p_bracket(p) 688 register struct parse *p; 689 { 690 register cset *cs = allocset(p); 691 register int invert = 0; 692 693 /* Dept of Truly Sickening Special-Case Kludges */ 694 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 695 EMIT(OBOW, 0); 696 NEXTn(6); 697 return; 698 } 699 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 700 EMIT(OEOW, 0); 701 NEXTn(6); 702 return; 703 } 704 705 if (EAT('^')) 706 invert++; /* make note to invert set at end */ 707 if (EAT(']')) 708 CHadd(cs, ']'); 709 else if (EAT('-')) 710 CHadd(cs, '-'); 711 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 712 p_b_term(p, cs); 713 if (EAT('-')) 714 CHadd(cs, '-'); 715 MUSTEAT(']', REG_EBRACK); 716 717 if (p->error != 0) /* don't mess things up further */ 718 return; 719 720 if (p->g->cflags®_ICASE) { 721 register int i; 722 register int ci; 723 724 for (i = p->g->csetsize - 1; i >= 0; i--) 725 if (CHIN(cs, i) && isalpha(i)) { 726 ci = othercase(i); 727 if (ci != i) 728 CHadd(cs, ci); 729 } 730 if (cs->multis != NULL) 731 mccase(p, cs); 732 } 733 if (invert) { 734 register int i; 735 736 for (i = p->g->csetsize - 1; i >= 0; i--) 737 if (CHIN(cs, i)) 738 CHsub(cs, i); 739 else 740 CHadd(cs, i); 741 if (p->g->cflags®_NEWLINE) 742 CHsub(cs, '\n'); 743 if (cs->multis != NULL) 744 mcinvert(p, cs); 745 } 746 747 assert(cs->multis == NULL); /* xxx */ 748 749 if (nch(p, cs) == 1) { /* optimize singleton sets */ 750 ordinary(p, firstch(p, cs)); 751 freeset(p, cs); 752 } else 753 EMIT(OANYOF, freezeset(p, cs)); 754 } 755 756 /* 757 - p_b_term - parse one term of a bracketed character list 758 == static void p_b_term(register struct parse *p, register cset *cs); 759 */ 760 static void 761 p_b_term(p, cs) 762 register struct parse *p; 763 register cset *cs; 764 { 765 register char c; 766 register char start, finish; 767 register int i; 768 769 /* classify what we've got */ 770 switch ((MORE()) ? PEEK() : '\0') { 771 case '[': 772 c = (MORE2()) ? PEEK2() : '\0'; 773 break; 774 case '-': 775 SETERROR(REG_ERANGE); 776 return; /* NOTE RETURN */ 777 break; 778 default: 779 c = '\0'; 780 break; 781 } 782 783 switch (c) { 784 case ':': /* character class */ 785 NEXT2(); 786 REQUIRE(MORE(), REG_EBRACK); 787 c = PEEK(); 788 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 789 p_b_cclass(p, cs); 790 REQUIRE(MORE(), REG_EBRACK); 791 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 792 break; 793 case '=': /* equivalence class */ 794 NEXT2(); 795 REQUIRE(MORE(), REG_EBRACK); 796 c = PEEK(); 797 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 798 p_b_eclass(p, cs); 799 REQUIRE(MORE(), REG_EBRACK); 800 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 801 break; 802 default: /* symbol, ordinary character, or range */ 803 /* xxx revision needed for multichar stuff */ 804 start = p_b_symbol(p); 805 if (SEE('-') && MORE2() && PEEK2() != ']') { 806 /* range */ 807 NEXT(); 808 if (EAT('-')) 809 finish = '-'; 810 else 811 finish = p_b_symbol(p); 812 } else 813 finish = start; 814 /* xxx what about signed chars here... */ 815 REQUIRE(start <= finish, REG_ERANGE); 816 for (i = start; i <= finish; i++) 817 CHadd(cs, i); 818 break; 819 } 820 } 821 822 /* 823 - p_b_cclass - parse a character-class name and deal with it 824 == static void p_b_cclass(register struct parse *p, register cset *cs); 825 */ 826 static void 827 p_b_cclass(p, cs) 828 register struct parse *p; 829 register cset *cs; 830 { 831 register char *sp = p->next; 832 register struct cclass *cp; 833 register size_t len; 834 register char *u; 835 register char c; 836 837 while (MORE() && isalpha(PEEK())) 838 NEXT(); 839 len = p->next - sp; 840 for (cp = cclasses; cp->name != NULL; cp++) 841 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 842 break; 843 if (cp->name == NULL) { 844 /* oops, didn't find it */ 845 SETERROR(REG_ECTYPE); 846 return; 847 } 848 849 u = cp->chars; 850 while ((c = *u++) != '\0') 851 CHadd(cs, c); 852 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 853 MCadd(p, cs, u); 854 } 855 856 /* 857 - p_b_eclass - parse an equivalence-class name and deal with it 858 == static void p_b_eclass(register struct parse *p, register cset *cs); 859 * 860 * This implementation is incomplete. xxx 861 */ 862 static void 863 p_b_eclass(p, cs) 864 register struct parse *p; 865 register cset *cs; 866 { 867 register char c; 868 869 c = p_b_coll_elem(p, '='); 870 CHadd(cs, c); 871 } 872 873 /* 874 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 875 == static char p_b_symbol(register struct parse *p); 876 */ 877 static char /* value of symbol */ 878 p_b_symbol(p) 879 register struct parse *p; 880 { 881 register char value; 882 883 REQUIRE(MORE(), REG_EBRACK); 884 if (!EATTWO('[', '.')) 885 return(GETNEXT()); 886 887 /* collating symbol */ 888 value = p_b_coll_elem(p, '.'); 889 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 890 return(value); 891 } 892 893 /* 894 - p_b_coll_elem - parse a collating-element name and look it up 895 == static char p_b_coll_elem(register struct parse *p, int endc); 896 */ 897 static char /* value of collating element */ 898 p_b_coll_elem(p, endc) 899 register struct parse *p; 900 int endc; /* name ended by endc,']' */ 901 { 902 register char *sp = p->next; 903 register struct cname *cp; 904 register int len; 905 906 while (MORE() && !SEETWO(endc, ']')) 907 NEXT(); 908 if (!MORE()) { 909 SETERROR(REG_EBRACK); 910 return(0); 911 } 912 len = p->next - sp; 913 for (cp = cnames; cp->name != NULL; cp++) 914 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 915 return(cp->code); /* known name */ 916 if (len == 1) 917 return(*sp); /* single character */ 918 SETERROR(REG_ECOLLATE); /* neither */ 919 return(0); 920 } 921 922 /* 923 - othercase - return the case counterpart of an alphabetic 924 == static char othercase(int ch); 925 */ 926 static char /* if no counterpart, return ch */ 927 othercase(ch) 928 int ch; 929 { 930 assert(isalpha(ch)); 931 if (isupper(ch)) 932 return(tolower(ch)); 933 else if (islower(ch)) 934 return(toupper(ch)); 935 else /* peculiar, but could happen */ 936 return(ch); 937 } 938 939 /* 940 - bothcases - emit a dualcase version of a two-case character 941 == static void bothcases(register struct parse *p, int ch); 942 * 943 * Boy, is this implementation ever a kludge... 944 */ 945 static void 946 bothcases(p, ch) 947 register struct parse *p; 948 int ch; 949 { 950 register char *oldnext = p->next; 951 register char *oldend = p->end; 952 char bracket[3]; 953 954 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 955 p->next = bracket; 956 p->end = bracket+2; 957 bracket[0] = ch; 958 bracket[1] = ']'; 959 bracket[2] = '\0'; 960 p_bracket(p); 961 assert(p->next == bracket+2); 962 p->next = oldnext; 963 p->end = oldend; 964 } 965 966 /* 967 - ordinary - emit an ordinary character 968 == static void ordinary(register struct parse *p, register int ch); 969 */ 970 static void 971 ordinary(p, ch) 972 register struct parse *p; 973 register int ch; 974 { 975 register cat_t *cap = p->g->categories; 976 977 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 978 bothcases(p, ch); 979 else { 980 EMIT(OCHAR, (unsigned char)ch); 981 if (cap[ch] == 0) 982 cap[ch] = p->g->ncategories++; 983 } 984 } 985 986 /* 987 - nonnewline - emit REG_NEWLINE version of OANY 988 == static void nonnewline(register struct parse *p); 989 * 990 * Boy, is this implementation ever a kludge... 991 */ 992 static void 993 nonnewline(p) 994 register struct parse *p; 995 { 996 register char *oldnext = p->next; 997 register char *oldend = p->end; 998 char bracket[4]; 999 1000 p->next = bracket; 1001 p->end = bracket+3; 1002 bracket[0] = '^'; 1003 bracket[1] = '\n'; 1004 bracket[2] = ']'; 1005 bracket[3] = '\0'; 1006 p_bracket(p); 1007 assert(p->next == bracket+3); 1008 p->next = oldnext; 1009 p->end = oldend; 1010 } 1011 1012 /* 1013 - repeat - generate code for a bounded repetition, recursively if needed 1014 == static void repeat(register struct parse *p, sopno start, int from, int to); 1015 */ 1016 static void 1017 repeat(p, start, from, to) 1018 register struct parse *p; 1019 sopno start; /* operand from here to end of strip */ 1020 int from; /* repeated from this number */ 1021 int to; /* to this number of times (maybe INFINITY) */ 1022 { 1023 register sopno finish = HERE(); 1024 # define N 2 1025 # define INF 3 1026 # define REP(f, t) ((f)*8 + (t)) 1027 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1028 register sopno copy; 1029 1030 if (p->error != 0) /* head off possible runaway recursion */ 1031 return; 1032 1033 assert(from <= to); 1034 1035 switch (REP(MAP(from), MAP(to))) { 1036 case REP(0, 0): /* must be user doing this */ 1037 DROP(finish-start); /* drop the operand */ 1038 break; 1039 case REP(0, 1): /* as x{1,1}? */ 1040 case REP(0, N): /* as x{1,n}? */ 1041 case REP(0, INF): /* as x{1,}? */ 1042 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1043 INSERT(OCH_, start); /* offset is wrong... */ 1044 repeat(p, start+1, 1, to); 1045 ASTERN(OOR1, start); 1046 AHEAD(start); /* ... fix it */ 1047 EMIT(OOR2, 0); 1048 AHEAD(THERE()); 1049 ASTERN(O_CH, THERETHERE()); 1050 break; 1051 case REP(1, 1): /* trivial case */ 1052 /* done */ 1053 break; 1054 case REP(1, N): /* as x?x{1,n-1} */ 1055 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1056 INSERT(OCH_, start); 1057 ASTERN(OOR1, start); 1058 AHEAD(start); 1059 EMIT(OOR2, 0); /* offset very wrong... */ 1060 AHEAD(THERE()); /* ...so fix it */ 1061 ASTERN(O_CH, THERETHERE()); 1062 copy = dupl(p, start+1, finish+1); 1063 assert(copy == finish+4); 1064 repeat(p, copy, 1, to-1); 1065 break; 1066 case REP(1, INF): /* as x+ */ 1067 INSERT(OPLUS_, start); 1068 ASTERN(O_PLUS, start); 1069 break; 1070 case REP(N, N): /* as xx{m-1,n-1} */ 1071 copy = dupl(p, start, finish); 1072 repeat(p, copy, from-1, to-1); 1073 break; 1074 case REP(N, INF): /* as xx{n-1,INF} */ 1075 copy = dupl(p, start, finish); 1076 repeat(p, copy, from-1, to); 1077 break; 1078 default: /* "can't happen" */ 1079 SETERROR(REG_ASSERT); /* just in case */ 1080 break; 1081 } 1082 } 1083 1084 /* 1085 - seterr - set an error condition 1086 == static int seterr(register struct parse *p, int e); 1087 */ 1088 static int /* useless but makes type checking happy */ 1089 seterr(p, e) 1090 register struct parse *p; 1091 int e; 1092 { 1093 if (p->error == 0) /* keep earliest error condition */ 1094 p->error = e; 1095 p->next = nuls; /* try to bring things to a halt */ 1096 p->end = nuls; 1097 return(0); /* make the return value well-defined */ 1098 } 1099 1100 /* 1101 - allocset - allocate a set of characters for [] 1102 == static cset *allocset(register struct parse *p); 1103 */ 1104 static cset * 1105 allocset(p) 1106 register struct parse *p; 1107 { 1108 register int no = p->g->ncsets++; 1109 register size_t nc; 1110 register size_t nbytes; 1111 register cset *cs; 1112 register size_t css = (size_t)p->g->csetsize; 1113 register int i; 1114 1115 if (no >= p->ncsalloc) { /* need another column of space */ 1116 p->ncsalloc += CHAR_BIT; 1117 nc = p->ncsalloc; 1118 assert(nc % CHAR_BIT == 0); 1119 nbytes = nc / CHAR_BIT * css; 1120 if (p->g->sets == NULL) 1121 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1122 else 1123 p->g->sets = (cset *)realloc((char *)p->g->sets, 1124 nc * sizeof(cset)); 1125 if (p->g->setbits == NULL) 1126 p->g->setbits = (uch *)malloc(nbytes); 1127 else { 1128 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1129 nbytes); 1130 /* xxx this isn't right if setbits is now NULL */ 1131 for (i = 0; i < no; i++) 1132 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1133 } 1134 if (p->g->sets != NULL && p->g->setbits != NULL) 1135 (void) memset((char *)p->g->setbits + (nbytes - css), 1136 0, css); 1137 else { 1138 no = 0; 1139 SETERROR(REG_ESPACE); 1140 /* caller's responsibility not to do set ops */ 1141 } 1142 } 1143 1144 assert(p->g->sets != NULL); /* xxx */ 1145 cs = &p->g->sets[no]; 1146 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1147 cs->mask = 1 << ((no) % CHAR_BIT); 1148 cs->hash = 0; 1149 cs->smultis = 0; 1150 cs->multis = NULL; 1151 1152 return(cs); 1153 } 1154 1155 /* 1156 - freeset - free a now-unused set 1157 == static void freeset(register struct parse *p, register cset *cs); 1158 */ 1159 static void 1160 freeset(p, cs) 1161 register struct parse *p; 1162 register cset *cs; 1163 { 1164 register int i; 1165 register cset *top = &p->g->sets[p->g->ncsets]; 1166 register size_t css = (size_t)p->g->csetsize; 1167 1168 for (i = 0; i < css; i++) 1169 CHsub(cs, i); 1170 if (cs == top-1) /* recover only the easy case */ 1171 p->g->ncsets--; 1172 } 1173 1174 /* 1175 - freezeset - final processing on a set of characters 1176 == static int freezeset(register struct parse *p, register cset *cs); 1177 * 1178 * The main task here is merging identical sets. This is usually a waste 1179 * of time (although the hash code minimizes the overhead), but can win 1180 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1181 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1182 * the same value! 1183 */ 1184 static int /* set number */ 1185 freezeset(p, cs) 1186 register struct parse *p; 1187 register cset *cs; 1188 { 1189 register uch h = cs->hash; 1190 register int i; 1191 register cset *top = &p->g->sets[p->g->ncsets]; 1192 register cset *cs2; 1193 register size_t css = (size_t)p->g->csetsize; 1194 1195 /* look for an earlier one which is the same */ 1196 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1197 if (cs2->hash == h && cs2 != cs) { 1198 /* maybe */ 1199 for (i = 0; i < css; i++) 1200 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1201 break; /* no */ 1202 if (i == css) 1203 break; /* yes */ 1204 } 1205 1206 if (cs2 < top) { /* found one */ 1207 freeset(p, cs); 1208 cs = cs2; 1209 } 1210 1211 return((int)(cs - p->g->sets)); 1212 } 1213 1214 /* 1215 - firstch - return first character in a set (which must have at least one) 1216 == static int firstch(register struct parse *p, register cset *cs); 1217 */ 1218 static int /* character; there is no "none" value */ 1219 firstch(p, cs) 1220 register struct parse *p; 1221 register cset *cs; 1222 { 1223 register int i; 1224 register size_t css = (size_t)p->g->csetsize; 1225 1226 for (i = 0; i < css; i++) 1227 if (CHIN(cs, i)) 1228 return((char)i); 1229 assert(never); 1230 return(0); /* arbitrary */ 1231 } 1232 1233 /* 1234 - nch - number of characters in a set 1235 == static int nch(register struct parse *p, register cset *cs); 1236 */ 1237 static int 1238 nch(p, cs) 1239 register struct parse *p; 1240 register cset *cs; 1241 { 1242 register int i; 1243 register size_t css = (size_t)p->g->csetsize; 1244 register int n = 0; 1245 1246 for (i = 0; i < css; i++) 1247 if (CHIN(cs, i)) 1248 n++; 1249 return(n); 1250 } 1251 1252 /* 1253 - mcadd - add a collating element to a cset 1254 == static void mcadd(register struct parse *p, register cset *cs, \ 1255 == register char *cp); 1256 */ 1257 static void 1258 mcadd(p, cs, cp) 1259 register struct parse *p; 1260 register cset *cs; 1261 register char *cp; 1262 { 1263 register size_t oldend = cs->smultis; 1264 1265 cs->smultis += strlen(cp) + 1; 1266 if (cs->multis == NULL) 1267 cs->multis = malloc(cs->smultis); 1268 else 1269 cs->multis = realloc(cs->multis, cs->smultis); 1270 if (cs->multis == NULL) { 1271 SETERROR(REG_ESPACE); 1272 return; 1273 } 1274 1275 (void) strcpy(cs->multis + oldend - 1, cp); 1276 cs->multis[cs->smultis - 1] = '\0'; 1277 } 1278 1279 #if 0 1280 /* 1281 - mcsub - subtract a collating element from a cset 1282 == static void mcsub(register cset *cs, register char *cp); 1283 */ 1284 static void 1285 mcsub(cs, cp) 1286 register cset *cs; 1287 register char *cp; 1288 { 1289 register char *fp = mcfind(cs, cp); 1290 register size_t len = strlen(fp); 1291 1292 assert(fp != NULL); 1293 (void) memmove(fp, fp + len + 1, 1294 cs->smultis - (fp + len + 1 - cs->multis)); 1295 cs->smultis -= len; 1296 1297 if (cs->smultis == 0) { 1298 free(cs->multis); 1299 cs->multis = NULL; 1300 return; 1301 } 1302 1303 cs->multis = realloc(cs->multis, cs->smultis); 1304 assert(cs->multis != NULL); 1305 } 1306 1307 /* 1308 - mcin - is a collating element in a cset? 1309 == static int mcin(register cset *cs, register char *cp); 1310 */ 1311 static int 1312 mcin(cs, cp) 1313 register cset *cs; 1314 register char *cp; 1315 { 1316 return(mcfind(cs, cp) != NULL); 1317 } 1318 1319 /* 1320 - mcfind - find a collating element in a cset 1321 == static char *mcfind(register cset *cs, register char *cp); 1322 */ 1323 static char * 1324 mcfind(cs, cp) 1325 register cset *cs; 1326 register char *cp; 1327 { 1328 register char *p; 1329 1330 if (cs->multis == NULL) 1331 return(NULL); 1332 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1333 if (strcmp(cp, p) == 0) 1334 return(p); 1335 return(NULL); 1336 } 1337 #endif 1338 1339 /* 1340 - mcinvert - invert the list of collating elements in a cset 1341 == static void mcinvert(register struct parse *p, register cset *cs); 1342 * 1343 * This would have to know the set of possibilities. Implementation 1344 * is deferred. 1345 */ 1346 static void 1347 mcinvert(p, cs) 1348 register struct parse *p; 1349 register cset *cs; 1350 { 1351 assert(cs->multis == NULL); /* xxx */ 1352 } 1353 1354 /* 1355 - mccase - add case counterparts of the list of collating elements in a cset 1356 == static void mccase(register struct parse *p, register cset *cs); 1357 * 1358 * This would have to know the set of possibilities. Implementation 1359 * is deferred. 1360 */ 1361 static void 1362 mccase(p, cs) 1363 register struct parse *p; 1364 register cset *cs; 1365 { 1366 assert(cs->multis == NULL); /* xxx */ 1367 } 1368 1369 /* 1370 - isinsets - is this character in any sets? 1371 == static int isinsets(register struct re_guts *g, int c); 1372 */ 1373 static int /* predicate */ 1374 isinsets(g, c) 1375 register struct re_guts *g; 1376 int c; 1377 { 1378 register uch *col; 1379 register int i; 1380 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1381 register unsigned uc = (unsigned char)c; 1382 1383 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1384 if (col[uc] != 0) 1385 return(1); 1386 return(0); 1387 } 1388 1389 /* 1390 - samesets - are these two characters in exactly the same sets? 1391 == static int samesets(register struct re_guts *g, int c1, int c2); 1392 */ 1393 static int /* predicate */ 1394 samesets(g, c1, c2) 1395 register struct re_guts *g; 1396 int c1; 1397 int c2; 1398 { 1399 register uch *col; 1400 register int i; 1401 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1402 register unsigned uc1 = (unsigned char)c1; 1403 register unsigned uc2 = (unsigned char)c2; 1404 1405 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1406 if (col[uc1] != col[uc2]) 1407 return(0); 1408 return(1); 1409 } 1410 1411 /* 1412 - categorize - sort out character categories 1413 == static void categorize(struct parse *p, register struct re_guts *g); 1414 */ 1415 static void 1416 categorize(p, g) 1417 struct parse *p; 1418 register struct re_guts *g; 1419 { 1420 register cat_t *cats = g->categories; 1421 register int c; 1422 register int c2; 1423 register cat_t cat; 1424 1425 /* avoid making error situations worse */ 1426 if (p->error != 0) 1427 return; 1428 1429 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1430 if (cats[c] == 0 && isinsets(g, c)) { 1431 cat = g->ncategories++; 1432 cats[c] = cat; 1433 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1434 if (cats[c2] == 0 && samesets(g, c, c2)) 1435 cats[c2] = cat; 1436 } 1437 } 1438 1439 /* 1440 - dupl - emit a duplicate of a bunch of sops 1441 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1442 */ 1443 static sopno /* start of duplicate */ 1444 dupl(p, start, finish) 1445 register struct parse *p; 1446 sopno start; /* from here */ 1447 sopno finish; /* to this less one */ 1448 { 1449 register sopno ret = HERE(); 1450 register sopno len = finish - start; 1451 1452 assert(finish >= start); 1453 if (len == 0) 1454 return(ret); 1455 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1456 assert(p->ssize >= p->slen + len); 1457 (void) memcpy((char *)(p->strip + p->slen), 1458 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1459 p->slen += len; 1460 return(ret); 1461 } 1462 1463 /* 1464 - doemit - emit a strip operator 1465 == static void doemit(register struct parse *p, sop op, size_t opnd); 1466 * 1467 * It might seem better to implement this as a macro with a function as 1468 * hard-case backup, but it's just too big and messy unless there are 1469 * some changes to the data structures. Maybe later. 1470 */ 1471 static void 1472 doemit(p, op, opnd) 1473 register struct parse *p; 1474 sop op; 1475 size_t opnd; 1476 { 1477 /* avoid making error situations worse */ 1478 if (p->error != 0) 1479 return; 1480 1481 /* deal with oversize operands ("can't happen", more or less) */ 1482 assert(opnd < 1<<OPSHIFT); 1483 1484 /* deal with undersized strip */ 1485 if (p->slen >= p->ssize) 1486 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1487 assert(p->slen < p->ssize); 1488 1489 /* finally, it's all reduced to the easy case */ 1490 p->strip[p->slen++] = SOP(op, opnd); 1491 } 1492 1493 /* 1494 - doinsert - insert a sop into the strip 1495 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1496 */ 1497 static void 1498 doinsert(p, op, opnd, pos) 1499 register struct parse *p; 1500 sop op; 1501 size_t opnd; 1502 sopno pos; 1503 { 1504 register sopno sn; 1505 register sop s; 1506 register int i; 1507 1508 /* avoid making error situations worse */ 1509 if (p->error != 0) 1510 return; 1511 1512 sn = HERE(); 1513 EMIT(op, opnd); /* do checks, ensure space */ 1514 assert(HERE() == sn+1); 1515 s = p->strip[sn]; 1516 1517 /* adjust paren pointers */ 1518 assert(pos > 0); 1519 for (i = 1; i < NPAREN; i++) { 1520 if (p->pbegin[i] >= pos) { 1521 p->pbegin[i]++; 1522 } 1523 if (p->pend[i] >= pos) { 1524 p->pend[i]++; 1525 } 1526 } 1527 1528 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1529 (HERE()-pos-1)*sizeof(sop)); 1530 p->strip[pos] = s; 1531 } 1532 1533 /* 1534 - dofwd - complete a forward reference 1535 == static void dofwd(register struct parse *p, sopno pos, sop value); 1536 */ 1537 static void 1538 dofwd(p, pos, value) 1539 register struct parse *p; 1540 register sopno pos; 1541 sop value; 1542 { 1543 /* avoid making error situations worse */ 1544 if (p->error != 0) 1545 return; 1546 1547 assert(value < 1<<OPSHIFT); 1548 p->strip[pos] = OP(p->strip[pos]) | value; 1549 } 1550 1551 /* 1552 - enlarge - enlarge the strip 1553 == static void enlarge(register struct parse *p, sopno size); 1554 */ 1555 static void 1556 enlarge(p, size) 1557 register struct parse *p; 1558 register sopno size; 1559 { 1560 register sop *sp; 1561 1562 if (p->ssize >= size) 1563 return; 1564 1565 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1566 if (sp == NULL) { 1567 SETERROR(REG_ESPACE); 1568 return; 1569 } 1570 p->strip = sp; 1571 p->ssize = size; 1572 } 1573 1574 /* 1575 - stripsnug - compact the strip 1576 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1577 */ 1578 static void 1579 stripsnug(p, g) 1580 register struct parse *p; 1581 register struct re_guts *g; 1582 { 1583 g->nstates = p->slen; 1584 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1585 if (g->strip == NULL) { 1586 SETERROR(REG_ESPACE); 1587 g->strip = p->strip; 1588 } 1589 } 1590 1591 /* 1592 - findmust - fill in must and mlen with longest mandatory literal string 1593 == static void findmust(register struct parse *p, register struct re_guts *g); 1594 * 1595 * This algorithm could do fancy things like analyzing the operands of | 1596 * for common subsequences. Someday. This code is simple and finds most 1597 * of the interesting cases. 1598 * 1599 * Note that must and mlen got initialized during setup. 1600 */ 1601 static void 1602 findmust(p, g) 1603 struct parse *p; 1604 register struct re_guts *g; 1605 { 1606 register sop *scan; 1607 sop *start = NULL; 1608 register sop *newstart = NULL; 1609 register sopno newlen; 1610 register sop s; 1611 register char *cp; 1612 register sopno i; 1613 1614 /* avoid making error situations worse */ 1615 if (p->error != 0) 1616 return; 1617 1618 /* find the longest OCHAR sequence in strip */ 1619 newlen = 0; 1620 scan = g->strip + 1; 1621 do { 1622 s = *scan++; 1623 switch (OP(s)) { 1624 case OCHAR: /* sequence member */ 1625 if (newlen == 0) /* new sequence */ 1626 newstart = scan - 1; 1627 newlen++; 1628 break; 1629 case OPLUS_: /* things that don't break one */ 1630 case OLPAREN: 1631 case ORPAREN: 1632 break; 1633 case OQUEST_: /* things that must be skipped */ 1634 case OCH_: 1635 scan--; 1636 do { 1637 scan += OPND(s); 1638 s = *scan; 1639 /* assert() interferes w debug printouts */ 1640 if (OP(s) != O_QUEST && OP(s) != O_CH && 1641 OP(s) != OOR2) { 1642 g->iflags |= BAD; 1643 return; 1644 } 1645 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1646 /* fallthrough */ 1647 default: /* things that break a sequence */ 1648 if (newlen > g->mlen) { /* ends one */ 1649 start = newstart; 1650 g->mlen = newlen; 1651 } 1652 newlen = 0; 1653 break; 1654 } 1655 } while (OP(s) != OEND); 1656 1657 if (g->mlen == 0) /* there isn't one */ 1658 return; 1659 1660 /* turn it into a character string */ 1661 g->must = malloc((size_t)g->mlen + 1); 1662 if (g->must == NULL) { /* argh; just forget it */ 1663 g->mlen = 0; 1664 return; 1665 } 1666 cp = g->must; 1667 scan = start; 1668 for (i = g->mlen; i > 0; i--) { 1669 while (OP(s = *scan++) != OCHAR) 1670 continue; 1671 assert(cp < g->must + g->mlen); 1672 *cp++ = (char)OPND(s); 1673 } 1674 assert(cp == g->must + g->mlen); 1675 *cp++ = '\0'; /* just on general principles */ 1676 } 1677 1678 /* 1679 - pluscount - count + nesting 1680 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1681 */ 1682 static sopno /* nesting depth */ 1683 pluscount(p, g) 1684 struct parse *p; 1685 register struct re_guts *g; 1686 { 1687 register sop *scan; 1688 register sop s; 1689 register sopno plusnest = 0; 1690 register sopno maxnest = 0; 1691 1692 if (p->error != 0) 1693 return(0); /* there may not be an OEND */ 1694 1695 scan = g->strip + 1; 1696 do { 1697 s = *scan++; 1698 switch (OP(s)) { 1699 case OPLUS_: 1700 plusnest++; 1701 break; 1702 case O_PLUS: 1703 if (plusnest > maxnest) 1704 maxnest = plusnest; 1705 plusnest--; 1706 break; 1707 } 1708 } while (OP(s) != OEND); 1709 if (plusnest != 0) 1710 g->iflags |= BAD; 1711 return(maxnest); 1712 } 1713