1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 */ 39 40 #if defined(LIBC_SCCS) && !defined(lint) 41 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 42 #endif /* LIBC_SCCS and not lint */ 43 44 #include <sys/types.h> 45 #include <stdio.h> 46 #include <string.h> 47 #ifdef __minix_vmd 48 #include <bsd/asciictype.h> 49 #else 50 #include <ctype.h> 51 #endif 52 #include <limits.h> 53 #include <stdlib.h> 54 #include <regex.h> 55 56 #include "utils.h" 57 #include "regex2.h" 58 59 #include "cclass.h" 60 #include "cname.h" 61 62 /* 63 * parse structure, passed up and down to avoid global variables and 64 * other clumsinesses 65 */ 66 struct parse { 67 char *next; /* next character in RE */ 68 char *end; /* end of string (-> NUL normally) */ 69 int error; /* has an error been seen? */ 70 sop *strip; /* malloced strip */ 71 sopno ssize; /* malloced strip size (allocated) */ 72 sopno slen; /* malloced strip length (used) */ 73 int ncsalloc; /* number of csets allocated */ 74 struct re_guts *g; 75 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 76 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 77 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 78 }; 79 80 /* ========= begin header generated by ./mkh ========= */ 81 #ifdef __cplusplus 82 extern "C" { 83 #endif 84 85 /* === regcomp.c === */ 86 static void p_ere(struct parse *p, int stop); 87 static void p_ere_exp(struct parse *p); 88 static void p_str(struct parse *p); 89 static void p_bre(struct parse *p, int end1, int end2); 90 static int p_simp_re(struct parse *p, int starordinary); 91 static int p_count(struct parse *p); 92 static void p_bracket(struct parse *p); 93 static void p_b_term(struct parse *p, cset *cs); 94 static void p_b_cclass(struct parse *p, cset *cs); 95 static void p_b_eclass(struct parse *p, cset *cs); 96 static char p_b_symbol(struct parse *p); 97 static char p_b_coll_elem(struct parse *p, int endc); 98 static char othercase(int ch); 99 static void bothcases(struct parse *p, int ch); 100 static void ordinary(struct parse *p, int ch); 101 static void nonnewline(struct parse *p); 102 static void repeat(struct parse *p, sopno start, int from, int to); 103 static int seterr(struct parse *p, int e); 104 static cset *allocset(struct parse *p); 105 static void freeset(struct parse *p, cset *cs); 106 static int freezeset(struct parse *p, cset *cs); 107 static int firstch(struct parse *p, cset *cs); 108 static int nch(struct parse *p, cset *cs); 109 static void mcadd(struct parse *p, cset *cs, char *cp); 110 static void mcsub(cset *cs, char *cp); 111 static int mcin(cset *cs, char *cp); 112 static char *mcfind(cset *cs, char *cp); 113 static void mcinvert(struct parse *p, cset *cs); 114 static void mccase(struct parse *p, cset *cs); 115 static int isinsets(struct re_guts *g, int c); 116 static int samesets(struct re_guts *g, int c1, int c2); 117 static void categorize(struct parse *p, struct re_guts *g); 118 static sopno dupl(struct parse *p, sopno start, sopno finish); 119 static void doemit(struct parse *p, sop op, size_t opnd); 120 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 121 static void dofwd(struct parse *p, sopno pos, sop value); 122 static void enlarge(struct parse *p, sopno size); 123 static void stripsnug(struct parse *p, struct re_guts *g); 124 static void findmust(struct parse *p, struct re_guts *g); 125 static sopno pluscount(struct parse *p, struct re_guts *g); 126 127 #ifdef __cplusplus 128 } 129 #endif 130 /* ========= end header generated by ./mkh ========= */ 131 132 static char nuls[10]; /* place to point scanner in event of error */ 133 134 /* 135 * macros for use with parse structure 136 * BEWARE: these know that the parse structure is named `p' !!! 137 */ 138 #define PEEK() (*p->next) 139 #define PEEK2() (*(p->next+1)) 140 #define MORE() (p->next < p->end) 141 #define MORE2() (p->next+1 < p->end) 142 #define SEE(c) (MORE() && PEEK() == (c)) 143 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 144 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 145 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 146 #define NEXT() (p->next++) 147 #define NEXT2() (p->next += 2) 148 #define NEXTn(n) (p->next += (n)) 149 #define GETNEXT() (*p->next++) 150 #define SETERROR(e) seterr(p, (e)) 151 #define REQUIRE(co, e) ((co) || SETERROR(e)) 152 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 153 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 154 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 155 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 156 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 157 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 158 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 159 #define HERE() (p->slen) 160 #define THERE() (p->slen - 1) 161 #define THERETHERE() (p->slen - 2) 162 #define DROP(n) (p->slen -= (n)) 163 164 #ifndef NDEBUG 165 static int never = 0; /* for use in asserts; shuts lint up */ 166 #else 167 #define never 0 /* some <assert.h>s have bugs too */ 168 #endif 169 170 /* 171 - regcomp - interface for parser and compilation 172 = extern int regcomp(regex_t *, const char *, int); 173 = #define REG_BASIC 0000 174 = #define REG_EXTENDED 0001 175 = #define REG_ICASE 0002 176 = #define REG_NOSUB 0004 177 = #define REG_NEWLINE 0010 178 = #define REG_NOSPEC 0020 179 = #define REG_PEND 0040 180 = #define REG_DUMP 0200 181 */ 182 int /* 0 success, otherwise REG_something */ 183 regcomp(preg, pattern, cflags) 184 regex_t *preg; 185 const char *pattern; 186 int cflags; 187 { 188 struct parse pa; 189 register struct re_guts *g; 190 register struct parse *p = &pa; 191 register int i; 192 register size_t len; 193 #ifdef REDEBUG 194 # define GOODFLAGS(f) (f) 195 #else 196 # define GOODFLAGS(f) ((f)&~REG_DUMP) 197 #endif 198 199 cflags = GOODFLAGS(cflags); 200 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 201 return(REG_INVARG); 202 203 if (cflags®_PEND) { 204 if (preg->re_endp < pattern) 205 return(REG_INVARG); 206 len = preg->re_endp - pattern; 207 } else 208 len = strlen((char *)pattern); 209 210 /* do the mallocs early so failure handling is easy */ 211 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 212 (NC-1)*sizeof(cat_t)); 213 if (g == NULL) 214 return(REG_ESPACE); 215 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 216 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 217 p->slen = 0; 218 if (p->strip == NULL) { 219 free((char *)g); 220 return(REG_ESPACE); 221 } 222 223 /* set things up */ 224 p->g = g; 225 p->next = (char *)pattern; /* convenience; we do not modify it */ 226 p->end = p->next + len; 227 p->error = 0; 228 p->ncsalloc = 0; 229 for (i = 0; i < NPAREN; i++) { 230 p->pbegin[i] = 0; 231 p->pend[i] = 0; 232 } 233 g->csetsize = NC; 234 g->sets = NULL; 235 g->setbits = NULL; 236 g->ncsets = 0; 237 g->cflags = cflags; 238 g->iflags = 0; 239 g->nbol = 0; 240 g->neol = 0; 241 g->must = NULL; 242 g->mlen = 0; 243 g->nsub = 0; 244 g->ncategories = 1; /* category 0 is "everything else" */ 245 g->categories = &g->catspace[-(CHAR_MIN)]; 246 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 247 g->backrefs = 0; 248 249 /* do it */ 250 EMIT(OEND, 0); 251 g->firststate = THERE(); 252 if (cflags®_EXTENDED) 253 p_ere(p, OUT); 254 else if (cflags®_NOSPEC) 255 p_str(p); 256 else 257 p_bre(p, OUT, OUT); 258 EMIT(OEND, 0); 259 g->laststate = THERE(); 260 261 /* tidy up loose ends and fill things in */ 262 categorize(p, g); 263 stripsnug(p, g); 264 findmust(p, g); 265 g->nplus = pluscount(p, g); 266 g->magic = MAGIC2; 267 preg->re_nsub = g->nsub; 268 preg->re_g = g; 269 preg->re_magic = MAGIC1; 270 #ifndef REDEBUG 271 /* not debugging, so can't rely on the assert() in regexec() */ 272 if (g->iflags&BAD) 273 SETERROR(REG_ASSERT); 274 #endif 275 276 /* win or lose, we're done */ 277 if (p->error != 0) /* lose */ 278 regfree(preg); 279 return(p->error); 280 } 281 282 /* 283 - p_ere - ERE parser top level, concatenation and alternation 284 == static void p_ere(register struct parse *p, int stop); 285 */ 286 static void 287 p_ere(p, stop) 288 register struct parse *p; 289 int stop; /* character this ERE should end at */ 290 { 291 register char c; 292 register sopno prevback; 293 register sopno prevfwd; 294 register sopno conc; 295 register int first = 1; /* is this the first alternative? */ 296 297 for (;;) { 298 /* do a bunch of concatenated expressions */ 299 conc = HERE(); 300 while (MORE() && (c = PEEK()) != '|' && c != stop) 301 p_ere_exp(p); 302 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 303 304 if (!EAT('|')) 305 break; /* NOTE BREAK OUT */ 306 307 if (first) { 308 INSERT(OCH_, conc); /* offset is wrong */ 309 prevfwd = conc; 310 prevback = conc; 311 first = 0; 312 } 313 ASTERN(OOR1, prevback); 314 prevback = THERE(); 315 AHEAD(prevfwd); /* fix previous offset */ 316 prevfwd = HERE(); 317 EMIT(OOR2, 0); /* offset is very wrong */ 318 } 319 320 if (!first) { /* tail-end fixups */ 321 AHEAD(prevfwd); 322 ASTERN(O_CH, prevback); 323 } 324 325 assert(!MORE() || SEE(stop)); 326 } 327 328 /* 329 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 330 == static void p_ere_exp(register struct parse *p); 331 */ 332 static void 333 p_ere_exp(p) 334 register struct parse *p; 335 { 336 register char c; 337 register sopno pos; 338 register int count; 339 register int count2; 340 register sopno subno; 341 int wascaret = 0; 342 343 assert(MORE()); /* caller should have ensured this */ 344 c = GETNEXT(); 345 346 pos = HERE(); 347 switch (c) { 348 case '(': 349 REQUIRE(MORE(), REG_EPAREN); 350 p->g->nsub++; 351 subno = p->g->nsub; 352 if (subno < NPAREN) 353 p->pbegin[subno] = HERE(); 354 EMIT(OLPAREN, subno); 355 if (!SEE(')')) 356 p_ere(p, ')'); 357 if (subno < NPAREN) { 358 p->pend[subno] = HERE(); 359 assert(p->pend[subno] != 0); 360 } 361 EMIT(ORPAREN, subno); 362 MUSTEAT(')', REG_EPAREN); 363 break; 364 #ifndef POSIX_MISTAKE 365 case ')': /* happens only if no current unmatched ( */ 366 /* 367 * You may ask, why the ifndef? Because I didn't notice 368 * this until slightly too late for 1003.2, and none of the 369 * other 1003.2 regular-expression reviewers noticed it at 370 * all. So an unmatched ) is legal POSIX, at least until 371 * we can get it fixed. 372 */ 373 SETERROR(REG_EPAREN); 374 break; 375 #endif 376 case '^': 377 EMIT(OBOL, 0); 378 p->g->iflags |= USEBOL; 379 p->g->nbol++; 380 wascaret = 1; 381 break; 382 case '$': 383 EMIT(OEOL, 0); 384 p->g->iflags |= USEEOL; 385 p->g->neol++; 386 break; 387 case '|': 388 SETERROR(REG_EMPTY); 389 break; 390 case '*': 391 case '+': 392 case '?': 393 SETERROR(REG_BADRPT); 394 break; 395 case '.': 396 if (p->g->cflags®_NEWLINE) 397 nonnewline(p); 398 else 399 EMIT(OANY, 0); 400 break; 401 case '[': 402 p_bracket(p); 403 break; 404 case '\\': 405 REQUIRE(MORE(), REG_EESCAPE); 406 c = GETNEXT(); 407 ordinary(p, c); 408 break; 409 case '{': /* okay as ordinary except if digit follows */ 410 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 411 /* FALLTHROUGH */ 412 default: 413 ordinary(p, c); 414 break; 415 } 416 417 if (!MORE()) 418 return; 419 c = PEEK(); 420 /* we call { a repetition if followed by a digit */ 421 if (!( c == '*' || c == '+' || c == '?' || 422 (c == '{' && MORE2() && isdigit(PEEK2())) )) 423 return; /* no repetition, we're done */ 424 NEXT(); 425 426 REQUIRE(!wascaret, REG_BADRPT); 427 switch (c) { 428 case '*': /* implemented as +? */ 429 /* this case does not require the (y|) trick, noKLUDGE */ 430 INSERT(OPLUS_, pos); 431 ASTERN(O_PLUS, pos); 432 INSERT(OQUEST_, pos); 433 ASTERN(O_QUEST, pos); 434 break; 435 case '+': 436 INSERT(OPLUS_, pos); 437 ASTERN(O_PLUS, pos); 438 break; 439 case '?': 440 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 441 INSERT(OCH_, pos); /* offset slightly wrong */ 442 ASTERN(OOR1, pos); /* this one's right */ 443 AHEAD(pos); /* fix the OCH_ */ 444 EMIT(OOR2, 0); /* offset very wrong... */ 445 AHEAD(THERE()); /* ...so fix it */ 446 ASTERN(O_CH, THERETHERE()); 447 break; 448 case '{': 449 count = p_count(p); 450 if (EAT(',')) { 451 if (isdigit(PEEK())) { 452 count2 = p_count(p); 453 REQUIRE(count <= count2, REG_BADBR); 454 } else /* single number with comma */ 455 count2 = INFINITY; 456 } else /* just a single number */ 457 count2 = count; 458 repeat(p, pos, count, count2); 459 if (!EAT('}')) { /* error heuristics */ 460 while (MORE() && PEEK() != '}') 461 NEXT(); 462 REQUIRE(MORE(), REG_EBRACE); 463 SETERROR(REG_BADBR); 464 } 465 break; 466 } 467 468 if (!MORE()) 469 return; 470 c = PEEK(); 471 if (!( c == '*' || c == '+' || c == '?' || 472 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 473 return; 474 SETERROR(REG_BADRPT); 475 } 476 477 /* 478 - p_str - string (no metacharacters) "parser" 479 == static void p_str(register struct parse *p); 480 */ 481 static void 482 p_str(p) 483 register struct parse *p; 484 { 485 REQUIRE(MORE(), REG_EMPTY); 486 while (MORE()) 487 ordinary(p, GETNEXT()); 488 } 489 490 /* 491 - p_bre - BRE parser top level, anchoring and concatenation 492 == static void p_bre(register struct parse *p, register int end1, \ 493 == register int end2); 494 * Giving end1 as OUT essentially eliminates the end1/end2 check. 495 * 496 * This implementation is a bit of a kludge, in that a trailing $ is first 497 * taken as an ordinary character and then revised to be an anchor. The 498 * only undesirable side effect is that '$' gets included as a character 499 * category in such cases. This is fairly harmless; not worth fixing. 500 * The amount of lookahead needed to avoid this kludge is excessive. 501 */ 502 static void 503 p_bre(p, end1, end2) 504 register struct parse *p; 505 register int end1; /* first terminating character */ 506 register int end2; /* second terminating character */ 507 { 508 register sopno start = HERE(); 509 register int first = 1; /* first subexpression? */ 510 register int wasdollar = 0; 511 512 if (EAT('^')) { 513 EMIT(OBOL, 0); 514 p->g->iflags |= USEBOL; 515 p->g->nbol++; 516 } 517 while (MORE() && !SEETWO(end1, end2)) { 518 wasdollar = p_simp_re(p, first); 519 first = 0; 520 } 521 if (wasdollar) { /* oops, that was a trailing anchor */ 522 DROP(1); 523 EMIT(OEOL, 0); 524 p->g->iflags |= USEEOL; 525 p->g->neol++; 526 } 527 528 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 529 } 530 531 /* 532 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 533 == static int p_simp_re(register struct parse *p, int starordinary); 534 */ 535 static int /* was the simple RE an unbackslashed $? */ 536 p_simp_re(p, starordinary) 537 register struct parse *p; 538 int starordinary; /* is a leading * an ordinary character? */ 539 { 540 register int c; 541 register int count; 542 register int count2; 543 register sopno pos; 544 register int i; 545 register sopno subno; 546 # define BACKSL (1<<CHAR_BIT) 547 548 pos = HERE(); /* repetion op, if any, covers from here */ 549 550 assert(MORE()); /* caller should have ensured this */ 551 c = GETNEXT(); 552 if (c == '\\') { 553 REQUIRE(MORE(), REG_EESCAPE); 554 c = BACKSL | (unsigned char)GETNEXT(); 555 } 556 switch (c) { 557 case '.': 558 if (p->g->cflags®_NEWLINE) 559 nonnewline(p); 560 else 561 EMIT(OANY, 0); 562 break; 563 case '[': 564 p_bracket(p); 565 break; 566 case BACKSL|'{': 567 SETERROR(REG_BADRPT); 568 break; 569 case BACKSL|'(': 570 p->g->nsub++; 571 subno = p->g->nsub; 572 if (subno < NPAREN) 573 p->pbegin[subno] = HERE(); 574 EMIT(OLPAREN, subno); 575 /* the MORE here is an error heuristic */ 576 if (MORE() && !SEETWO('\\', ')')) 577 p_bre(p, '\\', ')'); 578 if (subno < NPAREN) { 579 p->pend[subno] = HERE(); 580 assert(p->pend[subno] != 0); 581 } 582 EMIT(ORPAREN, subno); 583 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 584 break; 585 case BACKSL|')': /* should not get here -- must be user */ 586 case BACKSL|'}': 587 SETERROR(REG_EPAREN); 588 break; 589 case BACKSL|'1': 590 case BACKSL|'2': 591 case BACKSL|'3': 592 case BACKSL|'4': 593 case BACKSL|'5': 594 case BACKSL|'6': 595 case BACKSL|'7': 596 case BACKSL|'8': 597 case BACKSL|'9': 598 i = (c&~BACKSL) - '0'; 599 assert(i < NPAREN); 600 if (p->pend[i] != 0) { 601 assert(i <= p->g->nsub); 602 EMIT(OBACK_, i); 603 assert(p->pbegin[i] != 0); 604 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 605 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 606 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 607 EMIT(O_BACK, i); 608 } else 609 SETERROR(REG_ESUBREG); 610 p->g->backrefs = 1; 611 break; 612 case '*': 613 REQUIRE(starordinary, REG_BADRPT); 614 /* FALLTHROUGH */ 615 default: 616 ordinary(p, c &~ BACKSL); 617 break; 618 } 619 620 if (EAT('*')) { /* implemented as +? */ 621 /* this case does not require the (y|) trick, noKLUDGE */ 622 INSERT(OPLUS_, pos); 623 ASTERN(O_PLUS, pos); 624 INSERT(OQUEST_, pos); 625 ASTERN(O_QUEST, pos); 626 } else if (EATTWO('\\', '{')) { 627 count = p_count(p); 628 if (EAT(',')) { 629 if (MORE() && isdigit(PEEK())) { 630 count2 = p_count(p); 631 REQUIRE(count <= count2, REG_BADBR); 632 } else /* single number with comma */ 633 count2 = INFINITY; 634 } else /* just a single number */ 635 count2 = count; 636 repeat(p, pos, count, count2); 637 if (!EATTWO('\\', '}')) { /* error heuristics */ 638 while (MORE() && !SEETWO('\\', '}')) 639 NEXT(); 640 REQUIRE(MORE(), REG_EBRACE); 641 SETERROR(REG_BADBR); 642 } 643 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 644 return(1); 645 646 return(0); 647 } 648 649 /* 650 - p_count - parse a repetition count 651 == static int p_count(register struct parse *p); 652 */ 653 static int /* the value */ 654 p_count(p) 655 register struct parse *p; 656 { 657 register int count = 0; 658 register int ndigits = 0; 659 660 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 661 count = count*10 + (GETNEXT() - '0'); 662 ndigits++; 663 } 664 665 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 666 return(count); 667 } 668 669 /* 670 - p_bracket - parse a bracketed character list 671 == static void p_bracket(register struct parse *p); 672 * 673 * Note a significant property of this code: if the allocset() did SETERROR, 674 * no set operations are done. 675 */ 676 static void 677 p_bracket(p) 678 register struct parse *p; 679 { 680 register char c; 681 register cset *cs = allocset(p); 682 register int invert = 0; 683 684 /* Dept of Truly Sickening Special-Case Kludges */ 685 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 686 EMIT(OBOW, 0); 687 NEXTn(6); 688 return; 689 } 690 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 691 EMIT(OEOW, 0); 692 NEXTn(6); 693 return; 694 } 695 696 if (EAT('^')) 697 invert++; /* make note to invert set at end */ 698 if (EAT(']')) 699 CHadd(cs, ']'); 700 else if (EAT('-')) 701 CHadd(cs, '-'); 702 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 703 p_b_term(p, cs); 704 if (EAT('-')) 705 CHadd(cs, '-'); 706 MUSTEAT(']', REG_EBRACK); 707 708 if (p->error != 0) /* don't mess things up further */ 709 return; 710 711 if (p->g->cflags®_ICASE) { 712 register int i; 713 register int ci; 714 715 for (i = p->g->csetsize - 1; i >= 0; i--) 716 if (CHIN(cs, i) && isalpha(i)) { 717 ci = othercase(i); 718 if (ci != i) 719 CHadd(cs, ci); 720 } 721 if (cs->multis != NULL) 722 mccase(p, cs); 723 } 724 if (invert) { 725 register int i; 726 727 for (i = p->g->csetsize - 1; i >= 0; i--) 728 if (CHIN(cs, i)) 729 CHsub(cs, i); 730 else 731 CHadd(cs, i); 732 if (p->g->cflags®_NEWLINE) 733 CHsub(cs, '\n'); 734 if (cs->multis != NULL) 735 mcinvert(p, cs); 736 } 737 738 assert(cs->multis == NULL); /* xxx */ 739 740 if (nch(p, cs) == 1) { /* optimize singleton sets */ 741 ordinary(p, firstch(p, cs)); 742 freeset(p, cs); 743 } else 744 EMIT(OANYOF, freezeset(p, cs)); 745 } 746 747 /* 748 - p_b_term - parse one term of a bracketed character list 749 == static void p_b_term(register struct parse *p, register cset *cs); 750 */ 751 static void 752 p_b_term(p, cs) 753 register struct parse *p; 754 register cset *cs; 755 { 756 register char c; 757 register char start, finish; 758 register int i; 759 760 /* classify what we've got */ 761 switch ((MORE()) ? PEEK() : '\0') { 762 case '[': 763 c = (MORE2()) ? PEEK2() : '\0'; 764 break; 765 case '-': 766 SETERROR(REG_ERANGE); 767 return; /* NOTE RETURN */ 768 break; 769 default: 770 c = '\0'; 771 break; 772 } 773 774 switch (c) { 775 case ':': /* character class */ 776 NEXT2(); 777 REQUIRE(MORE(), REG_EBRACK); 778 c = PEEK(); 779 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 780 p_b_cclass(p, cs); 781 REQUIRE(MORE(), REG_EBRACK); 782 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 783 break; 784 case '=': /* equivalence class */ 785 NEXT2(); 786 REQUIRE(MORE(), REG_EBRACK); 787 c = PEEK(); 788 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 789 p_b_eclass(p, cs); 790 REQUIRE(MORE(), REG_EBRACK); 791 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 792 break; 793 default: /* symbol, ordinary character, or range */ 794 /* xxx revision needed for multichar stuff */ 795 start = p_b_symbol(p); 796 if (SEE('-') && MORE2() && PEEK2() != ']') { 797 /* range */ 798 NEXT(); 799 if (EAT('-')) 800 finish = '-'; 801 else 802 finish = p_b_symbol(p); 803 } else 804 finish = start; 805 /* xxx what about signed chars here... */ 806 REQUIRE(start <= finish, REG_ERANGE); 807 for (i = start; i <= finish; i++) 808 CHadd(cs, i); 809 break; 810 } 811 } 812 813 /* 814 - p_b_cclass - parse a character-class name and deal with it 815 == static void p_b_cclass(register struct parse *p, register cset *cs); 816 */ 817 static void 818 p_b_cclass(p, cs) 819 register struct parse *p; 820 register cset *cs; 821 { 822 register char *sp = p->next; 823 register struct cclass *cp; 824 register size_t len; 825 register char *u; 826 register char c; 827 828 while (MORE() && isalpha(PEEK())) 829 NEXT(); 830 len = p->next - sp; 831 for (cp = cclasses; cp->name != NULL; cp++) 832 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 833 break; 834 if (cp->name == NULL) { 835 /* oops, didn't find it */ 836 SETERROR(REG_ECTYPE); 837 return; 838 } 839 840 u = cp->chars; 841 while ((c = *u++) != '\0') 842 CHadd(cs, c); 843 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 844 MCadd(p, cs, u); 845 } 846 847 /* 848 - p_b_eclass - parse an equivalence-class name and deal with it 849 == static void p_b_eclass(register struct parse *p, register cset *cs); 850 * 851 * This implementation is incomplete. xxx 852 */ 853 static void 854 p_b_eclass(p, cs) 855 register struct parse *p; 856 register cset *cs; 857 { 858 register char c; 859 860 c = p_b_coll_elem(p, '='); 861 CHadd(cs, c); 862 } 863 864 /* 865 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 866 == static char p_b_symbol(register struct parse *p); 867 */ 868 static char /* value of symbol */ 869 p_b_symbol(p) 870 register struct parse *p; 871 { 872 register char value; 873 874 REQUIRE(MORE(), REG_EBRACK); 875 if (!EATTWO('[', '.')) 876 return(GETNEXT()); 877 878 /* collating symbol */ 879 value = p_b_coll_elem(p, '.'); 880 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 881 return(value); 882 } 883 884 /* 885 - p_b_coll_elem - parse a collating-element name and look it up 886 == static char p_b_coll_elem(register struct parse *p, int endc); 887 */ 888 static char /* value of collating element */ 889 p_b_coll_elem(p, endc) 890 register struct parse *p; 891 int endc; /* name ended by endc,']' */ 892 { 893 register char *sp = p->next; 894 register struct cname *cp; 895 register int len; 896 register char c; 897 898 while (MORE() && !SEETWO(endc, ']')) 899 NEXT(); 900 if (!MORE()) { 901 SETERROR(REG_EBRACK); 902 return(0); 903 } 904 len = p->next - sp; 905 for (cp = cnames; cp->name != NULL; cp++) 906 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 907 return(cp->code); /* known name */ 908 if (len == 1) 909 return(*sp); /* single character */ 910 SETERROR(REG_ECOLLATE); /* neither */ 911 return(0); 912 } 913 914 /* 915 - othercase - return the case counterpart of an alphabetic 916 == static char othercase(int ch); 917 */ 918 static char /* if no counterpart, return ch */ 919 othercase(ch) 920 int ch; 921 { 922 assert(isalpha(ch)); 923 if (isupper(ch)) 924 return(tolower(ch)); 925 else if (islower(ch)) 926 return(toupper(ch)); 927 else /* peculiar, but could happen */ 928 return(ch); 929 } 930 931 /* 932 - bothcases - emit a dualcase version of a two-case character 933 == static void bothcases(register struct parse *p, int ch); 934 * 935 * Boy, is this implementation ever a kludge... 936 */ 937 static void 938 bothcases(p, ch) 939 register struct parse *p; 940 int ch; 941 { 942 register char *oldnext = p->next; 943 register char *oldend = p->end; 944 char bracket[3]; 945 946 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 947 p->next = bracket; 948 p->end = bracket+2; 949 bracket[0] = ch; 950 bracket[1] = ']'; 951 bracket[2] = '\0'; 952 p_bracket(p); 953 assert(p->next == bracket+2); 954 p->next = oldnext; 955 p->end = oldend; 956 } 957 958 /* 959 - ordinary - emit an ordinary character 960 == static void ordinary(register struct parse *p, register int ch); 961 */ 962 static void 963 ordinary(p, ch) 964 register struct parse *p; 965 register int ch; 966 { 967 register cat_t *cap = p->g->categories; 968 969 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 970 bothcases(p, ch); 971 else { 972 EMIT(OCHAR, (unsigned char)ch); 973 if (cap[ch] == 0) 974 cap[ch] = p->g->ncategories++; 975 } 976 } 977 978 /* 979 - nonnewline - emit REG_NEWLINE version of OANY 980 == static void nonnewline(register struct parse *p); 981 * 982 * Boy, is this implementation ever a kludge... 983 */ 984 static void 985 nonnewline(p) 986 register struct parse *p; 987 { 988 register char *oldnext = p->next; 989 register char *oldend = p->end; 990 char bracket[4]; 991 992 p->next = bracket; 993 p->end = bracket+3; 994 bracket[0] = '^'; 995 bracket[1] = '\n'; 996 bracket[2] = ']'; 997 bracket[3] = '\0'; 998 p_bracket(p); 999 assert(p->next == bracket+3); 1000 p->next = oldnext; 1001 p->end = oldend; 1002 } 1003 1004 /* 1005 - repeat - generate code for a bounded repetition, recursively if needed 1006 == static void repeat(register struct parse *p, sopno start, int from, int to); 1007 */ 1008 static void 1009 repeat(p, start, from, to) 1010 register struct parse *p; 1011 sopno start; /* operand from here to end of strip */ 1012 int from; /* repeated from this number */ 1013 int to; /* to this number of times (maybe INFINITY) */ 1014 { 1015 register sopno finish = HERE(); 1016 # define N 2 1017 # define INF 3 1018 # define REP(f, t) ((f)*8 + (t)) 1019 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1020 register sopno copy; 1021 1022 if (p->error != 0) /* head off possible runaway recursion */ 1023 return; 1024 1025 assert(from <= to); 1026 1027 switch (REP(MAP(from), MAP(to))) { 1028 case REP(0, 0): /* must be user doing this */ 1029 DROP(finish-start); /* drop the operand */ 1030 break; 1031 case REP(0, 1): /* as x{1,1}? */ 1032 case REP(0, N): /* as x{1,n}? */ 1033 case REP(0, INF): /* as x{1,}? */ 1034 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1035 INSERT(OCH_, start); /* offset is wrong... */ 1036 repeat(p, start+1, 1, to); 1037 ASTERN(OOR1, start); 1038 AHEAD(start); /* ... fix it */ 1039 EMIT(OOR2, 0); 1040 AHEAD(THERE()); 1041 ASTERN(O_CH, THERETHERE()); 1042 break; 1043 case REP(1, 1): /* trivial case */ 1044 /* done */ 1045 break; 1046 case REP(1, N): /* as x?x{1,n-1} */ 1047 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1048 INSERT(OCH_, start); 1049 ASTERN(OOR1, start); 1050 AHEAD(start); 1051 EMIT(OOR2, 0); /* offset very wrong... */ 1052 AHEAD(THERE()); /* ...so fix it */ 1053 ASTERN(O_CH, THERETHERE()); 1054 copy = dupl(p, start+1, finish+1); 1055 assert(copy == finish+4); 1056 repeat(p, copy, 1, to-1); 1057 break; 1058 case REP(1, INF): /* as x+ */ 1059 INSERT(OPLUS_, start); 1060 ASTERN(O_PLUS, start); 1061 break; 1062 case REP(N, N): /* as xx{m-1,n-1} */ 1063 copy = dupl(p, start, finish); 1064 repeat(p, copy, from-1, to-1); 1065 break; 1066 case REP(N, INF): /* as xx{n-1,INF} */ 1067 copy = dupl(p, start, finish); 1068 repeat(p, copy, from-1, to); 1069 break; 1070 default: /* "can't happen" */ 1071 SETERROR(REG_ASSERT); /* just in case */ 1072 break; 1073 } 1074 } 1075 1076 /* 1077 - seterr - set an error condition 1078 == static int seterr(register struct parse *p, int e); 1079 */ 1080 static int /* useless but makes type checking happy */ 1081 seterr(p, e) 1082 register struct parse *p; 1083 int e; 1084 { 1085 if (p->error == 0) /* keep earliest error condition */ 1086 p->error = e; 1087 p->next = nuls; /* try to bring things to a halt */ 1088 p->end = nuls; 1089 return(0); /* make the return value well-defined */ 1090 } 1091 1092 /* 1093 - allocset - allocate a set of characters for [] 1094 == static cset *allocset(register struct parse *p); 1095 */ 1096 static cset * 1097 allocset(p) 1098 register struct parse *p; 1099 { 1100 register int no = p->g->ncsets++; 1101 register size_t nc; 1102 register size_t nbytes; 1103 register cset *cs; 1104 register size_t css = (size_t)p->g->csetsize; 1105 register int i; 1106 1107 if (no >= p->ncsalloc) { /* need another column of space */ 1108 p->ncsalloc += CHAR_BIT; 1109 nc = p->ncsalloc; 1110 assert(nc % CHAR_BIT == 0); 1111 nbytes = nc / CHAR_BIT * css; 1112 if (p->g->sets == NULL) 1113 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1114 else 1115 p->g->sets = (cset *)realloc((char *)p->g->sets, 1116 nc * sizeof(cset)); 1117 if (p->g->setbits == NULL) 1118 p->g->setbits = (uch *)malloc(nbytes); 1119 else { 1120 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1121 nbytes); 1122 /* xxx this isn't right if setbits is now NULL */ 1123 for (i = 0; i < no; i++) 1124 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1125 } 1126 if (p->g->sets != NULL && p->g->setbits != NULL) 1127 (void) memset((char *)p->g->setbits + (nbytes - css), 1128 0, css); 1129 else { 1130 no = 0; 1131 SETERROR(REG_ESPACE); 1132 /* caller's responsibility not to do set ops */ 1133 } 1134 } 1135 1136 assert(p->g->sets != NULL); /* xxx */ 1137 cs = &p->g->sets[no]; 1138 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1139 cs->mask = 1 << ((no) % CHAR_BIT); 1140 cs->hash = 0; 1141 cs->smultis = 0; 1142 cs->multis = NULL; 1143 1144 return(cs); 1145 } 1146 1147 /* 1148 - freeset - free a now-unused set 1149 == static void freeset(register struct parse *p, register cset *cs); 1150 */ 1151 static void 1152 freeset(p, cs) 1153 register struct parse *p; 1154 register cset *cs; 1155 { 1156 register int i; 1157 register cset *top = &p->g->sets[p->g->ncsets]; 1158 register size_t css = (size_t)p->g->csetsize; 1159 1160 for (i = 0; i < css; i++) 1161 CHsub(cs, i); 1162 if (cs == top-1) /* recover only the easy case */ 1163 p->g->ncsets--; 1164 } 1165 1166 /* 1167 - freezeset - final processing on a set of characters 1168 == static int freezeset(register struct parse *p, register cset *cs); 1169 * 1170 * The main task here is merging identical sets. This is usually a waste 1171 * of time (although the hash code minimizes the overhead), but can win 1172 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1173 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1174 * the same value! 1175 */ 1176 static int /* set number */ 1177 freezeset(p, cs) 1178 register struct parse *p; 1179 register cset *cs; 1180 { 1181 register uch h = cs->hash; 1182 register int i; 1183 register cset *top = &p->g->sets[p->g->ncsets]; 1184 register cset *cs2; 1185 register size_t css = (size_t)p->g->csetsize; 1186 1187 /* look for an earlier one which is the same */ 1188 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1189 if (cs2->hash == h && cs2 != cs) { 1190 /* maybe */ 1191 for (i = 0; i < css; i++) 1192 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1193 break; /* no */ 1194 if (i == css) 1195 break; /* yes */ 1196 } 1197 1198 if (cs2 < top) { /* found one */ 1199 freeset(p, cs); 1200 cs = cs2; 1201 } 1202 1203 return((int)(cs - p->g->sets)); 1204 } 1205 1206 /* 1207 - firstch - return first character in a set (which must have at least one) 1208 == static int firstch(register struct parse *p, register cset *cs); 1209 */ 1210 static int /* character; there is no "none" value */ 1211 firstch(p, cs) 1212 register struct parse *p; 1213 register cset *cs; 1214 { 1215 register int i; 1216 register size_t css = (size_t)p->g->csetsize; 1217 1218 for (i = 0; i < css; i++) 1219 if (CHIN(cs, i)) 1220 return((char)i); 1221 assert(never); 1222 return(0); /* arbitrary */ 1223 } 1224 1225 /* 1226 - nch - number of characters in a set 1227 == static int nch(register struct parse *p, register cset *cs); 1228 */ 1229 static int 1230 nch(p, cs) 1231 register struct parse *p; 1232 register cset *cs; 1233 { 1234 register int i; 1235 register size_t css = (size_t)p->g->csetsize; 1236 register int n = 0; 1237 1238 for (i = 0; i < css; i++) 1239 if (CHIN(cs, i)) 1240 n++; 1241 return(n); 1242 } 1243 1244 /* 1245 - mcadd - add a collating element to a cset 1246 == static void mcadd(register struct parse *p, register cset *cs, \ 1247 == register char *cp); 1248 */ 1249 static void 1250 mcadd(p, cs, cp) 1251 register struct parse *p; 1252 register cset *cs; 1253 register char *cp; 1254 { 1255 register size_t oldend = cs->smultis; 1256 1257 cs->smultis += strlen(cp) + 1; 1258 if (cs->multis == NULL) 1259 cs->multis = malloc(cs->smultis); 1260 else 1261 cs->multis = realloc(cs->multis, cs->smultis); 1262 if (cs->multis == NULL) { 1263 SETERROR(REG_ESPACE); 1264 return; 1265 } 1266 1267 (void) strcpy(cs->multis + oldend - 1, cp); 1268 cs->multis[cs->smultis - 1] = '\0'; 1269 } 1270 1271 /* 1272 - mcsub - subtract a collating element from a cset 1273 == static void mcsub(register cset *cs, register char *cp); 1274 */ 1275 static void 1276 mcsub(cs, cp) 1277 register cset *cs; 1278 register char *cp; 1279 { 1280 register char *fp = mcfind(cs, cp); 1281 register size_t len = strlen(fp); 1282 1283 assert(fp != NULL); 1284 (void) memmove(fp, fp + len + 1, 1285 cs->smultis - (fp + len + 1 - cs->multis)); 1286 cs->smultis -= len; 1287 1288 if (cs->smultis == 0) { 1289 free(cs->multis); 1290 cs->multis = NULL; 1291 return; 1292 } 1293 1294 cs->multis = realloc(cs->multis, cs->smultis); 1295 assert(cs->multis != NULL); 1296 } 1297 1298 /* 1299 - mcin - is a collating element in a cset? 1300 == static int mcin(register cset *cs, register char *cp); 1301 */ 1302 static int 1303 mcin(cs, cp) 1304 register cset *cs; 1305 register char *cp; 1306 { 1307 return(mcfind(cs, cp) != NULL); 1308 } 1309 1310 /* 1311 - mcfind - find a collating element in a cset 1312 == static char *mcfind(register cset *cs, register char *cp); 1313 */ 1314 static char * 1315 mcfind(cs, cp) 1316 register cset *cs; 1317 register char *cp; 1318 { 1319 register char *p; 1320 1321 if (cs->multis == NULL) 1322 return(NULL); 1323 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1324 if (strcmp(cp, p) == 0) 1325 return(p); 1326 return(NULL); 1327 } 1328 1329 /* 1330 - mcinvert - invert the list of collating elements in a cset 1331 == static void mcinvert(register struct parse *p, register cset *cs); 1332 * 1333 * This would have to know the set of possibilities. Implementation 1334 * is deferred. 1335 */ 1336 static void 1337 mcinvert(p, cs) 1338 register struct parse *p; 1339 register cset *cs; 1340 { 1341 assert(cs->multis == NULL); /* xxx */ 1342 } 1343 1344 /* 1345 - mccase - add case counterparts of the list of collating elements in a cset 1346 == static void mccase(register struct parse *p, register cset *cs); 1347 * 1348 * This would have to know the set of possibilities. Implementation 1349 * is deferred. 1350 */ 1351 static void 1352 mccase(p, cs) 1353 register struct parse *p; 1354 register cset *cs; 1355 { 1356 assert(cs->multis == NULL); /* xxx */ 1357 } 1358 1359 /* 1360 - isinsets - is this character in any sets? 1361 == static int isinsets(register struct re_guts *g, int c); 1362 */ 1363 static int /* predicate */ 1364 isinsets(g, c) 1365 register struct re_guts *g; 1366 int c; 1367 { 1368 register uch *col; 1369 register int i; 1370 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1371 register unsigned uc = (unsigned char)c; 1372 1373 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1374 if (col[uc] != 0) 1375 return(1); 1376 return(0); 1377 } 1378 1379 /* 1380 - samesets - are these two characters in exactly the same sets? 1381 == static int samesets(register struct re_guts *g, int c1, int c2); 1382 */ 1383 static int /* predicate */ 1384 samesets(g, c1, c2) 1385 register struct re_guts *g; 1386 int c1; 1387 int c2; 1388 { 1389 register uch *col; 1390 register int i; 1391 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1392 register unsigned uc1 = (unsigned char)c1; 1393 register unsigned uc2 = (unsigned char)c2; 1394 1395 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1396 if (col[uc1] != col[uc2]) 1397 return(0); 1398 return(1); 1399 } 1400 1401 /* 1402 - categorize - sort out character categories 1403 == static void categorize(struct parse *p, register struct re_guts *g); 1404 */ 1405 static void 1406 categorize(p, g) 1407 struct parse *p; 1408 register struct re_guts *g; 1409 { 1410 register cat_t *cats = g->categories; 1411 register int c; 1412 register int c2; 1413 register cat_t cat; 1414 1415 /* avoid making error situations worse */ 1416 if (p->error != 0) 1417 return; 1418 1419 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1420 if (cats[c] == 0 && isinsets(g, c)) { 1421 cat = g->ncategories++; 1422 cats[c] = cat; 1423 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1424 if (cats[c2] == 0 && samesets(g, c, c2)) 1425 cats[c2] = cat; 1426 } 1427 } 1428 1429 /* 1430 - dupl - emit a duplicate of a bunch of sops 1431 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1432 */ 1433 static sopno /* start of duplicate */ 1434 dupl(p, start, finish) 1435 register struct parse *p; 1436 sopno start; /* from here */ 1437 sopno finish; /* to this less one */ 1438 { 1439 register sopno ret = HERE(); 1440 register sopno len = finish - start; 1441 1442 assert(finish >= start); 1443 if (len == 0) 1444 return(ret); 1445 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1446 assert(p->ssize >= p->slen + len); 1447 (void) memcpy((char *)(p->strip + p->slen), 1448 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1449 p->slen += len; 1450 return(ret); 1451 } 1452 1453 /* 1454 - doemit - emit a strip operator 1455 == static void doemit(register struct parse *p, sop op, size_t opnd); 1456 * 1457 * It might seem better to implement this as a macro with a function as 1458 * hard-case backup, but it's just too big and messy unless there are 1459 * some changes to the data structures. Maybe later. 1460 */ 1461 static void 1462 doemit(p, op, opnd) 1463 register struct parse *p; 1464 sop op; 1465 size_t opnd; 1466 { 1467 /* avoid making error situations worse */ 1468 if (p->error != 0) 1469 return; 1470 1471 /* deal with oversize operands ("can't happen", more or less) */ 1472 assert(opnd < 1<<OPSHIFT); 1473 1474 /* deal with undersized strip */ 1475 if (p->slen >= p->ssize) 1476 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1477 assert(p->slen < p->ssize); 1478 1479 /* finally, it's all reduced to the easy case */ 1480 p->strip[p->slen++] = SOP(op, opnd); 1481 } 1482 1483 /* 1484 - doinsert - insert a sop into the strip 1485 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1486 */ 1487 static void 1488 doinsert(p, op, opnd, pos) 1489 register struct parse *p; 1490 sop op; 1491 size_t opnd; 1492 sopno pos; 1493 { 1494 register sopno sn; 1495 register sop s; 1496 register int i; 1497 1498 /* avoid making error situations worse */ 1499 if (p->error != 0) 1500 return; 1501 1502 sn = HERE(); 1503 EMIT(op, opnd); /* do checks, ensure space */ 1504 assert(HERE() == sn+1); 1505 s = p->strip[sn]; 1506 1507 /* adjust paren pointers */ 1508 assert(pos > 0); 1509 for (i = 1; i < NPAREN; i++) { 1510 if (p->pbegin[i] >= pos) { 1511 p->pbegin[i]++; 1512 } 1513 if (p->pend[i] >= pos) { 1514 p->pend[i]++; 1515 } 1516 } 1517 1518 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1519 (HERE()-pos-1)*sizeof(sop)); 1520 p->strip[pos] = s; 1521 } 1522 1523 /* 1524 - dofwd - complete a forward reference 1525 == static void dofwd(register struct parse *p, sopno pos, sop value); 1526 */ 1527 static void 1528 dofwd(p, pos, value) 1529 register struct parse *p; 1530 register sopno pos; 1531 sop value; 1532 { 1533 /* avoid making error situations worse */ 1534 if (p->error != 0) 1535 return; 1536 1537 assert(value < 1<<OPSHIFT); 1538 p->strip[pos] = OP(p->strip[pos]) | value; 1539 } 1540 1541 /* 1542 - enlarge - enlarge the strip 1543 == static void enlarge(register struct parse *p, sopno size); 1544 */ 1545 static void 1546 enlarge(p, size) 1547 register struct parse *p; 1548 register sopno size; 1549 { 1550 register sop *sp; 1551 1552 if (p->ssize >= size) 1553 return; 1554 1555 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1556 if (sp == NULL) { 1557 SETERROR(REG_ESPACE); 1558 return; 1559 } 1560 p->strip = sp; 1561 p->ssize = size; 1562 } 1563 1564 /* 1565 - stripsnug - compact the strip 1566 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1567 */ 1568 static void 1569 stripsnug(p, g) 1570 register struct parse *p; 1571 register struct re_guts *g; 1572 { 1573 g->nstates = p->slen; 1574 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1575 if (g->strip == NULL) { 1576 SETERROR(REG_ESPACE); 1577 g->strip = p->strip; 1578 } 1579 } 1580 1581 /* 1582 - findmust - fill in must and mlen with longest mandatory literal string 1583 == static void findmust(register struct parse *p, register struct re_guts *g); 1584 * 1585 * This algorithm could do fancy things like analyzing the operands of | 1586 * for common subsequences. Someday. This code is simple and finds most 1587 * of the interesting cases. 1588 * 1589 * Note that must and mlen got initialized during setup. 1590 */ 1591 static void 1592 findmust(p, g) 1593 struct parse *p; 1594 register struct re_guts *g; 1595 { 1596 register sop *scan; 1597 sop *start; 1598 register sop *newstart; 1599 register sopno newlen; 1600 register sop s; 1601 register char *cp; 1602 register sopno i; 1603 1604 /* avoid making error situations worse */ 1605 if (p->error != 0) 1606 return; 1607 1608 /* find the longest OCHAR sequence in strip */ 1609 newlen = 0; 1610 scan = g->strip + 1; 1611 do { 1612 s = *scan++; 1613 switch (OP(s)) { 1614 case OCHAR: /* sequence member */ 1615 if (newlen == 0) /* new sequence */ 1616 newstart = scan - 1; 1617 newlen++; 1618 break; 1619 case OPLUS_: /* things that don't break one */ 1620 case OLPAREN: 1621 case ORPAREN: 1622 break; 1623 case OQUEST_: /* things that must be skipped */ 1624 case OCH_: 1625 scan--; 1626 do { 1627 scan += OPND(s); 1628 s = *scan; 1629 /* assert() interferes w debug printouts */ 1630 if (OP(s) != O_QUEST && OP(s) != O_CH && 1631 OP(s) != OOR2) { 1632 g->iflags |= BAD; 1633 return; 1634 } 1635 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1636 /* fallthrough */ 1637 default: /* things that break a sequence */ 1638 if (newlen > g->mlen) { /* ends one */ 1639 start = newstart; 1640 g->mlen = newlen; 1641 } 1642 newlen = 0; 1643 break; 1644 } 1645 } while (OP(s) != OEND); 1646 1647 if (g->mlen == 0) /* there isn't one */ 1648 return; 1649 1650 /* turn it into a character string */ 1651 g->must = malloc((size_t)g->mlen + 1); 1652 if (g->must == NULL) { /* argh; just forget it */ 1653 g->mlen = 0; 1654 return; 1655 } 1656 cp = g->must; 1657 scan = start; 1658 for (i = g->mlen; i > 0; i--) { 1659 while (OP(s = *scan++) != OCHAR) 1660 continue; 1661 assert(cp < g->must + g->mlen); 1662 *cp++ = (char)OPND(s); 1663 } 1664 assert(cp == g->must + g->mlen); 1665 *cp++ = '\0'; /* just on general principles */ 1666 } 1667 1668 /* 1669 - pluscount - count + nesting 1670 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1671 */ 1672 static sopno /* nesting depth */ 1673 pluscount(p, g) 1674 struct parse *p; 1675 register struct re_guts *g; 1676 { 1677 register sop *scan; 1678 register sop s; 1679 register sopno plusnest = 0; 1680 register sopno maxnest = 0; 1681 1682 if (p->error != 0) 1683 return(0); /* there may not be an OEND */ 1684 1685 scan = g->strip + 1; 1686 do { 1687 s = *scan++; 1688 switch (OP(s)) { 1689 case OPLUS_: 1690 plusnest++; 1691 break; 1692 case O_PLUS: 1693 if (plusnest > maxnest) 1694 maxnest = plusnest; 1695 plusnest--; 1696 break; 1697 } 1698 } while (OP(s) != OEND); 1699 if (plusnest != 0) 1700 g->iflags |= BAD; 1701 return(maxnest); 1702 } 1703 1704 /* 1705 * $PchId: regcomp.c,v 1.2 1996/03/12 19:10:15 philip Exp $ 1706 */ 1707