1 /* $NetBSD: regcomp.c,v 1.16 2000/01/22 22:19:17 mycroft Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 40 */ 41 42 #include <sys/cdefs.h> 43 #if defined(LIBC_SCCS) && !defined(lint) 44 #if 0 45 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 46 #else 47 __RCSID("$NetBSD: regcomp.c,v 1.16 2000/01/22 22:19:17 mycroft Exp $"); 48 #endif 49 #endif /* LIBC_SCCS and not lint */ 50 51 #include "namespace.h" 52 #include <sys/types.h> 53 54 #include <assert.h> 55 #include <ctype.h> 56 #include <limits.h> 57 #include <regex.h> 58 #include <stdio.h> 59 #include <stdlib.h> 60 #include <string.h> 61 62 #ifdef __weak_alias 63 __weak_alias(regcomp,_regcomp) 64 #endif 65 66 #include "utils.h" 67 #include "regex2.h" 68 69 #include "cclass.h" 70 #include "cname.h" 71 72 /* 73 * parse structure, passed up and down to avoid global variables and 74 * other clumsinesses 75 */ 76 struct parse { 77 char *next; /* next character in RE */ 78 char *end; /* end of string (-> NUL normally) */ 79 int error; /* has an error been seen? */ 80 sop *strip; /* malloced strip */ 81 sopno ssize; /* malloced strip size (allocated) */ 82 sopno slen; /* malloced strip length (used) */ 83 int ncsalloc; /* number of csets allocated */ 84 struct re_guts *g; 85 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 86 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 87 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 88 }; 89 90 /* ========= begin header generated by ./mkh ========= */ 91 #ifdef __cplusplus 92 extern "C" { 93 #endif 94 95 /* === regcomp.c === */ 96 static void p_ere __P((struct parse *p, int stop)); 97 static void p_ere_exp __P((struct parse *p)); 98 static void p_str __P((struct parse *p)); 99 static void p_bre __P((struct parse *p, int end1, int end2)); 100 static int p_simp_re __P((struct parse *p, int starordinary)); 101 static int p_count __P((struct parse *p)); 102 static void p_bracket __P((struct parse *p)); 103 static void p_b_term __P((struct parse *p, cset *cs)); 104 static void p_b_cclass __P((struct parse *p, cset *cs)); 105 static void p_b_eclass __P((struct parse *p, cset *cs)); 106 static char p_b_symbol __P((struct parse *p)); 107 static char p_b_coll_elem __P((struct parse *p, int endc)); 108 static char othercase __P((int ch)); 109 static void bothcases __P((struct parse *p, int ch)); 110 static void ordinary __P((struct parse *p, int ch)); 111 static void nonnewline __P((struct parse *p)); 112 static void repeat __P((struct parse *p, sopno start, int from, int to)); 113 static int seterr __P((struct parse *p, int e)); 114 static cset *allocset __P((struct parse *p)); 115 static void freeset __P((struct parse *p, cset *cs)); 116 static int freezeset __P((struct parse *p, cset *cs)); 117 static int firstch __P((struct parse *p, cset *cs)); 118 static int nch __P((struct parse *p, cset *cs)); 119 static void mcadd __P((struct parse *p, cset *cs, const char *cp)); 120 #if 0 121 static void mcsub __P((cset *cs, char *cp)); 122 static int mcin __P((cset *cs, char *cp)); 123 static char *mcfind __P((cset *cs, char *cp)); 124 #endif 125 static void mcinvert __P((struct parse *p, cset *cs)); 126 static void mccase __P((struct parse *p, cset *cs)); 127 static int isinsets __P((struct re_guts *g, int c)); 128 static int samesets __P((struct re_guts *g, int c1, int c2)); 129 static void categorize __P((struct parse *p, struct re_guts *g)); 130 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 131 static void doemit __P((struct parse *p, sop op, sopno opnd)); 132 static void doinsert __P((struct parse *p, sop op, sopno opnd, sopno pos)); 133 static void dofwd __P((struct parse *p, sopno pos, sopno value)); 134 static void enlarge __P((struct parse *p, sopno size)); 135 static void stripsnug __P((struct parse *p, struct re_guts *g)); 136 static void findmust __P((struct parse *p, struct re_guts *g)); 137 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 138 139 #ifdef __cplusplus 140 } 141 #endif 142 /* ========= end header generated by ./mkh ========= */ 143 144 static char nuls[10]; /* place to point scanner in event of error */ 145 146 /* 147 * macros for use with parse structure 148 * BEWARE: these know that the parse structure is named `p' !!! 149 */ 150 #define PEEK() (*p->next) 151 #define PEEK2() (*(p->next+1)) 152 #define MORE() (p->next < p->end) 153 #define MORE2() (p->next+1 < p->end) 154 #define SEE(c) (MORE() && PEEK() == (c)) 155 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 156 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 157 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 158 #define NEXT() (p->next++) 159 #define NEXT2() (p->next += 2) 160 #define NEXTn(n) (p->next += (n)) 161 #define GETNEXT() (*p->next++) 162 #define SETERROR(e) seterr(p, (e)) 163 #define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 164 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 165 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 166 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 167 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd) 168 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 169 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 170 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 171 #define HERE() (p->slen) 172 #define THERE() (p->slen - 1) 173 #define THERETHERE() (p->slen - 2) 174 #define DROP(n) (p->slen -= (n)) 175 176 #ifndef NDEBUG 177 static int never = 0; /* for use in asserts; shuts lint up */ 178 #else 179 #define never 0 /* some <assert.h>s have bugs too */ 180 #endif 181 182 /* 183 - regcomp - interface for parser and compilation 184 = extern int regcomp(regex_t *, const char *, int); 185 = #define REG_BASIC 0000 186 = #define REG_EXTENDED 0001 187 = #define REG_ICASE 0002 188 = #define REG_NOSUB 0004 189 = #define REG_NEWLINE 0010 190 = #define REG_NOSPEC 0020 191 = #define REG_PEND 0040 192 = #define REG_DUMP 0200 193 */ 194 int /* 0 success, otherwise REG_something */ 195 regcomp(preg, pattern, cflags) 196 regex_t *preg; 197 const char *pattern; 198 int cflags; 199 { 200 struct parse pa; 201 struct re_guts *g; 202 struct parse *p = &pa; 203 int i; 204 size_t len; 205 #ifdef REDEBUG 206 # define GOODFLAGS(f) (f) 207 #else 208 # define GOODFLAGS(f) ((f)&~REG_DUMP) 209 #endif 210 211 _DIAGASSERT(preg != NULL); 212 _DIAGASSERT(pattern != NULL); 213 214 cflags = GOODFLAGS(cflags); 215 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 216 return(REG_INVARG); 217 218 if (cflags®_PEND) { 219 if (preg->re_endp < pattern) 220 return(REG_INVARG); 221 len = preg->re_endp - pattern; 222 } else 223 len = strlen(pattern); 224 225 /* do the mallocs early so failure handling is easy */ 226 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 227 (NC-1)*sizeof(cat_t)); 228 if (g == NULL) 229 return(REG_ESPACE); 230 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 231 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 232 p->slen = 0; 233 if (p->strip == NULL) { 234 free(g); 235 return(REG_ESPACE); 236 } 237 238 /* set things up */ 239 p->g = g; 240 /* LINTED convenience; we do not modify it */ 241 p->next = (char *)pattern; 242 p->end = p->next + len; 243 p->error = 0; 244 p->ncsalloc = 0; 245 for (i = 0; i < NPAREN; i++) { 246 p->pbegin[i] = 0; 247 p->pend[i] = 0; 248 } 249 g->csetsize = NC; 250 g->sets = NULL; 251 g->setbits = NULL; 252 g->ncsets = 0; 253 g->cflags = cflags; 254 g->iflags = 0; 255 g->nbol = 0; 256 g->neol = 0; 257 g->must = NULL; 258 g->mlen = 0; 259 g->nsub = 0; 260 g->ncategories = 1; /* category 0 is "everything else" */ 261 g->categories = &g->catspace[-(CHAR_MIN)]; 262 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 263 g->backrefs = 0; 264 265 /* do it */ 266 EMIT(OEND, 0); 267 g->firststate = THERE(); 268 if (cflags®_EXTENDED) 269 p_ere(p, OUT); 270 else if (cflags®_NOSPEC) 271 p_str(p); 272 else 273 p_bre(p, OUT, OUT); 274 EMIT(OEND, 0); 275 g->laststate = THERE(); 276 277 /* tidy up loose ends and fill things in */ 278 categorize(p, g); 279 stripsnug(p, g); 280 findmust(p, g); 281 g->nplus = pluscount(p, g); 282 g->magic = MAGIC2; 283 preg->re_nsub = g->nsub; 284 preg->re_g = g; 285 preg->re_magic = MAGIC1; 286 #ifndef REDEBUG 287 /* not debugging, so can't rely on the assert() in regexec() */ 288 if (g->iflags&BAD) 289 SETERROR(REG_ASSERT); 290 #endif 291 292 /* win or lose, we're done */ 293 if (p->error != 0) /* lose */ 294 regfree(preg); 295 return(p->error); 296 } 297 298 /* 299 - p_ere - ERE parser top level, concatenation and alternation 300 == static void p_ere(struct parse *p, int stop); 301 */ 302 static void 303 p_ere(p, stop) 304 struct parse *p; 305 int stop; /* character this ERE should end at */ 306 { 307 char c; 308 sopno prevback = 0; /* pacify gcc */ 309 sopno prevfwd = 0; /* pacify gcc */ 310 sopno conc; 311 int first = 1; /* is this the first alternative? */ 312 313 _DIAGASSERT(p != NULL); 314 315 for (;;) { 316 /* do a bunch of concatenated expressions */ 317 conc = HERE(); 318 while (MORE() && (c = PEEK()) != '|' && c != stop) 319 p_ere_exp(p); 320 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 321 322 if (!EAT('|')) 323 break; /* NOTE BREAK OUT */ 324 325 if (first) { 326 INSERT(OCH_, conc); /* offset is wrong */ 327 prevfwd = conc; 328 prevback = conc; 329 first = 0; 330 } 331 ASTERN(OOR1, prevback); 332 prevback = THERE(); 333 AHEAD(prevfwd); /* fix previous offset */ 334 prevfwd = HERE(); 335 EMIT(OOR2, 0); /* offset is very wrong */ 336 } 337 338 if (!first) { /* tail-end fixups */ 339 AHEAD(prevfwd); 340 ASTERN(O_CH, prevback); 341 } 342 343 assert(!MORE() || SEE(stop)); 344 } 345 346 /* 347 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 348 == static void p_ere_exp(struct parse *p); 349 */ 350 static void 351 p_ere_exp(p) 352 struct parse *p; 353 { 354 char c; 355 sopno pos; 356 int count; 357 int count2; 358 sopno subno; 359 int wascaret = 0; 360 361 _DIAGASSERT(p != NULL); 362 363 assert(MORE()); /* caller should have ensured this */ 364 c = GETNEXT(); 365 366 pos = HERE(); 367 switch (c) { 368 case '(': 369 REQUIRE(MORE(), REG_EPAREN); 370 p->g->nsub++; 371 subno = p->g->nsub; 372 if (subno < NPAREN) 373 p->pbegin[subno] = HERE(); 374 EMIT(OLPAREN, subno); 375 if (!SEE(')')) 376 p_ere(p, ')'); 377 if (subno < NPAREN) { 378 p->pend[subno] = HERE(); 379 assert(p->pend[subno] != 0); 380 } 381 EMIT(ORPAREN, subno); 382 MUSTEAT(')', REG_EPAREN); 383 break; 384 #ifndef POSIX_MISTAKE 385 case ')': /* happens only if no current unmatched ( */ 386 /* 387 * You may ask, why the ifndef? Because I didn't notice 388 * this until slightly too late for 1003.2, and none of the 389 * other 1003.2 regular-expression reviewers noticed it at 390 * all. So an unmatched ) is legal POSIX, at least until 391 * we can get it fixed. 392 */ 393 SETERROR(REG_EPAREN); 394 break; 395 #endif 396 case '^': 397 EMIT(OBOL, 0); 398 p->g->iflags |= USEBOL; 399 p->g->nbol++; 400 wascaret = 1; 401 break; 402 case '$': 403 EMIT(OEOL, 0); 404 p->g->iflags |= USEEOL; 405 p->g->neol++; 406 break; 407 case '|': 408 SETERROR(REG_EMPTY); 409 break; 410 case '*': 411 case '+': 412 case '?': 413 SETERROR(REG_BADRPT); 414 break; 415 case '.': 416 if (p->g->cflags®_NEWLINE) 417 nonnewline(p); 418 else 419 EMIT(OANY, 0); 420 break; 421 case '[': 422 p_bracket(p); 423 break; 424 case '\\': 425 REQUIRE(MORE(), REG_EESCAPE); 426 c = GETNEXT(); 427 ordinary(p, c); 428 break; 429 case '{': /* okay as ordinary except if digit follows */ 430 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 431 /* FALLTHROUGH */ 432 default: 433 ordinary(p, c); 434 break; 435 } 436 437 if (!MORE()) 438 return; 439 c = PEEK(); 440 /* we call { a repetition if followed by a digit */ 441 if (!( c == '*' || c == '+' || c == '?' || 442 (c == '{' && MORE2() && isdigit(PEEK2())) )) 443 return; /* no repetition, we're done */ 444 NEXT(); 445 446 REQUIRE(!wascaret, REG_BADRPT); 447 switch (c) { 448 case '*': /* implemented as +? */ 449 /* this case does not require the (y|) trick, noKLUDGE */ 450 INSERT(OPLUS_, pos); 451 ASTERN(O_PLUS, pos); 452 INSERT(OQUEST_, pos); 453 ASTERN(O_QUEST, pos); 454 break; 455 case '+': 456 INSERT(OPLUS_, pos); 457 ASTERN(O_PLUS, pos); 458 break; 459 case '?': 460 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 461 INSERT(OCH_, pos); /* offset slightly wrong */ 462 ASTERN(OOR1, pos); /* this one's right */ 463 AHEAD(pos); /* fix the OCH_ */ 464 EMIT(OOR2, 0); /* offset very wrong... */ 465 AHEAD(THERE()); /* ...so fix it */ 466 ASTERN(O_CH, THERETHERE()); 467 break; 468 case '{': 469 count = p_count(p); 470 if (EAT(',')) { 471 if (isdigit(PEEK())) { 472 count2 = p_count(p); 473 REQUIRE(count <= count2, REG_BADBR); 474 } else /* single number with comma */ 475 count2 = INFINITY; 476 } else /* just a single number */ 477 count2 = count; 478 repeat(p, pos, count, count2); 479 if (!EAT('}')) { /* error heuristics */ 480 while (MORE() && PEEK() != '}') 481 NEXT(); 482 REQUIRE(MORE(), REG_EBRACE); 483 SETERROR(REG_BADBR); 484 } 485 break; 486 } 487 488 if (!MORE()) 489 return; 490 c = PEEK(); 491 if (!( c == '*' || c == '+' || c == '?' || 492 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 493 return; 494 SETERROR(REG_BADRPT); 495 } 496 497 /* 498 - p_str - string (no metacharacters) "parser" 499 == static void p_str(struct parse *p); 500 */ 501 static void 502 p_str(p) 503 struct parse *p; 504 { 505 506 _DIAGASSERT(p != NULL); 507 508 REQUIRE(MORE(), REG_EMPTY); 509 while (MORE()) 510 ordinary(p, GETNEXT()); 511 } 512 513 /* 514 - p_bre - BRE parser top level, anchoring and concatenation 515 == static void p_bre(struct parse *p, int end1, \ 516 == int end2); 517 * Giving end1 as OUT essentially eliminates the end1/end2 check. 518 * 519 * This implementation is a bit of a kludge, in that a trailing $ is first 520 * taken as an ordinary character and then revised to be an anchor. The 521 * only undesirable side effect is that '$' gets included as a character 522 * category in such cases. This is fairly harmless; not worth fixing. 523 * The amount of lookahead needed to avoid this kludge is excessive. 524 */ 525 static void 526 p_bre(p, end1, end2) 527 struct parse *p; 528 int end1; /* first terminating character */ 529 int end2; /* second terminating character */ 530 { 531 sopno start; 532 int first = 1; /* first subexpression? */ 533 int wasdollar = 0; 534 535 _DIAGASSERT(p != NULL); 536 537 start = HERE(); 538 539 if (EAT('^')) { 540 EMIT(OBOL, 0); 541 p->g->iflags |= USEBOL; 542 p->g->nbol++; 543 } 544 while (MORE() && !SEETWO(end1, end2)) { 545 wasdollar = p_simp_re(p, first); 546 first = 0; 547 } 548 if (wasdollar) { /* oops, that was a trailing anchor */ 549 DROP(1); 550 EMIT(OEOL, 0); 551 p->g->iflags |= USEEOL; 552 p->g->neol++; 553 } 554 555 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 556 } 557 558 /* 559 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 560 == static int p_simp_re(struct parse *p, int starordinary); 561 */ 562 static int /* was the simple RE an unbackslashed $? */ 563 p_simp_re(p, starordinary) 564 struct parse *p; 565 int starordinary; /* is a leading * an ordinary character? */ 566 { 567 int c; 568 int count; 569 int count2; 570 sopno pos; 571 int i; 572 sopno subno; 573 # define BACKSL (1<<CHAR_BIT) 574 575 _DIAGASSERT(p != NULL); 576 577 pos = HERE(); /* repetion op, if any, covers from here */ 578 579 assert(MORE()); /* caller should have ensured this */ 580 c = GETNEXT(); 581 if (c == '\\') { 582 REQUIRE(MORE(), REG_EESCAPE); 583 c = BACKSL | (unsigned char)GETNEXT(); 584 } 585 switch (c) { 586 case '.': 587 if (p->g->cflags®_NEWLINE) 588 nonnewline(p); 589 else 590 EMIT(OANY, 0); 591 break; 592 case '[': 593 p_bracket(p); 594 break; 595 case BACKSL|'{': 596 SETERROR(REG_BADRPT); 597 break; 598 case BACKSL|'(': 599 p->g->nsub++; 600 subno = p->g->nsub; 601 if (subno < NPAREN) 602 p->pbegin[subno] = HERE(); 603 EMIT(OLPAREN, subno); 604 /* the MORE here is an error heuristic */ 605 if (MORE() && !SEETWO('\\', ')')) 606 p_bre(p, '\\', ')'); 607 if (subno < NPAREN) { 608 p->pend[subno] = HERE(); 609 assert(p->pend[subno] != 0); 610 } 611 EMIT(ORPAREN, subno); 612 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 613 break; 614 case BACKSL|')': /* should not get here -- must be user */ 615 case BACKSL|'}': 616 SETERROR(REG_EPAREN); 617 break; 618 case BACKSL|'1': 619 case BACKSL|'2': 620 case BACKSL|'3': 621 case BACKSL|'4': 622 case BACKSL|'5': 623 case BACKSL|'6': 624 case BACKSL|'7': 625 case BACKSL|'8': 626 case BACKSL|'9': 627 i = (c&~BACKSL) - '0'; 628 assert(i < NPAREN); 629 if (p->pend[i] != 0) { 630 assert(i <= p->g->nsub); 631 EMIT(OBACK_, i); 632 assert(p->pbegin[i] != 0); 633 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 634 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 635 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 636 EMIT(O_BACK, i); 637 } else 638 SETERROR(REG_ESUBREG); 639 p->g->backrefs = 1; 640 break; 641 case '*': 642 REQUIRE(starordinary, REG_BADRPT); 643 /* FALLTHROUGH */ 644 default: 645 ordinary(p, c &~ BACKSL); 646 break; 647 } 648 649 if (EAT('*')) { /* implemented as +? */ 650 /* this case does not require the (y|) trick, noKLUDGE */ 651 INSERT(OPLUS_, pos); 652 ASTERN(O_PLUS, pos); 653 INSERT(OQUEST_, pos); 654 ASTERN(O_QUEST, pos); 655 } else if (EATTWO('\\', '{')) { 656 count = p_count(p); 657 if (EAT(',')) { 658 if (MORE() && isdigit(PEEK())) { 659 count2 = p_count(p); 660 REQUIRE(count <= count2, REG_BADBR); 661 } else /* single number with comma */ 662 count2 = INFINITY; 663 } else /* just a single number */ 664 count2 = count; 665 repeat(p, pos, count, count2); 666 if (!EATTWO('\\', '}')) { /* error heuristics */ 667 while (MORE() && !SEETWO('\\', '}')) 668 NEXT(); 669 REQUIRE(MORE(), REG_EBRACE); 670 SETERROR(REG_BADBR); 671 } 672 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 673 return(1); 674 675 return(0); 676 } 677 678 /* 679 - p_count - parse a repetition count 680 == static int p_count(struct parse *p); 681 */ 682 static int /* the value */ 683 p_count(p) 684 struct parse *p; 685 { 686 int count = 0; 687 int ndigits = 0; 688 689 _DIAGASSERT(p != NULL); 690 691 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 692 count = count*10 + (GETNEXT() - '0'); 693 ndigits++; 694 } 695 696 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 697 return(count); 698 } 699 700 /* 701 - p_bracket - parse a bracketed character list 702 == static void p_bracket(struct parse *p); 703 * 704 * Note a significant property of this code: if the allocset() did SETERROR, 705 * no set operations are done. 706 */ 707 static void 708 p_bracket(p) 709 struct parse *p; 710 { 711 cset *cs; 712 int invert = 0; 713 714 _DIAGASSERT(p != NULL); 715 716 cs = allocset(p); 717 718 /* Dept of Truly Sickening Special-Case Kludges */ 719 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 720 (size_t)6) == 0) { 721 EMIT(OBOW, 0); 722 NEXTn(6); 723 return; 724 } 725 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 726 (size_t)6) == 0) { 727 EMIT(OEOW, 0); 728 NEXTn(6); 729 return; 730 } 731 732 if (EAT('^')) 733 invert++; /* make note to invert set at end */ 734 if (EAT(']')) 735 CHadd(cs, ']'); 736 else if (EAT('-')) 737 CHadd(cs, '-'); 738 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 739 p_b_term(p, cs); 740 if (EAT('-')) 741 CHadd(cs, '-'); 742 MUSTEAT(']', REG_EBRACK); 743 744 if (p->error != 0) /* don't mess things up further */ 745 return; 746 747 if (p->g->cflags®_ICASE) { 748 int i; 749 int ci; 750 751 for (i = p->g->csetsize - 1; i >= 0; i--) 752 if (CHIN(cs, i) && isalpha(i)) { 753 ci = othercase(i); 754 if (ci != i) 755 CHadd(cs, ci); 756 } 757 if (cs->multis != NULL) 758 mccase(p, cs); 759 } 760 if (invert) { 761 int i; 762 763 for (i = p->g->csetsize - 1; i >= 0; i--) 764 if (CHIN(cs, i)) 765 CHsub(cs, i); 766 else 767 CHadd(cs, i); 768 if (p->g->cflags®_NEWLINE) 769 CHsub(cs, '\n'); 770 if (cs->multis != NULL) 771 mcinvert(p, cs); 772 } 773 774 assert(cs->multis == NULL); /* xxx */ 775 776 if (nch(p, cs) == 1) { /* optimize singleton sets */ 777 ordinary(p, firstch(p, cs)); 778 freeset(p, cs); 779 } else 780 EMIT(OANYOF, freezeset(p, cs)); 781 } 782 783 /* 784 - p_b_term - parse one term of a bracketed character list 785 == static void p_b_term(struct parse *p, cset *cs); 786 */ 787 static void 788 p_b_term(p, cs) 789 struct parse *p; 790 cset *cs; 791 { 792 char c; 793 char start, finish; 794 int i; 795 796 _DIAGASSERT(p != NULL); 797 _DIAGASSERT(cs != NULL); 798 799 /* classify what we've got */ 800 switch ((MORE()) ? PEEK() : '\0') { 801 case '[': 802 c = (MORE2()) ? PEEK2() : '\0'; 803 break; 804 805 case '-': 806 SETERROR(REG_ERANGE); 807 return; /* NOTE RETURN */ 808 809 default: 810 c = '\0'; 811 break; 812 } 813 814 switch (c) { 815 case ':': /* character class */ 816 NEXT2(); 817 REQUIRE(MORE(), REG_EBRACK); 818 c = PEEK(); 819 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 820 p_b_cclass(p, cs); 821 REQUIRE(MORE(), REG_EBRACK); 822 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 823 break; 824 case '=': /* equivalence class */ 825 NEXT2(); 826 REQUIRE(MORE(), REG_EBRACK); 827 c = PEEK(); 828 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 829 p_b_eclass(p, cs); 830 REQUIRE(MORE(), REG_EBRACK); 831 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 832 break; 833 default: /* symbol, ordinary character, or range */ 834 /* xxx revision needed for multichar stuff */ 835 start = p_b_symbol(p); 836 if (SEE('-') && MORE2() && PEEK2() != ']') { 837 /* range */ 838 NEXT(); 839 if (EAT('-')) 840 finish = '-'; 841 else 842 finish = p_b_symbol(p); 843 } else 844 finish = start; 845 /* xxx what about signed chars here... */ 846 REQUIRE(start <= finish, REG_ERANGE); 847 for (i = start; i <= finish; i++) 848 CHadd(cs, i); 849 break; 850 } 851 } 852 853 /* 854 - p_b_cclass - parse a character-class name and deal with it 855 == static void p_b_cclass(struct parse *p, cset *cs); 856 */ 857 static void 858 p_b_cclass(p, cs) 859 struct parse *p; 860 cset *cs; 861 { 862 char *sp; 863 const struct cclass *cp; 864 size_t len; 865 const char *u; 866 char c; 867 868 _DIAGASSERT(p != NULL); 869 _DIAGASSERT(cs != NULL); 870 871 sp = p->next; 872 873 while (MORE() && isalpha(PEEK())) 874 NEXT(); 875 len = p->next - sp; 876 for (cp = cclasses; cp->name != NULL; cp++) 877 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 878 break; 879 if (cp->name == NULL) { 880 /* oops, didn't find it */ 881 SETERROR(REG_ECTYPE); 882 return; 883 } 884 885 u = cp->chars; 886 while ((c = *u++) != '\0') 887 CHadd(cs, c); 888 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 889 MCadd(p, cs, u); 890 } 891 892 /* 893 - p_b_eclass - parse an equivalence-class name and deal with it 894 == static void p_b_eclass(struct parse *p, cset *cs); 895 * 896 * This implementation is incomplete. xxx 897 */ 898 static void 899 p_b_eclass(p, cs) 900 struct parse *p; 901 cset *cs; 902 { 903 char c; 904 905 _DIAGASSERT(p != NULL); 906 _DIAGASSERT(cs != NULL); 907 908 c = p_b_coll_elem(p, '='); 909 CHadd(cs, c); 910 } 911 912 /* 913 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 914 == static char p_b_symbol(struct parse *p); 915 */ 916 static char /* value of symbol */ 917 p_b_symbol(p) 918 struct parse *p; 919 { 920 char value; 921 922 _DIAGASSERT(p != NULL); 923 924 REQUIRE(MORE(), REG_EBRACK); 925 if (!EATTWO('[', '.')) 926 return(GETNEXT()); 927 928 /* collating symbol */ 929 value = p_b_coll_elem(p, '.'); 930 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 931 return(value); 932 } 933 934 /* 935 - p_b_coll_elem - parse a collating-element name and look it up 936 == static char p_b_coll_elem(struct parse *p, int endc); 937 */ 938 static char /* value of collating element */ 939 p_b_coll_elem(p, endc) 940 struct parse *p; 941 int endc; /* name ended by endc,']' */ 942 { 943 char *sp; 944 const struct cname *cp; 945 size_t len; 946 947 _DIAGASSERT(p != NULL); 948 949 sp = p->next; 950 951 while (MORE() && !SEETWO(endc, ']')) 952 NEXT(); 953 if (!MORE()) { 954 SETERROR(REG_EBRACK); 955 return(0); 956 } 957 len = p->next - sp; 958 for (cp = cnames; cp->name != NULL; cp++) 959 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 960 return(cp->code); /* known name */ 961 if (len == 1) 962 return(*sp); /* single character */ 963 SETERROR(REG_ECOLLATE); /* neither */ 964 return(0); 965 } 966 967 /* 968 - othercase - return the case counterpart of an alphabetic 969 == static char othercase(int ch); 970 */ 971 static char /* if no counterpart, return ch */ 972 othercase(ch) 973 int ch; 974 { 975 assert(isalpha(ch)); 976 if (isupper(ch)) 977 return(tolower(ch)); 978 else if (islower(ch)) 979 return(toupper(ch)); 980 else /* peculiar, but could happen */ 981 return(ch); 982 } 983 984 /* 985 - bothcases - emit a dualcase version of a two-case character 986 == static void bothcases(struct parse *p, int ch); 987 * 988 * Boy, is this implementation ever a kludge... 989 */ 990 static void 991 bothcases(p, ch) 992 struct parse *p; 993 int ch; 994 { 995 char *oldnext; 996 char *oldend; 997 char bracket[3]; 998 999 _DIAGASSERT(p != NULL); 1000 1001 oldnext = p->next; 1002 oldend = p->end; 1003 1004 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1005 p->next = bracket; 1006 p->end = bracket+2; 1007 bracket[0] = ch; 1008 bracket[1] = ']'; 1009 bracket[2] = '\0'; 1010 p_bracket(p); 1011 assert(p->next == bracket+2); 1012 p->next = oldnext; 1013 p->end = oldend; 1014 } 1015 1016 /* 1017 - ordinary - emit an ordinary character 1018 == static void ordinary(struct parse *p, int ch); 1019 */ 1020 static void 1021 ordinary(p, ch) 1022 struct parse *p; 1023 int ch; 1024 { 1025 cat_t *cap; 1026 1027 _DIAGASSERT(p != NULL); 1028 1029 cap = p->g->categories; 1030 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 1031 bothcases(p, ch); 1032 else { 1033 EMIT(OCHAR, (unsigned char)ch); 1034 if (cap[ch] == 0) 1035 cap[ch] = p->g->ncategories++; 1036 } 1037 } 1038 1039 /* 1040 - nonnewline - emit REG_NEWLINE version of OANY 1041 == static void nonnewline(struct parse *p); 1042 * 1043 * Boy, is this implementation ever a kludge... 1044 */ 1045 static void 1046 nonnewline(p) 1047 struct parse *p; 1048 { 1049 char *oldnext; 1050 char *oldend; 1051 char bracket[4]; 1052 1053 _DIAGASSERT(p != NULL); 1054 1055 oldnext = p->next; 1056 oldend = p->end; 1057 1058 p->next = bracket; 1059 p->end = bracket+3; 1060 bracket[0] = '^'; 1061 bracket[1] = '\n'; 1062 bracket[2] = ']'; 1063 bracket[3] = '\0'; 1064 p_bracket(p); 1065 assert(p->next == bracket+3); 1066 p->next = oldnext; 1067 p->end = oldend; 1068 } 1069 1070 /* 1071 - repeat - generate code for a bounded repetition, recursively if needed 1072 == static void repeat(struct parse *p, sopno start, int from, int to); 1073 */ 1074 static void 1075 repeat(p, start, from, to) 1076 struct parse *p; 1077 sopno start; /* operand from here to end of strip */ 1078 int from; /* repeated from this number */ 1079 int to; /* to this number of times (maybe INFINITY) */ 1080 { 1081 sopno finish; 1082 # define N 2 1083 # define INF 3 1084 # define REP(f, t) ((f)*8 + (t)) 1085 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1086 sopno copy; 1087 1088 _DIAGASSERT(p != NULL); 1089 1090 finish = HERE(); 1091 1092 if (p->error != 0) /* head off possible runaway recursion */ 1093 return; 1094 1095 assert(from <= to); 1096 1097 switch (REP(MAP(from), MAP(to))) { 1098 case REP(0, 0): /* must be user doing this */ 1099 DROP(finish-start); /* drop the operand */ 1100 break; 1101 case REP(0, 1): /* as x{1,1}? */ 1102 case REP(0, N): /* as x{1,n}? */ 1103 case REP(0, INF): /* as x{1,}? */ 1104 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1105 INSERT(OCH_, start); /* offset is wrong... */ 1106 repeat(p, start+1, 1, to); 1107 ASTERN(OOR1, start); 1108 AHEAD(start); /* ... fix it */ 1109 EMIT(OOR2, 0); 1110 AHEAD(THERE()); 1111 ASTERN(O_CH, THERETHERE()); 1112 break; 1113 case REP(1, 1): /* trivial case */ 1114 /* done */ 1115 break; 1116 case REP(1, N): /* as x?x{1,n-1} */ 1117 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1118 INSERT(OCH_, start); 1119 ASTERN(OOR1, start); 1120 AHEAD(start); 1121 EMIT(OOR2, 0); /* offset very wrong... */ 1122 AHEAD(THERE()); /* ...so fix it */ 1123 ASTERN(O_CH, THERETHERE()); 1124 copy = dupl(p, start+1, finish+1); 1125 assert(copy == finish+4); 1126 repeat(p, copy, 1, to-1); 1127 break; 1128 case REP(1, INF): /* as x+ */ 1129 INSERT(OPLUS_, start); 1130 ASTERN(O_PLUS, start); 1131 break; 1132 case REP(N, N): /* as xx{m-1,n-1} */ 1133 copy = dupl(p, start, finish); 1134 repeat(p, copy, from-1, to-1); 1135 break; 1136 case REP(N, INF): /* as xx{n-1,INF} */ 1137 copy = dupl(p, start, finish); 1138 repeat(p, copy, from-1, to); 1139 break; 1140 default: /* "can't happen" */ 1141 SETERROR(REG_ASSERT); /* just in case */ 1142 break; 1143 } 1144 } 1145 1146 /* 1147 - seterr - set an error condition 1148 == static int seterr(struct parse *p, int e); 1149 */ 1150 static int /* useless but makes type checking happy */ 1151 seterr(p, e) 1152 struct parse *p; 1153 int e; 1154 { 1155 1156 _DIAGASSERT(p != NULL); 1157 1158 if (p->error == 0) /* keep earliest error condition */ 1159 p->error = e; 1160 p->next = nuls; /* try to bring things to a halt */ 1161 p->end = nuls; 1162 return(0); /* make the return value well-defined */ 1163 } 1164 1165 /* 1166 - allocset - allocate a set of characters for [] 1167 == static cset *allocset(struct parse *p); 1168 */ 1169 static cset * 1170 allocset(p) 1171 struct parse *p; 1172 { 1173 int no; 1174 size_t nc; 1175 size_t nbytes; 1176 cset *cs; 1177 size_t css; 1178 int i; 1179 1180 _DIAGASSERT(p != NULL); 1181 1182 no = p->g->ncsets++; 1183 css = (size_t)p->g->csetsize; 1184 if (no >= p->ncsalloc) { /* need another column of space */ 1185 p->ncsalloc += CHAR_BIT; 1186 nc = p->ncsalloc; 1187 assert(nc % CHAR_BIT == 0); 1188 nbytes = nc / CHAR_BIT * css; 1189 if (p->g->sets == NULL) 1190 p->g->sets = malloc(nc * sizeof(cset)); 1191 else 1192 p->g->sets = realloc(p->g->sets, nc * sizeof(cset)); 1193 if (p->g->setbits == NULL) 1194 p->g->setbits = malloc(nbytes); 1195 else { 1196 p->g->setbits = realloc(p->g->setbits, nbytes); 1197 /* xxx this isn't right if setbits is now NULL */ 1198 for (i = 0; i < no; i++) 1199 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1200 } 1201 if (p->g->sets != NULL && p->g->setbits != NULL) 1202 (void) memset((char *)p->g->setbits + (nbytes - css), 1203 0, css); 1204 else { 1205 no = 0; 1206 SETERROR(REG_ESPACE); 1207 /* caller's responsibility not to do set ops */ 1208 } 1209 } 1210 1211 assert(p->g->sets != NULL); /* xxx */ 1212 cs = &p->g->sets[no]; 1213 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1214 cs->mask = 1 << ((no) % CHAR_BIT); 1215 cs->hash = 0; 1216 cs->smultis = 0; 1217 cs->multis = NULL; 1218 1219 return(cs); 1220 } 1221 1222 /* 1223 - freeset - free a now-unused set 1224 == static void freeset(struct parse *p, cset *cs); 1225 */ 1226 static void 1227 freeset(p, cs) 1228 struct parse *p; 1229 cset *cs; 1230 { 1231 int i; 1232 cset *top; 1233 size_t css; 1234 1235 _DIAGASSERT(p != NULL); 1236 _DIAGASSERT(cs != NULL); 1237 1238 top = &p->g->sets[p->g->ncsets]; 1239 css = (size_t)p->g->csetsize; 1240 1241 for (i = 0; i < css; i++) 1242 CHsub(cs, i); 1243 if (cs == top-1) /* recover only the easy case */ 1244 p->g->ncsets--; 1245 } 1246 1247 /* 1248 - freezeset - final processing on a set of characters 1249 == static int freezeset(struct parse *p, cset *cs); 1250 * 1251 * The main task here is merging identical sets. This is usually a waste 1252 * of time (although the hash code minimizes the overhead), but can win 1253 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1254 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1255 * the same value! 1256 */ 1257 static int /* set number */ 1258 freezeset(p, cs) 1259 struct parse *p; 1260 cset *cs; 1261 { 1262 uch h; 1263 int i; 1264 cset *top; 1265 cset *cs2; 1266 size_t css; 1267 1268 _DIAGASSERT(p != NULL); 1269 _DIAGASSERT(cs != NULL); 1270 1271 h = cs->hash; 1272 top = &p->g->sets[p->g->ncsets]; 1273 css = (size_t)p->g->csetsize; 1274 1275 /* look for an earlier one which is the same */ 1276 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1277 if (cs2->hash == h && cs2 != cs) { 1278 /* maybe */ 1279 for (i = 0; i < css; i++) 1280 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1281 break; /* no */ 1282 if (i == css) 1283 break; /* yes */ 1284 } 1285 1286 if (cs2 < top) { /* found one */ 1287 freeset(p, cs); 1288 cs = cs2; 1289 } 1290 1291 return((int)(cs - p->g->sets)); 1292 } 1293 1294 /* 1295 - firstch - return first character in a set (which must have at least one) 1296 == static int firstch(struct parse *p, cset *cs); 1297 */ 1298 static int /* character; there is no "none" value */ 1299 firstch(p, cs) 1300 struct parse *p; 1301 cset *cs; 1302 { 1303 int i; 1304 size_t css; 1305 1306 _DIAGASSERT(p != NULL); 1307 _DIAGASSERT(cs != NULL); 1308 1309 css = (size_t)p->g->csetsize; 1310 1311 for (i = 0; i < css; i++) 1312 if (CHIN(cs, i)) 1313 return((char)i); 1314 assert(never); 1315 return(0); /* arbitrary */ 1316 } 1317 1318 /* 1319 - nch - number of characters in a set 1320 == static int nch(struct parse *p, cset *cs); 1321 */ 1322 static int 1323 nch(p, cs) 1324 struct parse *p; 1325 cset *cs; 1326 { 1327 int i; 1328 size_t css; 1329 int n = 0; 1330 1331 _DIAGASSERT(p != NULL); 1332 _DIAGASSERT(cs != NULL); 1333 1334 css = (size_t)p->g->csetsize; 1335 1336 for (i = 0; i < css; i++) 1337 if (CHIN(cs, i)) 1338 n++; 1339 return(n); 1340 } 1341 1342 /* 1343 - mcadd - add a collating element to a cset 1344 == static void mcadd(struct parse *p, cset *cs, \ 1345 == char *cp); 1346 */ 1347 static void 1348 mcadd(p, cs, cp) 1349 struct parse *p; 1350 cset *cs; 1351 const char *cp; 1352 { 1353 size_t oldend; 1354 1355 _DIAGASSERT(p != NULL); 1356 _DIAGASSERT(cs != NULL); 1357 _DIAGASSERT(cp != NULL); 1358 1359 oldend = cs->smultis; 1360 1361 cs->smultis += strlen(cp) + 1; 1362 if (cs->multis == NULL) 1363 cs->multis = malloc(cs->smultis); 1364 else 1365 cs->multis = realloc(cs->multis, cs->smultis); 1366 if (cs->multis == NULL) { 1367 SETERROR(REG_ESPACE); 1368 return; 1369 } 1370 1371 (void) strcpy(cs->multis + oldend - 1, cp); 1372 cs->multis[cs->smultis - 1] = '\0'; 1373 } 1374 1375 #if 0 1376 /* 1377 - mcsub - subtract a collating element from a cset 1378 == static void mcsub(cset *cs, char *cp); 1379 */ 1380 static void 1381 mcsub(cs, cp) 1382 cset *cs; 1383 char *cp; 1384 { 1385 char *fp; 1386 size_t len; 1387 1388 _DIAGASSERT(cs != NULL); 1389 _DIAGASSERT(cp != NULL); 1390 1391 fp = mcfind(cs, cp); 1392 len = strlen(fp); 1393 1394 assert(fp != NULL); 1395 (void) memmove(fp, fp + len + 1, 1396 cs->smultis - (fp + len + 1 - cs->multis)); 1397 cs->smultis -= len; 1398 1399 if (cs->smultis == 0) { 1400 free(cs->multis); 1401 cs->multis = NULL; 1402 return; 1403 } 1404 1405 cs->multis = realloc(cs->multis, cs->smultis); 1406 assert(cs->multis != NULL); 1407 } 1408 1409 /* 1410 - mcin - is a collating element in a cset? 1411 == static int mcin(cset *cs, char *cp); 1412 */ 1413 static int 1414 mcin(cs, cp) 1415 cset *cs; 1416 char *cp; 1417 { 1418 1419 _DIAGASSERT(cs != NULL); 1420 _DIAGASSERT(cp != NULL); 1421 1422 return(mcfind(cs, cp) != NULL); 1423 } 1424 1425 /* 1426 - mcfind - find a collating element in a cset 1427 == static char *mcfind(cset *cs, char *cp); 1428 */ 1429 static char * 1430 mcfind(cs, cp) 1431 cset *cs; 1432 char *cp; 1433 { 1434 char *p; 1435 1436 _DIAGASSERT(cs != NULL); 1437 _DIAGASSERT(cp != NULL); 1438 1439 if (cs->multis == NULL) 1440 return(NULL); 1441 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1442 if (strcmp(cp, p) == 0) 1443 return(p); 1444 return(NULL); 1445 } 1446 #endif 1447 1448 /* 1449 - mcinvert - invert the list of collating elements in a cset 1450 == static void mcinvert(struct parse *p, cset *cs); 1451 * 1452 * This would have to know the set of possibilities. Implementation 1453 * is deferred. 1454 */ 1455 /* ARGSUSED */ 1456 static void 1457 mcinvert(p, cs) 1458 struct parse *p; 1459 cset *cs; 1460 { 1461 1462 _DIAGASSERT(p != NULL); 1463 _DIAGASSERT(cs != NULL); 1464 1465 assert(cs->multis == NULL); /* xxx */ 1466 } 1467 1468 /* 1469 - mccase - add case counterparts of the list of collating elements in a cset 1470 == static void mccase(struct parse *p, cset *cs); 1471 * 1472 * This would have to know the set of possibilities. Implementation 1473 * is deferred. 1474 */ 1475 /* ARGSUSED */ 1476 static void 1477 mccase(p, cs) 1478 struct parse *p; 1479 cset *cs; 1480 { 1481 1482 _DIAGASSERT(p != NULL); 1483 _DIAGASSERT(cs != NULL); 1484 1485 assert(cs->multis == NULL); /* xxx */ 1486 } 1487 1488 /* 1489 - isinsets - is this character in any sets? 1490 == static int isinsets(struct re_guts *g, int c); 1491 */ 1492 static int /* predicate */ 1493 isinsets(g, c) 1494 struct re_guts *g; 1495 int c; 1496 { 1497 uch *col; 1498 int i; 1499 int ncols; 1500 unsigned uc = (unsigned char)c; 1501 1502 _DIAGASSERT(g != NULL); 1503 1504 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1505 1506 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1507 if (col[uc] != 0) 1508 return(1); 1509 return(0); 1510 } 1511 1512 /* 1513 - samesets - are these two characters in exactly the same sets? 1514 == static int samesets(struct re_guts *g, int c1, int c2); 1515 */ 1516 static int /* predicate */ 1517 samesets(g, c1, c2) 1518 struct re_guts *g; 1519 int c1; 1520 int c2; 1521 { 1522 uch *col; 1523 int i; 1524 int ncols; 1525 unsigned uc1 = (unsigned char)c1; 1526 unsigned uc2 = (unsigned char)c2; 1527 1528 _DIAGASSERT(g != NULL); 1529 1530 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1531 1532 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1533 if (col[uc1] != col[uc2]) 1534 return(0); 1535 return(1); 1536 } 1537 1538 /* 1539 - categorize - sort out character categories 1540 == static void categorize(struct parse *p, struct re_guts *g); 1541 */ 1542 static void 1543 categorize(p, g) 1544 struct parse *p; 1545 struct re_guts *g; 1546 { 1547 cat_t *cats; 1548 int c; 1549 int c2; 1550 cat_t cat; 1551 1552 _DIAGASSERT(p != NULL); 1553 _DIAGASSERT(g != NULL); 1554 1555 cats = g->categories; 1556 1557 /* avoid making error situations worse */ 1558 if (p->error != 0) 1559 return; 1560 1561 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1562 if (cats[c] == 0 && isinsets(g, c)) { 1563 cat = g->ncategories++; 1564 cats[c] = cat; 1565 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1566 if (cats[c2] == 0 && samesets(g, c, c2)) 1567 cats[c2] = cat; 1568 } 1569 } 1570 1571 /* 1572 - dupl - emit a duplicate of a bunch of sops 1573 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1574 */ 1575 static sopno /* start of duplicate */ 1576 dupl(p, start, finish) 1577 struct parse *p; 1578 sopno start; /* from here */ 1579 sopno finish; /* to this less one */ 1580 { 1581 sopno ret; 1582 sopno len = finish - start; 1583 1584 _DIAGASSERT(p != NULL); 1585 1586 ret = HERE(); 1587 1588 assert(finish >= start); 1589 if (len == 0) 1590 return(ret); 1591 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1592 assert(p->ssize >= p->slen + len); 1593 (void)memcpy(p->strip + p->slen, p->strip + start, 1594 (size_t)len * sizeof(sop)); 1595 p->slen += len; 1596 return(ret); 1597 } 1598 1599 /* 1600 - doemit - emit a strip operator 1601 == static void doemit(struct parse *p, sop op, size_t opnd); 1602 * 1603 * It might seem better to implement this as a macro with a function as 1604 * hard-case backup, but it's just too big and messy unless there are 1605 * some changes to the data structures. Maybe later. 1606 */ 1607 static void 1608 doemit(p, op, opnd) 1609 struct parse *p; 1610 sop op; 1611 sopno opnd; 1612 { 1613 1614 _DIAGASSERT(p != NULL); 1615 1616 /* avoid making error situations worse */ 1617 if (p->error != 0) 1618 return; 1619 1620 /* deal with oversize operands ("can't happen", more or less) */ 1621 assert(opnd < 1<<OPSHIFT); 1622 1623 /* deal with undersized strip */ 1624 if (p->slen >= p->ssize) 1625 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1626 assert(p->slen < p->ssize); 1627 1628 /* finally, it's all reduced to the easy case */ 1629 p->strip[p->slen++] = SOP(op, opnd); 1630 } 1631 1632 /* 1633 - doinsert - insert a sop into the strip 1634 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1635 */ 1636 static void 1637 doinsert(p, op, opnd, pos) 1638 struct parse *p; 1639 sop op; 1640 sopno opnd; 1641 sopno pos; 1642 { 1643 sopno sn; 1644 sop s; 1645 int i; 1646 1647 _DIAGASSERT(p != NULL); 1648 1649 /* avoid making error situations worse */ 1650 if (p->error != 0) 1651 return; 1652 1653 sn = HERE(); 1654 EMIT(op, opnd); /* do checks, ensure space */ 1655 assert(HERE() == sn+1); 1656 s = p->strip[sn]; 1657 1658 /* adjust paren pointers */ 1659 assert(pos > 0); 1660 for (i = 1; i < NPAREN; i++) { 1661 if (p->pbegin[i] >= pos) { 1662 p->pbegin[i]++; 1663 } 1664 if (p->pend[i] >= pos) { 1665 p->pend[i]++; 1666 } 1667 } 1668 1669 memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop)); 1670 p->strip[pos] = s; 1671 } 1672 1673 /* 1674 - dofwd - complete a forward reference 1675 == static void dofwd(struct parse *p, sopno pos, sop value); 1676 */ 1677 static void 1678 dofwd(p, pos, value) 1679 struct parse *p; 1680 sopno pos; 1681 sopno value; 1682 { 1683 1684 _DIAGASSERT(p != NULL); 1685 1686 /* avoid making error situations worse */ 1687 if (p->error != 0) 1688 return; 1689 1690 assert(value < 1<<OPSHIFT); 1691 p->strip[pos] = OP(p->strip[pos]) | value; 1692 } 1693 1694 /* 1695 - enlarge - enlarge the strip 1696 == static void enlarge(struct parse *p, sopno size); 1697 */ 1698 static void 1699 enlarge(p, size) 1700 struct parse *p; 1701 sopno size; 1702 { 1703 sop *sp; 1704 1705 _DIAGASSERT(p != NULL); 1706 1707 if (p->ssize >= size) 1708 return; 1709 1710 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1711 if (sp == NULL) { 1712 SETERROR(REG_ESPACE); 1713 return; 1714 } 1715 p->strip = sp; 1716 p->ssize = size; 1717 } 1718 1719 /* 1720 - stripsnug - compact the strip 1721 == static void stripsnug(struct parse *p, struct re_guts *g); 1722 */ 1723 static void 1724 stripsnug(p, g) 1725 struct parse *p; 1726 struct re_guts *g; 1727 { 1728 1729 _DIAGASSERT(p != NULL); 1730 _DIAGASSERT(g != NULL); 1731 1732 g->nstates = p->slen; 1733 g->strip = realloc(p->strip, p->slen * sizeof(sop)); 1734 if (g->strip == NULL) { 1735 SETERROR(REG_ESPACE); 1736 g->strip = p->strip; 1737 } 1738 } 1739 1740 /* 1741 - findmust - fill in must and mlen with longest mandatory literal string 1742 == static void findmust(struct parse *p, struct re_guts *g); 1743 * 1744 * This algorithm could do fancy things like analyzing the operands of | 1745 * for common subsequences. Someday. This code is simple and finds most 1746 * of the interesting cases. 1747 * 1748 * Note that must and mlen got initialized during setup. 1749 */ 1750 static void 1751 findmust(p, g) 1752 struct parse *p; 1753 struct re_guts *g; 1754 { 1755 sop *scan; 1756 sop *start = NULL; 1757 sop *newstart = NULL; 1758 sopno newlen; 1759 sop s; 1760 char *cp; 1761 sopno i; 1762 1763 _DIAGASSERT(p != NULL); 1764 _DIAGASSERT(g != NULL); 1765 1766 /* avoid making error situations worse */ 1767 if (p->error != 0) 1768 return; 1769 1770 /* find the longest OCHAR sequence in strip */ 1771 newlen = 0; 1772 scan = g->strip + 1; 1773 do { 1774 s = *scan++; 1775 switch (OP(s)) { 1776 case OCHAR: /* sequence member */ 1777 if (newlen == 0) /* new sequence */ 1778 newstart = scan - 1; 1779 newlen++; 1780 break; 1781 case OPLUS_: /* things that don't break one */ 1782 case OLPAREN: 1783 case ORPAREN: 1784 break; 1785 case OQUEST_: /* things that must be skipped */ 1786 case OCH_: 1787 scan--; 1788 do { 1789 scan += OPND(s); 1790 s = *scan; 1791 /* assert() interferes w debug printouts */ 1792 if (OP(s) != O_QUEST && OP(s) != O_CH && 1793 OP(s) != OOR2) { 1794 g->iflags |= BAD; 1795 return; 1796 } 1797 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1798 /* FALLTHROUGH */ 1799 default: /* things that break a sequence */ 1800 if (newlen > g->mlen) { /* ends one */ 1801 start = newstart; 1802 g->mlen = newlen; 1803 } 1804 newlen = 0; 1805 break; 1806 } 1807 } while (OP(s) != OEND); 1808 1809 if (g->mlen == 0) /* there isn't one */ 1810 return; 1811 1812 /* turn it into a character string */ 1813 g->must = malloc((size_t)g->mlen + 1); 1814 if (g->must == NULL) { /* argh; just forget it */ 1815 g->mlen = 0; 1816 return; 1817 } 1818 cp = g->must; 1819 scan = start; 1820 for (i = g->mlen; i > 0; i--) { 1821 while (OP(s = *scan++) != OCHAR) 1822 continue; 1823 assert(cp < g->must + g->mlen); 1824 *cp++ = (char)OPND(s); 1825 } 1826 assert(cp == g->must + g->mlen); 1827 *cp++ = '\0'; /* just on general principles */ 1828 } 1829 1830 /* 1831 - pluscount - count + nesting 1832 == static sopno pluscount(struct parse *p, struct re_guts *g); 1833 */ 1834 static sopno /* nesting depth */ 1835 pluscount(p, g) 1836 struct parse *p; 1837 struct re_guts *g; 1838 { 1839 sop *scan; 1840 sop s; 1841 sopno plusnest = 0; 1842 sopno maxnest = 0; 1843 1844 _DIAGASSERT(p != NULL); 1845 _DIAGASSERT(g != NULL); 1846 1847 if (p->error != 0) 1848 return(0); /* there may not be an OEND */ 1849 1850 scan = g->strip + 1; 1851 do { 1852 s = *scan++; 1853 switch (OP(s)) { 1854 case OPLUS_: 1855 plusnest++; 1856 break; 1857 case O_PLUS: 1858 if (plusnest > maxnest) 1859 maxnest = plusnest; 1860 plusnest--; 1861 break; 1862 } 1863 } while (OP(s) != OEND); 1864 if (plusnest != 0) 1865 g->iflags |= BAD; 1866 return(maxnest); 1867 } 1868