1 /* $NetBSD: regcomp.c,v 1.2 2013/11/22 15:52:06 christos Exp $ */ 2 /*- 3 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer of the University of Toronto. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)regcomp.c 8.4 (Berkeley) 3/19/94 39 */ 40 41 #if defined(LIBC_SCCS) && !defined(lint) 42 static char sccsid[] = "@(#)regcomp.c 8.4 (Berkeley) 3/19/94"; 43 #endif /* LIBC_SCCS and not lint */ 44 45 #include <sys/types.h> 46 #include <stdio.h> 47 #include <string.h> 48 #include <ctype.h> 49 #include <limits.h> 50 #include <stdlib.h> 51 #include <regex.h> 52 53 #include "utils.h" 54 #include "regex2.h" 55 56 #include "cclass.h" 57 #include "cname.h" 58 59 /* 60 * parse structure, passed up and down to avoid global variables and 61 * other clumsinesses 62 */ 63 struct parse { 64 RCHAR_T *next; /* next character in RE */ 65 RCHAR_T *end; /* end of string (-> NUL normally) */ 66 int error; /* has an error been seen? */ 67 sop *strip; /* malloced strip */ 68 RCHAR_T *stripdata; /* malloced stripdata */ 69 sopno ssize; /* malloced strip size (allocated) */ 70 sopno slen; /* malloced strip length (used) */ 71 int ncsalloc; /* number of csets allocated */ 72 struct re_guts *g; 73 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 74 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 75 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 76 }; 77 78 /* ========= begin header generated by ./mkh ========= */ 79 #ifdef __cplusplus 80 extern "C" { 81 #endif 82 83 /* === regcomp.c === */ 84 static void p_ere __P((struct parse *p, int stop, size_t reclimit)); 85 static void p_ere_exp __P((struct parse *p, size_t reclimit)); 86 static void p_str __P((struct parse *p)); 87 static void p_bre __P((struct parse *p, int end1, int end2, size_t reclimit)); 88 static int p_simp_re __P((struct parse *p, int starordinary, size_t reclimit)); 89 static int p_count __P((struct parse *p)); 90 static void p_bracket __P((struct parse *p)); 91 static void p_b_term __P((struct parse *p, cset *cs)); 92 static void p_b_cclass __P((struct parse *p, cset *cs)); 93 static void p_b_eclass __P((struct parse *p, cset *cs)); 94 static char p_b_symbol __P((struct parse *p)); 95 static char p_b_coll_elem __P((struct parse *p, int endc)); 96 static char othercase __P((int ch)); 97 static void bothcases __P((struct parse *p, int ch)); 98 static void ordinary __P((struct parse *p, int ch)); 99 static void nonnewline __P((struct parse *p)); 100 static void repeat __P((struct parse *p, sopno start, int from, int to, size_t reclimit)); 101 static int seterr __P((struct parse *p, int e)); 102 static cset *allocset __P((struct parse *p)); 103 static void freeset __P((struct parse *p, cset *cs)); 104 static int freezeset __P((struct parse *p, cset *cs)); 105 static int firstch __P((struct parse *p, cset *cs)); 106 static int nch __P((struct parse *p, cset *cs)); 107 static void mcadd __P((struct parse *p, cset *cs, const char *cp)); 108 #ifdef notdef 109 static void mcsub __P((cset *cs, char *cp)); 110 static int mcin __P((cset *cs, char *cp)); 111 static char *mcfind __P((cset *cs, char *cp)); 112 #endif 113 static void mcinvert __P((struct parse *p, cset *cs)); 114 static void mccase __P((struct parse *p, cset *cs)); 115 #ifdef notdef 116 static int isinsets __P((struct re_guts *g, int c)); 117 static int samesets __P((struct re_guts *g, int c1, int c2)); 118 #endif 119 static void categorize __P((struct parse *p, struct re_guts *g)); 120 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 121 static void doemit __P((struct parse *p, sop op, size_t opnd)); 122 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 123 static void dofwd __P((struct parse *p, sopno pos, sop value)); 124 static int enlarge __P((struct parse *p, sopno size)); 125 static void stripsnug __P((struct parse *p, struct re_guts *g)); 126 static void findmust __P((struct parse *p, struct re_guts *g)); 127 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 128 129 #ifdef __cplusplus 130 } 131 #endif 132 /* ========= end header generated by ./mkh ========= */ 133 134 static RCHAR_T nuls[10]; /* place to point scanner in event of error */ 135 136 /* 137 * macros for use with parse structure 138 * BEWARE: these know that the parse structure is named `p' !!! 139 */ 140 #define PEEK() (*p->next) 141 #define PEEK2() (*(p->next+1)) 142 #define MORE() (p->next < p->end) 143 #define MORE2() (p->next+1 < p->end) 144 #define SEE(c) (MORE() && PEEK() == (c)) 145 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 146 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 147 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 148 #define NEXT() (p->next++) 149 #define NEXT2() (p->next += 2) 150 #define NEXTn(n) (p->next += (n)) 151 #define GETNEXT() (*p->next++) 152 #define SETERROR(e) seterr(p, (e)) 153 #define REQUIRE(co, e) ((co) || SETERROR(e)) 154 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 155 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 156 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 157 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 158 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 159 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 160 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 161 #define HERE() (p->slen) 162 #define THERE() (p->slen - 1) 163 #define THERETHERE() (p->slen - 2) 164 #define DROP(n) (p->slen -= (n)) 165 166 #ifndef NDEBUG 167 static int never = 0; /* for use in asserts; shuts lint up */ 168 #else 169 #define never 0 /* some <assert.h>s have bugs too */ 170 #endif 171 172 #define MEMLIMIT 0x8000000 173 #define MEMSIZE(p) \ 174 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \ 175 (p)->ncsalloc * sizeof(cset) + \ 176 (p)->ssize * sizeof(sop)) 177 #define RECLIMIT 256 178 179 /* 180 - regcomp - interface for parser and compilation 181 = extern int regcomp(regex_t *, const RCHAR_T *, int); 182 = #define REG_BASIC 0000 183 = #define REG_EXTENDED 0001 184 = #define REG_ICASE 0002 185 = #define REG_NOSUB 0004 186 = #define REG_NEWLINE 0010 187 = #define REG_NOSPEC 0020 188 = #define REG_PEND 0040 189 = #define REG_DUMP 0200 190 */ 191 int /* 0 success, otherwise REG_something */ 192 regcomp(regex_t *preg, const RCHAR_T *pattern, int cflags) 193 { 194 struct parse pa; 195 register struct re_guts *g; 196 register struct parse *p = &pa; 197 register int i; 198 register size_t len; 199 #ifdef REDEBUG 200 # define GOODFLAGS(f) (f) 201 #else 202 # define GOODFLAGS(f) ((f)&~REG_DUMP) 203 #endif 204 205 cflags = GOODFLAGS(cflags); 206 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 207 return(REG_INVARG); 208 209 if (cflags®_PEND) { 210 if (preg->re_endp < pattern) 211 return(REG_INVARG); 212 len = preg->re_endp - pattern; 213 } else 214 len = STRLEN(pattern); 215 216 /* do the mallocs early so failure handling is easy */ 217 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 218 (NC-1)*sizeof(cat_t)); 219 if (g == NULL) 220 return(REG_ESPACE); 221 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 222 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 223 if (p->strip == NULL) { 224 free((char *)g); 225 return(REG_ESPACE); 226 } 227 p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T)); 228 if (p->stripdata == NULL) { 229 free((char *)p->strip); 230 free((char *)g); 231 return(REG_ESPACE); 232 } 233 p->slen = 0; 234 235 /* set things up */ 236 p->g = g; 237 p->next = (RCHAR_T *)__UNCONST(pattern); /* convenience; we do not modify it */ 238 p->end = p->next + len; 239 p->error = 0; 240 p->ncsalloc = 0; 241 for (i = 0; i < NPAREN; i++) { 242 p->pbegin[i] = 0; 243 p->pend[i] = 0; 244 } 245 g->csetsize = NC; 246 g->sets = NULL; 247 g->setbits = NULL; 248 g->ncsets = 0; 249 g->cflags = cflags; 250 g->iflags = 0; 251 g->nbol = 0; 252 g->neol = 0; 253 g->must = NULL; 254 g->mlen = 0; 255 g->nsub = 0; 256 #if 0 257 g->ncategories = 1; /* category 0 is "everything else" */ 258 g->categories = &g->catspace[-(CHAR_MIN)]; 259 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 260 #endif 261 g->backrefs = 0; 262 263 /* do it */ 264 EMIT(OEND, 0); 265 g->firststate = THERE(); 266 if (cflags®_EXTENDED) 267 p_ere(p, OUT, 0); 268 else if (cflags®_NOSPEC) 269 p_str(p); 270 else 271 p_bre(p, OUT, OUT, 0); 272 EMIT(OEND, 0); 273 g->laststate = THERE(); 274 275 /* tidy up loose ends and fill things in */ 276 categorize(p, g); 277 stripsnug(p, g); 278 findmust(p, g); 279 g->nplus = pluscount(p, g); 280 g->magic = MAGIC2; 281 preg->re_nsub = g->nsub; 282 preg->re_g = g; 283 preg->re_magic = MAGIC1; 284 #ifndef REDEBUG 285 /* not debugging, so can't rely on the assert() in regexec() */ 286 if (g->iflags&BAD) 287 SETERROR(REG_ASSERT); 288 #endif 289 290 /* win or lose, we're done */ 291 if (p->error != 0) /* lose */ 292 regfree(preg); 293 return(p->error); 294 } 295 296 /* 297 - p_ere - ERE parser top level, concatenation and alternation 298 == static void p_ere(register struct parse *p, int stop, size_t reclimit); 299 */ 300 static void 301 p_ere(register struct parse *p, int stop, size_t reclimit) 302 303 /* character this ERE should end at */ 304 { 305 register char c; 306 register sopno prevback = 0; 307 register sopno prevfwd = 0; 308 register sopno conc; 309 register int first = 1; /* is this the first alternative? */ 310 311 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 312 p->error = REG_ESPACE; 313 return; 314 } 315 316 for (;;) { 317 /* do a bunch of concatenated expressions */ 318 conc = HERE(); 319 while (MORE() && (c = PEEK()) != '|' && c != stop) 320 p_ere_exp(p, reclimit); 321 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 322 323 if (!EAT('|')) 324 break; /* NOTE BREAK OUT */ 325 326 if (first) { 327 INSERT(OCH_, conc); /* offset is wrong */ 328 prevfwd = conc; 329 prevback = conc; 330 first = 0; 331 } 332 ASTERN(OOR1, prevback); 333 prevback = THERE(); 334 AHEAD(prevfwd); /* fix previous offset */ 335 prevfwd = HERE(); 336 EMIT(OOR2, 0); /* offset is very wrong */ 337 } 338 339 if (!first) { /* tail-end fixups */ 340 AHEAD(prevfwd); 341 ASTERN(O_CH, prevback); 342 } 343 344 assert(!MORE() || SEE(stop)); 345 } 346 347 /* 348 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 349 == static void p_ere_exp(register struct parse *p); 350 */ 351 static void 352 p_ere_exp(register struct parse *p, size_t reclimit) 353 { 354 register char c; 355 register sopno pos; 356 register int count; 357 register int count2; 358 register sopno subno; 359 int wascaret = 0; 360 361 assert(MORE()); /* caller should have ensured this */ 362 c = GETNEXT(); 363 364 pos = HERE(); 365 switch (c) { 366 case '(': 367 (void)REQUIRE(MORE(), REG_EPAREN); 368 p->g->nsub++; 369 subno = p->g->nsub; 370 if (subno < NPAREN) 371 p->pbegin[subno] = HERE(); 372 EMIT(OLPAREN, subno); 373 if (!SEE(')')) 374 p_ere(p, ')', reclimit); 375 if (subno < NPAREN) { 376 p->pend[subno] = HERE(); 377 assert(p->pend[subno] != 0); 378 } 379 EMIT(ORPAREN, subno); 380 (void)MUSTEAT(')', REG_EPAREN); 381 break; 382 #ifndef POSIX_MISTAKE 383 case ')': /* happens only if no current unmatched ( */ 384 /* 385 * You may ask, why the ifndef? Because I didn't notice 386 * this until slightly too late for 1003.2, and none of the 387 * other 1003.2 regular-expression reviewers noticed it at 388 * all. So an unmatched ) is legal POSIX, at least until 389 * we can get it fixed. 390 */ 391 SETERROR(REG_EPAREN); 392 break; 393 #endif 394 case '^': 395 EMIT(OBOL, 0); 396 p->g->iflags |= USEBOL; 397 p->g->nbol++; 398 wascaret = 1; 399 break; 400 case '$': 401 EMIT(OEOL, 0); 402 p->g->iflags |= USEEOL; 403 p->g->neol++; 404 break; 405 case '|': 406 SETERROR(REG_EMPTY); 407 break; 408 case '*': 409 case '+': 410 case '?': 411 SETERROR(REG_BADRPT); 412 break; 413 case '.': 414 if (p->g->cflags®_NEWLINE) 415 nonnewline(p); 416 else 417 EMIT(OANY, 0); 418 break; 419 case '[': 420 p_bracket(p); 421 break; 422 case '\\': 423 (void)REQUIRE(MORE(), REG_EESCAPE); 424 c = GETNEXT(); 425 ordinary(p, c); 426 break; 427 case '{': /* okay as ordinary except if digit follows */ 428 (void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT); 429 /* FALLTHROUGH */ 430 default: 431 ordinary(p, c); 432 break; 433 } 434 435 if (!MORE()) 436 return; 437 c = PEEK(); 438 /* we call { a repetition if followed by a digit */ 439 if (!( c == '*' || c == '+' || c == '?' || 440 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) )) 441 return; /* no repetition, we're done */ 442 NEXT(); 443 444 (void)REQUIRE(!wascaret, REG_BADRPT); 445 switch (c) { 446 case '*': /* implemented as +? */ 447 /* this case does not require the (y|) trick, noKLUDGE */ 448 INSERT(OPLUS_, pos); 449 ASTERN(O_PLUS, pos); 450 INSERT(OQUEST_, pos); 451 ASTERN(O_QUEST, pos); 452 break; 453 case '+': 454 INSERT(OPLUS_, pos); 455 ASTERN(O_PLUS, pos); 456 break; 457 case '?': 458 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 459 INSERT(OCH_, pos); /* offset slightly wrong */ 460 ASTERN(OOR1, pos); /* this one's right */ 461 AHEAD(pos); /* fix the OCH_ */ 462 EMIT(OOR2, 0); /* offset very wrong... */ 463 AHEAD(THERE()); /* ...so fix it */ 464 ASTERN(O_CH, THERETHERE()); 465 break; 466 case '{': 467 count = p_count(p); 468 if (EAT(',')) { 469 if (ISDIGIT((UCHAR_T)PEEK())) { 470 count2 = p_count(p); 471 (void)REQUIRE(count <= count2, REG_BADBR); 472 } else /* single number with comma */ 473 count2 = INFINITY; 474 } else /* just a single number */ 475 count2 = count; 476 repeat(p, pos, count, count2, 0); 477 if (!EAT('}')) { /* error heuristics */ 478 while (MORE() && PEEK() != '}') 479 NEXT(); 480 (void)REQUIRE(MORE(), REG_EBRACE); 481 SETERROR(REG_BADBR); 482 } 483 break; 484 } 485 486 if (!MORE()) 487 return; 488 c = PEEK(); 489 if (!( c == '*' || c == '+' || c == '?' || 490 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) ) 491 return; 492 SETERROR(REG_BADRPT); 493 } 494 495 /* 496 - p_str - string (no metacharacters) "parser" 497 == static void p_str(register struct parse *p); 498 */ 499 static void 500 p_str(register struct parse *p) 501 { 502 (void)REQUIRE(MORE(), REG_EMPTY); 503 while (MORE()) 504 ordinary(p, GETNEXT()); 505 } 506 507 /* 508 - p_bre - BRE parser top level, anchoring and concatenation 509 == static void p_bre(register struct parse *p, register int end1, \ 510 == register int end2, size_t reclimit); 511 * Giving end1 as OUT essentially eliminates the end1/end2 check. 512 * 513 * This implementation is a bit of a kludge, in that a trailing $ is first 514 * taken as an ordinary character and then revised to be an anchor. The 515 * only undesirable side effect is that '$' gets included as a character 516 * category in such cases. This is fairly harmless; not worth fixing. 517 * The amount of lookahead needed to avoid this kludge is excessive. 518 */ 519 static void 520 p_bre(register struct parse *p, register int end1, register int end2, size_t reclimit) 521 522 /* first terminating character */ 523 /* second terminating character */ 524 { 525 register sopno start; 526 register int first = 1; /* first subexpression? */ 527 register int wasdollar = 0; 528 529 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 530 p->error = REG_ESPACE; 531 return; 532 } 533 534 start = HERE(); 535 536 if (EAT('^')) { 537 EMIT(OBOL, 0); 538 p->g->iflags |= USEBOL; 539 p->g->nbol++; 540 } 541 while (MORE() && !SEETWO(end1, end2)) { 542 wasdollar = p_simp_re(p, first, reclimit); 543 first = 0; 544 } 545 if (wasdollar) { /* oops, that was a trailing anchor */ 546 DROP(1); 547 EMIT(OEOL, 0); 548 p->g->iflags |= USEEOL; 549 p->g->neol++; 550 } 551 552 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 553 } 554 555 /* 556 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 557 == static int p_simp_re(register struct parse *p, int starordinary, size_t reclimit); 558 */ 559 static int /* was the simple RE an unbackslashed $? */ 560 p_simp_re(register struct parse *p, int starordinary, size_t reclimit) 561 562 /* is a leading * an ordinary character? */ 563 { 564 register int c; 565 register int count; 566 register int count2; 567 register sopno pos; 568 register int i; 569 register sopno subno; 570 int backsl; 571 572 pos = HERE(); /* repetion op, if any, covers from here */ 573 574 assert(MORE()); /* caller should have ensured this */ 575 c = GETNEXT(); 576 backsl = c == '\\'; 577 if (backsl) { 578 (void)REQUIRE(MORE(), REG_EESCAPE); 579 c = (unsigned char)GETNEXT(); 580 switch (c) { 581 case '{': 582 SETERROR(REG_BADRPT); 583 break; 584 case '(': 585 p->g->nsub++; 586 subno = p->g->nsub; 587 if (subno < NPAREN) 588 p->pbegin[subno] = HERE(); 589 EMIT(OLPAREN, subno); 590 /* the MORE here is an error heuristic */ 591 if (MORE() && !SEETWO('\\', ')')) 592 p_bre(p, '\\', ')', reclimit); 593 if (subno < NPAREN) { 594 p->pend[subno] = HERE(); 595 assert(p->pend[subno] != 0); 596 } 597 EMIT(ORPAREN, subno); 598 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 599 break; 600 case ')': /* should not get here -- must be user */ 601 case '}': 602 SETERROR(REG_EPAREN); 603 break; 604 case '1': 605 case '2': 606 case '3': 607 case '4': 608 case '5': 609 case '6': 610 case '7': 611 case '8': 612 case '9': 613 i = c - '0'; 614 assert(i < NPAREN); 615 if (p->pend[i] != 0) { 616 assert(i <= p->g->nsub); 617 EMIT(OBACK_, i); 618 assert(p->pbegin[i] != 0); 619 assert(p->strip[p->pbegin[i]] == OLPAREN); 620 assert(p->strip[p->pend[i]] == ORPAREN); 621 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 622 EMIT(O_BACK, i); 623 } else 624 SETERROR(REG_ESUBREG); 625 p->g->backrefs = 1; 626 break; 627 default: 628 ordinary(p, c); 629 break; 630 } 631 } else { 632 switch (c) { 633 case '.': 634 if (p->g->cflags®_NEWLINE) 635 nonnewline(p); 636 else 637 EMIT(OANY, 0); 638 break; 639 case '[': 640 p_bracket(p); 641 break; 642 case '*': 643 (void)REQUIRE(starordinary, REG_BADRPT); 644 /* FALLTHROUGH */ 645 default: 646 ordinary(p, c); 647 break; 648 } 649 } 650 651 if (EAT('*')) { /* implemented as +? */ 652 /* this case does not require the (y|) trick, noKLUDGE */ 653 INSERT(OPLUS_, pos); 654 ASTERN(O_PLUS, pos); 655 INSERT(OQUEST_, pos); 656 ASTERN(O_QUEST, pos); 657 } else if (EATTWO('\\', '{')) { 658 count = p_count(p); 659 if (EAT(',')) { 660 if (MORE() && ISDIGIT((UCHAR_T)PEEK())) { 661 count2 = p_count(p); 662 (void)REQUIRE(count <= count2, REG_BADBR); 663 } else /* single number with comma */ 664 count2 = INFINITY; 665 } else /* just a single number */ 666 count2 = count; 667 repeat(p, pos, count, count2, reclimit); 668 if (!EATTWO('\\', '}')) { /* error heuristics */ 669 while (MORE() && !SEETWO('\\', '}')) 670 NEXT(); 671 (void)REQUIRE(MORE(), REG_EBRACE); 672 SETERROR(REG_BADBR); 673 } 674 } else if (!backsl && c == (unsigned char)'$') /* $ (but not \$) ends it */ 675 return(1); 676 677 return(0); 678 } 679 680 /* 681 - p_count - parse a repetition count 682 == static int p_count(register struct parse *p); 683 */ 684 static int /* the value */ 685 p_count(register struct parse *p) 686 { 687 register int count = 0; 688 register int ndigits = 0; 689 690 while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) { 691 count = count*10 + (GETNEXT() - '0'); 692 ndigits++; 693 } 694 695 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 696 return(count); 697 } 698 699 /* 700 - p_bracket - parse a bracketed character list 701 == static void p_bracket(register struct parse *p); 702 * 703 * Note a significant property of this code: if the allocset() did SETERROR, 704 * no set operations are done. 705 */ 706 static void 707 p_bracket(register struct parse *p) 708 { 709 register cset *cs; 710 register int invert = 0; 711 static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' }; 712 static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' }; 713 714 cs = allocset(p); 715 if (cs == NULL) 716 return; 717 718 /* Dept of Truly Sickening Special-Case Kludges */ 719 if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) { 720 EMIT(OBOW, 0); 721 NEXTn(6); 722 return; 723 } 724 if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) { 725 EMIT(OEOW, 0); 726 NEXTn(6); 727 return; 728 } 729 730 if (EAT('^')) 731 invert++; /* make note to invert set at end */ 732 if (EAT(']')) 733 CHadd(cs, ']'); 734 else if (EAT('-')) 735 CHadd(cs, '-'); 736 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 737 p_b_term(p, cs); 738 if (EAT('-')) 739 CHadd(cs, '-'); 740 (void)MUSTEAT(']', REG_EBRACK); 741 742 if (p->error != 0) /* don't mess things up further */ 743 return; 744 745 if (p->g->cflags®_ICASE) { 746 register int i; 747 register int ci; 748 749 for (i = p->g->csetsize - 1; i >= 0; i--) 750 if (CHIN(cs, i) && isalpha(i)) { 751 ci = othercase(i); 752 if (ci != i) 753 CHadd(cs, ci); 754 } 755 if (cs->multis != NULL) 756 mccase(p, cs); 757 } 758 if (invert) { 759 register int i; 760 761 for (i = p->g->csetsize - 1; i >= 0; i--) 762 if (CHIN(cs, i)) 763 CHsub(cs, i); 764 else 765 CHadd(cs, i); 766 if (p->g->cflags®_NEWLINE) 767 CHsub(cs, '\n'); 768 if (cs->multis != NULL) 769 mcinvert(p, cs); 770 } 771 772 assert(cs->multis == NULL); /* xxx */ 773 774 if (nch(p, cs) == 1) { /* optimize singleton sets */ 775 ordinary(p, firstch(p, cs)); 776 freeset(p, cs); 777 } else 778 EMIT(OANYOF, freezeset(p, cs)); 779 } 780 781 /* 782 - p_b_term - parse one term of a bracketed character list 783 == static void p_b_term(register struct parse *p, register cset *cs); 784 */ 785 static void 786 p_b_term(register struct parse *p, register cset *cs) 787 { 788 register char c; 789 register char start, finish; 790 register int i; 791 792 /* classify what we've got */ 793 switch ((MORE()) ? PEEK() : '\0') { 794 case '[': 795 c = (MORE2()) ? PEEK2() : '\0'; 796 break; 797 case '-': 798 SETERROR(REG_ERANGE); 799 return; /* NOTE RETURN */ 800 break; 801 default: 802 c = '\0'; 803 break; 804 } 805 806 switch (c) { 807 case ':': /* character class */ 808 NEXT2(); 809 (void)REQUIRE(MORE(), REG_EBRACK); 810 c = PEEK(); 811 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 812 p_b_cclass(p, cs); 813 (void)REQUIRE(MORE(), REG_EBRACK); 814 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 815 break; 816 case '=': /* equivalence class */ 817 NEXT2(); 818 (void)REQUIRE(MORE(), REG_EBRACK); 819 c = PEEK(); 820 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 821 p_b_eclass(p, cs); 822 (void)REQUIRE(MORE(), REG_EBRACK); 823 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 824 break; 825 default: /* symbol, ordinary character, or range */ 826 /* xxx revision needed for multichar stuff */ 827 start = p_b_symbol(p); 828 if (SEE('-') && MORE2() && PEEK2() != ']') { 829 /* range */ 830 NEXT(); 831 if (EAT('-')) 832 finish = '-'; 833 else 834 finish = p_b_symbol(p); 835 } else 836 finish = start; 837 /* xxx what about signed chars here... */ 838 (void)REQUIRE(start <= finish, REG_ERANGE); 839 for (i = start; i <= finish; i++) 840 CHadd(cs, i); 841 break; 842 } 843 } 844 845 /* 846 - p_b_cclass - parse a character-class name and deal with it 847 == static void p_b_cclass(register struct parse *p, register cset *cs); 848 */ 849 static void 850 p_b_cclass(register struct parse *p, register cset *cs) 851 { 852 register RCHAR_T *sp = p->next; 853 register struct cclass *cp; 854 register size_t len; 855 register const char *u; 856 register char c; 857 858 while (MORE() && isalpha(PEEK())) 859 NEXT(); 860 len = p->next - sp; 861 for (cp = cclasses; cp->name != NULL; cp++) 862 if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len)) 863 break; 864 if (cp->name == NULL) { 865 /* oops, didn't find it */ 866 SETERROR(REG_ECTYPE); 867 return; 868 } 869 870 u = cp->chars; 871 while ((c = *u++) != '\0') 872 CHadd(cs, c); 873 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 874 MCadd(p, cs, u); 875 } 876 877 /* 878 - p_b_eclass - parse an equivalence-class name and deal with it 879 == static void p_b_eclass(register struct parse *p, register cset *cs); 880 * 881 * This implementation is incomplete. xxx 882 */ 883 static void 884 p_b_eclass(register struct parse *p, register cset *cs) 885 { 886 register char c; 887 888 c = p_b_coll_elem(p, '='); 889 CHadd(cs, c); 890 } 891 892 /* 893 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 894 == static char p_b_symbol(register struct parse *p); 895 */ 896 static char /* value of symbol */ 897 p_b_symbol(register struct parse *p) 898 { 899 register char value; 900 901 (void)REQUIRE(MORE(), REG_EBRACK); 902 if (!EATTWO('[', '.')) 903 return(GETNEXT()); 904 905 /* collating symbol */ 906 value = p_b_coll_elem(p, '.'); 907 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 908 return(value); 909 } 910 911 /* 912 - p_b_coll_elem - parse a collating-element name and look it up 913 == static char p_b_coll_elem(register struct parse *p, int endc); 914 */ 915 static char /* value of collating element */ 916 p_b_coll_elem(register struct parse *p, int endc) 917 918 /* name ended by endc,']' */ 919 { 920 register RCHAR_T *sp = p->next; 921 register struct cname *cp; 922 register size_t len; 923 924 while (MORE() && !SEETWO(endc, ']')) 925 NEXT(); 926 if (!MORE()) { 927 SETERROR(REG_EBRACK); 928 return(0); 929 } 930 len = p->next - sp; 931 for (cp = cnames; cp->name != NULL; cp++) 932 if (STRLEN(cp->name) == len && MEMCMP(cp->name, sp, len)) 933 return(cp->code); /* known name */ 934 if (len == 1) 935 return(*sp); /* single character */ 936 SETERROR(REG_ECOLLATE); /* neither */ 937 return(0); 938 } 939 940 /* 941 - othercase - return the case counterpart of an alphabetic 942 == static char othercase(int ch); 943 */ 944 static char /* if no counterpart, return ch */ 945 othercase(int ch) 946 { 947 assert(isalpha(ch)); 948 if (isupper(ch)) 949 return(tolower(ch)); 950 else if (islower(ch)) 951 return(toupper(ch)); 952 else /* peculiar, but could happen */ 953 return(ch); 954 } 955 956 /* 957 - bothcases - emit a dualcase version of a two-case character 958 == static void bothcases(register struct parse *p, int ch); 959 * 960 * Boy, is this implementation ever a kludge... 961 */ 962 static void 963 bothcases(register struct parse *p, int ch) 964 { 965 register RCHAR_T *oldnext = p->next; 966 register RCHAR_T *oldend = p->end; 967 RCHAR_T bracket[3]; 968 969 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 970 p->next = bracket; 971 p->end = bracket+2; 972 bracket[0] = ch; 973 bracket[1] = ']'; 974 bracket[2] = '\0'; 975 p_bracket(p); 976 assert(p->next == bracket+2); 977 p->next = oldnext; 978 p->end = oldend; 979 } 980 981 /* 982 - ordinary - emit an ordinary character 983 == static void ordinary(register struct parse *p, register int ch); 984 */ 985 static void 986 ordinary(register struct parse *p, register int ch) 987 { 988 /* 989 register cat_t *cap = p->g->categories; 990 */ 991 992 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 993 bothcases(p, ch); 994 else { 995 EMIT(OCHAR, (UCHAR_T)ch); 996 /* 997 if (cap[ch] == 0) 998 cap[ch] = p->g->ncategories++; 999 */ 1000 } 1001 } 1002 1003 /* 1004 - nonnewline - emit REG_NEWLINE version of OANY 1005 == static void nonnewline(register struct parse *p); 1006 * 1007 * Boy, is this implementation ever a kludge... 1008 */ 1009 static void 1010 nonnewline(register struct parse *p) 1011 { 1012 register RCHAR_T *oldnext = p->next; 1013 register RCHAR_T *oldend = p->end; 1014 RCHAR_T bracket[4]; 1015 1016 p->next = bracket; 1017 p->end = bracket+3; 1018 bracket[0] = '^'; 1019 bracket[1] = '\n'; 1020 bracket[2] = ']'; 1021 bracket[3] = '\0'; 1022 p_bracket(p); 1023 assert(p->next == bracket+3); 1024 p->next = oldnext; 1025 p->end = oldend; 1026 } 1027 1028 /* 1029 - repeat - generate code for a bounded repetition, recursively if needed 1030 == static void repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit); 1031 */ 1032 static void 1033 repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit) 1034 1035 /* operand from here to end of strip */ 1036 /* repeated from this number */ 1037 /* to this number of times (maybe INFINITY) */ 1038 { 1039 register sopno finish; 1040 # define N 2 1041 # define INF 3 1042 # define REP(f, t) ((f)*8 + (t)) 1043 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1044 register sopno copy; 1045 1046 if (reclimit++ > RECLIMIT) 1047 p->error = REG_ESPACE; 1048 if (p->error) 1049 return; 1050 1051 finish = HERE(); 1052 1053 assert(from <= to); 1054 1055 switch (REP(MAP(from), MAP(to))) { 1056 case REP(0, 0): /* must be user doing this */ 1057 DROP(finish-start); /* drop the operand */ 1058 break; 1059 case REP(0, 1): /* as x{1,1}? */ 1060 case REP(0, N): /* as x{1,n}? */ 1061 case REP(0, INF): /* as x{1,}? */ 1062 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1063 INSERT(OCH_, start); /* offset is wrong... */ 1064 repeat(p, start+1, 1, to, reclimit); 1065 ASTERN(OOR1, start); 1066 AHEAD(start); /* ... fix it */ 1067 EMIT(OOR2, 0); 1068 AHEAD(THERE()); 1069 ASTERN(O_CH, THERETHERE()); 1070 break; 1071 case REP(1, 1): /* trivial case */ 1072 /* done */ 1073 break; 1074 case REP(1, N): /* as x?x{1,n-1} */ 1075 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1076 INSERT(OCH_, start); 1077 ASTERN(OOR1, start); 1078 AHEAD(start); 1079 EMIT(OOR2, 0); /* offset very wrong... */ 1080 AHEAD(THERE()); /* ...so fix it */ 1081 ASTERN(O_CH, THERETHERE()); 1082 copy = dupl(p, start+1, finish+1); 1083 assert(copy == finish+4); 1084 repeat(p, copy, 1, to-1, reclimit); 1085 break; 1086 case REP(1, INF): /* as x+ */ 1087 INSERT(OPLUS_, start); 1088 ASTERN(O_PLUS, start); 1089 break; 1090 case REP(N, N): /* as xx{m-1,n-1} */ 1091 copy = dupl(p, start, finish); 1092 repeat(p, copy, from-1, to-1, reclimit); 1093 break; 1094 case REP(N, INF): /* as xx{n-1,INF} */ 1095 copy = dupl(p, start, finish); 1096 repeat(p, copy, from-1, to, reclimit); 1097 break; 1098 default: /* "can't happen" */ 1099 SETERROR(REG_ASSERT); /* just in case */ 1100 break; 1101 } 1102 } 1103 1104 /* 1105 - seterr - set an error condition 1106 == static int seterr(register struct parse *p, int e); 1107 */ 1108 static int /* useless but makes type checking happy */ 1109 seterr(register struct parse *p, int e) 1110 { 1111 if (p->error == 0) /* keep earliest error condition */ 1112 p->error = e; 1113 p->next = nuls; /* try to bring things to a halt */ 1114 p->end = nuls; 1115 return(0); /* make the return value well-defined */ 1116 } 1117 1118 /* 1119 - allocset - allocate a set of characters for [] 1120 == static cset *allocset(register struct parse *p); 1121 */ 1122 static cset * 1123 allocset(register struct parse *p) 1124 { 1125 register int no = p->g->ncsets++; 1126 register size_t nc; 1127 register size_t nbytes; 1128 register cset *cs; 1129 register size_t css = (size_t)p->g->csetsize; 1130 register int i; 1131 1132 if (no >= p->ncsalloc) { /* need another column of space */ 1133 p->ncsalloc += CHAR_BIT; 1134 nc = p->ncsalloc; 1135 assert(nc % CHAR_BIT == 0); 1136 nbytes = nc / CHAR_BIT * css; 1137 if (MEMSIZE(p) > MEMLIMIT) 1138 goto oomem; 1139 if (p->g->sets == NULL) 1140 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1141 else 1142 p->g->sets = (cset *)realloc((char *)p->g->sets, 1143 nc * sizeof(cset)); 1144 if (p->g->setbits == NULL) 1145 p->g->setbits = (uch *)malloc(nbytes); 1146 else { 1147 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1148 nbytes); 1149 /* xxx this isn't right if setbits is now NULL */ 1150 for (i = 0; i < no; i++) 1151 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1152 } 1153 if (p->g->sets != NULL && p->g->setbits != NULL) 1154 (void) memset((char *)p->g->setbits + (nbytes - css), 1155 0, css); 1156 else { 1157 oomem: 1158 no = 0; 1159 SETERROR(REG_ESPACE); 1160 /* caller's responsibility not to do set ops */ 1161 return NULL; 1162 } 1163 } 1164 1165 cs = &p->g->sets[no]; 1166 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1167 cs->mask = 1 << ((no) % CHAR_BIT); 1168 cs->hash = 0; 1169 cs->smultis = 0; 1170 cs->multis = NULL; 1171 1172 return(cs); 1173 } 1174 1175 /* 1176 - freeset - free a now-unused set 1177 == static void freeset(register struct parse *p, register cset *cs); 1178 */ 1179 static void 1180 freeset(register struct parse *p, register cset *cs) 1181 { 1182 register size_t i; 1183 register cset *top = &p->g->sets[p->g->ncsets]; 1184 register size_t css = (size_t)p->g->csetsize; 1185 1186 for (i = 0; i < css; i++) 1187 CHsub(cs, i); 1188 if (cs == top-1) /* recover only the easy case */ 1189 p->g->ncsets--; 1190 } 1191 1192 /* 1193 - freezeset - final processing on a set of characters 1194 == static int freezeset(register struct parse *p, register cset *cs); 1195 * 1196 * The main task here is merging identical sets. This is usually a waste 1197 * of time (although the hash code minimizes the overhead), but can win 1198 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1199 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1200 * the same value! 1201 */ 1202 static int /* set number */ 1203 freezeset(register struct parse *p, register cset *cs) 1204 { 1205 register uch h = cs->hash; 1206 register size_t i; 1207 register cset *top = &p->g->sets[p->g->ncsets]; 1208 register cset *cs2; 1209 register size_t css = (size_t)p->g->csetsize; 1210 1211 /* look for an earlier one which is the same */ 1212 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1213 if (cs2->hash == h && cs2 != cs) { 1214 /* maybe */ 1215 for (i = 0; i < css; i++) 1216 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1217 break; /* no */ 1218 if (i == css) 1219 break; /* yes */ 1220 } 1221 1222 if (cs2 < top) { /* found one */ 1223 freeset(p, cs); 1224 cs = cs2; 1225 } 1226 1227 return((int)(cs - p->g->sets)); 1228 } 1229 1230 /* 1231 - firstch - return first character in a set (which must have at least one) 1232 == static int firstch(register struct parse *p, register cset *cs); 1233 */ 1234 static int /* character; there is no "none" value */ 1235 firstch(register struct parse *p, register cset *cs) 1236 { 1237 register size_t i; 1238 register size_t css = (size_t)p->g->csetsize; 1239 1240 for (i = 0; i < css; i++) 1241 if (CHIN(cs, i)) 1242 return((char)i); 1243 assert(never); 1244 return(0); /* arbitrary */ 1245 } 1246 1247 /* 1248 - nch - number of characters in a set 1249 == static int nch(register struct parse *p, register cset *cs); 1250 */ 1251 static int 1252 nch(register struct parse *p, register cset *cs) 1253 { 1254 register size_t i; 1255 register size_t css = (size_t)p->g->csetsize; 1256 register int n = 0; 1257 1258 for (i = 0; i < css; i++) 1259 if (CHIN(cs, i)) 1260 n++; 1261 return(n); 1262 } 1263 1264 /* 1265 - mcadd - add a collating element to a cset 1266 == static void mcadd(register struct parse *p, register cset *cs, \ 1267 == register char *cp); 1268 */ 1269 static void 1270 mcadd(register struct parse *p, register cset *cs, register const char *cp) 1271 { 1272 register size_t oldend = cs->smultis; 1273 1274 cs->smultis += strlen(cp) + 1; 1275 if (cs->multis == NULL) 1276 cs->multis = malloc(cs->smultis); 1277 else 1278 cs->multis = realloc(cs->multis, cs->smultis); 1279 if (cs->multis == NULL) { 1280 SETERROR(REG_ESPACE); 1281 return; 1282 } 1283 1284 (void) strcpy(cs->multis + oldend - 1, cp); 1285 cs->multis[cs->smultis - 1] = '\0'; 1286 } 1287 1288 #ifdef notdef 1289 /* 1290 - mcsub - subtract a collating element from a cset 1291 == static void mcsub(register cset *cs, register char *cp); 1292 */ 1293 static void 1294 mcsub(register cset *cs, register char *cp) 1295 { 1296 register char *fp = mcfind(cs, cp); 1297 register size_t len = strlen(fp); 1298 1299 assert(fp != NULL); 1300 (void) memmove(fp, fp + len + 1, 1301 cs->smultis - (fp + len + 1 - cs->multis)); 1302 cs->smultis -= len; 1303 1304 if (cs->smultis == 0) { 1305 free(cs->multis); 1306 cs->multis = NULL; 1307 return; 1308 } 1309 1310 cs->multis = realloc(cs->multis, cs->smultis); 1311 assert(cs->multis != NULL); 1312 } 1313 1314 /* 1315 - mcin - is a collating element in a cset? 1316 == static int mcin(register cset *cs, register char *cp); 1317 */ 1318 static int 1319 mcin(register cset *cs, register char *cp) 1320 { 1321 return(mcfind(cs, cp) != NULL); 1322 } 1323 1324 /* 1325 - mcfind - find a collating element in a cset 1326 == static char *mcfind(register cset *cs, register char *cp); 1327 */ 1328 static char * 1329 mcfind(register cset *cs, register char *cp) 1330 { 1331 register char *p; 1332 1333 if (cs->multis == NULL) 1334 return(NULL); 1335 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1336 if (strcmp(cp, p) == 0) 1337 return(p); 1338 return(NULL); 1339 } 1340 #endif 1341 1342 /* 1343 - mcinvert - invert the list of collating elements in a cset 1344 == static void mcinvert(register struct parse *p, register cset *cs); 1345 * 1346 * This would have to know the set of possibilities. Implementation 1347 * is deferred. 1348 */ 1349 static void 1350 mcinvert(register struct parse *p, register cset *cs) 1351 { 1352 assert(cs->multis == NULL); /* xxx */ 1353 } 1354 1355 /* 1356 - mccase - add case counterparts of the list of collating elements in a cset 1357 == static void mccase(register struct parse *p, register cset *cs); 1358 * 1359 * This would have to know the set of possibilities. Implementation 1360 * is deferred. 1361 */ 1362 static void 1363 mccase(register struct parse *p, register cset *cs) 1364 { 1365 assert(cs->multis == NULL); /* xxx */ 1366 } 1367 1368 #ifdef notdef 1369 /* 1370 - isinsets - is this character in any sets? 1371 == static int isinsets(register struct re_guts *g, int c); 1372 */ 1373 static int /* predicate */ 1374 isinsets(register struct re_guts *g, int c) 1375 { 1376 register uch *col; 1377 register int i; 1378 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1379 register unsigned uc = (unsigned char)c; 1380 1381 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1382 if (col[uc] != 0) 1383 return(1); 1384 return(0); 1385 } 1386 1387 /* 1388 - samesets - are these two characters in exactly the same sets? 1389 == static int samesets(register struct re_guts *g, int c1, int c2); 1390 */ 1391 static int /* predicate */ 1392 samesets(register struct re_guts *g, int c1, int c2) 1393 { 1394 register uch *col; 1395 register int i; 1396 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1397 register unsigned uc1 = (unsigned char)c1; 1398 register unsigned uc2 = (unsigned char)c2; 1399 1400 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1401 if (col[uc1] != col[uc2]) 1402 return(0); 1403 return(1); 1404 } 1405 #endif 1406 1407 /* 1408 - categorize - sort out character categories 1409 == static void categorize(struct parse *p, register struct re_guts *g); 1410 */ 1411 static void 1412 categorize(struct parse *p, register struct re_guts *g) 1413 { 1414 #ifdef notdef 1415 register cat_t *cats = g->categories; 1416 register int c; 1417 register int c2; 1418 register cat_t cat; 1419 1420 /* avoid making error situations worse */ 1421 if (p->error != 0) 1422 return; 1423 1424 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1425 if (cats[c] == 0 && isinsets(g, c)) { 1426 cat = g->ncategories++; 1427 cats[c] = cat; 1428 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1429 if (cats[c2] == 0 && samesets(g, c, c2)) 1430 cats[c2] = cat; 1431 } 1432 #endif 1433 } 1434 1435 /* 1436 - dupl - emit a duplicate of a bunch of sops 1437 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1438 */ 1439 static sopno /* start of duplicate */ 1440 dupl(register struct parse *p, sopno start, sopno finish) 1441 1442 /* from here */ 1443 /* to this less one */ 1444 { 1445 register sopno ret = HERE(); 1446 register sopno len = finish - start; 1447 1448 assert(finish >= start); 1449 if (len == 0) 1450 return(ret); 1451 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1452 return ret; 1453 assert(p->ssize >= p->slen + len); 1454 (void) memcpy((char *)(p->strip + p->slen), 1455 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1456 (void) memcpy((char *)(p->stripdata + p->slen), 1457 (char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T)); 1458 p->slen += len; 1459 return(ret); 1460 } 1461 1462 /* 1463 - doemit - emit a strip operator 1464 == static void doemit(register struct parse *p, sop op, size_t opnd); 1465 * 1466 * It might seem better to implement this as a macro with a function as 1467 * hard-case backup, but it's just too big and messy unless there are 1468 * some changes to the data structures. Maybe later. 1469 */ 1470 static void 1471 doemit(register struct parse *p, sop op, size_t opnd) 1472 { 1473 /* avoid making error situations worse */ 1474 if (p->error != 0) 1475 return; 1476 1477 /* deal with oversize operands ("can't happen", more or less) */ 1478 assert(opnd < 1); 1479 1480 /* deal with undersized strip */ 1481 if (p->slen >= p->ssize) 1482 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1483 return; 1484 1485 /* finally, it's all reduced to the easy case */ 1486 p->strip[p->slen] = op; 1487 p->stripdata[p->slen] = opnd; 1488 p->slen++; 1489 } 1490 1491 /* 1492 - doinsert - insert a sop into the strip 1493 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1494 */ 1495 static void 1496 doinsert(register struct parse *p, sop op, size_t opnd, sopno pos) 1497 { 1498 register sopno sn; 1499 register sop s; 1500 register RCHAR_T d; 1501 register int i; 1502 1503 /* avoid making error situations worse */ 1504 if (p->error != 0) 1505 return; 1506 1507 sn = HERE(); 1508 EMIT(op, opnd); /* do checks, ensure space */ 1509 assert(HERE() == sn+1); 1510 s = p->strip[sn]; 1511 d = p->stripdata[sn]; 1512 1513 /* adjust paren pointers */ 1514 assert(pos > 0); 1515 for (i = 1; i < NPAREN; i++) { 1516 if (p->pbegin[i] >= pos) { 1517 p->pbegin[i]++; 1518 } 1519 if (p->pend[i] >= pos) { 1520 p->pend[i]++; 1521 } 1522 } 1523 1524 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1525 (HERE()-pos-1)*sizeof(sop)); 1526 memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos], 1527 (HERE()-pos-1)*sizeof(RCHAR_T)); 1528 p->strip[pos] = s; 1529 p->stripdata[pos] = d; 1530 } 1531 1532 /* 1533 - dofwd - complete a forward reference 1534 == static void dofwd(register struct parse *p, sopno pos, sop value); 1535 */ 1536 static void 1537 dofwd(register struct parse *p, register sopno pos, sop value) 1538 { 1539 /* avoid making error situations worse */ 1540 if (p->error != 0) 1541 return; 1542 1543 assert(value < 1); 1544 p->stripdata[pos] = value; 1545 } 1546 1547 /* 1548 - enlarge - enlarge the strip 1549 == static int enlarge(register struct parse *p, sopno size); 1550 */ 1551 static int 1552 enlarge(register struct parse *p, register sopno size) 1553 { 1554 register sop *sp; 1555 register RCHAR_T *dp; 1556 sopno osize; 1557 1558 if (p->ssize >= size) 1559 return 1; 1560 1561 osize = p->ssize; 1562 p->ssize = size; 1563 if (MEMSIZE(p) > MEMLIMIT) 1564 goto oomem; 1565 sp = realloc(p->strip, p->ssize * sizeof(sop)); 1566 if (sp == NULL) 1567 goto oomem; 1568 p->strip = sp; 1569 dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T)); 1570 if (dp == NULL) { 1571 oomem: 1572 p->ssize = osize; 1573 SETERROR(REG_ESPACE); 1574 return 0; 1575 } 1576 p->stripdata = dp; 1577 return 1; 1578 } 1579 1580 /* 1581 - stripsnug - compact the strip 1582 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1583 */ 1584 static void 1585 stripsnug(register struct parse *p, register struct re_guts *g) 1586 { 1587 g->nstates = p->slen; 1588 g->strip = (sop *)realloc((char *)p->strip, 1589 p->slen * sizeof(sop)); 1590 if (g->strip == NULL) { 1591 SETERROR(REG_ESPACE); 1592 g->strip = p->strip; 1593 } 1594 g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata, 1595 p->slen * sizeof(RCHAR_T)); 1596 if (g->stripdata == NULL) { 1597 SETERROR(REG_ESPACE); 1598 g->stripdata = p->stripdata; 1599 } 1600 } 1601 1602 /* 1603 - findmust - fill in must and mlen with longest mandatory literal string 1604 == static void findmust(register struct parse *p, register struct re_guts *g); 1605 * 1606 * This algorithm could do fancy things like analyzing the operands of | 1607 * for common subsequences. Someday. This code is simple and finds most 1608 * of the interesting cases. 1609 * 1610 * Note that must and mlen got initialized during setup. 1611 */ 1612 static void 1613 findmust(struct parse *p, register struct re_guts *g) 1614 { 1615 register sop *scans; 1616 register RCHAR_T *scand; 1617 sop *starts = 0; 1618 RCHAR_T *startd = NULL; 1619 register sop *newstarts = 0; 1620 register RCHAR_T *newstartd = NULL; 1621 register sopno newlen; 1622 register sop s; 1623 register RCHAR_T d; 1624 register RCHAR_T *cp; 1625 register sopno i; 1626 1627 /* avoid making error situations worse */ 1628 if (p->error != 0) 1629 return; 1630 1631 /* find the longest OCHAR sequence in strip */ 1632 newlen = 0; 1633 scans = g->strip + 1; 1634 scand = g->stripdata + 1; 1635 do { 1636 s = *scans++; 1637 d = *scand++; 1638 switch (s) { 1639 case OCHAR: /* sequence member */ 1640 if (newlen == 0) { /* new sequence */ 1641 newstarts = scans - 1; 1642 newstartd = scand - 1; 1643 } 1644 newlen++; 1645 break; 1646 case OPLUS_: /* things that don't break one */ 1647 case OLPAREN: 1648 case ORPAREN: 1649 break; 1650 case OQUEST_: /* things that must be skipped */ 1651 case OCH_: 1652 scans--; 1653 scand--; 1654 do { 1655 scans += d; 1656 scand += d; 1657 s = *scans; 1658 d = *scand; 1659 /* assert() interferes w debug printouts */ 1660 if (s != O_QUEST && s != O_CH && s != OOR2) { 1661 g->iflags |= BAD; 1662 return; 1663 } 1664 } while (s != O_QUEST && s != O_CH); 1665 /* fallthrough */ 1666 default: /* things that break a sequence */ 1667 if (newlen > g->mlen) { /* ends one */ 1668 starts = newstarts; 1669 startd = newstartd; 1670 g->mlen = newlen; 1671 } 1672 newlen = 0; 1673 break; 1674 } 1675 } while (s != OEND); 1676 1677 if (g->mlen == 0) /* there isn't one */ 1678 return; 1679 1680 /* turn it into a character string */ 1681 g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T)); 1682 if (g->must == NULL) { /* argh; just forget it */ 1683 g->mlen = 0; 1684 return; 1685 } 1686 cp = g->must; 1687 scans = starts; 1688 scand = startd; 1689 for (i = g->mlen; i > 0; i--) { 1690 for (;;) { 1691 s = *scans++; 1692 d = *scand++; 1693 if (s == OCHAR) 1694 break; 1695 } 1696 assert(cp < g->must + g->mlen); 1697 *cp++ = d; 1698 } 1699 assert(cp == g->must + g->mlen); 1700 *cp++ = '\0'; /* just on general principles */ 1701 } 1702 1703 /* 1704 - pluscount - count + nesting 1705 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1706 */ 1707 static sopno /* nesting depth */ 1708 pluscount(struct parse *p, register struct re_guts *g) 1709 { 1710 register sop *scan; 1711 register sop s; 1712 register sopno plusnest = 0; 1713 register sopno maxnest = 0; 1714 1715 if (p->error != 0) 1716 return(0); /* there may not be an OEND */ 1717 1718 scan = g->strip + 1; 1719 do { 1720 s = *scan++; 1721 switch (s) { 1722 case OPLUS_: 1723 plusnest++; 1724 break; 1725 case O_PLUS: 1726 if (plusnest > maxnest) 1727 maxnest = plusnest; 1728 plusnest--; 1729 break; 1730 } 1731 } while (s != OEND); 1732 if (plusnest != 0) 1733 g->iflags |= BAD; 1734 return(maxnest); 1735 } 1736