1 /* $NetBSD: regcomp.c,v 1.8 2021/05/17 04:01:57 rin Exp $ */ 2 /*- 3 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer of the University of Toronto. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)regcomp.c 8.4 (Berkeley) 3/19/94 39 */ 40 41 #include <sys/cdefs.h> 42 #if 0 43 #if defined(LIBC_SCCS) && !defined(lint) 44 static char sccsid[] = "@(#)regcomp.c 8.4 (Berkeley) 3/19/94"; 45 #endif /* LIBC_SCCS and not lint */ 46 #else 47 __RCSID("$NetBSD: regcomp.c,v 1.8 2021/05/17 04:01:57 rin Exp $"); 48 #endif 49 50 #include <sys/types.h> 51 #include <stdio.h> 52 #include <string.h> 53 #include <ctype.h> 54 #include <limits.h> 55 #include <stdlib.h> 56 #include <regex.h> 57 58 #include "utils.h" 59 #include "regex2.h" 60 61 #include "cclass.h" 62 #include "cname.h" 63 64 /* 65 * parse structure, passed up and down to avoid global variables and 66 * other clumsinesses 67 */ 68 struct parse { 69 RCHAR_T *next; /* next character in RE */ 70 RCHAR_T *end; /* end of string (-> NUL normally) */ 71 int error; /* has an error been seen? */ 72 sop *strip; /* malloced strip */ 73 RCHAR_T *stripdata; /* malloced stripdata */ 74 sopno ssize; /* malloced strip size (allocated) */ 75 sopno slen; /* malloced strip length (used) */ 76 int ncsalloc; /* number of csets allocated */ 77 struct re_guts *g; 78 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 79 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 80 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 81 }; 82 83 /* ========= begin header generated by ./mkh ========= */ 84 #ifdef __cplusplus 85 extern "C" { 86 #endif 87 88 /* === regcomp.c === */ 89 static void p_ere __P((struct parse *p, int stop, size_t reclimit)); 90 static void p_ere_exp __P((struct parse *p, size_t reclimit)); 91 static void p_str __P((struct parse *p)); 92 static void p_bre __P((struct parse *p, int end1, int end2, size_t reclimit)); 93 static int p_simp_re __P((struct parse *p, int starordinary, size_t reclimit)); 94 static int p_count __P((struct parse *p)); 95 static void p_bracket __P((struct parse *p)); 96 static void p_b_term __P((struct parse *p, cset *cs)); 97 static void p_b_cclass __P((struct parse *p, cset *cs)); 98 static void p_b_eclass __P((struct parse *p, cset *cs)); 99 static char p_b_symbol __P((struct parse *p)); 100 static char p_b_coll_elem __P((struct parse *p, int endc)); 101 static RCHAR_T othercase __P((int ch)); 102 static void bothcases __P((struct parse *p, int ch)); 103 static void ordinary __P((struct parse *p, int ch)); 104 static void nonnewline __P((struct parse *p)); 105 static void repeat __P((struct parse *p, sopno start, int from, int to, size_t reclimit)); 106 static int seterr __P((struct parse *p, int e)); 107 static cset *allocset __P((struct parse *p)); 108 static void freeset __P((struct parse *p, cset *cs)); 109 static int freezeset __P((struct parse *p, cset *cs)); 110 static int firstch __P((struct parse *p, cset *cs)); 111 static int nch __P((struct parse *p, cset *cs)); 112 static void mcadd __P((struct parse *p, cset *cs, const char *cp)); 113 #ifdef notdef 114 static void mcsub __P((cset *cs, char *cp)); 115 static int mcin __P((cset *cs, char *cp)); 116 static char *mcfind __P((cset *cs, char *cp)); 117 #endif 118 static void mcinvert __P((struct parse *p, cset *cs)); 119 static void mccase __P((struct parse *p, cset *cs)); 120 #ifdef notdef 121 static int isinsets __P((struct re_guts *g, int c)); 122 static int samesets __P((struct re_guts *g, int c1, int c2)); 123 #endif 124 static void categorize __P((struct parse *p, struct re_guts *g)); 125 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 126 static void doemit __P((struct parse *p, sop op, size_t opnd)); 127 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 128 static void dofwd __P((struct parse *p, sopno pos, sop value)); 129 static int enlarge __P((struct parse *p, sopno size)); 130 static void stripsnug __P((struct parse *p, struct re_guts *g)); 131 static void findmust __P((struct parse *p, struct re_guts *g)); 132 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 133 134 #ifdef __cplusplus 135 } 136 #endif 137 /* ========= end header generated by ./mkh ========= */ 138 139 static RCHAR_T nuls[10]; /* place to point scanner in event of error */ 140 141 /* 142 * macros for use with parse structure 143 * BEWARE: these know that the parse structure is named `p' !!! 144 */ 145 #define PEEK() (*p->next) 146 #define PEEK2() (*(p->next+1)) 147 #define MORE() (p->next < p->end) 148 #define MORE2() (p->next+1 < p->end) 149 #define SEE(c) (MORE() && PEEK() == (c)) 150 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 151 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 152 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 153 #define NEXT() (p->next++) 154 #define NEXT2() (p->next += 2) 155 #define NEXTn(n) (p->next += (n)) 156 #define GETNEXT() (*p->next++) 157 #define SETERROR(e) seterr(p, (e)) 158 #define REQUIRE(co, e) ((co) || SETERROR(e)) 159 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 160 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 161 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 162 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 163 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 164 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 165 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 166 #define HERE() (p->slen) 167 #define THERE() (p->slen - 1) 168 #define THERETHERE() (p->slen - 2) 169 #define DROP(n) (p->slen -= (n)) 170 171 #ifndef NDEBUG 172 static int never = 0; /* for use in asserts; shuts lint up */ 173 #else 174 #define never 0 /* some <assert.h>s have bugs too */ 175 #endif 176 177 #define MEMLIMIT 0x8000000 178 #define MEMSIZE(p) \ 179 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \ 180 (p)->ncsalloc * sizeof(cset) + \ 181 (p)->ssize * sizeof(sop)) 182 #define RECLIMIT 256 183 184 /* 185 - regcomp - interface for parser and compilation 186 = extern int regcomp(regex_t *, const RCHAR_T *, int); 187 = #define REG_BASIC 0000 188 = #define REG_EXTENDED 0001 189 = #define REG_ICASE 0002 190 = #define REG_NOSUB 0004 191 = #define REG_NEWLINE 0010 192 = #define REG_NOSPEC 0020 193 = #define REG_PEND 0040 194 = #define REG_DUMP 0200 195 */ 196 int /* 0 success, otherwise REG_something */ 197 regcomp(regex_t *preg, const RCHAR_T *pattern, int cflags) 198 { 199 struct parse pa; 200 struct re_guts *g; 201 struct parse *p = &pa; 202 int i; 203 size_t len; 204 #ifdef REDEBUG 205 # define GOODFLAGS(f) (f) 206 #else 207 # define GOODFLAGS(f) ((f)&~REG_DUMP) 208 #endif 209 210 cflags = GOODFLAGS(cflags); 211 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 212 return(REG_INVARG); 213 214 if (cflags®_PEND) { 215 if (preg->re_endp < pattern) 216 return(REG_INVARG); 217 len = preg->re_endp - pattern; 218 } else 219 len = STRLEN(pattern); 220 221 /* do the mallocs early so failure handling is easy */ 222 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 223 (NC-1)*sizeof(cat_t)); 224 if (g == NULL) 225 return(REG_ESPACE); 226 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 227 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 228 if (p->strip == NULL) { 229 free((char *)g); 230 return(REG_ESPACE); 231 } 232 p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T)); 233 if (p->stripdata == NULL) { 234 free((char *)p->strip); 235 free((char *)g); 236 return(REG_ESPACE); 237 } 238 p->slen = 0; 239 240 /* set things up */ 241 p->g = g; 242 p->next = (RCHAR_T *)__UNCONST(pattern); /* convenience; we do not modify it */ 243 p->end = p->next + len; 244 p->error = 0; 245 p->ncsalloc = 0; 246 for (i = 0; i < NPAREN; i++) { 247 p->pbegin[i] = 0; 248 p->pend[i] = 0; 249 } 250 g->csetsize = NC; 251 g->sets = NULL; 252 g->setbits = NULL; 253 g->ncsets = 0; 254 g->cflags = cflags; 255 g->iflags = 0; 256 g->nbol = 0; 257 g->neol = 0; 258 g->must = NULL; 259 g->mlen = 0; 260 g->nsub = 0; 261 #if 0 262 g->ncategories = 1; /* category 0 is "everything else" */ 263 g->categories = &g->catspace[-(CHAR_MIN)]; 264 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 265 #endif 266 g->backrefs = 0; 267 268 /* do it */ 269 EMIT(OEND, 0); 270 g->firststate = THERE(); 271 if (cflags®_EXTENDED) 272 p_ere(p, OUT, 0); 273 else if (cflags®_NOSPEC) 274 p_str(p); 275 else 276 p_bre(p, OUT, OUT, 0); 277 EMIT(OEND, 0); 278 g->laststate = THERE(); 279 280 /* tidy up loose ends and fill things in */ 281 categorize(p, g); 282 stripsnug(p, g); 283 findmust(p, g); 284 g->nplus = pluscount(p, g); 285 g->magic = MAGIC2; 286 preg->re_nsub = g->nsub; 287 preg->re_g = g; 288 preg->re_magic = MAGIC1; 289 #ifndef REDEBUG 290 /* not debugging, so can't rely on the assert() in regexec() */ 291 if (g->iflags&BAD) 292 SETERROR(REG_ASSERT); 293 #endif 294 295 /* win or lose, we're done */ 296 if (p->error != 0) /* lose */ 297 regfree(preg); 298 return(p->error); 299 } 300 301 /* 302 - p_ere - ERE parser top level, concatenation and alternation 303 == static void p_ere(struct parse *p, int stop, size_t reclimit); 304 */ 305 static void 306 p_ere(struct parse *p, int stop, size_t reclimit) 307 308 /* character this ERE should end at */ 309 { 310 char c; 311 sopno prevback = 0; 312 sopno prevfwd = 0; 313 sopno conc; 314 int first = 1; /* is this the first alternative? */ 315 316 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 317 p->error = REG_ESPACE; 318 return; 319 } 320 321 for (;;) { 322 /* do a bunch of concatenated expressions */ 323 conc = HERE(); 324 while (MORE() && (c = PEEK()) != '|' && c != stop) 325 p_ere_exp(p, reclimit); 326 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 327 328 if (!EAT('|')) 329 break; /* NOTE BREAK OUT */ 330 331 if (first) { 332 INSERT(OCH_, conc); /* offset is wrong */ 333 prevfwd = conc; 334 prevback = conc; 335 first = 0; 336 } 337 ASTERN(OOR1, prevback); 338 prevback = THERE(); 339 AHEAD(prevfwd); /* fix previous offset */ 340 prevfwd = HERE(); 341 EMIT(OOR2, 0); /* offset is very wrong */ 342 } 343 344 if (!first) { /* tail-end fixups */ 345 AHEAD(prevfwd); 346 ASTERN(O_CH, prevback); 347 } 348 349 assert(!MORE() || SEE(stop)); 350 } 351 352 /* 353 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 354 == static void p_ere_exp(struct parse *p); 355 */ 356 static void 357 p_ere_exp(struct parse *p, size_t reclimit) 358 { 359 char c; 360 sopno pos; 361 int count; 362 int count2; 363 sopno subno; 364 int wascaret = 0; 365 366 assert(MORE()); /* caller should have ensured this */ 367 c = GETNEXT(); 368 369 pos = HERE(); 370 switch (c) { 371 case '(': 372 (void)REQUIRE(MORE(), REG_EPAREN); 373 p->g->nsub++; 374 subno = p->g->nsub; 375 if (subno < NPAREN) 376 p->pbegin[subno] = HERE(); 377 EMIT(OLPAREN, subno); 378 if (!SEE(')')) 379 p_ere(p, ')', reclimit); 380 if (subno < NPAREN) { 381 p->pend[subno] = HERE(); 382 assert(p->pend[subno] != 0); 383 } 384 EMIT(ORPAREN, subno); 385 (void)MUSTEAT(')', REG_EPAREN); 386 break; 387 #ifndef POSIX_MISTAKE 388 case ')': /* happens only if no current unmatched ( */ 389 /* 390 * You may ask, why the ifndef? Because I didn't notice 391 * this until slightly too late for 1003.2, and none of the 392 * other 1003.2 regular-expression reviewers noticed it at 393 * all. So an unmatched ) is legal POSIX, at least until 394 * we can get it fixed. 395 */ 396 SETERROR(REG_EPAREN); 397 break; 398 #endif 399 case '^': 400 EMIT(OBOL, 0); 401 p->g->iflags |= USEBOL; 402 p->g->nbol++; 403 wascaret = 1; 404 break; 405 case '$': 406 EMIT(OEOL, 0); 407 p->g->iflags |= USEEOL; 408 p->g->neol++; 409 break; 410 case '|': 411 SETERROR(REG_EMPTY); 412 break; 413 case '*': 414 case '+': 415 case '?': 416 SETERROR(REG_BADRPT); 417 break; 418 case '.': 419 if (p->g->cflags®_NEWLINE) 420 nonnewline(p); 421 else 422 EMIT(OANY, 0); 423 break; 424 case '[': 425 p_bracket(p); 426 break; 427 case '\\': 428 (void)REQUIRE(MORE(), REG_EESCAPE); 429 c = GETNEXT(); 430 ordinary(p, c); 431 break; 432 case '{': /* okay as ordinary except if digit follows */ 433 (void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT); 434 /* FALLTHROUGH */ 435 default: 436 ordinary(p, c); 437 break; 438 } 439 440 if (!MORE()) 441 return; 442 c = PEEK(); 443 /* we call { a repetition if followed by a digit */ 444 if (!( c == '*' || c == '+' || c == '?' || 445 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) )) 446 return; /* no repetition, we're done */ 447 NEXT(); 448 449 (void)REQUIRE(!wascaret, REG_BADRPT); 450 switch (c) { 451 case '*': /* implemented as +? */ 452 /* this case does not require the (y|) trick, noKLUDGE */ 453 INSERT(OPLUS_, pos); 454 ASTERN(O_PLUS, pos); 455 INSERT(OQUEST_, pos); 456 ASTERN(O_QUEST, pos); 457 break; 458 case '+': 459 INSERT(OPLUS_, pos); 460 ASTERN(O_PLUS, pos); 461 break; 462 case '?': 463 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 464 INSERT(OCH_, pos); /* offset slightly wrong */ 465 ASTERN(OOR1, pos); /* this one's right */ 466 AHEAD(pos); /* fix the OCH_ */ 467 EMIT(OOR2, 0); /* offset very wrong... */ 468 AHEAD(THERE()); /* ...so fix it */ 469 ASTERN(O_CH, THERETHERE()); 470 break; 471 case '{': 472 count = p_count(p); 473 if (EAT(',')) { 474 if (ISDIGIT((UCHAR_T)PEEK())) { 475 count2 = p_count(p); 476 (void)REQUIRE(count <= count2, REG_BADBR); 477 } else /* single number with comma */ 478 count2 = INFINITY; 479 } else /* just a single number */ 480 count2 = count; 481 repeat(p, pos, count, count2, 0); 482 if (!EAT('}')) { /* error heuristics */ 483 while (MORE() && PEEK() != '}') 484 NEXT(); 485 (void)REQUIRE(MORE(), REG_EBRACE); 486 SETERROR(REG_BADBR); 487 } 488 break; 489 } 490 491 if (!MORE()) 492 return; 493 c = PEEK(); 494 if (!( c == '*' || c == '+' || c == '?' || 495 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) ) 496 return; 497 SETERROR(REG_BADRPT); 498 } 499 500 /* 501 - p_str - string (no metacharacters) "parser" 502 == static void p_str(struct parse *p); 503 */ 504 static void 505 p_str(struct parse *p) 506 { 507 (void)REQUIRE(MORE(), REG_EMPTY); 508 while (MORE()) 509 ordinary(p, GETNEXT()); 510 } 511 512 /* 513 - p_bre - BRE parser top level, anchoring and concatenation 514 == static void p_bre(struct parse *p, int end1, \ 515 == int end2, size_t reclimit); 516 * Giving end1 as OUT essentially eliminates the end1/end2 check. 517 * 518 * This implementation is a bit of a kludge, in that a trailing $ is first 519 * taken as an ordinary character and then revised to be an anchor. The 520 * only undesirable side effect is that '$' gets included as a character 521 * category in such cases. This is fairly harmless; not worth fixing. 522 * The amount of lookahead needed to avoid this kludge is excessive. 523 */ 524 static void 525 p_bre(struct parse *p, int end1, int end2, size_t reclimit) 526 527 /* first terminating character */ 528 /* second terminating character */ 529 { 530 sopno start; 531 int first = 1; /* first subexpression? */ 532 int wasdollar = 0; 533 534 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 535 p->error = REG_ESPACE; 536 return; 537 } 538 539 start = HERE(); 540 541 if (EAT('^')) { 542 EMIT(OBOL, 0); 543 p->g->iflags |= USEBOL; 544 p->g->nbol++; 545 } 546 while (MORE() && !SEETWO(end1, end2)) { 547 wasdollar = p_simp_re(p, first, reclimit); 548 first = 0; 549 } 550 if (wasdollar) { /* oops, that was a trailing anchor */ 551 DROP(1); 552 EMIT(OEOL, 0); 553 p->g->iflags |= USEEOL; 554 p->g->neol++; 555 } 556 557 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 558 } 559 560 /* 561 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 562 == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 563 */ 564 static int /* was the simple RE an unbackslashed $? */ 565 p_simp_re(struct parse *p, int starordinary, size_t reclimit) 566 567 /* is a leading * an ordinary character? */ 568 { 569 int c; 570 int count; 571 int count2; 572 sopno pos; 573 int i; 574 sopno subno; 575 int backsl; 576 577 pos = HERE(); /* repetion op, if any, covers from here */ 578 579 assert(MORE()); /* caller should have ensured this */ 580 c = GETNEXT(); 581 backsl = c == '\\'; 582 if (backsl) { 583 (void)REQUIRE(MORE(), REG_EESCAPE); 584 c = (unsigned char)GETNEXT(); 585 switch (c) { 586 case '{': 587 SETERROR(REG_BADRPT); 588 break; 589 case '(': 590 p->g->nsub++; 591 subno = p->g->nsub; 592 if (subno < NPAREN) 593 p->pbegin[subno] = HERE(); 594 EMIT(OLPAREN, subno); 595 /* the MORE here is an error heuristic */ 596 if (MORE() && !SEETWO('\\', ')')) 597 p_bre(p, '\\', ')', reclimit); 598 if (subno < NPAREN) { 599 p->pend[subno] = HERE(); 600 assert(p->pend[subno] != 0); 601 } 602 EMIT(ORPAREN, subno); 603 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 604 break; 605 case ')': /* should not get here -- must be user */ 606 case '}': 607 SETERROR(REG_EPAREN); 608 break; 609 case '1': 610 case '2': 611 case '3': 612 case '4': 613 case '5': 614 case '6': 615 case '7': 616 case '8': 617 case '9': 618 i = c - '0'; 619 assert(i < NPAREN); 620 if (p->pend[i] != 0) { 621 assert(i <= p->g->nsub); 622 EMIT(OBACK_, i); 623 assert(p->pbegin[i] != 0); 624 assert(p->strip[p->pbegin[i]] == OLPAREN); 625 assert(p->strip[p->pend[i]] == ORPAREN); 626 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 627 EMIT(O_BACK, i); 628 } else 629 SETERROR(REG_ESUBREG); 630 p->g->backrefs = 1; 631 break; 632 default: 633 ordinary(p, c); 634 break; 635 } 636 } else { 637 switch (c) { 638 case '.': 639 if (p->g->cflags®_NEWLINE) 640 nonnewline(p); 641 else 642 EMIT(OANY, 0); 643 break; 644 case '[': 645 p_bracket(p); 646 break; 647 case '*': 648 (void)REQUIRE(starordinary, REG_BADRPT); 649 /* FALLTHROUGH */ 650 default: 651 ordinary(p, c); 652 break; 653 } 654 } 655 656 if (EAT('*')) { /* implemented as +? */ 657 /* this case does not require the (y|) trick, noKLUDGE */ 658 INSERT(OPLUS_, pos); 659 ASTERN(O_PLUS, pos); 660 INSERT(OQUEST_, pos); 661 ASTERN(O_QUEST, pos); 662 } else if (EATTWO('\\', '{')) { 663 count = p_count(p); 664 if (EAT(',')) { 665 if (MORE() && ISDIGIT((UCHAR_T)PEEK())) { 666 count2 = p_count(p); 667 (void)REQUIRE(count <= count2, REG_BADBR); 668 } else /* single number with comma */ 669 count2 = INFINITY; 670 } else /* just a single number */ 671 count2 = count; 672 repeat(p, pos, count, count2, reclimit); 673 if (!EATTWO('\\', '}')) { /* error heuristics */ 674 while (MORE() && !SEETWO('\\', '}')) 675 NEXT(); 676 (void)REQUIRE(MORE(), REG_EBRACE); 677 SETERROR(REG_BADBR); 678 } 679 } else if (!backsl && c == (unsigned char)'$') /* $ (but not \$) ends it */ 680 return(1); 681 682 return(0); 683 } 684 685 /* 686 - p_count - parse a repetition count 687 == static int p_count(struct parse *p); 688 */ 689 static int /* the value */ 690 p_count(struct parse *p) 691 { 692 int count = 0; 693 int ndigits = 0; 694 695 while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) { 696 count = count*10 + (GETNEXT() - '0'); 697 ndigits++; 698 } 699 700 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 701 return(count); 702 } 703 704 /* 705 - p_bracket - parse a bracketed character list 706 == static void p_bracket(struct parse *p); 707 * 708 * Note a significant property of this code: if the allocset() did SETERROR, 709 * no set operations are done. 710 */ 711 static void 712 p_bracket(struct parse *p) 713 { 714 cset *cs; 715 int invert = 0; 716 static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' }; 717 static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' }; 718 719 cs = allocset(p); 720 if (cs == NULL) 721 return; 722 723 /* Dept of Truly Sickening Special-Case Kludges */ 724 if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) { 725 EMIT(OBOW, 0); 726 NEXTn(6); 727 return; 728 } 729 if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) { 730 EMIT(OEOW, 0); 731 NEXTn(6); 732 return; 733 } 734 735 if (EAT('^')) 736 invert++; /* make note to invert set at end */ 737 if (EAT(']')) 738 CHadd(cs, ']'); 739 else if (EAT('-')) 740 CHadd(cs, '-'); 741 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 742 p_b_term(p, cs); 743 if (EAT('-')) 744 CHadd(cs, '-'); 745 (void)MUSTEAT(']', REG_EBRACK); 746 747 if (p->error != 0) /* don't mess things up further */ 748 return; 749 750 if (p->g->cflags®_ICASE) { 751 int i; 752 int ci; 753 754 for (i = p->g->csetsize - 1; i >= 0; i--) 755 if (CHIN(cs, i) && ISALPHA2(i)) { 756 ci = othercase(i); 757 if (ci != i) 758 CHadd(cs, ci); 759 } 760 if (cs->multis != NULL) 761 mccase(p, cs); 762 } 763 if (invert) { 764 int i; 765 766 for (i = p->g->csetsize - 1; i >= 0; i--) 767 if (CHIN(cs, i)) 768 CHsub(cs, i); 769 else 770 CHadd(cs, i); 771 if (p->g->cflags®_NEWLINE) 772 CHsub(cs, '\n'); 773 if (cs->multis != NULL) 774 mcinvert(p, cs); 775 } 776 777 assert(cs->multis == NULL); /* xxx */ 778 779 if (nch(p, cs) == 1) { /* optimize singleton sets */ 780 ordinary(p, firstch(p, cs)); 781 freeset(p, cs); 782 } else 783 EMIT(OANYOF, freezeset(p, cs)); 784 } 785 786 /* 787 - p_b_term - parse one term of a bracketed character list 788 == static void p_b_term(struct parse *p, cset *cs); 789 */ 790 static void 791 p_b_term(struct parse *p, cset *cs) 792 { 793 char c; 794 char start, finish; 795 int i; 796 797 /* classify what we've got */ 798 switch ((MORE()) ? PEEK() : '\0') { 799 case '[': 800 c = (MORE2()) ? PEEK2() : '\0'; 801 break; 802 case '-': 803 SETERROR(REG_ERANGE); 804 return; /* NOTE RETURN */ 805 break; 806 default: 807 c = '\0'; 808 break; 809 } 810 811 switch (c) { 812 case ':': /* character class */ 813 NEXT2(); 814 (void)REQUIRE(MORE(), REG_EBRACK); 815 c = PEEK(); 816 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 817 p_b_cclass(p, cs); 818 (void)REQUIRE(MORE(), REG_EBRACK); 819 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 820 break; 821 case '=': /* equivalence class */ 822 NEXT2(); 823 (void)REQUIRE(MORE(), REG_EBRACK); 824 c = PEEK(); 825 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 826 p_b_eclass(p, cs); 827 (void)REQUIRE(MORE(), REG_EBRACK); 828 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 829 break; 830 default: /* symbol, ordinary character, or range */ 831 /* xxx revision needed for multichar stuff */ 832 start = p_b_symbol(p); 833 if (SEE('-') && MORE2() && PEEK2() != ']') { 834 /* range */ 835 NEXT(); 836 if (EAT('-')) 837 finish = '-'; 838 else 839 finish = p_b_symbol(p); 840 } else 841 finish = start; 842 /* xxx what about signed chars here... */ 843 (void)REQUIRE(start <= finish, REG_ERANGE); 844 for (i = start; i <= finish; i++) 845 CHadd(cs, i); 846 break; 847 } 848 } 849 850 /* 851 - p_b_cclass - parse a character-class name and deal with it 852 == static void p_b_cclass(struct parse *p, cset *cs); 853 */ 854 static void 855 p_b_cclass(struct parse *p, cset *cs) 856 { 857 RCHAR_T *sp = p->next; 858 struct cclass *cp; 859 size_t len; 860 const char *u; 861 char c; 862 863 while (MORE() && ISALPHA2(PEEK())) 864 NEXT(); 865 len = p->next - sp; 866 for (cp = cclasses; cp->name != NULL; cp++) 867 if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len)) 868 break; 869 if (cp->name == NULL) { 870 /* oops, didn't find it */ 871 SETERROR(REG_ECTYPE); 872 return; 873 } 874 875 u = cp->chars; 876 while ((c = *u++) != '\0') 877 CHadd(cs, c); 878 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 879 MCadd(p, cs, u); 880 } 881 882 /* 883 - p_b_eclass - parse an equivalence-class name and deal with it 884 == static void p_b_eclass(struct parse *p, cset *cs); 885 * 886 * This implementation is incomplete. xxx 887 */ 888 static void 889 p_b_eclass(struct parse *p, cset *cs) 890 { 891 char c; 892 893 c = p_b_coll_elem(p, '='); 894 CHadd(cs, c); 895 } 896 897 /* 898 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 899 == static char p_b_symbol(struct parse *p); 900 */ 901 static char /* value of symbol */ 902 p_b_symbol(struct parse *p) 903 { 904 char value; 905 906 (void)REQUIRE(MORE(), REG_EBRACK); 907 if (!EATTWO('[', '.')) 908 return(GETNEXT()); 909 910 /* collating symbol */ 911 value = p_b_coll_elem(p, '.'); 912 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 913 return(value); 914 } 915 916 /* 917 - p_b_coll_elem - parse a collating-element name and look it up 918 == static char p_b_coll_elem(struct parse *p, int endc); 919 */ 920 static char /* value of collating element */ 921 p_b_coll_elem(struct parse *p, int endc) 922 923 /* name ended by endc,']' */ 924 { 925 RCHAR_T *sp = p->next; 926 struct cname *cp; 927 size_t len; 928 929 while (MORE() && !SEETWO(endc, ']')) 930 NEXT(); 931 if (!MORE()) { 932 SETERROR(REG_EBRACK); 933 return(0); 934 } 935 len = p->next - sp; 936 for (cp = cnames; cp->name != NULL; cp++) 937 if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len)) 938 return(cp->code); /* known name */ 939 if (len == 1) 940 return(*sp); /* single character */ 941 SETERROR(REG_ECOLLATE); /* neither */ 942 return(0); 943 } 944 945 /* 946 - othercase - return the case counterpart of an alphabetic 947 == static char othercase(int ch); 948 */ 949 static RCHAR_T /* if no counterpart, return ch */ 950 othercase(int ch) 951 { 952 assert(ISALPHA2(ch)); 953 if (ISUPPER(ch)) 954 return(TOLOWER(ch)); 955 else if (ISLOWER(ch)) 956 return(TOUPPER(ch)); 957 else /* peculiar, but could happen */ 958 return(ch); 959 } 960 961 /* 962 - bothcases - emit a dualcase version of a two-case character 963 == static void bothcases(struct parse *p, int ch); 964 * 965 * Boy, is this implementation ever a kludge... 966 */ 967 static void 968 bothcases(struct parse *p, int ch) 969 { 970 RCHAR_T *oldnext = p->next; 971 RCHAR_T *oldend = p->end; 972 RCHAR_T bracket[3]; 973 974 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 975 p->next = bracket; 976 p->end = bracket+2; 977 bracket[0] = ch; 978 bracket[1] = ']'; 979 bracket[2] = '\0'; 980 p_bracket(p); 981 assert(p->next == bracket+2); 982 p->next = oldnext; 983 p->end = oldend; 984 } 985 986 /* 987 - ordinary - emit an ordinary character 988 == static void ordinary(struct parse *p, int ch); 989 */ 990 static void 991 ordinary(struct parse *p, int ch) 992 { 993 /* 994 cat_t *cap = p->g->categories; 995 */ 996 997 if ((p->g->cflags®_ICASE) && ISALPHA2(ch) && othercase(ch) != ch) 998 bothcases(p, ch); 999 else { 1000 EMIT(OCHAR, (UCHAR_T)ch); 1001 /* 1002 if (cap[ch] == 0) 1003 cap[ch] = p->g->ncategories++; 1004 */ 1005 } 1006 } 1007 1008 /* 1009 - nonnewline - emit REG_NEWLINE version of OANY 1010 == static void nonnewline(struct parse *p); 1011 * 1012 * Boy, is this implementation ever a kludge... 1013 */ 1014 static void 1015 nonnewline(struct parse *p) 1016 { 1017 RCHAR_T *oldnext = p->next; 1018 RCHAR_T *oldend = p->end; 1019 RCHAR_T bracket[4]; 1020 1021 p->next = bracket; 1022 p->end = bracket+3; 1023 bracket[0] = '^'; 1024 bracket[1] = '\n'; 1025 bracket[2] = ']'; 1026 bracket[3] = '\0'; 1027 p_bracket(p); 1028 assert(p->next == bracket+3); 1029 p->next = oldnext; 1030 p->end = oldend; 1031 } 1032 1033 /* 1034 - repeat - generate code for a bounded repetition, recursively if needed 1035 == static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit); 1036 */ 1037 static void 1038 repeat(struct parse *p, sopno start, int from, int to, size_t reclimit) 1039 1040 /* operand from here to end of strip */ 1041 /* repeated from this number */ 1042 /* to this number of times (maybe INFINITY) */ 1043 { 1044 sopno finish; 1045 # define N 2 1046 # define INF 3 1047 # define REP(f, t) ((f)*8 + (t)) 1048 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1049 sopno copy; 1050 1051 if (reclimit++ > RECLIMIT) 1052 p->error = REG_ESPACE; 1053 if (p->error) 1054 return; 1055 1056 finish = HERE(); 1057 1058 assert(from <= to); 1059 1060 switch (REP(MAP(from), MAP(to))) { 1061 case REP(0, 0): /* must be user doing this */ 1062 DROP(finish-start); /* drop the operand */ 1063 break; 1064 case REP(0, 1): /* as x{1,1}? */ 1065 case REP(0, N): /* as x{1,n}? */ 1066 case REP(0, INF): /* as x{1,}? */ 1067 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1068 INSERT(OCH_, start); /* offset is wrong... */ 1069 repeat(p, start+1, 1, to, reclimit); 1070 ASTERN(OOR1, start); 1071 AHEAD(start); /* ... fix it */ 1072 EMIT(OOR2, 0); 1073 AHEAD(THERE()); 1074 ASTERN(O_CH, THERETHERE()); 1075 break; 1076 case REP(1, 1): /* trivial case */ 1077 /* done */ 1078 break; 1079 case REP(1, N): /* as x?x{1,n-1} */ 1080 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1081 INSERT(OCH_, start); 1082 ASTERN(OOR1, start); 1083 AHEAD(start); 1084 EMIT(OOR2, 0); /* offset very wrong... */ 1085 AHEAD(THERE()); /* ...so fix it */ 1086 ASTERN(O_CH, THERETHERE()); 1087 copy = dupl(p, start+1, finish+1); 1088 assert(copy == finish+4); 1089 repeat(p, copy, 1, to-1, reclimit); 1090 break; 1091 case REP(1, INF): /* as x+ */ 1092 INSERT(OPLUS_, start); 1093 ASTERN(O_PLUS, start); 1094 break; 1095 case REP(N, N): /* as xx{m-1,n-1} */ 1096 copy = dupl(p, start, finish); 1097 repeat(p, copy, from-1, to-1, reclimit); 1098 break; 1099 case REP(N, INF): /* as xx{n-1,INF} */ 1100 copy = dupl(p, start, finish); 1101 repeat(p, copy, from-1, to, reclimit); 1102 break; 1103 default: /* "can't happen" */ 1104 SETERROR(REG_ASSERT); /* just in case */ 1105 break; 1106 } 1107 } 1108 1109 /* 1110 - seterr - set an error condition 1111 == static int seterr(struct parse *p, int e); 1112 */ 1113 static int /* useless but makes type checking happy */ 1114 seterr(struct parse *p, int e) 1115 { 1116 if (p->error == 0) /* keep earliest error condition */ 1117 p->error = e; 1118 p->next = nuls; /* try to bring things to a halt */ 1119 p->end = nuls; 1120 return(0); /* make the return value well-defined */ 1121 } 1122 1123 /* 1124 - allocset - allocate a set of characters for [] 1125 == static cset *allocset(struct parse *p); 1126 */ 1127 static cset * 1128 allocset(struct parse *p) 1129 { 1130 int no = p->g->ncsets++; 1131 size_t nc; 1132 size_t nbytes; 1133 cset *cs; 1134 size_t css = (size_t)p->g->csetsize; 1135 int i; 1136 1137 if (no >= p->ncsalloc) { /* need another column of space */ 1138 p->ncsalloc += CHAR_BIT; 1139 nc = p->ncsalloc; 1140 assert(nc % CHAR_BIT == 0); 1141 nbytes = nc / CHAR_BIT * css; 1142 if (MEMSIZE(p) > MEMLIMIT) 1143 goto oomem; 1144 if (p->g->sets == NULL) 1145 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1146 else 1147 p->g->sets = (cset *)realloc((char *)p->g->sets, 1148 nc * sizeof(cset)); 1149 if (p->g->setbits == NULL) 1150 p->g->setbits = (uch *)malloc(nbytes); 1151 else { 1152 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1153 nbytes); 1154 /* xxx this isn't right if setbits is now NULL */ 1155 for (i = 0; i < no; i++) 1156 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1157 } 1158 if (p->g->sets != NULL && p->g->setbits != NULL) 1159 (void) memset((char *)p->g->setbits + (nbytes - css), 1160 0, css); 1161 else { 1162 oomem: 1163 no = 0; 1164 SETERROR(REG_ESPACE); 1165 /* caller's responsibility not to do set ops */ 1166 return NULL; 1167 } 1168 } 1169 1170 cs = &p->g->sets[no]; 1171 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1172 cs->mask = 1 << ((no) % CHAR_BIT); 1173 cs->hash = 0; 1174 cs->smultis = 0; 1175 cs->multis = NULL; 1176 1177 return(cs); 1178 } 1179 1180 /* 1181 - freeset - free a now-unused set 1182 == static void freeset(struct parse *p, cset *cs); 1183 */ 1184 static void 1185 freeset(struct parse *p, cset *cs) 1186 { 1187 size_t i; 1188 cset *top = &p->g->sets[p->g->ncsets]; 1189 size_t css = (size_t)p->g->csetsize; 1190 1191 for (i = 0; i < css; i++) 1192 CHsub(cs, i); 1193 if (cs == top-1) /* recover only the easy case */ 1194 p->g->ncsets--; 1195 } 1196 1197 /* 1198 - freezeset - final processing on a set of characters 1199 == static int freezeset(struct parse *p, cset *cs); 1200 * 1201 * The main task here is merging identical sets. This is usually a waste 1202 * of time (although the hash code minimizes the overhead), but can win 1203 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1204 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1205 * the same value! 1206 */ 1207 static int /* set number */ 1208 freezeset(struct parse *p, cset *cs) 1209 { 1210 uch h = cs->hash; 1211 size_t i; 1212 cset *top = &p->g->sets[p->g->ncsets]; 1213 cset *cs2; 1214 size_t css = (size_t)p->g->csetsize; 1215 1216 /* look for an earlier one which is the same */ 1217 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1218 if (cs2->hash == h && cs2 != cs) { 1219 /* maybe */ 1220 for (i = 0; i < css; i++) 1221 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1222 break; /* no */ 1223 if (i == css) 1224 break; /* yes */ 1225 } 1226 1227 if (cs2 < top) { /* found one */ 1228 freeset(p, cs); 1229 cs = cs2; 1230 } 1231 1232 return((int)(cs - p->g->sets)); 1233 } 1234 1235 /* 1236 - firstch - return first character in a set (which must have at least one) 1237 == static int firstch(struct parse *p, cset *cs); 1238 */ 1239 static int /* character; there is no "none" value */ 1240 firstch(struct parse *p, cset *cs) 1241 { 1242 size_t i; 1243 size_t css = (size_t)p->g->csetsize; 1244 1245 for (i = 0; i < css; i++) 1246 if (CHIN(cs, i)) 1247 return((char)i); 1248 assert(never); 1249 return(0); /* arbitrary */ 1250 } 1251 1252 /* 1253 - nch - number of characters in a set 1254 == static int nch(struct parse *p, cset *cs); 1255 */ 1256 static int 1257 nch(struct parse *p, cset *cs) 1258 { 1259 size_t i; 1260 size_t css = (size_t)p->g->csetsize; 1261 int n = 0; 1262 1263 for (i = 0; i < css; i++) 1264 if (CHIN(cs, i)) 1265 n++; 1266 return(n); 1267 } 1268 1269 /* 1270 - mcadd - add a collating element to a cset 1271 == static void mcadd(struct parse *p, cset *cs, \ 1272 == char *cp); 1273 */ 1274 static void 1275 mcadd(struct parse *p, cset *cs, const char *cp) 1276 { 1277 size_t oldend = cs->smultis; 1278 1279 cs->smultis += strlen(cp) + 1; 1280 if (cs->multis == NULL) 1281 cs->multis = malloc(cs->smultis); 1282 else 1283 cs->multis = realloc(cs->multis, cs->smultis); 1284 if (cs->multis == NULL) { 1285 SETERROR(REG_ESPACE); 1286 return; 1287 } 1288 1289 (void) strcpy(cs->multis + oldend - 1, cp); 1290 cs->multis[cs->smultis - 1] = '\0'; 1291 } 1292 1293 #ifdef notdef 1294 /* 1295 - mcsub - subtract a collating element from a cset 1296 == static void mcsub(cset *cs, char *cp); 1297 */ 1298 static void 1299 mcsub(cset *cs, char *cp) 1300 { 1301 char *fp = mcfind(cs, cp); 1302 size_t len = strlen(fp); 1303 1304 assert(fp != NULL); 1305 (void) memmove(fp, fp + len + 1, 1306 cs->smultis - (fp + len + 1 - cs->multis)); 1307 cs->smultis -= len; 1308 1309 if (cs->smultis == 0) { 1310 free(cs->multis); 1311 cs->multis = NULL; 1312 return; 1313 } 1314 1315 cs->multis = realloc(cs->multis, cs->smultis); 1316 assert(cs->multis != NULL); 1317 } 1318 1319 /* 1320 - mcin - is a collating element in a cset? 1321 == static int mcin(cset *cs, char *cp); 1322 */ 1323 static int 1324 mcin(cset *cs, char *cp) 1325 { 1326 return(mcfind(cs, cp) != NULL); 1327 } 1328 1329 /* 1330 - mcfind - find a collating element in a cset 1331 == static char *mcfind(cset *cs, char *cp); 1332 */ 1333 static char * 1334 mcfind(cset *cs, char *cp) 1335 { 1336 char *p; 1337 1338 if (cs->multis == NULL) 1339 return(NULL); 1340 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1341 if (strcmp(cp, p) == 0) 1342 return(p); 1343 return(NULL); 1344 } 1345 #endif 1346 1347 /* 1348 - mcinvert - invert the list of collating elements in a cset 1349 == static void mcinvert(struct parse *p, cset *cs); 1350 * 1351 * This would have to know the set of possibilities. Implementation 1352 * is deferred. 1353 */ 1354 static void 1355 mcinvert(struct parse *p, cset *cs) 1356 { 1357 assert(cs->multis == NULL); /* xxx */ 1358 } 1359 1360 /* 1361 - mccase - add case counterparts of the list of collating elements in a cset 1362 == static void mccase(struct parse *p, cset *cs); 1363 * 1364 * This would have to know the set of possibilities. Implementation 1365 * is deferred. 1366 */ 1367 static void 1368 mccase(struct parse *p, cset *cs) 1369 { 1370 assert(cs->multis == NULL); /* xxx */ 1371 } 1372 1373 #ifdef notdef 1374 /* 1375 - isinsets - is this character in any sets? 1376 == static int isinsets(struct re_guts *g, int c); 1377 */ 1378 static int /* predicate */ 1379 isinsets(struct re_guts *g, int c) 1380 { 1381 uch *col; 1382 int i; 1383 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1384 unsigned uc = (unsigned char)c; 1385 1386 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1387 if (col[uc] != 0) 1388 return(1); 1389 return(0); 1390 } 1391 1392 /* 1393 - samesets - are these two characters in exactly the same sets? 1394 == static int samesets(struct re_guts *g, int c1, int c2); 1395 */ 1396 static int /* predicate */ 1397 samesets(struct re_guts *g, int c1, int c2) 1398 { 1399 uch *col; 1400 int i; 1401 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1402 unsigned uc1 = (unsigned char)c1; 1403 unsigned uc2 = (unsigned char)c2; 1404 1405 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1406 if (col[uc1] != col[uc2]) 1407 return(0); 1408 return(1); 1409 } 1410 #endif 1411 1412 /* 1413 - categorize - sort out character categories 1414 == static void categorize(struct parse *p, struct re_guts *g); 1415 */ 1416 static void 1417 categorize(struct parse *p, struct re_guts *g) 1418 { 1419 #ifdef notdef 1420 cat_t *cats = g->categories; 1421 int c; 1422 int c2; 1423 cat_t cat; 1424 1425 /* avoid making error situations worse */ 1426 if (p->error != 0) 1427 return; 1428 1429 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1430 if (cats[c] == 0 && isinsets(g, c)) { 1431 cat = g->ncategories++; 1432 cats[c] = cat; 1433 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1434 if (cats[c2] == 0 && samesets(g, c, c2)) 1435 cats[c2] = cat; 1436 } 1437 #endif 1438 } 1439 1440 /* 1441 - dupl - emit a duplicate of a bunch of sops 1442 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1443 */ 1444 static sopno /* start of duplicate */ 1445 dupl(struct parse *p, sopno start, sopno finish) 1446 1447 /* from here */ 1448 /* to this less one */ 1449 { 1450 sopno ret = HERE(); 1451 sopno len = finish - start; 1452 1453 assert(finish >= start); 1454 if (len == 0) 1455 return(ret); 1456 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1457 return ret; 1458 assert(p->ssize >= p->slen + len); 1459 (void) memcpy((char *)(p->strip + p->slen), 1460 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1461 (void) memcpy((char *)(p->stripdata + p->slen), 1462 (char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T)); 1463 p->slen += len; 1464 return(ret); 1465 } 1466 1467 /* 1468 - doemit - emit a strip operator 1469 == static void doemit(struct parse *p, sop op, size_t opnd); 1470 * 1471 * It might seem better to implement this as a macro with a function as 1472 * hard-case backup, but it's just too big and messy unless there are 1473 * some changes to the data structures. Maybe later. 1474 */ 1475 static void 1476 doemit(struct parse *p, sop op, size_t opnd) 1477 { 1478 /* avoid making error situations worse */ 1479 if (p->error != 0) 1480 return; 1481 1482 /* deal with oversize operands ("can't happen", more or less) */ 1483 assert(opnd < 1); 1484 1485 /* deal with undersized strip */ 1486 if (p->slen >= p->ssize) 1487 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1488 return; 1489 1490 /* finally, it's all reduced to the easy case */ 1491 p->strip[p->slen] = op; 1492 p->stripdata[p->slen] = opnd; 1493 p->slen++; 1494 } 1495 1496 /* 1497 - doinsert - insert a sop into the strip 1498 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1499 */ 1500 static void 1501 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1502 { 1503 sopno sn; 1504 sop s; 1505 RCHAR_T d; 1506 int i; 1507 1508 /* avoid making error situations worse */ 1509 if (p->error != 0) 1510 return; 1511 1512 sn = HERE(); 1513 EMIT(op, opnd); /* do checks, ensure space */ 1514 assert(HERE() == sn+1); 1515 s = p->strip[sn]; 1516 d = p->stripdata[sn]; 1517 1518 /* adjust paren pointers */ 1519 assert(pos > 0); 1520 for (i = 1; i < NPAREN; i++) { 1521 if (p->pbegin[i] >= pos) { 1522 p->pbegin[i]++; 1523 } 1524 if (p->pend[i] >= pos) { 1525 p->pend[i]++; 1526 } 1527 } 1528 1529 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1530 (HERE()-pos-1)*sizeof(sop)); 1531 memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos], 1532 (HERE()-pos-1)*sizeof(RCHAR_T)); 1533 p->strip[pos] = s; 1534 p->stripdata[pos] = d; 1535 } 1536 1537 /* 1538 - dofwd - complete a forward reference 1539 == static void dofwd(struct parse *p, sopno pos, sop value); 1540 */ 1541 static void 1542 dofwd(struct parse *p, sopno pos, sop value) 1543 { 1544 /* avoid making error situations worse */ 1545 if (p->error != 0) 1546 return; 1547 1548 assert(value < 1); 1549 p->stripdata[pos] = value; 1550 } 1551 1552 /* 1553 - enlarge - enlarge the strip 1554 == static int enlarge(struct parse *p, sopno size); 1555 */ 1556 static int 1557 enlarge(struct parse *p, sopno size) 1558 { 1559 sop *sp; 1560 RCHAR_T *dp; 1561 sopno osize; 1562 1563 if (p->ssize >= size) 1564 return 1; 1565 1566 osize = p->ssize; 1567 p->ssize = size; 1568 if (MEMSIZE(p) > MEMLIMIT) 1569 goto oomem; 1570 sp = realloc(p->strip, p->ssize * sizeof(sop)); 1571 if (sp == NULL) 1572 goto oomem; 1573 p->strip = sp; 1574 dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T)); 1575 if (dp == NULL) { 1576 oomem: 1577 p->ssize = osize; 1578 SETERROR(REG_ESPACE); 1579 return 0; 1580 } 1581 p->stripdata = dp; 1582 return 1; 1583 } 1584 1585 /* 1586 - stripsnug - compact the strip 1587 == static void stripsnug(struct parse *p, struct re_guts *g); 1588 */ 1589 static void 1590 stripsnug(struct parse *p, struct re_guts *g) 1591 { 1592 g->nstates = p->slen; 1593 g->strip = (sop *)realloc((char *)p->strip, 1594 p->slen * sizeof(sop)); 1595 if (g->strip == NULL) { 1596 SETERROR(REG_ESPACE); 1597 g->strip = p->strip; 1598 } 1599 g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata, 1600 p->slen * sizeof(RCHAR_T)); 1601 if (g->stripdata == NULL) { 1602 SETERROR(REG_ESPACE); 1603 g->stripdata = p->stripdata; 1604 } 1605 } 1606 1607 /* 1608 - findmust - fill in must and mlen with longest mandatory literal string 1609 == static void findmust(struct parse *p, struct re_guts *g); 1610 * 1611 * This algorithm could do fancy things like analyzing the operands of | 1612 * for common subsequences. Someday. This code is simple and finds most 1613 * of the interesting cases. 1614 * 1615 * Note that must and mlen got initialized during setup. 1616 */ 1617 static void 1618 findmust(struct parse *p, struct re_guts *g) 1619 { 1620 sop *scans; 1621 RCHAR_T *scand; 1622 sop *starts = 0; 1623 RCHAR_T *startd = NULL; 1624 sop *newstarts = 0; 1625 RCHAR_T *newstartd = NULL; 1626 sopno newlen; 1627 sop s; 1628 RCHAR_T d; 1629 RCHAR_T *cp; 1630 sopno i; 1631 1632 /* avoid making error situations worse */ 1633 if (p->error != 0) 1634 return; 1635 1636 /* find the longest OCHAR sequence in strip */ 1637 newlen = 0; 1638 scans = g->strip + 1; 1639 scand = g->stripdata + 1; 1640 do { 1641 s = *scans++; 1642 d = *scand++; 1643 switch (s) { 1644 case OCHAR: /* sequence member */ 1645 if (newlen == 0) { /* new sequence */ 1646 newstarts = scans - 1; 1647 newstartd = scand - 1; 1648 } 1649 newlen++; 1650 break; 1651 case OPLUS_: /* things that don't break one */ 1652 case OLPAREN: 1653 case ORPAREN: 1654 break; 1655 case OQUEST_: /* things that must be skipped */ 1656 case OCH_: 1657 scans--; 1658 scand--; 1659 do { 1660 scans += d; 1661 scand += d; 1662 s = *scans; 1663 d = *scand; 1664 /* assert() interferes w debug printouts */ 1665 if (s != O_QUEST && s != O_CH && s != OOR2) { 1666 g->iflags |= BAD; 1667 return; 1668 } 1669 } while (s != O_QUEST && s != O_CH); 1670 /* fallthrough */ 1671 default: /* things that break a sequence */ 1672 if (newlen > g->mlen) { /* ends one */ 1673 starts = newstarts; 1674 startd = newstartd; 1675 g->mlen = newlen; 1676 } 1677 newlen = 0; 1678 break; 1679 } 1680 } while (s != OEND); 1681 1682 if (g->mlen == 0) /* there isn't one */ 1683 return; 1684 1685 /* turn it into a character string */ 1686 g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T)); 1687 if (g->must == NULL) { /* argh; just forget it */ 1688 g->mlen = 0; 1689 return; 1690 } 1691 cp = g->must; 1692 scans = starts; 1693 scand = startd; 1694 for (i = g->mlen; i > 0; i--) { 1695 for (;;) { 1696 s = *scans++; 1697 d = *scand++; 1698 if (s == OCHAR) 1699 break; 1700 } 1701 assert(cp < g->must + g->mlen); 1702 *cp++ = d; 1703 } 1704 assert(cp == g->must + g->mlen); 1705 *cp++ = '\0'; /* just on general principles */ 1706 } 1707 1708 /* 1709 - pluscount - count + nesting 1710 == static sopno pluscount(struct parse *p, struct re_guts *g); 1711 */ 1712 static sopno /* nesting depth */ 1713 pluscount(struct parse *p, struct re_guts *g) 1714 { 1715 sop *scan; 1716 sop s; 1717 sopno plusnest = 0; 1718 sopno maxnest = 0; 1719 1720 if (p->error != 0) 1721 return(0); /* there may not be an OEND */ 1722 1723 scan = g->strip + 1; 1724 do { 1725 s = *scan++; 1726 switch (s) { 1727 case OPLUS_: 1728 plusnest++; 1729 break; 1730 case O_PLUS: 1731 if (plusnest > maxnest) 1732 maxnest = plusnest; 1733 plusnest--; 1734 break; 1735 } 1736 } while (s != OEND); 1737 if (plusnest != 0) 1738 g->iflags |= BAD; 1739 return(maxnest); 1740 } 1741