1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * All rights reserved. 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * Henry Spencer. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 41 */ 42 43 #if defined(LIBC_SCCS) && !defined(lint) 44 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 45 #endif /* LIBC_SCCS and not lint */ 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/types.h> 50 #include <stdio.h> 51 #include <string.h> 52 #include <ctype.h> 53 #include <limits.h> 54 #include <stdlib.h> 55 #include <regex.h> 56 #include <stdbool.h> 57 #include <wchar.h> 58 #include <wctype.h> 59 60 #ifndef LIBREGEX 61 #include "collate.h" 62 #endif 63 64 #include "utils.h" 65 #include "regex2.h" 66 67 #include "cname.h" 68 69 /* 70 * Branching context, used to keep track of branch state for all of the branch- 71 * aware functions. In addition to keeping track of branch positions for the 72 * p_branch_* functions, we use this to simplify some clumsiness in BREs for 73 * detection of whether ^ is acting as an anchor or being used erroneously and 74 * also for whether we're in a sub-expression or not. 75 */ 76 struct branchc { 77 sopno start; 78 sopno back; 79 sopno fwd; 80 81 int nbranch; 82 int nchain; 83 bool outer; 84 bool terminate; 85 }; 86 87 /* 88 * parse structure, passed up and down to avoid global variables and 89 * other clumsinesses 90 */ 91 struct parse { 92 const char *next; /* next character in RE */ 93 const char *end; /* end of string (-> NUL normally) */ 94 int error; /* has an error been seen? */ 95 sop *strip; /* malloced strip */ 96 sopno ssize; /* malloced strip size (allocated) */ 97 sopno slen; /* malloced strip length (used) */ 98 int ncsalloc; /* number of csets allocated */ 99 struct re_guts *g; 100 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 101 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 102 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 103 bool allowbranch; /* can this expression branch? */ 104 bool bre; /* convenience; is this a BRE? */ 105 int pflags; /* other parsing flags -- legacy escapes? */ 106 bool (*parse_expr)(struct parse *, struct branchc *); 107 void (*pre_parse)(struct parse *, struct branchc *); 108 void (*post_parse)(struct parse *, struct branchc *); 109 }; 110 111 #define PFLAG_LEGACY_ESC 0x00000001 112 113 /* ========= begin header generated by ./mkh ========= */ 114 #ifdef __cplusplus 115 extern "C" { 116 #endif 117 118 /* === regcomp.c === */ 119 static bool p_ere_exp(struct parse *p, struct branchc *bc); 120 static void p_str(struct parse *p); 121 static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 122 static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 123 static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 124 static bool p_branch_empty(struct parse *p, struct branchc *bc); 125 static bool p_branch_do(struct parse *p, struct branchc *bc); 126 static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 127 static void p_bre_post_parse(struct parse *p, struct branchc *bc); 128 static void p_re(struct parse *p, int end1, int end2); 129 static bool p_simp_re(struct parse *p, struct branchc *bc); 130 static int p_count(struct parse *p); 131 static void p_bracket(struct parse *p); 132 static int p_range_cmp(wchar_t c1, wchar_t c2); 133 static void p_b_term(struct parse *p, cset *cs); 134 static void p_b_cclass(struct parse *p, cset *cs); 135 static void p_b_eclass(struct parse *p, cset *cs); 136 static wint_t p_b_symbol(struct parse *p); 137 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 138 static bool may_escape(struct parse *p, const wint_t ch); 139 static wint_t othercase(wint_t ch); 140 static void bothcases(struct parse *p, wint_t ch); 141 static void ordinary(struct parse *p, wint_t ch); 142 static void nonnewline(struct parse *p); 143 static void repeat(struct parse *p, sopno start, int from, int to); 144 static int seterr(struct parse *p, int e); 145 static cset *allocset(struct parse *p); 146 static void freeset(struct parse *p, cset *cs); 147 static void CHadd(struct parse *p, cset *cs, wint_t ch); 148 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 149 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 150 static wint_t singleton(cset *cs); 151 static sopno dupl(struct parse *p, sopno start, sopno finish); 152 static void doemit(struct parse *p, sop op, size_t opnd); 153 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 154 static void dofwd(struct parse *p, sopno pos, sop value); 155 static int enlarge(struct parse *p, sopno size); 156 static void stripsnug(struct parse *p, struct re_guts *g); 157 static void findmust(struct parse *p, struct re_guts *g); 158 static int altoffset(sop *scan, int offset); 159 static void computejumps(struct parse *p, struct re_guts *g); 160 static void computematchjumps(struct parse *p, struct re_guts *g); 161 static sopno pluscount(struct parse *p, struct re_guts *g); 162 static wint_t wgetnext(struct parse *p); 163 164 #ifdef __cplusplus 165 } 166 #endif 167 /* ========= end header generated by ./mkh ========= */ 168 169 static char nuls[10]; /* place to point scanner in event of error */ 170 171 /* 172 * macros for use with parse structure 173 * BEWARE: these know that the parse structure is named `p' !!! 174 */ 175 #define PEEK() (*p->next) 176 #define PEEK2() (*(p->next+1)) 177 #define MORE() (p->next < p->end) 178 #define MORE2() (p->next+1 < p->end) 179 #define SEE(c) (MORE() && PEEK() == (c)) 180 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 181 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 182 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 183 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 184 #define NEXT() (p->next++) 185 #define NEXT2() (p->next += 2) 186 #define NEXTn(n) (p->next += (n)) 187 #define GETNEXT() (*p->next++) 188 #define WGETNEXT() wgetnext(p) 189 #define SETERROR(e) seterr(p, (e)) 190 #define REQUIRE(co, e) ((co) || SETERROR(e)) 191 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 192 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 193 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 194 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 195 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 196 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 197 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 198 #define HERE() (p->slen) 199 #define THERE() (p->slen - 1) 200 #define THERETHERE() (p->slen - 2) 201 #define DROP(n) (p->slen -= (n)) 202 203 /* Macro used by computejump()/computematchjump() */ 204 #define MIN(a,b) ((a)<(b)?(a):(b)) 205 206 static int /* 0 success, otherwise REG_something */ 207 regcomp_internal(regex_t * __restrict preg, 208 const char * __restrict pattern, 209 int cflags, int pflags) 210 { 211 struct parse pa; 212 struct re_guts *g; 213 struct parse *p = &pa; 214 int i; 215 size_t len; 216 size_t maxlen; 217 #ifdef REDEBUG 218 # define GOODFLAGS(f) (f) 219 #else 220 # define GOODFLAGS(f) ((f)&~REG_DUMP) 221 #endif 222 223 cflags = GOODFLAGS(cflags); 224 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 225 return(REG_INVARG); 226 227 if (cflags®_PEND) { 228 if (preg->re_endp < pattern) 229 return(REG_INVARG); 230 len = preg->re_endp - pattern; 231 } else 232 len = strlen(pattern); 233 234 /* do the mallocs early so failure handling is easy */ 235 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 236 if (g == NULL) 237 return(REG_ESPACE); 238 /* 239 * Limit the pattern space to avoid a 32-bit overflow on buffer 240 * extension. Also avoid any signed overflow in case of conversion 241 * so make the real limit based on a 31-bit overflow. 242 * 243 * Likely not applicable on 64-bit systems but handle the case 244 * generically (who are we to stop people from using ~715MB+ 245 * patterns?). 246 */ 247 maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3; 248 if (len >= maxlen) { 249 free((char *)g); 250 return(REG_ESPACE); 251 } 252 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 253 assert(p->ssize >= len); 254 255 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 256 p->slen = 0; 257 if (p->strip == NULL) { 258 free((char *)g); 259 return(REG_ESPACE); 260 } 261 262 /* set things up */ 263 p->g = g; 264 p->next = pattern; /* convenience; we do not modify it */ 265 p->end = p->next + len; 266 p->error = 0; 267 p->ncsalloc = 0; 268 p->pflags = pflags; 269 for (i = 0; i < NPAREN; i++) { 270 p->pbegin[i] = 0; 271 p->pend[i] = 0; 272 } 273 if (cflags & REG_EXTENDED) { 274 p->allowbranch = true; 275 p->bre = false; 276 p->parse_expr = p_ere_exp; 277 p->pre_parse = NULL; 278 p->post_parse = NULL; 279 } else { 280 p->allowbranch = false; 281 p->bre = true; 282 p->parse_expr = p_simp_re; 283 p->pre_parse = p_bre_pre_parse; 284 p->post_parse = p_bre_post_parse; 285 } 286 g->sets = NULL; 287 g->ncsets = 0; 288 g->cflags = cflags; 289 g->iflags = 0; 290 g->nbol = 0; 291 g->neol = 0; 292 g->must = NULL; 293 g->moffset = -1; 294 g->charjump = NULL; 295 g->matchjump = NULL; 296 g->mlen = 0; 297 g->nsub = 0; 298 g->backrefs = 0; 299 300 /* do it */ 301 EMIT(OEND, 0); 302 g->firststate = THERE(); 303 if (cflags & REG_NOSPEC) 304 p_str(p); 305 else 306 p_re(p, OUT, OUT); 307 EMIT(OEND, 0); 308 g->laststate = THERE(); 309 310 /* tidy up loose ends and fill things in */ 311 stripsnug(p, g); 312 findmust(p, g); 313 /* only use Boyer-Moore algorithm if the pattern is bigger 314 * than three characters 315 */ 316 if(g->mlen > 3) { 317 computejumps(p, g); 318 computematchjumps(p, g); 319 if(g->matchjump == NULL && g->charjump != NULL) { 320 free(g->charjump); 321 g->charjump = NULL; 322 } 323 } 324 g->nplus = pluscount(p, g); 325 g->magic = MAGIC2; 326 preg->re_nsub = g->nsub; 327 preg->re_g = g; 328 preg->re_magic = MAGIC1; 329 #ifndef REDEBUG 330 /* not debugging, so can't rely on the assert() in regexec() */ 331 if (g->iflags&BAD) 332 SETERROR(REG_ASSERT); 333 #endif 334 335 /* win or lose, we're done */ 336 if (p->error != 0) /* lose */ 337 regfree(preg); 338 return(p->error); 339 } 340 341 /* 342 - regcomp - interface for parser and compilation 343 = extern int regcomp(regex_t *, const char *, int); 344 = #define REG_BASIC 0000 345 = #define REG_EXTENDED 0001 346 = #define REG_ICASE 0002 347 = #define REG_NOSUB 0004 348 = #define REG_NEWLINE 0010 349 = #define REG_NOSPEC 0020 350 = #define REG_PEND 0040 351 = #define REG_DUMP 0200 352 */ 353 int /* 0 success, otherwise REG_something */ 354 regcomp(regex_t * __restrict preg, 355 const char * __restrict pattern, 356 int cflags) 357 { 358 359 return (regcomp_internal(preg, pattern, cflags, 0)); 360 } 361 362 #ifndef LIBREGEX 363 /* 364 * Legacy interface that requires more lax escaping behavior. 365 */ 366 int 367 freebsd12_regcomp(regex_t * __restrict preg, 368 const char * __restrict pattern, 369 int cflags, int pflags) 370 { 371 372 return (regcomp_internal(preg, pattern, cflags, PFLAG_LEGACY_ESC)); 373 } 374 375 __sym_compat(regcomp, freebsd12_regcomp, FBSD_1.0); 376 #endif /* !LIBREGEX */ 377 378 /* 379 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 380 - return whether we should terminate or not 381 == static bool p_ere_exp(struct parse *p); 382 */ 383 static bool 384 p_ere_exp(struct parse *p, struct branchc *bc) 385 { 386 char c; 387 wint_t wc; 388 sopno pos; 389 int count; 390 int count2; 391 sopno subno; 392 int wascaret = 0; 393 394 (void)bc; 395 assert(MORE()); /* caller should have ensured this */ 396 c = GETNEXT(); 397 398 pos = HERE(); 399 switch (c) { 400 case '(': 401 (void)REQUIRE(MORE(), REG_EPAREN); 402 p->g->nsub++; 403 subno = p->g->nsub; 404 if (subno < NPAREN) 405 p->pbegin[subno] = HERE(); 406 EMIT(OLPAREN, subno); 407 if (!SEE(')')) 408 p_re(p, ')', IGN); 409 if (subno < NPAREN) { 410 p->pend[subno] = HERE(); 411 assert(p->pend[subno] != 0); 412 } 413 EMIT(ORPAREN, subno); 414 (void)MUSTEAT(')', REG_EPAREN); 415 break; 416 #ifndef POSIX_MISTAKE 417 case ')': /* happens only if no current unmatched ( */ 418 /* 419 * You may ask, why the ifndef? Because I didn't notice 420 * this until slightly too late for 1003.2, and none of the 421 * other 1003.2 regular-expression reviewers noticed it at 422 * all. So an unmatched ) is legal POSIX, at least until 423 * we can get it fixed. 424 */ 425 SETERROR(REG_EPAREN); 426 break; 427 #endif 428 case '^': 429 EMIT(OBOL, 0); 430 p->g->iflags |= USEBOL; 431 p->g->nbol++; 432 wascaret = 1; 433 break; 434 case '$': 435 EMIT(OEOL, 0); 436 p->g->iflags |= USEEOL; 437 p->g->neol++; 438 break; 439 case '|': 440 SETERROR(REG_EMPTY); 441 break; 442 case '*': 443 case '+': 444 case '?': 445 case '{': 446 SETERROR(REG_BADRPT); 447 break; 448 case '.': 449 if (p->g->cflags®_NEWLINE) 450 nonnewline(p); 451 else 452 EMIT(OANY, 0); 453 break; 454 case '[': 455 p_bracket(p); 456 break; 457 case '\\': 458 (void)REQUIRE(MORE(), REG_EESCAPE); 459 wc = WGETNEXT(); 460 switch (wc) { 461 case '<': 462 EMIT(OBOW, 0); 463 break; 464 case '>': 465 EMIT(OEOW, 0); 466 break; 467 default: 468 if (may_escape(p, wc)) 469 ordinary(p, wc); 470 else 471 SETERROR(REG_EESCAPE); 472 break; 473 } 474 break; 475 default: 476 if (p->error != 0) 477 return (false); 478 p->next--; 479 wc = WGETNEXT(); 480 ordinary(p, wc); 481 break; 482 } 483 484 if (!MORE()) 485 return (false); 486 c = PEEK(); 487 /* we call { a repetition if followed by a digit */ 488 if (!( c == '*' || c == '+' || c == '?' || c == '{')) 489 return (false); /* no repetition, we're done */ 490 else if (c == '{') 491 (void)REQUIRE(MORE2() && \ 492 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 493 NEXT(); 494 495 (void)REQUIRE(!wascaret, REG_BADRPT); 496 switch (c) { 497 case '*': /* implemented as +? */ 498 /* this case does not require the (y|) trick, noKLUDGE */ 499 INSERT(OPLUS_, pos); 500 ASTERN(O_PLUS, pos); 501 INSERT(OQUEST_, pos); 502 ASTERN(O_QUEST, pos); 503 break; 504 case '+': 505 INSERT(OPLUS_, pos); 506 ASTERN(O_PLUS, pos); 507 break; 508 case '?': 509 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 510 INSERT(OCH_, pos); /* offset slightly wrong */ 511 ASTERN(OOR1, pos); /* this one's right */ 512 AHEAD(pos); /* fix the OCH_ */ 513 EMIT(OOR2, 0); /* offset very wrong... */ 514 AHEAD(THERE()); /* ...so fix it */ 515 ASTERN(O_CH, THERETHERE()); 516 break; 517 case '{': 518 count = p_count(p); 519 if (EAT(',')) { 520 if (isdigit((uch)PEEK())) { 521 count2 = p_count(p); 522 (void)REQUIRE(count <= count2, REG_BADBR); 523 } else /* single number with comma */ 524 count2 = INFINITY; 525 } else /* just a single number */ 526 count2 = count; 527 repeat(p, pos, count, count2); 528 if (!EAT('}')) { /* error heuristics */ 529 while (MORE() && PEEK() != '}') 530 NEXT(); 531 (void)REQUIRE(MORE(), REG_EBRACE); 532 SETERROR(REG_BADBR); 533 } 534 break; 535 } 536 537 if (!MORE()) 538 return (false); 539 c = PEEK(); 540 if (!( c == '*' || c == '+' || c == '?' || 541 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 542 return (false); 543 SETERROR(REG_BADRPT); 544 return (false); 545 } 546 547 /* 548 - p_str - string (no metacharacters) "parser" 549 == static void p_str(struct parse *p); 550 */ 551 static void 552 p_str(struct parse *p) 553 { 554 (void)REQUIRE(MORE(), REG_EMPTY); 555 while (MORE()) 556 ordinary(p, WGETNEXT()); 557 } 558 559 /* 560 * Eat consecutive branch delimiters for the kind of expression that we are 561 * parsing, return the number of delimiters that we ate. 562 */ 563 static int 564 p_branch_eat_delim(struct parse *p, struct branchc *bc) 565 { 566 int nskip; 567 568 (void)bc; 569 nskip = 0; 570 while (EAT('|')) 571 ++nskip; 572 return (nskip); 573 } 574 575 /* 576 * Insert necessary branch book-keeping operations. This emits a 577 * bogus 'next' offset, since we still have more to parse 578 */ 579 static void 580 p_branch_ins_offset(struct parse *p, struct branchc *bc) 581 { 582 583 if (bc->nbranch == 0) { 584 INSERT(OCH_, bc->start); /* offset is wrong */ 585 bc->fwd = bc->start; 586 bc->back = bc->start; 587 } 588 589 ASTERN(OOR1, bc->back); 590 bc->back = THERE(); 591 AHEAD(bc->fwd); /* fix previous offset */ 592 bc->fwd = HERE(); 593 EMIT(OOR2, 0); /* offset is very wrong */ 594 ++bc->nbranch; 595 } 596 597 /* 598 * Fix the offset of the tail branch, if we actually had any branches. 599 * This is to correct the bogus placeholder offset that we use. 600 */ 601 static void 602 p_branch_fix_tail(struct parse *p, struct branchc *bc) 603 { 604 605 /* Fix bogus offset at the tail if we actually have branches */ 606 if (bc->nbranch > 0) { 607 AHEAD(bc->fwd); 608 ASTERN(O_CH, bc->back); 609 } 610 } 611 612 /* 613 * Signal to the parser that an empty branch has been encountered; this will, 614 * in the future, be used to allow for more permissive behavior with empty 615 * branches. The return value should indicate whether parsing may continue 616 * or not. 617 */ 618 static bool 619 p_branch_empty(struct parse *p, struct branchc *bc) 620 { 621 622 (void)bc; 623 SETERROR(REG_EMPTY); 624 return (false); 625 } 626 627 /* 628 * Take care of any branching requirements. This includes inserting the 629 * appropriate branching instructions as well as eating all of the branch 630 * delimiters until we either run out of pattern or need to parse more pattern. 631 */ 632 static bool 633 p_branch_do(struct parse *p, struct branchc *bc) 634 { 635 int ate = 0; 636 637 ate = p_branch_eat_delim(p, bc); 638 if (ate == 0) 639 return (false); 640 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 641 /* 642 * Halt parsing only if we have an empty branch and p_branch_empty 643 * indicates that we must not continue. In the future, this will not 644 * necessarily be an error. 645 */ 646 return (false); 647 p_branch_ins_offset(p, bc); 648 649 return (true); 650 } 651 652 static void 653 p_bre_pre_parse(struct parse *p, struct branchc *bc) 654 { 655 656 (void) bc; 657 /* 658 * Does not move cleanly into expression parser because of 659 * ordinary interpration of * at the beginning position of 660 * an expression. 661 */ 662 if (EAT('^')) { 663 EMIT(OBOL, 0); 664 p->g->iflags |= USEBOL; 665 p->g->nbol++; 666 } 667 } 668 669 static void 670 p_bre_post_parse(struct parse *p, struct branchc *bc) 671 { 672 673 /* Expression is terminating due to EOL token */ 674 if (bc->terminate) { 675 DROP(1); 676 EMIT(OEOL, 0); 677 p->g->iflags |= USEEOL; 678 p->g->neol++; 679 } 680 } 681 682 /* 683 - p_re - Top level parser, concatenation and BRE anchoring 684 == static void p_re(struct parse *p, int end1, int end2); 685 * Giving end1 as OUT essentially eliminates the end1/end2 check. 686 * 687 * This implementation is a bit of a kludge, in that a trailing $ is first 688 * taken as an ordinary character and then revised to be an anchor. 689 * The amount of lookahead needed to avoid this kludge is excessive. 690 */ 691 static void 692 p_re(struct parse *p, 693 int end1, /* first terminating character */ 694 int end2) /* second terminating character; ignored for EREs */ 695 { 696 struct branchc bc; 697 698 bc.nbranch = 0; 699 if (end1 == OUT && end2 == OUT) 700 bc.outer = true; 701 else 702 bc.outer = false; 703 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 704 for (;;) { 705 bc.start = HERE(); 706 bc.nchain = 0; 707 bc.terminate = false; 708 if (p->pre_parse != NULL) 709 p->pre_parse(p, &bc); 710 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 711 bc.terminate = p->parse_expr(p, &bc); 712 ++bc.nchain; 713 } 714 if (p->post_parse != NULL) 715 p->post_parse(p, &bc); 716 (void) REQUIRE(HERE() != bc.start, REG_EMPTY); 717 if (!p->allowbranch) 718 break; 719 /* 720 * p_branch_do's return value indicates whether we should 721 * continue parsing or not. This is both for correctness and 722 * a slight optimization, because it will check if we've 723 * encountered an empty branch or the end of the string 724 * immediately following a branch delimiter. 725 */ 726 if (!p_branch_do(p, &bc)) 727 break; 728 } 729 #undef SEE_END 730 if (p->allowbranch) 731 p_branch_fix_tail(p, &bc); 732 assert(!MORE() || SEE(end1)); 733 } 734 735 /* 736 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 737 == static bool p_simp_re(struct parse *p, struct branchc *bc); 738 */ 739 static bool /* was the simple RE an unbackslashed $? */ 740 p_simp_re(struct parse *p, struct branchc *bc) 741 { 742 int c; 743 int count; 744 int count2; 745 sopno pos; 746 int i; 747 wint_t wc; 748 sopno subno; 749 # define BACKSL (1<<CHAR_BIT) 750 751 pos = HERE(); /* repetition op, if any, covers from here */ 752 753 assert(MORE()); /* caller should have ensured this */ 754 c = GETNEXT(); 755 if (c == '\\') { 756 (void)REQUIRE(MORE(), REG_EESCAPE); 757 c = BACKSL | GETNEXT(); 758 } 759 switch (c) { 760 case '.': 761 if (p->g->cflags®_NEWLINE) 762 nonnewline(p); 763 else 764 EMIT(OANY, 0); 765 break; 766 case '[': 767 p_bracket(p); 768 break; 769 case BACKSL|'<': 770 EMIT(OBOW, 0); 771 break; 772 case BACKSL|'>': 773 EMIT(OEOW, 0); 774 break; 775 case BACKSL|'{': 776 SETERROR(REG_BADRPT); 777 break; 778 case BACKSL|'(': 779 p->g->nsub++; 780 subno = p->g->nsub; 781 if (subno < NPAREN) 782 p->pbegin[subno] = HERE(); 783 EMIT(OLPAREN, subno); 784 /* the MORE here is an error heuristic */ 785 if (MORE() && !SEETWO('\\', ')')) 786 p_re(p, '\\', ')'); 787 if (subno < NPAREN) { 788 p->pend[subno] = HERE(); 789 assert(p->pend[subno] != 0); 790 } 791 EMIT(ORPAREN, subno); 792 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 793 break; 794 case BACKSL|')': /* should not get here -- must be user */ 795 SETERROR(REG_EPAREN); 796 break; 797 case BACKSL|'1': 798 case BACKSL|'2': 799 case BACKSL|'3': 800 case BACKSL|'4': 801 case BACKSL|'5': 802 case BACKSL|'6': 803 case BACKSL|'7': 804 case BACKSL|'8': 805 case BACKSL|'9': 806 i = (c&~BACKSL) - '0'; 807 assert(i < NPAREN); 808 if (p->pend[i] != 0) { 809 assert(i <= p->g->nsub); 810 EMIT(OBACK_, i); 811 assert(p->pbegin[i] != 0); 812 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 813 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 814 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 815 EMIT(O_BACK, i); 816 } else 817 SETERROR(REG_ESUBREG); 818 p->g->backrefs = 1; 819 break; 820 case '*': 821 /* 822 * Ordinary if used as the first character beyond BOL anchor of 823 * a (sub-)expression, counts as a bad repetition operator if it 824 * appears otherwise. 825 */ 826 (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 827 /* FALLTHROUGH */ 828 default: 829 if (p->error != 0) 830 return (false); /* Definitely not $... */ 831 p->next--; 832 wc = WGETNEXT(); 833 if ((c & BACKSL) == 0 || may_escape(p, wc)) 834 ordinary(p, wc); 835 else 836 SETERROR(REG_EESCAPE); 837 break; 838 } 839 840 if (EAT('*')) { /* implemented as +? */ 841 /* this case does not require the (y|) trick, noKLUDGE */ 842 INSERT(OPLUS_, pos); 843 ASTERN(O_PLUS, pos); 844 INSERT(OQUEST_, pos); 845 ASTERN(O_QUEST, pos); 846 } else if (EATTWO('\\', '{')) { 847 count = p_count(p); 848 if (EAT(',')) { 849 if (MORE() && isdigit((uch)PEEK())) { 850 count2 = p_count(p); 851 (void)REQUIRE(count <= count2, REG_BADBR); 852 } else /* single number with comma */ 853 count2 = INFINITY; 854 } else /* just a single number */ 855 count2 = count; 856 repeat(p, pos, count, count2); 857 if (!EATTWO('\\', '}')) { /* error heuristics */ 858 while (MORE() && !SEETWO('\\', '}')) 859 NEXT(); 860 (void)REQUIRE(MORE(), REG_EBRACE); 861 SETERROR(REG_BADBR); 862 } 863 } else if (c == '$') /* $ (but not \$) ends it */ 864 return (true); 865 866 return (false); 867 } 868 869 /* 870 - p_count - parse a repetition count 871 == static int p_count(struct parse *p); 872 */ 873 static int /* the value */ 874 p_count(struct parse *p) 875 { 876 int count = 0; 877 int ndigits = 0; 878 879 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 880 count = count*10 + (GETNEXT() - '0'); 881 ndigits++; 882 } 883 884 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 885 return(count); 886 } 887 888 /* 889 - p_bracket - parse a bracketed character list 890 == static void p_bracket(struct parse *p); 891 */ 892 static void 893 p_bracket(struct parse *p) 894 { 895 cset *cs; 896 wint_t ch; 897 898 /* Dept of Truly Sickening Special-Case Kludges */ 899 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 900 EMIT(OBOW, 0); 901 NEXTn(6); 902 return; 903 } 904 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 905 EMIT(OEOW, 0); 906 NEXTn(6); 907 return; 908 } 909 910 if ((cs = allocset(p)) == NULL) 911 return; 912 913 if (p->g->cflags®_ICASE) 914 cs->icase = 1; 915 if (EAT('^')) 916 cs->invert = 1; 917 if (EAT(']')) 918 CHadd(p, cs, ']'); 919 else if (EAT('-')) 920 CHadd(p, cs, '-'); 921 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 922 p_b_term(p, cs); 923 if (EAT('-')) 924 CHadd(p, cs, '-'); 925 (void)MUSTEAT(']', REG_EBRACK); 926 927 if (p->error != 0) /* don't mess things up further */ 928 return; 929 930 if (cs->invert && p->g->cflags®_NEWLINE) 931 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 932 933 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 934 ordinary(p, ch); 935 freeset(p, cs); 936 } else 937 EMIT(OANYOF, (int)(cs - p->g->sets)); 938 } 939 940 static int 941 p_range_cmp(wchar_t c1, wchar_t c2) 942 { 943 #ifndef LIBREGEX 944 return __wcollate_range_cmp(c1, c2); 945 #else 946 /* Copied from libc/collate __wcollate_range_cmp */ 947 wchar_t s1[2], s2[2]; 948 949 s1[0] = c1; 950 s1[1] = L'\0'; 951 s2[0] = c2; 952 s2[1] = L'\0'; 953 return (wcscoll(s1, s2)); 954 #endif 955 } 956 957 /* 958 - p_b_term - parse one term of a bracketed character list 959 == static void p_b_term(struct parse *p, cset *cs); 960 */ 961 static void 962 p_b_term(struct parse *p, cset *cs) 963 { 964 char c; 965 wint_t start, finish; 966 wint_t i; 967 #ifndef LIBREGEX 968 struct xlocale_collate *table = 969 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 970 #endif 971 /* classify what we've got */ 972 switch ((MORE()) ? PEEK() : '\0') { 973 case '[': 974 c = (MORE2()) ? PEEK2() : '\0'; 975 break; 976 case '-': 977 SETERROR(REG_ERANGE); 978 return; /* NOTE RETURN */ 979 default: 980 c = '\0'; 981 break; 982 } 983 984 switch (c) { 985 case ':': /* character class */ 986 NEXT2(); 987 (void)REQUIRE(MORE(), REG_EBRACK); 988 c = PEEK(); 989 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 990 p_b_cclass(p, cs); 991 (void)REQUIRE(MORE(), REG_EBRACK); 992 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 993 break; 994 case '=': /* equivalence class */ 995 NEXT2(); 996 (void)REQUIRE(MORE(), REG_EBRACK); 997 c = PEEK(); 998 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 999 p_b_eclass(p, cs); 1000 (void)REQUIRE(MORE(), REG_EBRACK); 1001 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 1002 break; 1003 default: /* symbol, ordinary character, or range */ 1004 start = p_b_symbol(p); 1005 if (SEE('-') && MORE2() && PEEK2() != ']') { 1006 /* range */ 1007 NEXT(); 1008 if (EAT('-')) 1009 finish = '-'; 1010 else 1011 finish = p_b_symbol(p); 1012 } else 1013 finish = start; 1014 if (start == finish) 1015 CHadd(p, cs, start); 1016 else { 1017 #ifndef LIBREGEX 1018 if (table->__collate_load_error || MB_CUR_MAX > 1) { 1019 #else 1020 if (MB_CUR_MAX > 1) { 1021 #endif 1022 (void)REQUIRE(start <= finish, REG_ERANGE); 1023 CHaddrange(p, cs, start, finish); 1024 } else { 1025 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); 1026 for (i = 0; i <= UCHAR_MAX; i++) { 1027 if (p_range_cmp(start, i) <= 0 && 1028 p_range_cmp(i, finish) <= 0 ) 1029 CHadd(p, cs, i); 1030 } 1031 } 1032 } 1033 break; 1034 } 1035 } 1036 1037 /* 1038 - p_b_cclass - parse a character-class name and deal with it 1039 == static void p_b_cclass(struct parse *p, cset *cs); 1040 */ 1041 static void 1042 p_b_cclass(struct parse *p, cset *cs) 1043 { 1044 const char *sp = p->next; 1045 size_t len; 1046 wctype_t wct; 1047 char clname[16]; 1048 1049 while (MORE() && isalpha((uch)PEEK())) 1050 NEXT(); 1051 len = p->next - sp; 1052 if (len >= sizeof(clname) - 1) { 1053 SETERROR(REG_ECTYPE); 1054 return; 1055 } 1056 memcpy(clname, sp, len); 1057 clname[len] = '\0'; 1058 if ((wct = wctype(clname)) == 0) { 1059 SETERROR(REG_ECTYPE); 1060 return; 1061 } 1062 CHaddtype(p, cs, wct); 1063 } 1064 1065 /* 1066 - p_b_eclass - parse an equivalence-class name and deal with it 1067 == static void p_b_eclass(struct parse *p, cset *cs); 1068 * 1069 * This implementation is incomplete. xxx 1070 */ 1071 static void 1072 p_b_eclass(struct parse *p, cset *cs) 1073 { 1074 wint_t c; 1075 1076 c = p_b_coll_elem(p, '='); 1077 CHadd(p, cs, c); 1078 } 1079 1080 /* 1081 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1082 == static wint_t p_b_symbol(struct parse *p); 1083 */ 1084 static wint_t /* value of symbol */ 1085 p_b_symbol(struct parse *p) 1086 { 1087 wint_t value; 1088 1089 (void)REQUIRE(MORE(), REG_EBRACK); 1090 if (!EATTWO('[', '.')) 1091 return(WGETNEXT()); 1092 1093 /* collating symbol */ 1094 value = p_b_coll_elem(p, '.'); 1095 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1096 return(value); 1097 } 1098 1099 /* 1100 - p_b_coll_elem - parse a collating-element name and look it up 1101 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1102 */ 1103 static wint_t /* value of collating element */ 1104 p_b_coll_elem(struct parse *p, 1105 wint_t endc) /* name ended by endc,']' */ 1106 { 1107 const char *sp = p->next; 1108 struct cname *cp; 1109 mbstate_t mbs; 1110 wchar_t wc; 1111 size_t clen, len; 1112 1113 while (MORE() && !SEETWO(endc, ']')) 1114 NEXT(); 1115 if (!MORE()) { 1116 SETERROR(REG_EBRACK); 1117 return(0); 1118 } 1119 len = p->next - sp; 1120 for (cp = cnames; cp->name != NULL; cp++) 1121 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1122 return(cp->code); /* known name */ 1123 memset(&mbs, 0, sizeof(mbs)); 1124 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1125 return (wc); /* single character */ 1126 else if (clen == (size_t)-1 || clen == (size_t)-2) 1127 SETERROR(REG_ILLSEQ); 1128 else 1129 SETERROR(REG_ECOLLATE); /* neither */ 1130 return(0); 1131 } 1132 1133 /* 1134 - may_escape - determine whether 'ch' is escape-able in the current context 1135 == static int may_escape(struct parse *p, const wint_t ch) 1136 */ 1137 static bool 1138 may_escape(struct parse *p, const wint_t ch) 1139 { 1140 1141 if ((p->pflags & PFLAG_LEGACY_ESC) != 0) 1142 return (true); 1143 if (isalpha(ch) || ch == '\'' || ch == '`') 1144 return (false); 1145 return (true); 1146 #ifdef NOTYET 1147 /* 1148 * Build a whitelist of characters that may be escaped to produce an 1149 * ordinary in the current context. This assumes that these have not 1150 * been otherwise interpreted as a special character. Escaping an 1151 * ordinary character yields undefined results according to 1152 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take 1153 * advantage of this and use escaped ordinary characters to provide 1154 * special meaning, e.g. \b, \B, \w, \W, \s, \S. 1155 */ 1156 switch(ch) { 1157 case '|': 1158 case '+': 1159 case '?': 1160 /* The above characters may not be escaped in BREs */ 1161 if (!(p->g->cflags®_EXTENDED)) 1162 return (false); 1163 /* Fallthrough */ 1164 case '(': 1165 case ')': 1166 case '{': 1167 case '}': 1168 case '.': 1169 case '[': 1170 case ']': 1171 case '\\': 1172 case '*': 1173 case '^': 1174 case '$': 1175 return (true); 1176 default: 1177 return (false); 1178 } 1179 #endif 1180 } 1181 1182 /* 1183 - othercase - return the case counterpart of an alphabetic 1184 == static wint_t othercase(wint_t ch); 1185 */ 1186 static wint_t /* if no counterpart, return ch */ 1187 othercase(wint_t ch) 1188 { 1189 assert(iswalpha(ch)); 1190 if (iswupper(ch)) 1191 return(towlower(ch)); 1192 else if (iswlower(ch)) 1193 return(towupper(ch)); 1194 else /* peculiar, but could happen */ 1195 return(ch); 1196 } 1197 1198 /* 1199 - bothcases - emit a dualcase version of a two-case character 1200 == static void bothcases(struct parse *p, wint_t ch); 1201 * 1202 * Boy, is this implementation ever a kludge... 1203 */ 1204 static void 1205 bothcases(struct parse *p, wint_t ch) 1206 { 1207 const char *oldnext = p->next; 1208 const char *oldend = p->end; 1209 char bracket[3 + MB_LEN_MAX]; 1210 size_t n; 1211 mbstate_t mbs; 1212 1213 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1214 p->next = bracket; 1215 memset(&mbs, 0, sizeof(mbs)); 1216 n = wcrtomb(bracket, ch, &mbs); 1217 assert(n != (size_t)-1); 1218 bracket[n] = ']'; 1219 bracket[n + 1] = '\0'; 1220 p->end = bracket+n+1; 1221 p_bracket(p); 1222 assert(p->next == p->end); 1223 p->next = oldnext; 1224 p->end = oldend; 1225 } 1226 1227 /* 1228 - ordinary - emit an ordinary character 1229 == static void ordinary(struct parse *p, wint_t ch); 1230 */ 1231 static void 1232 ordinary(struct parse *p, wint_t ch) 1233 { 1234 cset *cs; 1235 1236 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1237 bothcases(p, ch); 1238 else if ((ch & OPDMASK) == ch) 1239 EMIT(OCHAR, ch); 1240 else { 1241 /* 1242 * Kludge: character is too big to fit into an OCHAR operand. 1243 * Emit a singleton set. 1244 */ 1245 if ((cs = allocset(p)) == NULL) 1246 return; 1247 CHadd(p, cs, ch); 1248 EMIT(OANYOF, (int)(cs - p->g->sets)); 1249 } 1250 } 1251 1252 /* 1253 - nonnewline - emit REG_NEWLINE version of OANY 1254 == static void nonnewline(struct parse *p); 1255 * 1256 * Boy, is this implementation ever a kludge... 1257 */ 1258 static void 1259 nonnewline(struct parse *p) 1260 { 1261 const char *oldnext = p->next; 1262 const char *oldend = p->end; 1263 char bracket[4]; 1264 1265 p->next = bracket; 1266 p->end = bracket+3; 1267 bracket[0] = '^'; 1268 bracket[1] = '\n'; 1269 bracket[2] = ']'; 1270 bracket[3] = '\0'; 1271 p_bracket(p); 1272 assert(p->next == bracket+3); 1273 p->next = oldnext; 1274 p->end = oldend; 1275 } 1276 1277 /* 1278 - repeat - generate code for a bounded repetition, recursively if needed 1279 == static void repeat(struct parse *p, sopno start, int from, int to); 1280 */ 1281 static void 1282 repeat(struct parse *p, 1283 sopno start, /* operand from here to end of strip */ 1284 int from, /* repeated from this number */ 1285 int to) /* to this number of times (maybe INFINITY) */ 1286 { 1287 sopno finish = HERE(); 1288 # define N 2 1289 # define INF 3 1290 # define REP(f, t) ((f)*8 + (t)) 1291 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1292 sopno copy; 1293 1294 if (p->error != 0) /* head off possible runaway recursion */ 1295 return; 1296 1297 assert(from <= to); 1298 1299 switch (REP(MAP(from), MAP(to))) { 1300 case REP(0, 0): /* must be user doing this */ 1301 DROP(finish-start); /* drop the operand */ 1302 break; 1303 case REP(0, 1): /* as x{1,1}? */ 1304 case REP(0, N): /* as x{1,n}? */ 1305 case REP(0, INF): /* as x{1,}? */ 1306 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1307 INSERT(OCH_, start); /* offset is wrong... */ 1308 repeat(p, start+1, 1, to); 1309 ASTERN(OOR1, start); 1310 AHEAD(start); /* ... fix it */ 1311 EMIT(OOR2, 0); 1312 AHEAD(THERE()); 1313 ASTERN(O_CH, THERETHERE()); 1314 break; 1315 case REP(1, 1): /* trivial case */ 1316 /* done */ 1317 break; 1318 case REP(1, N): /* as x?x{1,n-1} */ 1319 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1320 INSERT(OCH_, start); 1321 ASTERN(OOR1, start); 1322 AHEAD(start); 1323 EMIT(OOR2, 0); /* offset very wrong... */ 1324 AHEAD(THERE()); /* ...so fix it */ 1325 ASTERN(O_CH, THERETHERE()); 1326 copy = dupl(p, start+1, finish+1); 1327 assert(copy == finish+4); 1328 repeat(p, copy, 1, to-1); 1329 break; 1330 case REP(1, INF): /* as x+ */ 1331 INSERT(OPLUS_, start); 1332 ASTERN(O_PLUS, start); 1333 break; 1334 case REP(N, N): /* as xx{m-1,n-1} */ 1335 copy = dupl(p, start, finish); 1336 repeat(p, copy, from-1, to-1); 1337 break; 1338 case REP(N, INF): /* as xx{n-1,INF} */ 1339 copy = dupl(p, start, finish); 1340 repeat(p, copy, from-1, to); 1341 break; 1342 default: /* "can't happen" */ 1343 SETERROR(REG_ASSERT); /* just in case */ 1344 break; 1345 } 1346 } 1347 1348 /* 1349 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1350 - character from the parse struct, signals a REG_ILLSEQ error if the 1351 - character can't be converted. Returns the number of bytes consumed. 1352 */ 1353 static wint_t 1354 wgetnext(struct parse *p) 1355 { 1356 mbstate_t mbs; 1357 wchar_t wc; 1358 size_t n; 1359 1360 memset(&mbs, 0, sizeof(mbs)); 1361 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1362 if (n == (size_t)-1 || n == (size_t)-2) { 1363 SETERROR(REG_ILLSEQ); 1364 return (0); 1365 } 1366 if (n == 0) 1367 n = 1; 1368 p->next += n; 1369 return (wc); 1370 } 1371 1372 /* 1373 - seterr - set an error condition 1374 == static int seterr(struct parse *p, int e); 1375 */ 1376 static int /* useless but makes type checking happy */ 1377 seterr(struct parse *p, int e) 1378 { 1379 if (p->error == 0) /* keep earliest error condition */ 1380 p->error = e; 1381 p->next = nuls; /* try to bring things to a halt */ 1382 p->end = nuls; 1383 return(0); /* make the return value well-defined */ 1384 } 1385 1386 /* 1387 - allocset - allocate a set of characters for [] 1388 == static cset *allocset(struct parse *p); 1389 */ 1390 static cset * 1391 allocset(struct parse *p) 1392 { 1393 cset *cs, *ncs; 1394 1395 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1396 if (ncs == NULL) { 1397 SETERROR(REG_ESPACE); 1398 return (NULL); 1399 } 1400 p->g->sets = ncs; 1401 cs = &p->g->sets[p->g->ncsets++]; 1402 memset(cs, 0, sizeof(*cs)); 1403 1404 return(cs); 1405 } 1406 1407 /* 1408 - freeset - free a now-unused set 1409 == static void freeset(struct parse *p, cset *cs); 1410 */ 1411 static void 1412 freeset(struct parse *p, cset *cs) 1413 { 1414 cset *top = &p->g->sets[p->g->ncsets]; 1415 1416 free(cs->wides); 1417 free(cs->ranges); 1418 free(cs->types); 1419 memset(cs, 0, sizeof(*cs)); 1420 if (cs == top-1) /* recover only the easy case */ 1421 p->g->ncsets--; 1422 } 1423 1424 /* 1425 - singleton - Determine whether a set contains only one character, 1426 - returning it if so, otherwise returning OUT. 1427 */ 1428 static wint_t 1429 singleton(cset *cs) 1430 { 1431 wint_t i, s, n; 1432 1433 for (i = n = 0; i < NC; i++) 1434 if (CHIN(cs, i)) { 1435 n++; 1436 s = i; 1437 } 1438 if (n == 1) 1439 return (s); 1440 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1441 cs->icase == 0) 1442 return (cs->wides[0]); 1443 /* Don't bother handling the other cases. */ 1444 return (OUT); 1445 } 1446 1447 /* 1448 - CHadd - add character to character set. 1449 */ 1450 static void 1451 CHadd(struct parse *p, cset *cs, wint_t ch) 1452 { 1453 wint_t nch, *newwides; 1454 assert(ch >= 0); 1455 if (ch < NC) 1456 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1457 else { 1458 newwides = reallocarray(cs->wides, cs->nwides + 1, 1459 sizeof(*cs->wides)); 1460 if (newwides == NULL) { 1461 SETERROR(REG_ESPACE); 1462 return; 1463 } 1464 cs->wides = newwides; 1465 cs->wides[cs->nwides++] = ch; 1466 } 1467 if (cs->icase) { 1468 if ((nch = towlower(ch)) < NC) 1469 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1470 if ((nch = towupper(ch)) < NC) 1471 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1472 } 1473 } 1474 1475 /* 1476 - CHaddrange - add all characters in the range [min,max] to a character set. 1477 */ 1478 static void 1479 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1480 { 1481 crange *newranges; 1482 1483 for (; min < NC && min <= max; min++) 1484 CHadd(p, cs, min); 1485 if (min >= max) 1486 return; 1487 newranges = reallocarray(cs->ranges, cs->nranges + 1, 1488 sizeof(*cs->ranges)); 1489 if (newranges == NULL) { 1490 SETERROR(REG_ESPACE); 1491 return; 1492 } 1493 cs->ranges = newranges; 1494 cs->ranges[cs->nranges].min = min; 1495 cs->ranges[cs->nranges].max = max; 1496 cs->nranges++; 1497 } 1498 1499 /* 1500 - CHaddtype - add all characters of a certain type to a character set. 1501 */ 1502 static void 1503 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1504 { 1505 wint_t i; 1506 wctype_t *newtypes; 1507 1508 for (i = 0; i < NC; i++) 1509 if (iswctype(i, wct)) 1510 CHadd(p, cs, i); 1511 newtypes = reallocarray(cs->types, cs->ntypes + 1, 1512 sizeof(*cs->types)); 1513 if (newtypes == NULL) { 1514 SETERROR(REG_ESPACE); 1515 return; 1516 } 1517 cs->types = newtypes; 1518 cs->types[cs->ntypes++] = wct; 1519 } 1520 1521 /* 1522 - dupl - emit a duplicate of a bunch of sops 1523 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1524 */ 1525 static sopno /* start of duplicate */ 1526 dupl(struct parse *p, 1527 sopno start, /* from here */ 1528 sopno finish) /* to this less one */ 1529 { 1530 sopno ret = HERE(); 1531 sopno len = finish - start; 1532 1533 assert(finish >= start); 1534 if (len == 0) 1535 return(ret); 1536 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1537 return(ret); 1538 (void) memcpy((char *)(p->strip + p->slen), 1539 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1540 p->slen += len; 1541 return(ret); 1542 } 1543 1544 /* 1545 - doemit - emit a strip operator 1546 == static void doemit(struct parse *p, sop op, size_t opnd); 1547 * 1548 * It might seem better to implement this as a macro with a function as 1549 * hard-case backup, but it's just too big and messy unless there are 1550 * some changes to the data structures. Maybe later. 1551 */ 1552 static void 1553 doemit(struct parse *p, sop op, size_t opnd) 1554 { 1555 /* avoid making error situations worse */ 1556 if (p->error != 0) 1557 return; 1558 1559 /* deal with oversize operands ("can't happen", more or less) */ 1560 assert(opnd < 1<<OPSHIFT); 1561 1562 /* deal with undersized strip */ 1563 if (p->slen >= p->ssize) 1564 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1565 return; 1566 1567 /* finally, it's all reduced to the easy case */ 1568 p->strip[p->slen++] = SOP(op, opnd); 1569 } 1570 1571 /* 1572 - doinsert - insert a sop into the strip 1573 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1574 */ 1575 static void 1576 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1577 { 1578 sopno sn; 1579 sop s; 1580 int i; 1581 1582 /* avoid making error situations worse */ 1583 if (p->error != 0) 1584 return; 1585 1586 sn = HERE(); 1587 EMIT(op, opnd); /* do checks, ensure space */ 1588 assert(HERE() == sn+1); 1589 s = p->strip[sn]; 1590 1591 /* adjust paren pointers */ 1592 assert(pos > 0); 1593 for (i = 1; i < NPAREN; i++) { 1594 if (p->pbegin[i] >= pos) { 1595 p->pbegin[i]++; 1596 } 1597 if (p->pend[i] >= pos) { 1598 p->pend[i]++; 1599 } 1600 } 1601 1602 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1603 (HERE()-pos-1)*sizeof(sop)); 1604 p->strip[pos] = s; 1605 } 1606 1607 /* 1608 - dofwd - complete a forward reference 1609 == static void dofwd(struct parse *p, sopno pos, sop value); 1610 */ 1611 static void 1612 dofwd(struct parse *p, sopno pos, sop value) 1613 { 1614 /* avoid making error situations worse */ 1615 if (p->error != 0) 1616 return; 1617 1618 assert(value < 1<<OPSHIFT); 1619 p->strip[pos] = OP(p->strip[pos]) | value; 1620 } 1621 1622 /* 1623 - enlarge - enlarge the strip 1624 == static int enlarge(struct parse *p, sopno size); 1625 */ 1626 static int 1627 enlarge(struct parse *p, sopno size) 1628 { 1629 sop *sp; 1630 1631 if (p->ssize >= size) 1632 return 1; 1633 1634 sp = reallocarray(p->strip, size, sizeof(sop)); 1635 if (sp == NULL) { 1636 SETERROR(REG_ESPACE); 1637 return 0; 1638 } 1639 p->strip = sp; 1640 p->ssize = size; 1641 return 1; 1642 } 1643 1644 /* 1645 - stripsnug - compact the strip 1646 == static void stripsnug(struct parse *p, struct re_guts *g); 1647 */ 1648 static void 1649 stripsnug(struct parse *p, struct re_guts *g) 1650 { 1651 g->nstates = p->slen; 1652 g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop)); 1653 if (g->strip == NULL) { 1654 SETERROR(REG_ESPACE); 1655 g->strip = p->strip; 1656 } 1657 } 1658 1659 /* 1660 - findmust - fill in must and mlen with longest mandatory literal string 1661 == static void findmust(struct parse *p, struct re_guts *g); 1662 * 1663 * This algorithm could do fancy things like analyzing the operands of | 1664 * for common subsequences. Someday. This code is simple and finds most 1665 * of the interesting cases. 1666 * 1667 * Note that must and mlen got initialized during setup. 1668 */ 1669 static void 1670 findmust(struct parse *p, struct re_guts *g) 1671 { 1672 sop *scan; 1673 sop *start = NULL; 1674 sop *newstart = NULL; 1675 sopno newlen; 1676 sop s; 1677 char *cp; 1678 int offset; 1679 char buf[MB_LEN_MAX]; 1680 size_t clen; 1681 mbstate_t mbs; 1682 1683 /* avoid making error situations worse */ 1684 if (p->error != 0) 1685 return; 1686 1687 /* 1688 * It's not generally safe to do a ``char'' substring search on 1689 * multibyte character strings, but it's safe for at least 1690 * UTF-8 (see RFC 3629). 1691 */ 1692 if (MB_CUR_MAX > 1 && 1693 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1694 return; 1695 1696 /* find the longest OCHAR sequence in strip */ 1697 newlen = 0; 1698 offset = 0; 1699 g->moffset = 0; 1700 scan = g->strip + 1; 1701 do { 1702 s = *scan++; 1703 switch (OP(s)) { 1704 case OCHAR: /* sequence member */ 1705 if (newlen == 0) { /* new sequence */ 1706 memset(&mbs, 0, sizeof(mbs)); 1707 newstart = scan - 1; 1708 } 1709 clen = wcrtomb(buf, OPND(s), &mbs); 1710 if (clen == (size_t)-1) 1711 goto toohard; 1712 newlen += clen; 1713 break; 1714 case OPLUS_: /* things that don't break one */ 1715 case OLPAREN: 1716 case ORPAREN: 1717 break; 1718 case OQUEST_: /* things that must be skipped */ 1719 case OCH_: 1720 offset = altoffset(scan, offset); 1721 scan--; 1722 do { 1723 scan += OPND(s); 1724 s = *scan; 1725 /* assert() interferes w debug printouts */ 1726 if (OP(s) != (sop)O_QUEST && 1727 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) { 1728 g->iflags |= BAD; 1729 return; 1730 } 1731 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 1732 /* FALLTHROUGH */ 1733 case OBOW: /* things that break a sequence */ 1734 case OEOW: 1735 case OBOL: 1736 case OEOL: 1737 case O_QUEST: 1738 case O_CH: 1739 case OEND: 1740 if (newlen > (sopno)g->mlen) { /* ends one */ 1741 start = newstart; 1742 g->mlen = newlen; 1743 if (offset > -1) { 1744 g->moffset += offset; 1745 offset = newlen; 1746 } else 1747 g->moffset = offset; 1748 } else { 1749 if (offset > -1) 1750 offset += newlen; 1751 } 1752 newlen = 0; 1753 break; 1754 case OANY: 1755 if (newlen > (sopno)g->mlen) { /* ends one */ 1756 start = newstart; 1757 g->mlen = newlen; 1758 if (offset > -1) { 1759 g->moffset += offset; 1760 offset = newlen; 1761 } else 1762 g->moffset = offset; 1763 } else { 1764 if (offset > -1) 1765 offset += newlen; 1766 } 1767 if (offset > -1) 1768 offset++; 1769 newlen = 0; 1770 break; 1771 case OANYOF: /* may or may not invalidate offset */ 1772 /* First, everything as OANY */ 1773 if (newlen > (sopno)g->mlen) { /* ends one */ 1774 start = newstart; 1775 g->mlen = newlen; 1776 if (offset > -1) { 1777 g->moffset += offset; 1778 offset = newlen; 1779 } else 1780 g->moffset = offset; 1781 } else { 1782 if (offset > -1) 1783 offset += newlen; 1784 } 1785 if (offset > -1) 1786 offset++; 1787 newlen = 0; 1788 break; 1789 toohard: 1790 default: 1791 /* Anything here makes it impossible or too hard 1792 * to calculate the offset -- so we give up; 1793 * save the last known good offset, in case the 1794 * must sequence doesn't occur later. 1795 */ 1796 if (newlen > (sopno)g->mlen) { /* ends one */ 1797 start = newstart; 1798 g->mlen = newlen; 1799 if (offset > -1) 1800 g->moffset += offset; 1801 else 1802 g->moffset = offset; 1803 } 1804 offset = -1; 1805 newlen = 0; 1806 break; 1807 } 1808 } while (OP(s) != OEND); 1809 1810 if (g->mlen == 0) { /* there isn't one */ 1811 g->moffset = -1; 1812 return; 1813 } 1814 1815 /* turn it into a character string */ 1816 g->must = malloc((size_t)g->mlen + 1); 1817 if (g->must == NULL) { /* argh; just forget it */ 1818 g->mlen = 0; 1819 g->moffset = -1; 1820 return; 1821 } 1822 cp = g->must; 1823 scan = start; 1824 memset(&mbs, 0, sizeof(mbs)); 1825 while (cp < g->must + g->mlen) { 1826 while (OP(s = *scan++) != OCHAR) 1827 continue; 1828 clen = wcrtomb(cp, OPND(s), &mbs); 1829 assert(clen != (size_t)-1); 1830 cp += clen; 1831 } 1832 assert(cp == g->must + g->mlen); 1833 *cp++ = '\0'; /* just on general principles */ 1834 } 1835 1836 /* 1837 - altoffset - choose biggest offset among multiple choices 1838 == static int altoffset(sop *scan, int offset); 1839 * 1840 * Compute, recursively if necessary, the largest offset among multiple 1841 * re paths. 1842 */ 1843 static int 1844 altoffset(sop *scan, int offset) 1845 { 1846 int largest; 1847 int try; 1848 sop s; 1849 1850 /* If we gave up already on offsets, return */ 1851 if (offset == -1) 1852 return -1; 1853 1854 largest = 0; 1855 try = 0; 1856 s = *scan++; 1857 while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) { 1858 switch (OP(s)) { 1859 case OOR1: 1860 if (try > largest) 1861 largest = try; 1862 try = 0; 1863 break; 1864 case OQUEST_: 1865 case OCH_: 1866 try = altoffset(scan, try); 1867 if (try == -1) 1868 return -1; 1869 scan--; 1870 do { 1871 scan += OPND(s); 1872 s = *scan; 1873 if (OP(s) != (sop)O_QUEST && 1874 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) 1875 return -1; 1876 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 1877 /* We must skip to the next position, or we'll 1878 * leave altoffset() too early. 1879 */ 1880 scan++; 1881 break; 1882 case OANYOF: 1883 case OCHAR: 1884 case OANY: 1885 try++; 1886 case OBOW: 1887 case OEOW: 1888 case OLPAREN: 1889 case ORPAREN: 1890 case OOR2: 1891 break; 1892 default: 1893 try = -1; 1894 break; 1895 } 1896 if (try == -1) 1897 return -1; 1898 s = *scan++; 1899 } 1900 1901 if (try > largest) 1902 largest = try; 1903 1904 return largest+offset; 1905 } 1906 1907 /* 1908 - computejumps - compute char jumps for BM scan 1909 == static void computejumps(struct parse *p, struct re_guts *g); 1910 * 1911 * This algorithm assumes g->must exists and is has size greater than 1912 * zero. It's based on the algorithm found on Computer Algorithms by 1913 * Sara Baase. 1914 * 1915 * A char jump is the number of characters one needs to jump based on 1916 * the value of the character from the text that was mismatched. 1917 */ 1918 static void 1919 computejumps(struct parse *p, struct re_guts *g) 1920 { 1921 int ch; 1922 int mindex; 1923 1924 /* Avoid making errors worse */ 1925 if (p->error != 0) 1926 return; 1927 1928 g->charjump = (int *)malloc((NC_MAX + 1) * sizeof(int)); 1929 if (g->charjump == NULL) /* Not a fatal error */ 1930 return; 1931 /* Adjust for signed chars, if necessary */ 1932 g->charjump = &g->charjump[-(CHAR_MIN)]; 1933 1934 /* If the character does not exist in the pattern, the jump 1935 * is equal to the number of characters in the pattern. 1936 */ 1937 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1938 g->charjump[ch] = g->mlen; 1939 1940 /* If the character does exist, compute the jump that would 1941 * take us to the last character in the pattern equal to it 1942 * (notice that we match right to left, so that last character 1943 * is the first one that would be matched). 1944 */ 1945 for (mindex = 0; mindex < g->mlen; mindex++) 1946 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1947 } 1948 1949 /* 1950 - computematchjumps - compute match jumps for BM scan 1951 == static void computematchjumps(struct parse *p, struct re_guts *g); 1952 * 1953 * This algorithm assumes g->must exists and is has size greater than 1954 * zero. It's based on the algorithm found on Computer Algorithms by 1955 * Sara Baase. 1956 * 1957 * A match jump is the number of characters one needs to advance based 1958 * on the already-matched suffix. 1959 * Notice that all values here are minus (g->mlen-1), because of the way 1960 * the search algorithm works. 1961 */ 1962 static void 1963 computematchjumps(struct parse *p, struct re_guts *g) 1964 { 1965 int mindex; /* General "must" iterator */ 1966 int suffix; /* Keeps track of matching suffix */ 1967 int ssuffix; /* Keeps track of suffixes' suffix */ 1968 int* pmatches; /* pmatches[k] points to the next i 1969 * such that i+1...mlen is a substring 1970 * of k+1...k+mlen-i-1 1971 */ 1972 1973 /* Avoid making errors worse */ 1974 if (p->error != 0) 1975 return; 1976 1977 pmatches = (int*) malloc(g->mlen * sizeof(int)); 1978 if (pmatches == NULL) { 1979 g->matchjump = NULL; 1980 return; 1981 } 1982 1983 g->matchjump = (int*) malloc(g->mlen * sizeof(int)); 1984 if (g->matchjump == NULL) { /* Not a fatal error */ 1985 free(pmatches); 1986 return; 1987 } 1988 1989 /* Set maximum possible jump for each character in the pattern */ 1990 for (mindex = 0; mindex < g->mlen; mindex++) 1991 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1992 1993 /* Compute pmatches[] */ 1994 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1995 mindex--, suffix--) { 1996 pmatches[mindex] = suffix; 1997 1998 /* If a mismatch is found, interrupting the substring, 1999 * compute the matchjump for that position. If no 2000 * mismatch is found, then a text substring mismatched 2001 * against the suffix will also mismatch against the 2002 * substring. 2003 */ 2004 while (suffix < g->mlen 2005 && g->must[mindex] != g->must[suffix]) { 2006 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2007 g->mlen - mindex - 1); 2008 suffix = pmatches[suffix]; 2009 } 2010 } 2011 2012 /* Compute the matchjump up to the last substring found to jump 2013 * to the beginning of the largest must pattern prefix matching 2014 * it's own suffix. 2015 */ 2016 for (mindex = 0; mindex <= suffix; mindex++) 2017 g->matchjump[mindex] = MIN(g->matchjump[mindex], 2018 g->mlen + suffix - mindex); 2019 2020 ssuffix = pmatches[suffix]; 2021 while (suffix < g->mlen) { 2022 while (suffix <= ssuffix && suffix < g->mlen) { 2023 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2024 g->mlen + ssuffix - suffix); 2025 suffix++; 2026 } 2027 if (suffix < g->mlen) 2028 ssuffix = pmatches[ssuffix]; 2029 } 2030 2031 free(pmatches); 2032 } 2033 2034 /* 2035 - pluscount - count + nesting 2036 == static sopno pluscount(struct parse *p, struct re_guts *g); 2037 */ 2038 static sopno /* nesting depth */ 2039 pluscount(struct parse *p, struct re_guts *g) 2040 { 2041 sop *scan; 2042 sop s; 2043 sopno plusnest = 0; 2044 sopno maxnest = 0; 2045 2046 if (p->error != 0) 2047 return(0); /* there may not be an OEND */ 2048 2049 scan = g->strip + 1; 2050 do { 2051 s = *scan++; 2052 switch (OP(s)) { 2053 case OPLUS_: 2054 plusnest++; 2055 break; 2056 case O_PLUS: 2057 if (plusnest > maxnest) 2058 maxnest = plusnest; 2059 plusnest--; 2060 break; 2061 } 2062 } while (OP(s) != OEND); 2063 if (plusnest != 0) 2064 g->iflags |= BAD; 2065 return(maxnest); 2066 } 2067