1 /* $NetBSD: regcomp.c,v 1.49 2025/01/01 18:19:50 christos Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-3-Clause 5 * 6 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 7 * Copyright (c) 1992, 1993, 1994 8 * The Regents of the University of California. All rights reserved. 9 * 10 * Copyright (c) 2011 The FreeBSD Foundation 11 * All rights reserved. 12 * Portions of this software were developed by David Chisnall 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * Henry Spencer. 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 1. Redistributions of source code must retain the above copyright 22 * notice, this list of conditions and the following disclaimer. 23 * 2. Redistributions in binary form must reproduce the above copyright 24 * notice, this list of conditions and the following disclaimer in the 25 * documentation and/or other materials provided with the distribution. 26 * 3. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 43 */ 44 45 #if HAVE_NBTOOL_CONFIG_H 46 #include "nbtool_config.h" 47 #endif 48 49 #include <sys/cdefs.h> 50 #if 0 51 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 52 __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $"); 53 #endif 54 __RCSID("$NetBSD: regcomp.c,v 1.49 2025/01/01 18:19:50 christos Exp $"); 55 56 #ifndef LIBHACK 57 #define REGEX_GNU_EXTENSIONS 58 59 #include "namespace.h" 60 #endif 61 #include <sys/types.h> 62 #include <stdio.h> 63 #include <string.h> 64 #include <ctype.h> 65 #include <limits.h> 66 #include <stdlib.h> 67 #include <regex.h> 68 #include <stdbool.h> 69 70 #if defined(__weak_alias) && !defined(LIBHACK) 71 __weak_alias(regcomp,_regcomp) 72 #endif 73 74 #ifdef REGEX_LIBC_COLLATE 75 #include "collate.h" 76 #endif 77 78 #include "utils.h" 79 #include "regex2.h" 80 81 #include "cname.h" 82 83 /* 84 * Branching context, used to keep track of branch state for all of the branch- 85 * aware functions. In addition to keeping track of branch positions for the 86 * p_branch_* functions, we use this to simplify some clumsiness in BREs for 87 * detection of whether ^ is acting as an anchor or being used erroneously and 88 * also for whether we're in a sub-expression or not. 89 */ 90 struct branchc { 91 sopno start; 92 sopno back; 93 sopno fwd; 94 95 int nbranch; 96 int nchain; 97 bool outer; 98 bool terminate; 99 }; 100 101 /* 102 * parse structure, passed up and down to avoid global variables and 103 * other clumsinesses 104 */ 105 struct parse { 106 const char *next; /* next character in RE */ 107 const char *end; /* end of string (-> NUL normally) */ 108 int error; /* has an error been seen? */ 109 int gnuext; 110 sop *strip; /* malloced strip */ 111 sopno ssize; /* malloced strip size (allocated) */ 112 sopno slen; /* malloced strip length (used) */ 113 size_t ncsalloc; /* number of csets allocated */ 114 struct re_guts *g; 115 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 116 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 117 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 118 bool allowbranch; /* can this expression branch? */ 119 bool bre; /* convenience; is this a BRE? */ 120 int pflags; /* other parsing flags -- legacy escapes? */ 121 bool (*parse_expr)(struct parse *, struct branchc *); 122 void (*pre_parse)(struct parse *, struct branchc *); 123 void (*post_parse)(struct parse *, struct branchc *); 124 }; 125 126 #define PFLAG_LEGACY_ESC 0x00000001 127 128 /* ========= begin header generated by ./mkh ========= */ 129 #ifdef __cplusplus 130 extern "C" { 131 #endif 132 133 /* === regcomp.c === */ 134 static bool p_ere_exp(struct parse *p, struct branchc *bc); 135 static void p_str(struct parse *p); 136 static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 137 static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 138 static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 139 static bool p_branch_empty(struct parse *p, struct branchc *bc); 140 static bool p_branch_do(struct parse *p, struct branchc *bc); 141 static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 142 static void p_bre_post_parse(struct parse *p, struct branchc *bc); 143 static void p_re(struct parse *p, int end1, int end2); 144 static bool p_simp_re(struct parse *p, struct branchc *bc); 145 static int p_count(struct parse *p); 146 static void p_bracket(struct parse *p); 147 static int p_range_cmp(wchar_t c1, wchar_t c2); 148 static void p_b_term(struct parse *p, cset *cs); 149 #ifdef REGEX_GNU_EXTENSIONS 150 static int p_b_pseudoclass(struct parse *p, char c); 151 #endif 152 static void p_b_cclass(struct parse *p, cset *cs); 153 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); 154 static void p_b_eclass(struct parse *p, cset *cs); 155 static wint_t p_b_symbol(struct parse *p); 156 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 157 static bool may_escape(struct parse *p, const wint_t ch); 158 static wint_t othercase(wint_t ch); 159 static void bothcases(struct parse *p, wint_t ch); 160 static void ordinary(struct parse *p, wint_t ch); 161 static void nonnewline(struct parse *p); 162 static void repeat(struct parse *p, sopno start, int from, int to); 163 static int seterr(struct parse *p, int e); 164 static cset *allocset(struct parse *p); 165 static void freeset(struct parse *p, cset *cs); 166 static void CHadd(struct parse *p, cset *cs, wint_t ch); 167 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 168 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 169 static wint_t singleton(cset *cs); 170 static sopno dupl(struct parse *p, sopno start, sopno finish); 171 static void doemit(struct parse *p, sop op, size_t opnd); 172 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 173 static void dofwd(struct parse *p, sopno pos, sop value); 174 static int enlarge(struct parse *p, sopno size); 175 static void stripsnug(struct parse *p, struct re_guts *g); 176 static void findmust(struct parse *p, struct re_guts *g); 177 static int altoffset(sop *scan, int offset); 178 static void computejumps(struct parse *p, struct re_guts *g); 179 static void computematchjumps(struct parse *p, struct re_guts *g); 180 static sopno pluscount(struct parse *p, struct re_guts *g); 181 static wint_t wgetnext(struct parse *p); 182 183 #ifdef __cplusplus 184 } 185 #endif 186 /* ========= end header generated by ./mkh ========= */ 187 188 static char nuls[10]; /* place to point scanner in event of error */ 189 190 /* 191 * macros for use with parse structure 192 * BEWARE: these know that the parse structure is named `p' !!! 193 */ 194 #define PEEK() (*p->next) 195 #define PEEK2() (*(p->next+1)) 196 #define MORE() (p->next < p->end) 197 #define MORE2() (p->next+1 < p->end) 198 #define SEE(c) (MORE() && PEEK() == (c)) 199 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 200 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 201 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 202 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 203 #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a)) 204 #define NEXT() (p->next++) 205 #define NEXT2() (p->next += 2) 206 #define NEXTn(n) (p->next += (n)) 207 #define GETNEXT() (*p->next++) 208 #define WGETNEXT() wgetnext(p) 209 #define SETERROR(e) seterr(p, (e)) 210 #define REQUIRE(co, e) ((co) || SETERROR(e)) 211 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 212 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 213 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 214 #define EMIT(op, sopnd) doemit(p, (op), (sopnd)) 215 #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos) 216 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 217 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 218 #define HERE() (p->slen) 219 #define THERE() (p->slen - 1) 220 #define THERETHERE() (p->slen - 2) 221 #define DROP(n) (p->slen -= (n)) 222 223 /* Macro used by computejump()/computematchjump() */ 224 #ifndef MIN 225 #define MIN(a,b) ((a)<(b)?(a):(b)) 226 #endif 227 228 #ifndef NLS 229 static const struct { 230 const char *name; 231 int (*func)(int); 232 } wctypes[] = { 233 #define ADD(x) { .name = # x, .func = is ## x } 234 ADD(alnum), 235 ADD(alpha), 236 ADD(blank), 237 ADD(cntrl), 238 ADD(digit), 239 ADD(graph), 240 ADD(lower), 241 ADD(print), 242 ADD(punct), 243 ADD(space), 244 ADD(upper), 245 ADD(xdigit), 246 #undef ADD 247 }; 248 249 wctype_t 250 __regex_wctype(const char *str) 251 { 252 for (size_t i = 0; i < __arraycount(wctypes); i++) { 253 if (strcmp(wctypes[i].name, str) == 0) 254 return (wctype_t)(i + 1); 255 } 256 return (wctype_t)0; 257 } 258 259 int 260 __regex_iswctype(wint_t c, wctype_t ct) 261 { 262 if (ct == 0) 263 return 0; 264 return (*wctypes[ct - 1].func)(c); 265 } 266 #endif 267 268 static int /* 0 success, otherwise REG_something */ 269 regcomp_internal(regex_t * __restrict preg, 270 const char * __restrict pattern, 271 int cflags, int pflags) 272 { 273 struct parse pa; 274 struct re_guts *g; 275 struct parse *p = &pa; 276 int i; 277 size_t len; 278 size_t maxlen; 279 #ifdef REDEBUG 280 # define GOODFLAGS(f) (f) 281 #else 282 # define GOODFLAGS(f) ((f)&~REG_DUMP) 283 #endif 284 285 _DIAGASSERT(preg != NULL); 286 _DIAGASSERT(pattern != NULL); 287 288 cflags = GOODFLAGS(cflags); 289 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 290 return(REG_INVARG); 291 292 if (cflags®_PEND) { 293 if (preg->re_endp < pattern) 294 return(REG_INVARG); 295 len = preg->re_endp - pattern; 296 } else 297 len = strlen(pattern); 298 299 /* do the mallocs early so failure handling is easy */ 300 g = malloc(sizeof(*g)); 301 if (g == NULL) 302 return(REG_ESPACE); 303 /* 304 * Limit the pattern space to avoid a 32-bit overflow on buffer 305 * extension. Also avoid any signed overflow in case of conversion 306 * so make the real limit based on a 31-bit overflow. 307 * 308 * Likely not applicable on 64-bit systems but handle the case 309 * generically (who are we to stop people from using ~715MB+ 310 * patterns?). 311 */ 312 maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3; 313 if (len >= maxlen) { 314 free(g); 315 return(REG_ESPACE); 316 } 317 p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */ 318 assert(p->ssize >= len); 319 320 p->strip = calloc(p->ssize, sizeof(*p->strip)); 321 p->slen = 0; 322 if (p->strip == NULL) { 323 free(g); 324 return(REG_ESPACE); 325 } 326 327 /* set things up */ 328 p->g = g; 329 p->next = pattern; /* convenience; we do not modify it */ 330 p->end = p->next + len; 331 p->error = 0; 332 p->ncsalloc = 0; 333 p->pflags = pflags; 334 for (i = 0; i < NPAREN; i++) { 335 p->pbegin[i] = 0; 336 p->pend[i] = 0; 337 } 338 #ifdef REGEX_GNU_EXTENSIONS 339 if ((cflags & REG_GNU) == 0) { 340 p->gnuext = false; 341 p->allowbranch = (cflags & REG_EXTENDED) != 0; 342 } else 343 p->gnuext = p->allowbranch = true; 344 #else 345 p->gnuext = false; 346 p->allowbranch = (cflags & REG_EXTENDED) != 0; 347 #endif 348 if (cflags & REG_EXTENDED) { 349 p->bre = false; 350 p->parse_expr = p_ere_exp; 351 p->pre_parse = NULL; 352 p->post_parse = NULL; 353 } else { 354 p->bre = true; 355 p->parse_expr = p_simp_re; 356 p->pre_parse = p_bre_pre_parse; 357 p->post_parse = p_bre_post_parse; 358 } 359 g->sets = NULL; 360 g->ncsets = 0; 361 g->cflags = cflags; 362 g->iflags = 0; 363 g->nbol = 0; 364 g->neol = 0; 365 g->must = NULL; 366 g->moffset = -1; 367 g->charjump = NULL; 368 g->matchjump = NULL; 369 g->mlen = 0; 370 g->nsub = 0; 371 g->backrefs = 0; 372 373 /* do it */ 374 EMIT(OEND, 0); 375 g->firststate = THERE(); 376 if (cflags & REG_NOSPEC) 377 p_str(p); 378 else 379 p_re(p, OUT, OUT); 380 EMIT(OEND, 0); 381 g->laststate = THERE(); 382 383 /* tidy up loose ends and fill things in */ 384 stripsnug(p, g); 385 findmust(p, g); 386 /* only use Boyer-Moore algorithm if the pattern is bigger 387 * than three characters 388 */ 389 if(g->mlen > 3) { 390 computejumps(p, g); 391 computematchjumps(p, g); 392 if(g->matchjump == NULL && g->charjump != NULL) { 393 free(g->charjump); 394 g->charjump = NULL; 395 } 396 } 397 g->nplus = pluscount(p, g); 398 g->magic = MAGIC2; 399 preg->re_nsub = g->nsub; 400 preg->re_g = g; 401 preg->re_magic = MAGIC1; 402 #ifndef REDEBUG 403 /* not debugging, so can't rely on the assert() in regexec() */ 404 if (g->iflags&BAD) 405 SETERROR(REG_ASSERT); 406 #endif 407 408 /* win or lose, we're done */ 409 if (p->error != 0) /* lose */ 410 regfree(preg); 411 return(p->error); 412 } 413 414 /* 415 - regcomp - interface for parser and compilation 416 = extern int regcomp(regex_t *, const char *, int); 417 = #define REG_BASIC 0000 418 = #define REG_EXTENDED 0001 419 = #define REG_ICASE 0002 420 = #define REG_NOSUB 0004 421 = #define REG_NEWLINE 0010 422 = #define REG_NOSPEC 0020 423 = #define REG_PEND 0040 424 = #define REG_DUMP 0200 425 */ 426 int /* 0 success, otherwise REG_something */ 427 regcomp(regex_t * __restrict preg, 428 const char * __restrict pattern, 429 int cflags) 430 { 431 432 return (regcomp_internal(preg, pattern, cflags, 0)); 433 } 434 435 /* 436 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 437 - return whether we should terminate or not 438 == static bool p_ere_exp(struct parse *p); 439 */ 440 static bool 441 p_ere_exp(struct parse *p, struct branchc *bc) 442 { 443 char c; 444 wint_t wc; 445 sopno pos; 446 int count; 447 int count2; 448 #ifdef REGEX_GNU_EXTENSIONS 449 size_t i; 450 int handled; 451 #endif 452 sopno subno; 453 int wascaret = 0; 454 455 _DIAGASSERT(p != NULL); 456 457 (void)bc; 458 assert(MORE()); /* caller should have ensured this */ 459 c = GETNEXT(); 460 461 #ifdef REGEX_GNU_EXTENSIONS 462 handled = 0; 463 #endif 464 pos = HERE(); 465 switch (c) { 466 case '(': 467 (void)REQUIRE(MORE(), REG_EPAREN); 468 p->g->nsub++; 469 subno = (sopno)p->g->nsub; 470 if (subno < NPAREN) 471 p->pbegin[subno] = HERE(); 472 EMIT(OLPAREN, subno); 473 if (!SEE(')')) 474 p_re(p, ')', IGN); 475 if (subno < NPAREN) { 476 p->pend[subno] = HERE(); 477 assert(p->pend[subno] != 0); 478 } 479 EMIT(ORPAREN, subno); 480 (void)MUSTEAT(')', REG_EPAREN); 481 break; 482 #ifndef POSIX_MISTAKE 483 case ')': /* happens only if no current unmatched ( */ 484 /* 485 * You may ask, why the ifndef? Because I didn't notice 486 * this until slightly too late for 1003.2, and none of the 487 * other 1003.2 regular-expression reviewers noticed it at 488 * all. So an unmatched ) is legal POSIX, at least until 489 * we can get it fixed. 490 */ 491 SETERROR(REG_EPAREN); 492 break; 493 #endif 494 case '^': 495 EMIT(OBOL, 0); 496 p->g->iflags |= USEBOL; 497 p->g->nbol++; 498 wascaret = 1; 499 break; 500 case '$': 501 EMIT(OEOL, 0); 502 p->g->iflags |= USEEOL; 503 p->g->neol++; 504 break; 505 case '|': 506 SETERROR(REG_EMPTY); 507 break; 508 case '*': 509 case '+': 510 case '?': 511 case '{': 512 SETERROR(REG_BADRPT); 513 break; 514 case '.': 515 if (p->g->cflags®_NEWLINE) 516 nonnewline(p); 517 else 518 EMIT(OANY, 0); 519 break; 520 case '[': 521 p_bracket(p); 522 break; 523 case '\\': 524 (void)REQUIRE(MORE(), REG_EESCAPE); 525 wc = WGETNEXT(); 526 #ifdef REGEX_GNU_EXTENSIONS 527 if (p->gnuext) { 528 handled = 1; 529 switch (wc) { 530 case '`': 531 EMIT(OBOS, 0); 532 break; 533 case '\'': 534 EMIT(OEOS, 0); 535 break; 536 case 'B': 537 EMIT(ONWBND, 0); 538 break; 539 case 'b': 540 EMIT(OWBND, 0); 541 break; 542 case 'W': 543 case 'w': 544 case 'S': 545 case 's': 546 p_b_pseudoclass(p, wc); 547 break; 548 case 'a': 549 ordinary(p, '\a'); 550 break; 551 case 'e': 552 ordinary(p, '\e'); 553 break; 554 case 'f': 555 ordinary(p, '\f'); 556 break; 557 case 'n': 558 ordinary(p, '\n'); 559 break; 560 case 'r': 561 ordinary(p, '\r'); 562 break; 563 case 't': 564 ordinary(p, '\t'); 565 break; 566 case 'v': 567 ordinary(p, '\v'); 568 break; 569 case '1': 570 case '2': 571 case '3': 572 case '4': 573 case '5': 574 case '6': 575 case '7': 576 case '8': 577 case '9': 578 i = wc - '0'; 579 assert(i < NPAREN); 580 if (p->pend[i] != 0) { 581 assert(i <= p->g->nsub); 582 EMIT(OBACK_, i); 583 assert(p->pbegin[i] != 0); 584 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 585 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 586 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 587 EMIT(O_BACK, i); 588 } else 589 SETERROR(REG_ESUBREG); 590 p->g->backrefs = 1; 591 break; 592 default: 593 handled = 0; 594 } 595 /* Don't proceed to the POSIX bits if we've already handled it */ 596 if (handled) 597 break; 598 } 599 #endif 600 switch (wc) { 601 case '<': 602 EMIT(OBOW, 0); 603 break; 604 case '>': 605 EMIT(OEOW, 0); 606 break; 607 default: 608 if (may_escape(p, wc)) 609 ordinary(p, wc); 610 else 611 SETERROR(REG_EESCAPE); 612 break; 613 } 614 break; 615 default: 616 if (p->error != 0) 617 return (false); 618 p->next--; 619 wc = WGETNEXT(); 620 ordinary(p, wc); 621 break; 622 } 623 624 if (!MORE()) 625 return (false); 626 c = PEEK(); 627 /* we call { a repetition if followed by a digit */ 628 if (!( c == '*' || c == '+' || c == '?' || c == '{')) 629 return (false); /* no repetition, we're done */ 630 else if (c == '{') 631 (void)REQUIRE(MORE2() && \ 632 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 633 NEXT(); 634 635 (void)REQUIRE(!wascaret, REG_BADRPT); 636 switch (c) { 637 case '*': /* implemented as +? */ 638 /* this case does not require the (y|) trick, noKLUDGE */ 639 INSERT(OPLUS_, pos); 640 ASTERN(O_PLUS, pos); 641 INSERT(OQUEST_, pos); 642 ASTERN(O_QUEST, pos); 643 break; 644 case '+': 645 INSERT(OPLUS_, pos); 646 ASTERN(O_PLUS, pos); 647 break; 648 case '?': 649 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 650 INSERT(OCH_, pos); /* offset slightly wrong */ 651 ASTERN(OOR1, pos); /* this one's right */ 652 AHEAD(pos); /* fix the OCH_ */ 653 EMIT(OOR2, 0); /* offset very wrong... */ 654 AHEAD(THERE()); /* ...so fix it */ 655 ASTERN(O_CH, THERETHERE()); 656 break; 657 case '{': 658 count = p_count(p); 659 if (EAT(',')) { 660 if (isdigit((uch)PEEK())) { 661 count2 = p_count(p); 662 (void)REQUIRE(count <= count2, REG_BADBR); 663 } else /* single number with comma */ 664 count2 = INFINITY; 665 } else /* just a single number */ 666 count2 = count; 667 repeat(p, pos, count, count2); 668 if (!EAT('}')) { /* error heuristics */ 669 while (MORE() && PEEK() != '}') 670 NEXT(); 671 (void)REQUIRE(MORE(), REG_EBRACE); 672 SETERROR(REG_BADBR); 673 } 674 break; 675 } 676 677 if (!MORE()) 678 return (false); 679 c = PEEK(); 680 if (!( c == '*' || c == '+' || c == '?' || 681 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 682 return (false); 683 SETERROR(REG_BADRPT); 684 return (false); 685 } 686 687 /* 688 - p_str - string (no metacharacters) "parser" 689 == static void p_str(struct parse *p); 690 */ 691 static void 692 p_str(struct parse *p) 693 { 694 (void)REQUIRE(MORE(), REG_EMPTY); 695 while (MORE()) 696 ordinary(p, WGETNEXT()); 697 } 698 699 /* 700 * Eat consecutive branch delimiters for the kind of expression that we are 701 * parsing, return the number of delimiters that we ate. 702 */ 703 static int 704 p_branch_eat_delim(struct parse *p, struct branchc *bc) 705 { 706 int nskip; 707 708 (void)bc; 709 nskip = 0; 710 while (EATSPEC('|')) 711 ++nskip; 712 return (nskip); 713 } 714 715 /* 716 * Insert necessary branch book-keeping operations. This emits a 717 * bogus 'next' offset, since we still have more to parse 718 */ 719 static void 720 p_branch_ins_offset(struct parse *p, struct branchc *bc) 721 { 722 723 if (bc->nbranch == 0) { 724 INSERT(OCH_, bc->start); /* offset is wrong */ 725 bc->fwd = bc->start; 726 bc->back = bc->start; 727 } 728 729 ASTERN(OOR1, bc->back); 730 bc->back = THERE(); 731 AHEAD(bc->fwd); /* fix previous offset */ 732 bc->fwd = HERE(); 733 EMIT(OOR2, 0); /* offset is very wrong */ 734 ++bc->nbranch; 735 } 736 737 /* 738 * Fix the offset of the tail branch, if we actually had any branches. 739 * This is to correct the bogus placeholder offset that we use. 740 */ 741 static void 742 p_branch_fix_tail(struct parse *p, struct branchc *bc) 743 { 744 745 /* Fix bogus offset at the tail if we actually have branches */ 746 if (bc->nbranch > 0) { 747 AHEAD(bc->fwd); 748 ASTERN(O_CH, bc->back); 749 } 750 } 751 752 /* 753 * Signal to the parser that an empty branch has been encountered; this will, 754 * in the future, be used to allow for more permissive behavior with empty 755 * branches. The return value should indicate whether parsing may continue 756 * or not. 757 */ 758 static bool 759 p_branch_empty(struct parse *p, struct branchc *bc) 760 { 761 762 (void)bc; 763 SETERROR(REG_EMPTY); 764 return (false); 765 } 766 767 /* 768 * Take care of any branching requirements. This includes inserting the 769 * appropriate branching instructions as well as eating all of the branch 770 * delimiters until we either run out of pattern or need to parse more pattern. 771 */ 772 static bool 773 p_branch_do(struct parse *p, struct branchc *bc) 774 { 775 int ate = 0; 776 777 ate = p_branch_eat_delim(p, bc); 778 if (ate == 0) 779 return (false); 780 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 781 /* 782 * Halt parsing only if we have an empty branch and p_branch_empty 783 * indicates that we must not continue. In the future, this will not 784 * necessarily be an error. 785 */ 786 return (false); 787 p_branch_ins_offset(p, bc); 788 789 return (true); 790 } 791 792 static void 793 p_bre_pre_parse(struct parse *p, struct branchc *bc) 794 { 795 796 (void)bc; 797 /* 798 * Does not move cleanly into expression parser because of 799 * ordinary interpration of * at the beginning position of 800 * an expression. 801 */ 802 if (EAT('^')) { 803 EMIT(OBOL, 0); 804 p->g->iflags |= USEBOL; 805 p->g->nbol++; 806 } 807 } 808 809 static void 810 p_bre_post_parse(struct parse *p, struct branchc *bc) 811 { 812 813 /* Expression is terminating due to EOL token */ 814 if (bc->terminate) { 815 DROP(1); 816 EMIT(OEOL, 0); 817 p->g->iflags |= USEEOL; 818 p->g->neol++; 819 } 820 } 821 822 /* 823 - p_re - Top level parser, concatenation and BRE anchoring 824 == static void p_re(struct parse *p, int end1, int end2); 825 * Giving end1 as OUT essentially eliminates the end1/end2 check. 826 * 827 * This implementation is a bit of a kludge, in that a trailing $ is first 828 * taken as an ordinary character and then revised to be an anchor. 829 * The amount of lookahead needed to avoid this kludge is excessive. 830 */ 831 static void 832 p_re(struct parse *p, 833 int end1, /* first terminating character */ 834 int end2) /* second terminating character; ignored for EREs */ 835 { 836 struct branchc bc; 837 838 bc.nbranch = 0; 839 if (end1 == OUT && end2 == OUT) 840 bc.outer = true; 841 else 842 bc.outer = false; 843 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 844 for (;;) { 845 bc.start = HERE(); 846 bc.nchain = 0; 847 bc.terminate = false; 848 if (p->pre_parse != NULL) 849 p->pre_parse(p, &bc); 850 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 851 bc.terminate = p->parse_expr(p, &bc); 852 ++bc.nchain; 853 } 854 if (p->post_parse != NULL) 855 p->post_parse(p, &bc); 856 (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY); 857 #ifdef REGEX_GNU_EXTENSIONS 858 if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc)) 859 break; 860 #endif 861 if (!p->allowbranch) 862 break; 863 /* 864 * p_branch_do's return value indicates whether we should 865 * continue parsing or not. This is both for correctness and 866 * a slight optimization, because it will check if we've 867 * encountered an empty branch or the end of the string 868 * immediately following a branch delimiter. 869 */ 870 if (!p_branch_do(p, &bc)) 871 break; 872 } 873 #undef SEE_END 874 if (p->allowbranch) 875 p_branch_fix_tail(p, &bc); 876 assert(!MORE() || SEE(end1)); 877 } 878 879 /* 880 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 881 == static bool p_simp_re(struct parse *p, struct branchc *bc); 882 */ 883 static bool /* was the simple RE an unbackslashed $? */ 884 p_simp_re(struct parse *p, struct branchc *bc) 885 { 886 int c; 887 int cc; /* convenient/control character */ 888 int count; 889 int count2; 890 sopno pos; 891 bool handled; 892 size_t i; 893 wint_t wc; 894 sopno subno; 895 # define BACKSL (1<<CHAR_BIT) 896 897 pos = HERE(); /* repetition op, if any, covers from here */ 898 handled = false; 899 900 assert(MORE()); /* caller should have ensured this */ 901 c = (uch)GETNEXT(); 902 if (c == '\\') { 903 (void)REQUIRE(MORE(), REG_EESCAPE); 904 cc = (uch)GETNEXT(); 905 c = BACKSL | cc; 906 #ifdef REGEX_GNU_EXTENSIONS 907 if (p->gnuext) { 908 handled = true; 909 switch (c) { 910 case BACKSL|'`': 911 EMIT(OBOS, 0); 912 break; 913 case BACKSL|'\'': 914 EMIT(OEOS, 0); 915 break; 916 case BACKSL|'B': 917 EMIT(ONWBND, 0); 918 break; 919 case BACKSL|'b': 920 EMIT(OWBND, 0); 921 break; 922 case BACKSL|'W': 923 case BACKSL|'w': 924 case BACKSL|'S': 925 case BACKSL|'s': 926 p_b_pseudoclass(p, cc); 927 break; 928 case BACKSL|'a': 929 ordinary(p, '\a'); 930 break; 931 case BACKSL|'e': 932 ordinary(p, '\e'); 933 break; 934 case BACKSL|'f': 935 ordinary(p, '\f'); 936 break; 937 case BACKSL|'n': 938 ordinary(p, '\n'); 939 break; 940 case BACKSL|'r': 941 ordinary(p, '\r'); 942 break; 943 case BACKSL|'t': 944 ordinary(p, '\t'); 945 break; 946 case BACKSL|'v': 947 ordinary(p, '\v'); 948 break; 949 default: 950 handled = false; 951 } 952 } 953 #endif 954 } 955 if (!handled) { 956 switch (c) { 957 case '.': 958 if (p->g->cflags®_NEWLINE) 959 nonnewline(p); 960 else 961 EMIT(OANY, 0); 962 break; 963 case '[': 964 p_bracket(p); 965 break; 966 case BACKSL|'<': 967 EMIT(OBOW, 0); 968 break; 969 case BACKSL|'>': 970 EMIT(OEOW, 0); 971 break; 972 case BACKSL|'{': 973 SETERROR(REG_BADRPT); 974 break; 975 case BACKSL|'(': 976 p->g->nsub++; 977 subno = (sopno)p->g->nsub; 978 if (subno < NPAREN) 979 p->pbegin[subno] = HERE(); 980 EMIT(OLPAREN, subno); 981 /* the MORE here is an error heuristic */ 982 if (MORE() && !SEETWO('\\', ')')) 983 p_re(p, '\\', ')'); 984 if (subno < NPAREN) { 985 p->pend[subno] = HERE(); 986 assert(p->pend[subno] != 0); 987 } 988 EMIT(ORPAREN, subno); 989 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 990 break; 991 case BACKSL|')': /* should not get here -- must be user */ 992 SETERROR(REG_EPAREN); 993 break; 994 case BACKSL|'1': 995 case BACKSL|'2': 996 case BACKSL|'3': 997 case BACKSL|'4': 998 case BACKSL|'5': 999 case BACKSL|'6': 1000 case BACKSL|'7': 1001 case BACKSL|'8': 1002 case BACKSL|'9': 1003 i = (c&~BACKSL) - '0'; 1004 assert(i < NPAREN); 1005 if (p->pend[i] != 0) { 1006 assert(i <= p->g->nsub); 1007 EMIT(OBACK_, i); 1008 assert(p->pbegin[i] != 0); 1009 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 1010 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 1011 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 1012 EMIT(O_BACK, i); 1013 } else 1014 SETERROR(REG_ESUBREG); 1015 p->g->backrefs = 1; 1016 break; 1017 case '*': 1018 /* 1019 * Ordinary if used as the first character beyond BOL anchor of 1020 * a (sub-)expression, counts as a bad repetition operator if it 1021 * appears otherwise. 1022 */ 1023 (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 1024 /* FALLTHROUGH */ 1025 default: 1026 if (p->error != 0) 1027 return (false); /* Definitely not $... */ 1028 p->next--; 1029 wc = WGETNEXT(); 1030 if ((c & BACKSL) == 0 || may_escape(p, wc)) 1031 ordinary(p, wc); 1032 else 1033 SETERROR(REG_EESCAPE); 1034 break; 1035 } 1036 } 1037 1038 if (EAT('*')) { /* implemented as +? */ 1039 /* this case does not require the (y|) trick, noKLUDGE */ 1040 INSERT(OPLUS_, pos); 1041 ASTERN(O_PLUS, pos); 1042 INSERT(OQUEST_, pos); 1043 ASTERN(O_QUEST, pos); 1044 #ifdef REGEX_GNU_EXTENSIONS 1045 } else if (p->gnuext && EATTWO('\\', '?')) { 1046 INSERT(OQUEST_, pos); 1047 ASTERN(O_QUEST, pos); 1048 } else if (p->gnuext && EATTWO('\\', '+')) { 1049 INSERT(OPLUS_, pos); 1050 ASTERN(O_PLUS, pos); 1051 #endif 1052 } else if (EATTWO('\\', '{')) { 1053 count = p_count(p); 1054 if (EAT(',')) { 1055 if (MORE() && isdigit((uch)PEEK())) { 1056 count2 = p_count(p); 1057 (void)REQUIRE(count <= count2, REG_BADBR); 1058 } else /* single number with comma */ 1059 count2 = INFINITY; 1060 } else /* just a single number */ 1061 count2 = count; 1062 repeat(p, pos, count, count2); 1063 if (!EATTWO('\\', '}')) { /* error heuristics */ 1064 while (MORE() && !SEETWO('\\', '}')) 1065 NEXT(); 1066 (void)REQUIRE(MORE(), REG_EBRACE); 1067 SETERROR(REG_BADBR); 1068 } 1069 } else if (c == '$') /* $ (but not \$) ends it */ 1070 return (true); 1071 1072 return (false); 1073 } 1074 1075 /* 1076 - p_count - parse a repetition count 1077 == static int p_count(struct parse *p); 1078 */ 1079 static int /* the value */ 1080 p_count(struct parse *p) 1081 { 1082 int count = 0; 1083 int ndigits = 0; 1084 1085 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 1086 count = count*10 + ((uch)GETNEXT() - '0'); 1087 ndigits++; 1088 } 1089 1090 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 1091 return(count); 1092 } 1093 1094 /* 1095 - p_bracket - parse a bracketed character list 1096 == static void p_bracket(struct parse *p); 1097 */ 1098 static void 1099 p_bracket(struct parse *p) 1100 { 1101 cset *cs; 1102 wint_t ch; 1103 1104 /* Dept of Truly Sickening Special-Case Kludges */ 1105 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 1106 EMIT(OBOW, 0); 1107 NEXTn(6); 1108 return; 1109 } 1110 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 1111 EMIT(OEOW, 0); 1112 NEXTn(6); 1113 return; 1114 } 1115 1116 if ((cs = allocset(p)) == NULL) 1117 return; 1118 1119 if (p->g->cflags®_ICASE) 1120 cs->icase = 1; 1121 if (EAT('^')) 1122 cs->invert = 1; 1123 if (EAT(']')) 1124 CHadd(p, cs, ']'); 1125 else if (EAT('-')) 1126 CHadd(p, cs, '-'); 1127 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 1128 p_b_term(p, cs); 1129 if (EAT('-')) 1130 CHadd(p, cs, '-'); 1131 (void)MUSTEAT(']', REG_EBRACK); 1132 1133 if (p->error != 0) /* don't mess things up further */ 1134 return; 1135 1136 if (cs->invert && p->g->cflags®_NEWLINE) 1137 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 1138 1139 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 1140 ordinary(p, ch); 1141 freeset(p, cs); 1142 } else 1143 EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1144 } 1145 1146 static int 1147 p_range_cmp(wchar_t c1, wchar_t c2) 1148 { 1149 #ifdef REGEX_LIBC_COLLATE 1150 return __wcollate_range_cmp(c1, c2); 1151 #elif defined(NLS) 1152 /* Copied from libc/collate __wcollate_range_cmp */ 1153 wchar_t s1[2], s2[2]; 1154 1155 s1[0] = c1; 1156 s1[1] = L'\0'; 1157 s2[0] = c2; 1158 s2[1] = L'\0'; 1159 return wcscoll(s1, s2); 1160 #else 1161 char s1[2], s2[2]; 1162 1163 s1[0] = (char)c1; 1164 s1[1] = '\0'; 1165 s2[0] = (char)c2; 1166 s2[1] = '\0'; 1167 return strcoll(s1, s2); 1168 #endif 1169 } 1170 1171 /* 1172 - p_b_term - parse one term of a bracketed character list 1173 == static void p_b_term(struct parse *p, cset *cs); 1174 */ 1175 static void 1176 p_b_term(struct parse *p, cset *cs) 1177 { 1178 char c; 1179 wint_t start, finish; 1180 wint_t i; 1181 #ifdef REGEX_LIBC_COLLATE 1182 struct xlocale_collate *table = 1183 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 1184 #endif 1185 1186 _DIAGASSERT(p != NULL); 1187 _DIAGASSERT(cs != NULL); 1188 1189 /* classify what we've got */ 1190 switch ((MORE()) ? PEEK() : '\0') { 1191 case '[': 1192 c = (MORE2()) ? PEEK2() : '\0'; 1193 break; 1194 case '-': 1195 SETERROR(REG_ERANGE); 1196 return; /* NOTE RETURN */ 1197 default: 1198 c = '\0'; 1199 break; 1200 } 1201 1202 switch (c) { 1203 case ':': /* character class */ 1204 NEXT2(); 1205 (void)REQUIRE(MORE(), REG_EBRACK); 1206 c = PEEK(); 1207 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 1208 p_b_cclass(p, cs); 1209 (void)REQUIRE(MORE(), REG_EBRACK); 1210 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 1211 break; 1212 case '=': /* equivalence class */ 1213 NEXT2(); 1214 (void)REQUIRE(MORE(), REG_EBRACK); 1215 c = PEEK(); 1216 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 1217 p_b_eclass(p, cs); 1218 (void)REQUIRE(MORE(), REG_EBRACK); 1219 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 1220 break; 1221 default: /* symbol, ordinary character, or range */ 1222 start = p_b_symbol(p); 1223 if (SEE('-') && MORE2() && PEEK2() != ']') { 1224 /* range */ 1225 NEXT(); 1226 if (EAT('-')) 1227 finish = '-'; 1228 else 1229 finish = p_b_symbol(p); 1230 } else 1231 finish = start; 1232 if (start == finish) 1233 CHadd(p, cs, start); 1234 else { 1235 #ifdef REGEX_LIBC_COLLATE 1236 if (table->__collate_load_error || MB_CUR_MAX > 1) { 1237 #else 1238 if (MB_CUR_MAX > 1) { 1239 #endif 1240 (void)REQUIRE(start <= finish, REG_ERANGE); 1241 CHaddrange(p, cs, start, finish); 1242 } else { 1243 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); 1244 for (i = 0; i <= UCHAR_MAX; i++) { 1245 if (p_range_cmp(start, i) <= 0 && 1246 p_range_cmp(i, finish) <= 0 ) 1247 CHadd(p, cs, i); 1248 } 1249 } 1250 } 1251 break; 1252 } 1253 } 1254 1255 #ifdef REGEX_GNU_EXTENSIONS 1256 /* 1257 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) 1258 == static int p_b_pseudoclass(struct parse *p, char c) 1259 */ 1260 static int 1261 p_b_pseudoclass(struct parse *p, char c) { 1262 cset *cs; 1263 1264 if ((cs = allocset(p)) == NULL) 1265 return(0); 1266 1267 if (p->g->cflags®_ICASE) 1268 cs->icase = 1; 1269 1270 switch (c) { 1271 case 'W': 1272 cs->invert = 1; 1273 /* FALLTHROUGH */ 1274 case 'w': 1275 p_b_cclass_named(p, cs, "alnum"); 1276 break; 1277 case 'S': 1278 cs->invert = 1; 1279 /* FALLTHROUGH */ 1280 case 's': 1281 p_b_cclass_named(p, cs, "space"); 1282 break; 1283 default: 1284 return(0); 1285 } 1286 1287 EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1288 return(1); 1289 } 1290 #endif 1291 1292 /* 1293 - p_b_cclass - parse a character-class name and deal with it 1294 == static void p_b_cclass(struct parse *p, cset *cs); 1295 */ 1296 static void 1297 p_b_cclass(struct parse *p, cset *cs) 1298 { 1299 const char *sp = p->next; 1300 size_t len; 1301 char clname[16]; 1302 1303 while (MORE() && isalpha((uch)PEEK())) 1304 NEXT(); 1305 len = p->next - sp; 1306 if (len >= sizeof(clname) - 1) { 1307 SETERROR(REG_ECTYPE); 1308 return; 1309 } 1310 memcpy(clname, sp, len); 1311 clname[len] = '\0'; 1312 1313 p_b_cclass_named(p, cs, clname); 1314 } 1315 1316 /* 1317 - p_b_cclass_named - deal with a named character class 1318 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); 1319 */ 1320 static void 1321 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { 1322 wctype_t wct; 1323 1324 if ((wct = wctype(clname)) == 0) { 1325 SETERROR(REG_ECTYPE); 1326 return; 1327 } 1328 CHaddtype(p, cs, wct); 1329 } 1330 1331 /* 1332 - p_b_eclass - parse an equivalence-class name and deal with it 1333 == static void p_b_eclass(struct parse *p, cset *cs); 1334 * 1335 * This implementation is incomplete. xxx 1336 */ 1337 static void 1338 p_b_eclass(struct parse *p, cset *cs) 1339 { 1340 wint_t c; 1341 1342 _DIAGASSERT(p != NULL); 1343 _DIAGASSERT(cs != NULL); 1344 1345 c = p_b_coll_elem(p, '='); 1346 CHadd(p, cs, c); 1347 } 1348 1349 /* 1350 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1351 == static wint_t p_b_symbol(struct parse *p); 1352 */ 1353 static wint_t /* value of symbol */ 1354 p_b_symbol(struct parse *p) 1355 { 1356 wint_t value; 1357 1358 _DIAGASSERT(p != NULL); 1359 1360 (void)REQUIRE(MORE(), REG_EBRACK); 1361 if (!EATTWO('[', '.')) 1362 return(WGETNEXT()); 1363 1364 /* collating symbol */ 1365 value = p_b_coll_elem(p, '.'); 1366 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1367 return(value); 1368 } 1369 1370 /* 1371 - p_b_coll_elem - parse a collating-element name and look it up 1372 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1373 */ 1374 static wint_t /* value of collating element */ 1375 p_b_coll_elem(struct parse *p, 1376 wint_t endc) /* name ended by endc,']' */ 1377 { 1378 const char *sp = p->next; 1379 struct cname *cp; 1380 size_t len; 1381 1382 _DIAGASSERT(p != NULL); 1383 1384 while (MORE() && !SEETWO(endc, ']')) 1385 NEXT(); 1386 if (!MORE()) { 1387 SETERROR(REG_EBRACK); 1388 return(0); 1389 } 1390 len = p->next - sp; 1391 for (cp = cnames; cp->name != NULL; cp++) 1392 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1393 return(cp->code); /* known name */ 1394 #ifdef NLS 1395 mbstate_t mbs; 1396 wchar_t wc; 1397 size_t clen; 1398 1399 memset(&mbs, 0, sizeof(mbs)); 1400 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1401 return (wc); /* single character */ 1402 else if (clen == (size_t)-1 || clen == (size_t)-2) 1403 SETERROR(REG_ILLSEQ); 1404 else 1405 SETERROR(REG_ECOLLATE); /* neither */ 1406 return(0); 1407 #else 1408 if (len == 1) 1409 return *sp; /* single character */ 1410 SETERROR(REG_ECOLLATE); /* neither */ 1411 return 0; 1412 #endif 1413 } 1414 1415 /* 1416 - may_escape - determine whether 'ch' is escape-able in the current context 1417 == static int may_escape(struct parse *p, const wint_t ch) 1418 */ 1419 static bool 1420 may_escape(struct parse *p, const wint_t ch) 1421 { 1422 1423 if ((p->pflags & PFLAG_LEGACY_ESC) != 0) 1424 return (true); 1425 if (iswalpha(ch) || ch == '\'' || ch == '`') 1426 return (false); 1427 return (true); 1428 #ifdef NOTYET 1429 /* 1430 * Build a whitelist of characters that may be escaped to produce an 1431 * ordinary in the current context. This assumes that these have not 1432 * been otherwise interpreted as a special character. Escaping an 1433 * ordinary character yields undefined results according to 1434 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take 1435 * advantage of this and use escaped ordinary characters to provide 1436 * special meaning, e.g. \b, \B, \w, \W, \s, \S. 1437 */ 1438 switch(ch) { 1439 case '|': 1440 case '+': 1441 case '?': 1442 /* The above characters may not be escaped in BREs */ 1443 if (!(p->g->cflags®_EXTENDED)) 1444 return (false); 1445 /* Fallthrough */ 1446 case '(': 1447 case ')': 1448 case '{': 1449 case '}': 1450 case '.': 1451 case '[': 1452 case ']': 1453 case '\\': 1454 case '*': 1455 case '^': 1456 case '$': 1457 return (true); 1458 default: 1459 return (false); 1460 } 1461 #endif 1462 } 1463 1464 /* 1465 - othercase - return the case counterpart of an alphabetic 1466 == static wint_t othercase(wint_t ch); 1467 */ 1468 static wint_t /* if no counterpart, return ch */ 1469 othercase(wint_t ch) 1470 { 1471 assert(iswalpha(ch)); 1472 if (iswupper(ch)) 1473 return(towlower(ch)); 1474 else if (iswlower(ch)) 1475 return(towupper(ch)); 1476 else /* peculiar, but could happen */ 1477 return(ch); 1478 } 1479 1480 /* 1481 - bothcases - emit a dualcase version of a two-case character 1482 == static void bothcases(struct parse *p, wint_t ch); 1483 * 1484 * Boy, is this implementation ever a kludge... 1485 */ 1486 static void 1487 bothcases(struct parse *p, wint_t ch) 1488 { 1489 const char *oldnext = p->next; 1490 const char *oldend = p->end; 1491 char bracket[3 + MB_LEN_MAX]; 1492 size_t n; 1493 1494 _DIAGASSERT(p != NULL); 1495 1496 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1497 p->next = bracket; 1498 #ifdef NLS 1499 mbstate_t mbs; 1500 memset(&mbs, 0, sizeof(mbs)); 1501 n = wcrtomb(bracket, ch, &mbs); 1502 assert(n != (size_t)-1); 1503 #else 1504 n = 0; 1505 bracket[n++] = ch; 1506 #endif 1507 bracket[n] = ']'; 1508 bracket[n + 1] = '\0'; 1509 p->end = bracket+n+1; 1510 p_bracket(p); 1511 assert(p->next == p->end); 1512 p->next = oldnext; 1513 p->end = oldend; 1514 } 1515 1516 /* 1517 - ordinary - emit an ordinary character 1518 == static void ordinary(struct parse *p, wint_t ch); 1519 */ 1520 static void 1521 ordinary(struct parse *p, wint_t ch) 1522 { 1523 cset *cs; 1524 1525 _DIAGASSERT(p != NULL); 1526 1527 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1528 bothcases(p, ch); 1529 else if ((wint_t)(ch & OPDMASK) == ch) 1530 EMIT(OCHAR, (size_t)ch); 1531 else { 1532 /* 1533 * Kludge: character is too big to fit into an OCHAR operand. 1534 * Emit a singleton set. 1535 */ 1536 if ((cs = allocset(p)) == NULL) 1537 return; 1538 CHadd(p, cs, ch); 1539 EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1540 } 1541 } 1542 1543 /* 1544 - nonnewline - emit REG_NEWLINE version of OANY 1545 == static void nonnewline(struct parse *p); 1546 * 1547 * Boy, is this implementation ever a kludge... 1548 */ 1549 static void 1550 nonnewline(struct parse *p) 1551 { 1552 const char *oldnext = p->next; 1553 const char *oldend = p->end; 1554 char bracket[4]; 1555 1556 _DIAGASSERT(p != NULL); 1557 1558 p->next = bracket; 1559 p->end = bracket+3; 1560 bracket[0] = '^'; 1561 bracket[1] = '\n'; 1562 bracket[2] = ']'; 1563 bracket[3] = '\0'; 1564 p_bracket(p); 1565 assert(p->next == bracket+3); 1566 p->next = oldnext; 1567 p->end = oldend; 1568 } 1569 1570 /* 1571 - repeat - generate code for a bounded repetition, recursively if needed 1572 == static void repeat(struct parse *p, sopno start, int from, int to); 1573 */ 1574 static void 1575 repeat(struct parse *p, 1576 sopno start, /* operand from here to end of strip */ 1577 int from, /* repeated from this number */ 1578 int to) /* to this number of times (maybe INFINITY) */ 1579 { 1580 sopno finish = HERE(); 1581 # define N 2 1582 # define INF 3 1583 # define REP(f, t) ((f)*8 + (t)) 1584 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1585 sopno copy; 1586 1587 _DIAGASSERT(p != NULL); 1588 1589 if (p->error != 0) /* head off possible runaway recursion */ 1590 return; 1591 1592 assert(from <= to); 1593 1594 switch (REP(MAP(from), MAP(to))) { 1595 case REP(0, 0): /* must be user doing this */ 1596 DROP(finish-start); /* drop the operand */ 1597 break; 1598 case REP(0, 1): /* as x{1,1}? */ 1599 case REP(0, N): /* as x{1,n}? */ 1600 case REP(0, INF): /* as x{1,}? */ 1601 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1602 INSERT(OCH_, start); /* offset is wrong... */ 1603 repeat(p, start+1, 1, to); 1604 ASTERN(OOR1, start); 1605 AHEAD(start); /* ... fix it */ 1606 EMIT(OOR2, 0); 1607 AHEAD(THERE()); 1608 ASTERN(O_CH, THERETHERE()); 1609 break; 1610 case REP(1, 1): /* trivial case */ 1611 /* done */ 1612 break; 1613 case REP(1, N): /* as x?x{1,n-1} */ 1614 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1615 INSERT(OCH_, start); 1616 ASTERN(OOR1, start); 1617 AHEAD(start); 1618 EMIT(OOR2, 0); /* offset very wrong... */ 1619 AHEAD(THERE()); /* ...so fix it */ 1620 ASTERN(O_CH, THERETHERE()); 1621 copy = dupl(p, start+1, finish+1); 1622 assert(copy == finish+4); 1623 repeat(p, copy, 1, to-1); 1624 break; 1625 case REP(1, INF): /* as x+ */ 1626 INSERT(OPLUS_, start); 1627 ASTERN(O_PLUS, start); 1628 break; 1629 case REP(N, N): /* as xx{m-1,n-1} */ 1630 copy = dupl(p, start, finish); 1631 repeat(p, copy, from-1, to-1); 1632 break; 1633 case REP(N, INF): /* as xx{n-1,INF} */ 1634 copy = dupl(p, start, finish); 1635 repeat(p, copy, from-1, to); 1636 break; 1637 default: /* "can't happen" */ 1638 SETERROR(REG_ASSERT); /* just in case */ 1639 break; 1640 } 1641 } 1642 1643 /* 1644 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1645 - character from the parse struct, signals a REG_ILLSEQ error if the 1646 - character can't be converted. Returns the number of bytes consumed. 1647 */ 1648 static wint_t 1649 wgetnext(struct parse *p) 1650 { 1651 #ifdef NLS 1652 mbstate_t mbs; 1653 wchar_t wc; 1654 size_t n; 1655 1656 memset(&mbs, 0, sizeof(mbs)); 1657 n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs); 1658 if (n == (size_t)-1 || n == (size_t)-2) { 1659 SETERROR(REG_ILLSEQ); 1660 return (0); 1661 } 1662 if (n == 0) 1663 n = 1; 1664 p->next += n; 1665 return wc; 1666 #else 1667 return *p->next++; 1668 #endif 1669 } 1670 1671 /* 1672 - seterr - set an error condition 1673 == static int seterr(struct parse *p, int e); 1674 */ 1675 static int /* useless but makes type checking happy */ 1676 seterr(struct parse *p, int e) 1677 { 1678 1679 _DIAGASSERT(p != NULL); 1680 1681 if (p->error == 0) /* keep earliest error condition */ 1682 p->error = e; 1683 p->next = nuls; /* try to bring things to a halt */ 1684 p->end = nuls; 1685 return(0); /* make the return value well-defined */ 1686 } 1687 1688 /* 1689 - allocset - allocate a set of characters for [] 1690 == static cset *allocset(struct parse *p); 1691 */ 1692 static cset * 1693 allocset(struct parse *p) 1694 { 1695 cset *cs, *ncs; 1696 1697 _DIAGASSERT(p != NULL); 1698 1699 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1700 if (ncs == NULL) { 1701 SETERROR(REG_ESPACE); 1702 return (NULL); 1703 } 1704 p->g->sets = ncs; 1705 cs = &p->g->sets[p->g->ncsets++]; 1706 memset(cs, 0, sizeof(*cs)); 1707 1708 return(cs); 1709 } 1710 1711 /* 1712 - freeset - free a now-unused set 1713 == static void freeset(struct parse *p, cset *cs); 1714 */ 1715 static void 1716 freeset(struct parse *p, cset *cs) 1717 { 1718 cset *top; 1719 1720 _DIAGASSERT(p != NULL); 1721 _DIAGASSERT(cs != NULL); 1722 1723 top = &p->g->sets[p->g->ncsets]; 1724 1725 free(cs->wides); 1726 free(cs->ranges); 1727 free(cs->types); 1728 memset(cs, 0, sizeof(*cs)); 1729 if (cs == top-1) /* recover only the easy case */ 1730 p->g->ncsets--; 1731 } 1732 1733 /* 1734 - singleton - Determine whether a set contains only one character, 1735 - returning it if so, otherwise returning OUT. 1736 */ 1737 static wint_t 1738 singleton(cset *cs) 1739 { 1740 wint_t i, s, n; 1741 1742 for (i = n = 0; i < NC; i++) 1743 if (CHIN(cs, i)) { 1744 n++; 1745 s = i; 1746 } 1747 if (n == 1) 1748 return (s); 1749 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1750 cs->icase == 0) 1751 return (cs->wides[0]); 1752 /* Don't bother handling the other cases. */ 1753 return (OUT); 1754 } 1755 1756 /* 1757 - CHadd - add character to character set. 1758 */ 1759 static void 1760 CHadd(struct parse *p, cset *cs, wint_t ch) 1761 { 1762 wint_t nch, *newwides; 1763 1764 _DIAGASSERT(p != NULL); 1765 _DIAGASSERT(cs != NULL); 1766 1767 if ((unsigned)ch < NC) 1768 cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7); 1769 else { 1770 newwides = reallocarray(cs->wides, cs->nwides + 1, 1771 sizeof(*cs->wides)); 1772 if (newwides == NULL) { 1773 SETERROR(REG_ESPACE); 1774 return; 1775 } 1776 cs->wides = newwides; 1777 cs->wides[cs->nwides++] = ch; 1778 } 1779 if (cs->icase) { 1780 if ((unsigned)(nch = towlower(ch)) < NC) 1781 cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7); 1782 if ((unsigned)(nch = towupper(ch)) < NC) 1783 cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7); 1784 } 1785 } 1786 1787 /* 1788 - CHaddrange - add all characters in the range [min,max] to a character set. 1789 */ 1790 static void 1791 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1792 { 1793 crange *newranges; 1794 1795 _DIAGASSERT(p != NULL); 1796 _DIAGASSERT(cs != NULL); 1797 1798 for (; min < NC && min <= max; min++) 1799 CHadd(p, cs, min); 1800 if (min >= max) 1801 return; 1802 newranges = reallocarray(cs->ranges, cs->nranges + 1, 1803 sizeof(*cs->ranges)); 1804 if (newranges == NULL) { 1805 SETERROR(REG_ESPACE); 1806 return; 1807 } 1808 cs->ranges = newranges; 1809 cs->ranges[cs->nranges].min = min; 1810 cs->ranges[cs->nranges].max = max; 1811 cs->nranges++; 1812 } 1813 1814 /* 1815 - CHaddtype - add all characters of a certain type to a character set. 1816 */ 1817 static void 1818 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1819 { 1820 wint_t i; 1821 wctype_t *newtypes; 1822 1823 _DIAGASSERT(p != NULL); 1824 _DIAGASSERT(cs != NULL); 1825 1826 for (i = 0; i < NC; i++) 1827 if (iswctype(i, wct)) 1828 CHadd(p, cs, i); 1829 newtypes = reallocarray(cs->types, cs->ntypes + 1, 1830 sizeof(*cs->types)); 1831 if (newtypes == NULL) { 1832 SETERROR(REG_ESPACE); 1833 return; 1834 } 1835 cs->types = newtypes; 1836 cs->types[cs->ntypes++] = wct; 1837 } 1838 1839 /* 1840 - dupl - emit a duplicate of a bunch of sops 1841 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1842 */ 1843 static sopno /* start of duplicate */ 1844 dupl(struct parse *p, 1845 sopno start, /* from here */ 1846 sopno finish) /* to this less one */ 1847 { 1848 sopno ret = HERE(); 1849 sopno len = finish - start; 1850 1851 _DIAGASSERT(p != NULL); 1852 1853 assert(finish >= start); 1854 if (len == 0) 1855 return(ret); 1856 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1857 return(ret); 1858 (void) memcpy(p->strip + p->slen, 1859 p->strip + start, len * sizeof(*p->strip)); 1860 p->slen += len; 1861 return(ret); 1862 } 1863 1864 /* 1865 - doemit - emit a strip operator 1866 == static void doemit(struct parse *p, sop op, size_t opnd); 1867 * 1868 * It might seem better to implement this as a macro with a function as 1869 * hard-case backup, but it's just too big and messy unless there are 1870 * some changes to the data structures. Maybe later. 1871 */ 1872 static void 1873 doemit(struct parse *p, sop op, size_t opnd) 1874 { 1875 /* avoid making error situations worse */ 1876 if (p->error != 0) 1877 return; 1878 1879 _DIAGASSERT(p != NULL); 1880 1881 /* deal with oversize operands ("can't happen", more or less) */ 1882 assert(opnd < 1<<OPSHIFT); 1883 1884 /* deal with undersized strip */ 1885 if (p->slen >= p->ssize) 1886 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1887 return; 1888 1889 /* finally, it's all reduced to the easy case */ 1890 p->strip[p->slen++] = (sopno)SOP(op, opnd); 1891 } 1892 1893 /* 1894 - doinsert - insert a sop into the strip 1895 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1896 */ 1897 static void 1898 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1899 { 1900 sopno sn; 1901 sop s; 1902 int i; 1903 1904 _DIAGASSERT(p != NULL); 1905 1906 /* avoid making error situations worse */ 1907 if (p->error != 0) 1908 return; 1909 1910 sn = HERE(); 1911 EMIT(op, opnd); /* do checks, ensure space */ 1912 assert(HERE() == sn+1); 1913 s = p->strip[sn]; 1914 1915 /* adjust paren pointers */ 1916 assert(pos > 0); 1917 for (i = 1; i < NPAREN; i++) { 1918 if (p->pbegin[i] >= pos) { 1919 p->pbegin[i]++; 1920 } 1921 if (p->pend[i] >= pos) { 1922 p->pend[i]++; 1923 } 1924 } 1925 1926 memmove(&p->strip[pos+1], &p->strip[pos], 1927 (HERE()-pos-1)*sizeof(*p->strip)); 1928 p->strip[pos] = s; 1929 } 1930 1931 /* 1932 - dofwd - complete a forward reference 1933 == static void dofwd(struct parse *p, sopno pos, sop value); 1934 */ 1935 static void 1936 dofwd(struct parse *p, sopno pos, sop value) 1937 { 1938 1939 _DIAGASSERT(p != NULL); 1940 1941 /* avoid making error situations worse */ 1942 if (p->error != 0) 1943 return; 1944 1945 assert(value < 1<<OPSHIFT); 1946 p->strip[pos] = OP(p->strip[pos]) | value; 1947 } 1948 1949 /* 1950 - enlarge - enlarge the strip 1951 == static int enlarge(struct parse *p, sopno size); 1952 */ 1953 static int 1954 enlarge(struct parse *p, sopno size) 1955 { 1956 sop *sp; 1957 1958 _DIAGASSERT(p != NULL); 1959 1960 if (p->ssize >= size) 1961 return 1; 1962 1963 sp = reallocarray(p->strip, size, sizeof(*p->strip)); 1964 if (sp == NULL) { 1965 SETERROR(REG_ESPACE); 1966 return 0; 1967 } 1968 p->strip = sp; 1969 p->ssize = size; 1970 return 1; 1971 } 1972 1973 /* 1974 - stripsnug - compact the strip 1975 == static void stripsnug(struct parse *p, struct re_guts *g); 1976 */ 1977 static void 1978 stripsnug(struct parse *p, struct re_guts *g) 1979 { 1980 1981 _DIAGASSERT(p != NULL); 1982 _DIAGASSERT(g != NULL); 1983 1984 g->nstates = p->slen; 1985 g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip)); 1986 if (g->strip == NULL) { 1987 SETERROR(REG_ESPACE); 1988 g->strip = p->strip; 1989 } 1990 } 1991 1992 /* 1993 - findmust - fill in must and mlen with longest mandatory literal string 1994 == static void findmust(struct parse *p, struct re_guts *g); 1995 * 1996 * This algorithm could do fancy things like analyzing the operands of | 1997 * for common subsequences. Someday. This code is simple and finds most 1998 * of the interesting cases. 1999 * 2000 * Note that must and mlen got initialized during setup. 2001 */ 2002 static void 2003 findmust(struct parse *p, struct re_guts *g) 2004 { 2005 sop *scan; 2006 sop *start = NULL; 2007 sop *newstart = NULL; 2008 sopno newlen; 2009 sop s; 2010 char *cp; 2011 int offset; 2012 mbstate_t mbs; 2013 2014 _DIAGASSERT(p != NULL); 2015 _DIAGASSERT(g != NULL); 2016 2017 /* avoid making error situations worse */ 2018 if (p->error != 0) 2019 return; 2020 2021 #ifdef notyet 2022 /* 2023 * It's not generally safe to do a ``char'' substring search on 2024 * multibyte character strings, but it's safe for at least 2025 * UTF-8 (see RFC 3629). 2026 */ 2027 if (MB_CUR_MAX > 1 && 2028 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 2029 return; 2030 #endif 2031 2032 /* find the longest OCHAR sequence in strip */ 2033 newlen = 0; 2034 offset = 0; 2035 g->moffset = 0; 2036 scan = g->strip + 1; 2037 do { 2038 s = *scan++; 2039 switch (OP(s)) { 2040 case OCHAR: /* sequence member */ 2041 if (newlen == 0) { /* new sequence */ 2042 memset(&mbs, 0, sizeof(mbs)); 2043 newstart = scan - 1; 2044 } 2045 #ifdef NLS 2046 char buf[MB_LEN_MAX]; 2047 size_t clen = wcrtomb(buf, (int)OPND(s), &mbs); 2048 if (clen == (size_t)-1) 2049 goto toohard; 2050 newlen += (sopno)clen; 2051 #else 2052 newlen++; 2053 #endif 2054 break; 2055 case OPLUS_: /* things that don't break one */ 2056 case OLPAREN: 2057 case ORPAREN: 2058 break; 2059 case OQUEST_: /* things that must be skipped */ 2060 case OCH_: 2061 offset = altoffset(scan, offset); 2062 scan--; 2063 do { 2064 scan += OPND(s); 2065 s = *scan; 2066 /* assert() interferes w debug printouts */ 2067 if (OP(s) != O_QUEST && 2068 OP(s) != O_CH && OP(s) != OOR2) { 2069 g->iflags |= BAD; 2070 return; 2071 } 2072 } while (OP(s) != O_QUEST && OP(s) != O_CH); 2073 /* FALLTHROUGH */ 2074 case OBOW: /* things that break a sequence */ 2075 case OEOW: 2076 case OBOL: 2077 case OEOL: 2078 case OBOS: 2079 case OEOS: 2080 case OWBND: 2081 case ONWBND: 2082 case O_QUEST: 2083 case O_CH: 2084 case OEND: 2085 if (newlen > (sopno)g->mlen) { /* ends one */ 2086 start = newstart; 2087 g->mlen = newlen; 2088 if (offset > -1) { 2089 g->moffset += offset; 2090 offset = newlen; 2091 } else 2092 g->moffset = offset; 2093 } else { 2094 if (offset > -1) 2095 offset += newlen; 2096 } 2097 newlen = 0; 2098 break; 2099 case OANY: 2100 if (newlen > (sopno)g->mlen) { /* ends one */ 2101 start = newstart; 2102 g->mlen = newlen; 2103 if (offset > -1) { 2104 g->moffset += offset; 2105 offset = newlen; 2106 } else 2107 g->moffset = offset; 2108 } else { 2109 if (offset > -1) 2110 offset += newlen; 2111 } 2112 if (offset > -1) 2113 offset++; 2114 newlen = 0; 2115 break; 2116 case OANYOF: /* may or may not invalidate offset */ 2117 /* First, everything as OANY */ 2118 if (newlen > (sopno)g->mlen) { /* ends one */ 2119 start = newstart; 2120 g->mlen = newlen; 2121 if (offset > -1) { 2122 g->moffset += offset; 2123 offset = newlen; 2124 } else 2125 g->moffset = offset; 2126 } else { 2127 if (offset > -1) 2128 offset += newlen; 2129 } 2130 if (offset > -1) 2131 offset++; 2132 newlen = 0; 2133 break; 2134 #ifdef NLS 2135 toohard:/*FALLTHROUGH*/ 2136 #endif 2137 default: 2138 /* Anything here makes it impossible or too hard 2139 * to calculate the offset -- so we give up; 2140 * save the last known good offset, in case the 2141 * must sequence doesn't occur later. 2142 */ 2143 if (newlen > (sopno)g->mlen) { /* ends one */ 2144 start = newstart; 2145 g->mlen = newlen; 2146 if (offset > -1) 2147 g->moffset += offset; 2148 else 2149 g->moffset = offset; 2150 } 2151 offset = -1; 2152 newlen = 0; 2153 break; 2154 } 2155 } while (OP(s) != OEND); 2156 2157 if (g->mlen == 0) { /* there isn't one */ 2158 g->moffset = -1; 2159 return; 2160 } 2161 2162 /* turn it into a character string */ 2163 g->must = malloc((size_t)g->mlen + 1); 2164 if (g->must == NULL) { /* argh; just forget it */ 2165 g->mlen = 0; 2166 g->moffset = -1; 2167 return; 2168 } 2169 cp = g->must; 2170 scan = start; 2171 memset(&mbs, 0, sizeof(mbs)); 2172 while (cp < g->must + g->mlen) { 2173 while (OP(s = *scan++) != OCHAR) 2174 continue; 2175 #ifdef NLS 2176 size_t clen = wcrtomb(cp, (int)OPND(s), &mbs); 2177 assert(clen != (size_t)-1); 2178 cp += clen; 2179 #else 2180 *cp++ = OPND(s); 2181 #endif 2182 } 2183 assert(cp == g->must + g->mlen); 2184 *cp++ = '\0'; /* just on general principles */ 2185 } 2186 2187 /* 2188 - altoffset - choose biggest offset among multiple choices 2189 == static int altoffset(sop *scan, int offset); 2190 * 2191 * Compute, recursively if necessary, the largest offset among multiple 2192 * re paths. 2193 */ 2194 static int 2195 altoffset(sop *scan, int offset) 2196 { 2197 int largest; 2198 int try; 2199 sop s; 2200 2201 _DIAGASSERT(scan != NULL); 2202 2203 /* If we gave up already on offsets, return */ 2204 if (offset == -1) 2205 return -1; 2206 2207 largest = 0; 2208 try = 0; 2209 s = *scan++; 2210 while (OP(s) != O_QUEST && OP(s) != O_CH) { 2211 switch (OP(s)) { 2212 case OOR1: 2213 if (try > largest) 2214 largest = try; 2215 try = 0; 2216 break; 2217 case OQUEST_: 2218 case OCH_: 2219 try = altoffset(scan, try); 2220 if (try == -1) 2221 return -1; 2222 scan--; 2223 do { 2224 scan += OPND(s); 2225 s = *scan; 2226 if (OP(s) != O_QUEST && 2227 OP(s) != O_CH && OP(s) != OOR2) 2228 return -1; 2229 } while (OP(s) != O_QUEST && OP(s) != O_CH); 2230 /* We must skip to the next position, or we'll 2231 * leave altoffset() too early. 2232 */ 2233 scan++; 2234 break; 2235 case OANYOF: 2236 case OCHAR: 2237 case OANY: 2238 try++; 2239 /*FALLTHROUGH*/ 2240 case OBOW: 2241 case OEOW: 2242 case OWBND: 2243 case ONWBND: 2244 case OLPAREN: 2245 case ORPAREN: 2246 case OOR2: 2247 break; 2248 default: 2249 try = -1; 2250 break; 2251 } 2252 if (try == -1) 2253 return -1; 2254 s = *scan++; 2255 } 2256 2257 if (try > largest) 2258 largest = try; 2259 2260 return largest+offset; 2261 } 2262 2263 /* 2264 - computejumps - compute char jumps for BM scan 2265 == static void computejumps(struct parse *p, struct re_guts *g); 2266 * 2267 * This algorithm assumes g->must exists and is has size greater than 2268 * zero. It's based on the algorithm found on Computer Algorithms by 2269 * Sara Baase. 2270 * 2271 * A char jump is the number of characters one needs to jump based on 2272 * the value of the character from the text that was mismatched. 2273 */ 2274 static void 2275 computejumps(struct parse *p, struct re_guts *g) 2276 { 2277 int ch; 2278 size_t mindex; 2279 2280 _DIAGASSERT(p != NULL); 2281 _DIAGASSERT(g != NULL); 2282 2283 /* Avoid making errors worse */ 2284 if (p->error != 0) 2285 return; 2286 2287 g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump)); 2288 if (g->charjump == NULL) /* Not a fatal error */ 2289 return; 2290 /* Adjust for signed chars, if necessary */ 2291 g->charjump = &g->charjump[-(CHAR_MIN)]; 2292 2293 /* If the character does not exist in the pattern, the jump 2294 * is equal to the number of characters in the pattern. 2295 */ 2296 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 2297 g->charjump[ch] = g->mlen; 2298 2299 /* If the character does exist, compute the jump that would 2300 * take us to the last character in the pattern equal to it 2301 * (notice that we match right to left, so that last character 2302 * is the first one that would be matched). 2303 */ 2304 for (mindex = 0; mindex < g->mlen; mindex++) 2305 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 2306 } 2307 2308 /* 2309 - computematchjumps - compute match jumps for BM scan 2310 == static void computematchjumps(struct parse *p, struct re_guts *g); 2311 * 2312 * This algorithm assumes g->must exists and is has size greater than 2313 * zero. It's based on the algorithm found on Computer Algorithms by 2314 * Sara Baase. 2315 * 2316 * A match jump is the number of characters one needs to advance based 2317 * on the already-matched suffix. 2318 * Notice that all values here are minus (g->mlen-1), because of the way 2319 * the search algorithm works. 2320 */ 2321 static void 2322 computematchjumps(struct parse *p, struct re_guts *g) 2323 { 2324 size_t mindex; /* General "must" iterator */ 2325 size_t suffix; /* Keeps track of matching suffix */ 2326 size_t ssuffix; /* Keeps track of suffixes' suffix */ 2327 size_t* pmatches; /* pmatches[k] points to the next i 2328 * such that i+1...mlen is a substring 2329 * of k+1...k+mlen-i-1 2330 */ 2331 2332 _DIAGASSERT(p != NULL); 2333 _DIAGASSERT(g != NULL); 2334 2335 /* Avoid making errors worse */ 2336 if (p->error != 0) 2337 return; 2338 2339 pmatches = calloc(g->mlen, sizeof(*pmatches)); 2340 if (pmatches == NULL) { 2341 g->matchjump = NULL; 2342 return; 2343 } 2344 2345 g->matchjump = calloc(g->mlen, sizeof(*g->matchjump)); 2346 if (g->matchjump == NULL) { /* Not a fatal error */ 2347 free(pmatches); 2348 return; 2349 } 2350 2351 /* Set maximum possible jump for each character in the pattern */ 2352 for (mindex = 0; mindex < g->mlen; mindex++) 2353 g->matchjump[mindex] = 2 * g->mlen - mindex - 1; 2354 2355 /* Compute pmatches[] */ 2356 for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) { 2357 pmatches[mindex] = suffix; 2358 2359 /* If a mismatch is found, interrupting the substring, 2360 * compute the matchjump for that position. If no 2361 * mismatch is found, then a text substring mismatched 2362 * against the suffix will also mismatch against the 2363 * substring. 2364 */ 2365 while (suffix < g->mlen 2366 && g->must[mindex] != g->must[suffix]) { 2367 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2368 g->mlen - mindex - 1); 2369 suffix = pmatches[suffix]; 2370 } 2371 } 2372 2373 /* Compute the matchjump up to the last substring found to jump 2374 * to the beginning of the largest must pattern prefix matching 2375 * it's own suffix. 2376 */ 2377 for (mindex = 0; mindex <= suffix; mindex++) 2378 g->matchjump[mindex] = MIN(g->matchjump[mindex], 2379 g->mlen + suffix - mindex); 2380 2381 ssuffix = pmatches[suffix]; 2382 while (suffix < g->mlen) { 2383 while (suffix <= ssuffix && suffix < g->mlen) { 2384 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2385 g->mlen + ssuffix - suffix); 2386 suffix++; 2387 } 2388 if (suffix < g->mlen) 2389 ssuffix = pmatches[ssuffix]; 2390 } 2391 2392 free(pmatches); 2393 } 2394 2395 /* 2396 - pluscount - count + nesting 2397 == static sopno pluscount(struct parse *p, struct re_guts *g); 2398 */ 2399 static sopno /* nesting depth */ 2400 pluscount(struct parse *p, struct re_guts *g) 2401 { 2402 sop *scan; 2403 sop s; 2404 sopno plusnest = 0; 2405 sopno maxnest = 0; 2406 2407 _DIAGASSERT(p != NULL); 2408 _DIAGASSERT(g != NULL); 2409 2410 if (p->error != 0) 2411 return(0); /* there may not be an OEND */ 2412 2413 scan = g->strip + 1; 2414 do { 2415 s = *scan++; 2416 switch (OP(s)) { 2417 case OPLUS_: 2418 plusnest++; 2419 break; 2420 case O_PLUS: 2421 if (plusnest > maxnest) 2422 maxnest = plusnest; 2423 plusnest--; 2424 break; 2425 } 2426 } while (OP(s) != OEND); 2427 if (plusnest != 0) 2428 g->iflags |= BAD; 2429 return(maxnest); 2430 } 2431