1 /* $NetBSD: regcomp.c,v 1.46 2021/03/11 15:00:29 christos Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-3-Clause 5 * 6 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 7 * Copyright (c) 1992, 1993, 1994 8 * The Regents of the University of California. All rights reserved. 9 * 10 * Copyright (c) 2011 The FreeBSD Foundation 11 * All rights reserved. 12 * Portions of this software were developed by David Chisnall 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * Henry Spencer. 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 1. Redistributions of source code must retain the above copyright 22 * notice, this list of conditions and the following disclaimer. 23 * 2. Redistributions in binary form must reproduce the above copyright 24 * notice, this list of conditions and the following disclaimer in the 25 * documentation and/or other materials provided with the distribution. 26 * 3. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 43 */ 44 45 #if HAVE_NBTOOL_CONFIG_H 46 #include "nbtool_config.h" 47 #endif 48 49 #include <sys/cdefs.h> 50 #if 0 51 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 52 __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $"); 53 #endif 54 __RCSID("$NetBSD: regcomp.c,v 1.46 2021/03/11 15:00:29 christos Exp $"); 55 56 #define _OPENBSD_SOURCE 57 58 #ifndef LIBHACK 59 #define REGEX_GNU_EXTENSIONS 60 61 #include "namespace.h" 62 #endif 63 #include <sys/types.h> 64 #include <stdio.h> 65 #include <string.h> 66 #include <ctype.h> 67 #include <limits.h> 68 #include <stdlib.h> 69 #include <regex.h> 70 #include <stdbool.h> 71 72 #if defined(__weak_alias) && !defined(LIBHACK) 73 __weak_alias(regcomp,_regcomp) 74 #endif 75 76 #ifdef REGEX_LIBC_COLLATE 77 #include "collate.h" 78 #endif 79 80 #include "utils.h" 81 #include "regex2.h" 82 83 #include "cname.h" 84 85 /* 86 * Branching context, used to keep track of branch state for all of the branch- 87 * aware functions. In addition to keeping track of branch positions for the 88 * p_branch_* functions, we use this to simplify some clumsiness in BREs for 89 * detection of whether ^ is acting as an anchor or being used erroneously and 90 * also for whether we're in a sub-expression or not. 91 */ 92 struct branchc { 93 sopno start; 94 sopno back; 95 sopno fwd; 96 97 int nbranch; 98 int nchain; 99 bool outer; 100 bool terminate; 101 }; 102 103 /* 104 * parse structure, passed up and down to avoid global variables and 105 * other clumsinesses 106 */ 107 struct parse { 108 const char *next; /* next character in RE */ 109 const char *end; /* end of string (-> NUL normally) */ 110 int error; /* has an error been seen? */ 111 int gnuext; 112 sop *strip; /* malloced strip */ 113 sopno ssize; /* malloced strip size (allocated) */ 114 sopno slen; /* malloced strip length (used) */ 115 size_t ncsalloc; /* number of csets allocated */ 116 struct re_guts *g; 117 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 118 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 119 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 120 bool allowbranch; /* can this expression branch? */ 121 bool bre; /* convenience; is this a BRE? */ 122 int pflags; /* other parsing flags -- legacy escapes? */ 123 bool (*parse_expr)(struct parse *, struct branchc *); 124 void (*pre_parse)(struct parse *, struct branchc *); 125 void (*post_parse)(struct parse *, struct branchc *); 126 }; 127 128 #define PFLAG_LEGACY_ESC 0x00000001 129 130 /* ========= begin header generated by ./mkh ========= */ 131 #ifdef __cplusplus 132 extern "C" { 133 #endif 134 135 /* === regcomp.c === */ 136 static bool p_ere_exp(struct parse *p, struct branchc *bc); 137 static void p_str(struct parse *p); 138 static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 139 static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 140 static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 141 static bool p_branch_empty(struct parse *p, struct branchc *bc); 142 static bool p_branch_do(struct parse *p, struct branchc *bc); 143 static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 144 static void p_bre_post_parse(struct parse *p, struct branchc *bc); 145 static void p_re(struct parse *p, int end1, int end2); 146 static bool p_simp_re(struct parse *p, struct branchc *bc); 147 static int p_count(struct parse *p); 148 static void p_bracket(struct parse *p); 149 static int p_range_cmp(wchar_t c1, wchar_t c2); 150 static void p_b_term(struct parse *p, cset *cs); 151 #ifdef REGEX_GNU_EXTENSIONS 152 static int p_b_pseudoclass(struct parse *p, char c); 153 #endif 154 static void p_b_cclass(struct parse *p, cset *cs); 155 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); 156 static void p_b_eclass(struct parse *p, cset *cs); 157 static wint_t p_b_symbol(struct parse *p); 158 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 159 static bool may_escape(struct parse *p, const wint_t ch); 160 static wint_t othercase(wint_t ch); 161 static void bothcases(struct parse *p, wint_t ch); 162 static void ordinary(struct parse *p, wint_t ch); 163 static void nonnewline(struct parse *p); 164 static void repeat(struct parse *p, sopno start, int from, int to); 165 static int seterr(struct parse *p, int e); 166 static cset *allocset(struct parse *p); 167 static void freeset(struct parse *p, cset *cs); 168 static void CHadd(struct parse *p, cset *cs, wint_t ch); 169 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 170 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 171 static wint_t singleton(cset *cs); 172 static sopno dupl(struct parse *p, sopno start, sopno finish); 173 static void doemit(struct parse *p, sop op, size_t opnd); 174 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 175 static void dofwd(struct parse *p, sopno pos, sop value); 176 static int enlarge(struct parse *p, sopno size); 177 static void stripsnug(struct parse *p, struct re_guts *g); 178 static void findmust(struct parse *p, struct re_guts *g); 179 static int altoffset(sop *scan, int offset); 180 static void computejumps(struct parse *p, struct re_guts *g); 181 static void computematchjumps(struct parse *p, struct re_guts *g); 182 static sopno pluscount(struct parse *p, struct re_guts *g); 183 static wint_t wgetnext(struct parse *p); 184 185 #ifdef __cplusplus 186 } 187 #endif 188 /* ========= end header generated by ./mkh ========= */ 189 190 static char nuls[10]; /* place to point scanner in event of error */ 191 192 /* 193 * macros for use with parse structure 194 * BEWARE: these know that the parse structure is named `p' !!! 195 */ 196 #define PEEK() (*p->next) 197 #define PEEK2() (*(p->next+1)) 198 #define MORE() (p->next < p->end) 199 #define MORE2() (p->next+1 < p->end) 200 #define SEE(c) (MORE() && PEEK() == (c)) 201 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 202 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 203 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 204 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 205 #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a)) 206 #define NEXT() (p->next++) 207 #define NEXT2() (p->next += 2) 208 #define NEXTn(n) (p->next += (n)) 209 #define GETNEXT() (*p->next++) 210 #define WGETNEXT() wgetnext(p) 211 #define SETERROR(e) seterr(p, (e)) 212 #define REQUIRE(co, e) ((co) || SETERROR(e)) 213 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 214 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 215 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 216 #define EMIT(op, sopnd) doemit(p, (op), (sopnd)) 217 #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos) 218 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 219 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 220 #define HERE() (p->slen) 221 #define THERE() (p->slen - 1) 222 #define THERETHERE() (p->slen - 2) 223 #define DROP(n) (p->slen -= (n)) 224 225 /* Macro used by computejump()/computematchjump() */ 226 #ifndef MIN 227 #define MIN(a,b) ((a)<(b)?(a):(b)) 228 #endif 229 230 #ifndef NLS 231 static const struct { 232 const char *name; 233 int (*func)(int); 234 } wctypes[] = { 235 #define ADD(x) { .name = # x, .func = is ## x } 236 ADD(alnum), 237 ADD(alpha), 238 ADD(blank), 239 ADD(cntrl), 240 ADD(digit), 241 ADD(graph), 242 ADD(lower), 243 ADD(print), 244 ADD(punct), 245 ADD(space), 246 ADD(upper), 247 ADD(xdigit), 248 #undef ADD 249 }; 250 251 wctype_t 252 __regex_wctype(const char *str) 253 { 254 for (size_t i = 0; i < __arraycount(wctypes); i++) { 255 if (strcmp(wctypes[i].name, str) == 0) 256 return (wctype_t)(i + 1); 257 } 258 return (wctype_t)0; 259 } 260 261 int 262 __regex_iswctype(wint_t c, wctype_t ct) 263 { 264 if (ct == 0) 265 return 0; 266 return (*wctypes[ct - 1].func)(c); 267 } 268 #endif 269 270 static int /* 0 success, otherwise REG_something */ 271 regcomp_internal(regex_t * __restrict preg, 272 const char * __restrict pattern, 273 int cflags, int pflags) 274 { 275 struct parse pa; 276 struct re_guts *g; 277 struct parse *p = &pa; 278 int i; 279 size_t len; 280 size_t maxlen; 281 #ifdef REDEBUG 282 # define GOODFLAGS(f) (f) 283 #else 284 # define GOODFLAGS(f) ((f)&~REG_DUMP) 285 #endif 286 287 _DIAGASSERT(preg != NULL); 288 _DIAGASSERT(pattern != NULL); 289 290 cflags = GOODFLAGS(cflags); 291 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 292 return(REG_INVARG); 293 294 if (cflags®_PEND) { 295 if (preg->re_endp < pattern) 296 return(REG_INVARG); 297 len = preg->re_endp - pattern; 298 } else 299 len = strlen(pattern); 300 301 /* do the mallocs early so failure handling is easy */ 302 g = malloc(sizeof(*g)); 303 if (g == NULL) 304 return(REG_ESPACE); 305 /* 306 * Limit the pattern space to avoid a 32-bit overflow on buffer 307 * extension. Also avoid any signed overflow in case of conversion 308 * so make the real limit based on a 31-bit overflow. 309 * 310 * Likely not applicable on 64-bit systems but handle the case 311 * generically (who are we to stop people from using ~715MB+ 312 * patterns?). 313 */ 314 maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3; 315 if (len >= maxlen) { 316 free(g); 317 return(REG_ESPACE); 318 } 319 p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */ 320 assert(p->ssize >= len); 321 322 p->strip = calloc(p->ssize, sizeof(*p->strip)); 323 p->slen = 0; 324 if (p->strip == NULL) { 325 free(g); 326 return(REG_ESPACE); 327 } 328 329 /* set things up */ 330 p->g = g; 331 p->next = pattern; /* convenience; we do not modify it */ 332 p->end = p->next + len; 333 p->error = 0; 334 p->ncsalloc = 0; 335 p->pflags = pflags; 336 for (i = 0; i < NPAREN; i++) { 337 p->pbegin[i] = 0; 338 p->pend[i] = 0; 339 } 340 #ifdef REGEX_GNU_EXTENSIONS 341 if ((cflags & REG_GNU) == 0) { 342 p->gnuext = false; 343 p->allowbranch = (cflags & REG_EXTENDED) != 0; 344 } else 345 p->gnuext = p->allowbranch = true; 346 #else 347 p->gnuext = false; 348 p->allowbranch = (cflags & REG_EXTENDED) != 0; 349 #endif 350 if (cflags & REG_EXTENDED) { 351 p->bre = false; 352 p->parse_expr = p_ere_exp; 353 p->pre_parse = NULL; 354 p->post_parse = NULL; 355 } else { 356 p->bre = true; 357 p->parse_expr = p_simp_re; 358 p->pre_parse = p_bre_pre_parse; 359 p->post_parse = p_bre_post_parse; 360 } 361 g->sets = NULL; 362 g->ncsets = 0; 363 g->cflags = cflags; 364 g->iflags = 0; 365 g->nbol = 0; 366 g->neol = 0; 367 g->must = NULL; 368 g->moffset = -1; 369 g->charjump = NULL; 370 g->matchjump = NULL; 371 g->mlen = 0; 372 g->nsub = 0; 373 g->backrefs = 0; 374 375 /* do it */ 376 EMIT(OEND, 0); 377 g->firststate = THERE(); 378 if (cflags & REG_NOSPEC) 379 p_str(p); 380 else 381 p_re(p, OUT, OUT); 382 EMIT(OEND, 0); 383 g->laststate = THERE(); 384 385 /* tidy up loose ends and fill things in */ 386 stripsnug(p, g); 387 findmust(p, g); 388 /* only use Boyer-Moore algorithm if the pattern is bigger 389 * than three characters 390 */ 391 if(g->mlen > 3) { 392 computejumps(p, g); 393 computematchjumps(p, g); 394 if(g->matchjump == NULL && g->charjump != NULL) { 395 free(g->charjump); 396 g->charjump = NULL; 397 } 398 } 399 g->nplus = pluscount(p, g); 400 g->magic = MAGIC2; 401 preg->re_nsub = g->nsub; 402 preg->re_g = g; 403 preg->re_magic = MAGIC1; 404 #ifndef REDEBUG 405 /* not debugging, so can't rely on the assert() in regexec() */ 406 if (g->iflags&BAD) 407 SETERROR(REG_ASSERT); 408 #endif 409 410 /* win or lose, we're done */ 411 if (p->error != 0) /* lose */ 412 regfree(preg); 413 return(p->error); 414 } 415 416 /* 417 - regcomp - interface for parser and compilation 418 = extern int regcomp(regex_t *, const char *, int); 419 = #define REG_BASIC 0000 420 = #define REG_EXTENDED 0001 421 = #define REG_ICASE 0002 422 = #define REG_NOSUB 0004 423 = #define REG_NEWLINE 0010 424 = #define REG_NOSPEC 0020 425 = #define REG_PEND 0040 426 = #define REG_DUMP 0200 427 */ 428 int /* 0 success, otherwise REG_something */ 429 regcomp(regex_t * __restrict preg, 430 const char * __restrict pattern, 431 int cflags) 432 { 433 434 return (regcomp_internal(preg, pattern, cflags, 0)); 435 } 436 437 /* 438 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 439 - return whether we should terminate or not 440 == static bool p_ere_exp(struct parse *p); 441 */ 442 static bool 443 p_ere_exp(struct parse *p, struct branchc *bc) 444 { 445 char c; 446 wint_t wc; 447 sopno pos; 448 int count; 449 int count2; 450 #ifdef REGEX_GNU_EXTENSIONS 451 size_t i; 452 int handled; 453 #endif 454 sopno subno; 455 int wascaret = 0; 456 457 _DIAGASSERT(p != NULL); 458 459 (void)bc; 460 assert(MORE()); /* caller should have ensured this */ 461 c = GETNEXT(); 462 463 #ifdef REGEX_GNU_EXTENSIONS 464 handled = 0; 465 #endif 466 pos = HERE(); 467 switch (c) { 468 case '(': 469 (void)REQUIRE(MORE(), REG_EPAREN); 470 p->g->nsub++; 471 subno = (sopno)p->g->nsub; 472 if (subno < NPAREN) 473 p->pbegin[subno] = HERE(); 474 EMIT(OLPAREN, subno); 475 if (!SEE(')')) 476 p_re(p, ')', IGN); 477 if (subno < NPAREN) { 478 p->pend[subno] = HERE(); 479 assert(p->pend[subno] != 0); 480 } 481 EMIT(ORPAREN, subno); 482 (void)MUSTEAT(')', REG_EPAREN); 483 break; 484 #ifndef POSIX_MISTAKE 485 case ')': /* happens only if no current unmatched ( */ 486 /* 487 * You may ask, why the ifndef? Because I didn't notice 488 * this until slightly too late for 1003.2, and none of the 489 * other 1003.2 regular-expression reviewers noticed it at 490 * all. So an unmatched ) is legal POSIX, at least until 491 * we can get it fixed. 492 */ 493 SETERROR(REG_EPAREN); 494 break; 495 #endif 496 case '^': 497 EMIT(OBOL, 0); 498 p->g->iflags |= USEBOL; 499 p->g->nbol++; 500 wascaret = 1; 501 break; 502 case '$': 503 EMIT(OEOL, 0); 504 p->g->iflags |= USEEOL; 505 p->g->neol++; 506 break; 507 case '|': 508 SETERROR(REG_EMPTY); 509 break; 510 case '*': 511 case '+': 512 case '?': 513 case '{': 514 SETERROR(REG_BADRPT); 515 break; 516 case '.': 517 if (p->g->cflags®_NEWLINE) 518 nonnewline(p); 519 else 520 EMIT(OANY, 0); 521 break; 522 case '[': 523 p_bracket(p); 524 break; 525 case '\\': 526 (void)REQUIRE(MORE(), REG_EESCAPE); 527 wc = WGETNEXT(); 528 #ifdef REGEX_GNU_EXTENSIONS 529 if (p->gnuext) { 530 handled = 1; 531 switch (wc) { 532 case '`': 533 EMIT(OBOS, 0); 534 break; 535 case '\'': 536 EMIT(OEOS, 0); 537 break; 538 case 'B': 539 EMIT(ONWBND, 0); 540 break; 541 case 'b': 542 EMIT(OWBND, 0); 543 break; 544 case 'W': 545 case 'w': 546 case 'S': 547 case 's': 548 p_b_pseudoclass(p, wc); 549 break; 550 case 'a': 551 ordinary(p, '\a'); 552 break; 553 case 'e': 554 ordinary(p, '\e'); 555 break; 556 case 'f': 557 ordinary(p, '\f'); 558 break; 559 case 'n': 560 ordinary(p, '\n'); 561 break; 562 case 'r': 563 ordinary(p, '\r'); 564 break; 565 case 't': 566 ordinary(p, '\t'); 567 break; 568 case 'v': 569 ordinary(p, '\v'); 570 break; 571 case '1': 572 case '2': 573 case '3': 574 case '4': 575 case '5': 576 case '6': 577 case '7': 578 case '8': 579 case '9': 580 i = wc - '0'; 581 assert(i < NPAREN); 582 if (p->pend[i] != 0) { 583 assert(i <= p->g->nsub); 584 EMIT(OBACK_, i); 585 assert(p->pbegin[i] != 0); 586 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 587 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 588 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 589 EMIT(O_BACK, i); 590 } else 591 SETERROR(REG_ESUBREG); 592 p->g->backrefs = 1; 593 break; 594 default: 595 handled = 0; 596 } 597 /* Don't proceed to the POSIX bits if we've already handled it */ 598 if (handled) 599 break; 600 } 601 #endif 602 switch (wc) { 603 case '<': 604 EMIT(OBOW, 0); 605 break; 606 case '>': 607 EMIT(OEOW, 0); 608 break; 609 default: 610 if (may_escape(p, wc)) 611 ordinary(p, wc); 612 else 613 SETERROR(REG_EESCAPE); 614 break; 615 } 616 break; 617 default: 618 if (p->error != 0) 619 return (false); 620 p->next--; 621 wc = WGETNEXT(); 622 ordinary(p, wc); 623 break; 624 } 625 626 if (!MORE()) 627 return (false); 628 c = PEEK(); 629 /* we call { a repetition if followed by a digit */ 630 if (!( c == '*' || c == '+' || c == '?' || c == '{')) 631 return (false); /* no repetition, we're done */ 632 else if (c == '{') 633 (void)REQUIRE(MORE2() && \ 634 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 635 NEXT(); 636 637 (void)REQUIRE(!wascaret, REG_BADRPT); 638 switch (c) { 639 case '*': /* implemented as +? */ 640 /* this case does not require the (y|) trick, noKLUDGE */ 641 INSERT(OPLUS_, pos); 642 ASTERN(O_PLUS, pos); 643 INSERT(OQUEST_, pos); 644 ASTERN(O_QUEST, pos); 645 break; 646 case '+': 647 INSERT(OPLUS_, pos); 648 ASTERN(O_PLUS, pos); 649 break; 650 case '?': 651 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 652 INSERT(OCH_, pos); /* offset slightly wrong */ 653 ASTERN(OOR1, pos); /* this one's right */ 654 AHEAD(pos); /* fix the OCH_ */ 655 EMIT(OOR2, 0); /* offset very wrong... */ 656 AHEAD(THERE()); /* ...so fix it */ 657 ASTERN(O_CH, THERETHERE()); 658 break; 659 case '{': 660 count = p_count(p); 661 if (EAT(',')) { 662 if (isdigit((uch)PEEK())) { 663 count2 = p_count(p); 664 (void)REQUIRE(count <= count2, REG_BADBR); 665 } else /* single number with comma */ 666 count2 = INFINITY; 667 } else /* just a single number */ 668 count2 = count; 669 repeat(p, pos, count, count2); 670 if (!EAT('}')) { /* error heuristics */ 671 while (MORE() && PEEK() != '}') 672 NEXT(); 673 (void)REQUIRE(MORE(), REG_EBRACE); 674 SETERROR(REG_BADBR); 675 } 676 break; 677 } 678 679 if (!MORE()) 680 return (false); 681 c = PEEK(); 682 if (!( c == '*' || c == '+' || c == '?' || 683 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 684 return (false); 685 SETERROR(REG_BADRPT); 686 return (false); 687 } 688 689 /* 690 - p_str - string (no metacharacters) "parser" 691 == static void p_str(struct parse *p); 692 */ 693 static void 694 p_str(struct parse *p) 695 { 696 (void)REQUIRE(MORE(), REG_EMPTY); 697 while (MORE()) 698 ordinary(p, WGETNEXT()); 699 } 700 701 /* 702 * Eat consecutive branch delimiters for the kind of expression that we are 703 * parsing, return the number of delimiters that we ate. 704 */ 705 static int 706 p_branch_eat_delim(struct parse *p, struct branchc *bc) 707 { 708 int nskip; 709 710 (void)bc; 711 nskip = 0; 712 while (EATSPEC('|')) 713 ++nskip; 714 return (nskip); 715 } 716 717 /* 718 * Insert necessary branch book-keeping operations. This emits a 719 * bogus 'next' offset, since we still have more to parse 720 */ 721 static void 722 p_branch_ins_offset(struct parse *p, struct branchc *bc) 723 { 724 725 if (bc->nbranch == 0) { 726 INSERT(OCH_, bc->start); /* offset is wrong */ 727 bc->fwd = bc->start; 728 bc->back = bc->start; 729 } 730 731 ASTERN(OOR1, bc->back); 732 bc->back = THERE(); 733 AHEAD(bc->fwd); /* fix previous offset */ 734 bc->fwd = HERE(); 735 EMIT(OOR2, 0); /* offset is very wrong */ 736 ++bc->nbranch; 737 } 738 739 /* 740 * Fix the offset of the tail branch, if we actually had any branches. 741 * This is to correct the bogus placeholder offset that we use. 742 */ 743 static void 744 p_branch_fix_tail(struct parse *p, struct branchc *bc) 745 { 746 747 /* Fix bogus offset at the tail if we actually have branches */ 748 if (bc->nbranch > 0) { 749 AHEAD(bc->fwd); 750 ASTERN(O_CH, bc->back); 751 } 752 } 753 754 /* 755 * Signal to the parser that an empty branch has been encountered; this will, 756 * in the future, be used to allow for more permissive behavior with empty 757 * branches. The return value should indicate whether parsing may continue 758 * or not. 759 */ 760 static bool 761 p_branch_empty(struct parse *p, struct branchc *bc) 762 { 763 764 (void)bc; 765 SETERROR(REG_EMPTY); 766 return (false); 767 } 768 769 /* 770 * Take care of any branching requirements. This includes inserting the 771 * appropriate branching instructions as well as eating all of the branch 772 * delimiters until we either run out of pattern or need to parse more pattern. 773 */ 774 static bool 775 p_branch_do(struct parse *p, struct branchc *bc) 776 { 777 int ate = 0; 778 779 ate = p_branch_eat_delim(p, bc); 780 if (ate == 0) 781 return (false); 782 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 783 /* 784 * Halt parsing only if we have an empty branch and p_branch_empty 785 * indicates that we must not continue. In the future, this will not 786 * necessarily be an error. 787 */ 788 return (false); 789 p_branch_ins_offset(p, bc); 790 791 return (true); 792 } 793 794 static void 795 p_bre_pre_parse(struct parse *p, struct branchc *bc) 796 { 797 798 (void)bc; 799 /* 800 * Does not move cleanly into expression parser because of 801 * ordinary interpration of * at the beginning position of 802 * an expression. 803 */ 804 if (EAT('^')) { 805 EMIT(OBOL, 0); 806 p->g->iflags |= USEBOL; 807 p->g->nbol++; 808 } 809 } 810 811 static void 812 p_bre_post_parse(struct parse *p, struct branchc *bc) 813 { 814 815 /* Expression is terminating due to EOL token */ 816 if (bc->terminate) { 817 DROP(1); 818 EMIT(OEOL, 0); 819 p->g->iflags |= USEEOL; 820 p->g->neol++; 821 } 822 } 823 824 /* 825 - p_re - Top level parser, concatenation and BRE anchoring 826 == static void p_re(struct parse *p, int end1, int end2); 827 * Giving end1 as OUT essentially eliminates the end1/end2 check. 828 * 829 * This implementation is a bit of a kludge, in that a trailing $ is first 830 * taken as an ordinary character and then revised to be an anchor. 831 * The amount of lookahead needed to avoid this kludge is excessive. 832 */ 833 static void 834 p_re(struct parse *p, 835 int end1, /* first terminating character */ 836 int end2) /* second terminating character; ignored for EREs */ 837 { 838 struct branchc bc; 839 840 bc.nbranch = 0; 841 if (end1 == OUT && end2 == OUT) 842 bc.outer = true; 843 else 844 bc.outer = false; 845 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 846 for (;;) { 847 bc.start = HERE(); 848 bc.nchain = 0; 849 bc.terminate = false; 850 if (p->pre_parse != NULL) 851 p->pre_parse(p, &bc); 852 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 853 bc.terminate = p->parse_expr(p, &bc); 854 ++bc.nchain; 855 } 856 if (p->post_parse != NULL) 857 p->post_parse(p, &bc); 858 (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY); 859 #ifdef REGEX_GNU_EXTENSIONS 860 if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc)) 861 break; 862 #endif 863 if (!p->allowbranch) 864 break; 865 /* 866 * p_branch_do's return value indicates whether we should 867 * continue parsing or not. This is both for correctness and 868 * a slight optimization, because it will check if we've 869 * encountered an empty branch or the end of the string 870 * immediately following a branch delimiter. 871 */ 872 if (!p_branch_do(p, &bc)) 873 break; 874 } 875 #undef SEE_END 876 if (p->allowbranch) 877 p_branch_fix_tail(p, &bc); 878 assert(!MORE() || SEE(end1)); 879 } 880 881 /* 882 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 883 == static bool p_simp_re(struct parse *p, struct branchc *bc); 884 */ 885 static bool /* was the simple RE an unbackslashed $? */ 886 p_simp_re(struct parse *p, struct branchc *bc) 887 { 888 int c; 889 int cc; /* convenient/control character */ 890 int count; 891 int count2; 892 sopno pos; 893 bool handled; 894 size_t i; 895 wint_t wc; 896 sopno subno; 897 # define BACKSL (1<<CHAR_BIT) 898 899 pos = HERE(); /* repetition op, if any, covers from here */ 900 handled = false; 901 902 assert(MORE()); /* caller should have ensured this */ 903 c = GETNEXT(); 904 if (c == '\\') { 905 (void)REQUIRE(MORE(), REG_EESCAPE); 906 cc = GETNEXT(); 907 c = BACKSL | cc; 908 #ifdef REGEX_GNU_EXTENSIONS 909 if (p->gnuext) { 910 handled = true; 911 switch (c) { 912 case BACKSL|'`': 913 EMIT(OBOS, 0); 914 break; 915 case BACKSL|'\'': 916 EMIT(OEOS, 0); 917 break; 918 case BACKSL|'B': 919 EMIT(ONWBND, 0); 920 break; 921 case BACKSL|'b': 922 EMIT(OWBND, 0); 923 break; 924 case BACKSL|'W': 925 case BACKSL|'w': 926 case BACKSL|'S': 927 case BACKSL|'s': 928 p_b_pseudoclass(p, cc); 929 break; 930 case BACKSL|'a': 931 ordinary(p, '\a'); 932 break; 933 case BACKSL|'e': 934 ordinary(p, '\e'); 935 break; 936 case BACKSL|'f': 937 ordinary(p, '\f'); 938 break; 939 case BACKSL|'n': 940 ordinary(p, '\n'); 941 break; 942 case BACKSL|'r': 943 ordinary(p, '\r'); 944 break; 945 case BACKSL|'t': 946 ordinary(p, '\t'); 947 break; 948 case BACKSL|'v': 949 ordinary(p, '\v'); 950 break; 951 default: 952 handled = false; 953 } 954 } 955 #endif 956 } 957 if (!handled) { 958 switch (c) { 959 case '.': 960 if (p->g->cflags®_NEWLINE) 961 nonnewline(p); 962 else 963 EMIT(OANY, 0); 964 break; 965 case '[': 966 p_bracket(p); 967 break; 968 case BACKSL|'<': 969 EMIT(OBOW, 0); 970 break; 971 case BACKSL|'>': 972 EMIT(OEOW, 0); 973 break; 974 case BACKSL|'{': 975 SETERROR(REG_BADRPT); 976 break; 977 case BACKSL|'(': 978 p->g->nsub++; 979 subno = (sopno)p->g->nsub; 980 if (subno < NPAREN) 981 p->pbegin[subno] = HERE(); 982 EMIT(OLPAREN, subno); 983 /* the MORE here is an error heuristic */ 984 if (MORE() && !SEETWO('\\', ')')) 985 p_re(p, '\\', ')'); 986 if (subno < NPAREN) { 987 p->pend[subno] = HERE(); 988 assert(p->pend[subno] != 0); 989 } 990 EMIT(ORPAREN, subno); 991 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 992 break; 993 case BACKSL|')': /* should not get here -- must be user */ 994 SETERROR(REG_EPAREN); 995 break; 996 case BACKSL|'1': 997 case BACKSL|'2': 998 case BACKSL|'3': 999 case BACKSL|'4': 1000 case BACKSL|'5': 1001 case BACKSL|'6': 1002 case BACKSL|'7': 1003 case BACKSL|'8': 1004 case BACKSL|'9': 1005 i = (c&~BACKSL) - '0'; 1006 assert(i < NPAREN); 1007 if (p->pend[i] != 0) { 1008 assert(i <= p->g->nsub); 1009 EMIT(OBACK_, i); 1010 assert(p->pbegin[i] != 0); 1011 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 1012 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 1013 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 1014 EMIT(O_BACK, i); 1015 } else 1016 SETERROR(REG_ESUBREG); 1017 p->g->backrefs = 1; 1018 break; 1019 case '*': 1020 /* 1021 * Ordinary if used as the first character beyond BOL anchor of 1022 * a (sub-)expression, counts as a bad repetition operator if it 1023 * appears otherwise. 1024 */ 1025 (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 1026 /* FALLTHROUGH */ 1027 default: 1028 if (p->error != 0) 1029 return (false); /* Definitely not $... */ 1030 p->next--; 1031 wc = WGETNEXT(); 1032 if ((c & BACKSL) == 0 || may_escape(p, wc)) 1033 ordinary(p, wc); 1034 else 1035 SETERROR(REG_EESCAPE); 1036 break; 1037 } 1038 } 1039 1040 if (EAT('*')) { /* implemented as +? */ 1041 /* this case does not require the (y|) trick, noKLUDGE */ 1042 INSERT(OPLUS_, pos); 1043 ASTERN(O_PLUS, pos); 1044 INSERT(OQUEST_, pos); 1045 ASTERN(O_QUEST, pos); 1046 #ifdef REGEX_GNU_EXTENSIONS 1047 } else if (p->gnuext && EATTWO('\\', '?')) { 1048 INSERT(OQUEST_, pos); 1049 ASTERN(O_QUEST, pos); 1050 } else if (p->gnuext && EATTWO('\\', '+')) { 1051 INSERT(OPLUS_, pos); 1052 ASTERN(O_PLUS, pos); 1053 #endif 1054 } else if (EATTWO('\\', '{')) { 1055 count = p_count(p); 1056 if (EAT(',')) { 1057 if (MORE() && isdigit((uch)PEEK())) { 1058 count2 = p_count(p); 1059 (void)REQUIRE(count <= count2, REG_BADBR); 1060 } else /* single number with comma */ 1061 count2 = INFINITY; 1062 } else /* just a single number */ 1063 count2 = count; 1064 repeat(p, pos, count, count2); 1065 if (!EATTWO('\\', '}')) { /* error heuristics */ 1066 while (MORE() && !SEETWO('\\', '}')) 1067 NEXT(); 1068 (void)REQUIRE(MORE(), REG_EBRACE); 1069 SETERROR(REG_BADBR); 1070 } 1071 } else if (c == '$') /* $ (but not \$) ends it */ 1072 return (true); 1073 1074 return (false); 1075 } 1076 1077 /* 1078 - p_count - parse a repetition count 1079 == static int p_count(struct parse *p); 1080 */ 1081 static int /* the value */ 1082 p_count(struct parse *p) 1083 { 1084 int count = 0; 1085 int ndigits = 0; 1086 1087 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 1088 count = count*10 + (GETNEXT() - '0'); 1089 ndigits++; 1090 } 1091 1092 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 1093 return(count); 1094 } 1095 1096 /* 1097 - p_bracket - parse a bracketed character list 1098 == static void p_bracket(struct parse *p); 1099 */ 1100 static void 1101 p_bracket(struct parse *p) 1102 { 1103 cset *cs; 1104 wint_t ch; 1105 1106 /* Dept of Truly Sickening Special-Case Kludges */ 1107 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 1108 EMIT(OBOW, 0); 1109 NEXTn(6); 1110 return; 1111 } 1112 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 1113 EMIT(OEOW, 0); 1114 NEXTn(6); 1115 return; 1116 } 1117 1118 if ((cs = allocset(p)) == NULL) 1119 return; 1120 1121 if (p->g->cflags®_ICASE) 1122 cs->icase = 1; 1123 if (EAT('^')) 1124 cs->invert = 1; 1125 if (EAT(']')) 1126 CHadd(p, cs, ']'); 1127 else if (EAT('-')) 1128 CHadd(p, cs, '-'); 1129 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 1130 p_b_term(p, cs); 1131 if (EAT('-')) 1132 CHadd(p, cs, '-'); 1133 (void)MUSTEAT(']', REG_EBRACK); 1134 1135 if (p->error != 0) /* don't mess things up further */ 1136 return; 1137 1138 if (cs->invert && p->g->cflags®_NEWLINE) 1139 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 1140 1141 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 1142 ordinary(p, ch); 1143 freeset(p, cs); 1144 } else 1145 EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1146 } 1147 1148 static int 1149 p_range_cmp(wchar_t c1, wchar_t c2) 1150 { 1151 #ifdef REGEX_LIBC_COLLATE 1152 return __wcollate_range_cmp(c1, c2); 1153 #elif defined(NLS) 1154 /* Copied from libc/collate __wcollate_range_cmp */ 1155 wchar_t s1[2], s2[2]; 1156 1157 s1[0] = c1; 1158 s1[1] = L'\0'; 1159 s2[0] = c2; 1160 s2[1] = L'\0'; 1161 return wcscoll(s1, s2); 1162 #else 1163 char s1[2], s2[2]; 1164 1165 s1[0] = (char)c1; 1166 s1[1] = '\0'; 1167 s2[0] = (char)c2; 1168 s2[1] = '\0'; 1169 return strcoll(s1, s2); 1170 #endif 1171 } 1172 1173 /* 1174 - p_b_term - parse one term of a bracketed character list 1175 == static void p_b_term(struct parse *p, cset *cs); 1176 */ 1177 static void 1178 p_b_term(struct parse *p, cset *cs) 1179 { 1180 char c; 1181 wint_t start, finish; 1182 wint_t i; 1183 #ifdef REGEX_LIBC_COLLATE 1184 struct xlocale_collate *table = 1185 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 1186 #endif 1187 1188 _DIAGASSERT(p != NULL); 1189 _DIAGASSERT(cs != NULL); 1190 1191 /* classify what we've got */ 1192 switch ((MORE()) ? PEEK() : '\0') { 1193 case '[': 1194 c = (MORE2()) ? PEEK2() : '\0'; 1195 break; 1196 case '-': 1197 SETERROR(REG_ERANGE); 1198 return; /* NOTE RETURN */ 1199 default: 1200 c = '\0'; 1201 break; 1202 } 1203 1204 switch (c) { 1205 case ':': /* character class */ 1206 NEXT2(); 1207 (void)REQUIRE(MORE(), REG_EBRACK); 1208 c = PEEK(); 1209 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 1210 p_b_cclass(p, cs); 1211 (void)REQUIRE(MORE(), REG_EBRACK); 1212 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 1213 break; 1214 case '=': /* equivalence class */ 1215 NEXT2(); 1216 (void)REQUIRE(MORE(), REG_EBRACK); 1217 c = PEEK(); 1218 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 1219 p_b_eclass(p, cs); 1220 (void)REQUIRE(MORE(), REG_EBRACK); 1221 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 1222 break; 1223 default: /* symbol, ordinary character, or range */ 1224 start = p_b_symbol(p); 1225 if (SEE('-') && MORE2() && PEEK2() != ']') { 1226 /* range */ 1227 NEXT(); 1228 if (EAT('-')) 1229 finish = '-'; 1230 else 1231 finish = p_b_symbol(p); 1232 } else 1233 finish = start; 1234 if (start == finish) 1235 CHadd(p, cs, start); 1236 else { 1237 #ifdef REGEX_LIBC_COLLATE 1238 if (table->__collate_load_error || MB_CUR_MAX > 1) { 1239 #else 1240 if (MB_CUR_MAX > 1) { 1241 #endif 1242 (void)REQUIRE(start <= finish, REG_ERANGE); 1243 CHaddrange(p, cs, start, finish); 1244 } else { 1245 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); 1246 for (i = 0; i <= UCHAR_MAX; i++) { 1247 if (p_range_cmp(start, i) <= 0 && 1248 p_range_cmp(i, finish) <= 0 ) 1249 CHadd(p, cs, i); 1250 } 1251 } 1252 } 1253 break; 1254 } 1255 } 1256 1257 #ifdef REGEX_GNU_EXTENSIONS 1258 /* 1259 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) 1260 == static int p_b_pseudoclass(struct parse *p, char c) 1261 */ 1262 static int 1263 p_b_pseudoclass(struct parse *p, char c) { 1264 cset *cs; 1265 1266 if ((cs = allocset(p)) == NULL) 1267 return(0); 1268 1269 if (p->g->cflags®_ICASE) 1270 cs->icase = 1; 1271 1272 switch (c) { 1273 case 'W': 1274 cs->invert = 1; 1275 /* FALLTHROUGH */ 1276 case 'w': 1277 p_b_cclass_named(p, cs, "alnum"); 1278 break; 1279 case 'S': 1280 cs->invert = 1; 1281 /* FALLTHROUGH */ 1282 case 's': 1283 p_b_cclass_named(p, cs, "space"); 1284 break; 1285 default: 1286 return(0); 1287 } 1288 1289 EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1290 return(1); 1291 } 1292 #endif 1293 1294 /* 1295 - p_b_cclass - parse a character-class name and deal with it 1296 == static void p_b_cclass(struct parse *p, cset *cs); 1297 */ 1298 static void 1299 p_b_cclass(struct parse *p, cset *cs) 1300 { 1301 const char *sp = p->next; 1302 size_t len; 1303 char clname[16]; 1304 1305 while (MORE() && isalpha((uch)PEEK())) 1306 NEXT(); 1307 len = p->next - sp; 1308 if (len >= sizeof(clname) - 1) { 1309 SETERROR(REG_ECTYPE); 1310 return; 1311 } 1312 memcpy(clname, sp, len); 1313 clname[len] = '\0'; 1314 1315 p_b_cclass_named(p, cs, clname); 1316 } 1317 1318 /* 1319 - p_b_cclass_named - deal with a named character class 1320 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); 1321 */ 1322 static void 1323 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { 1324 wctype_t wct; 1325 1326 if ((wct = wctype(clname)) == 0) { 1327 SETERROR(REG_ECTYPE); 1328 return; 1329 } 1330 CHaddtype(p, cs, wct); 1331 } 1332 1333 /* 1334 - p_b_eclass - parse an equivalence-class name and deal with it 1335 == static void p_b_eclass(struct parse *p, cset *cs); 1336 * 1337 * This implementation is incomplete. xxx 1338 */ 1339 static void 1340 p_b_eclass(struct parse *p, cset *cs) 1341 { 1342 wint_t c; 1343 1344 _DIAGASSERT(p != NULL); 1345 _DIAGASSERT(cs != NULL); 1346 1347 c = p_b_coll_elem(p, '='); 1348 CHadd(p, cs, c); 1349 } 1350 1351 /* 1352 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1353 == static wint_t p_b_symbol(struct parse *p); 1354 */ 1355 static wint_t /* value of symbol */ 1356 p_b_symbol(struct parse *p) 1357 { 1358 wint_t value; 1359 1360 _DIAGASSERT(p != NULL); 1361 1362 (void)REQUIRE(MORE(), REG_EBRACK); 1363 if (!EATTWO('[', '.')) 1364 return(WGETNEXT()); 1365 1366 /* collating symbol */ 1367 value = p_b_coll_elem(p, '.'); 1368 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1369 return(value); 1370 } 1371 1372 /* 1373 - p_b_coll_elem - parse a collating-element name and look it up 1374 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1375 */ 1376 static wint_t /* value of collating element */ 1377 p_b_coll_elem(struct parse *p, 1378 wint_t endc) /* name ended by endc,']' */ 1379 { 1380 const char *sp = p->next; 1381 struct cname *cp; 1382 size_t len; 1383 1384 _DIAGASSERT(p != NULL); 1385 1386 while (MORE() && !SEETWO(endc, ']')) 1387 NEXT(); 1388 if (!MORE()) { 1389 SETERROR(REG_EBRACK); 1390 return(0); 1391 } 1392 len = p->next - sp; 1393 for (cp = cnames; cp->name != NULL; cp++) 1394 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1395 return(cp->code); /* known name */ 1396 #ifdef NLS 1397 mbstate_t mbs; 1398 wchar_t wc; 1399 size_t clen; 1400 1401 memset(&mbs, 0, sizeof(mbs)); 1402 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1403 return (wc); /* single character */ 1404 else if (clen == (size_t)-1 || clen == (size_t)-2) 1405 SETERROR(REG_ILLSEQ); 1406 else 1407 SETERROR(REG_ECOLLATE); /* neither */ 1408 return(0); 1409 #else 1410 if (len == 1) 1411 return *sp; /* single character */ 1412 SETERROR(REG_ECOLLATE); /* neither */ 1413 return 0; 1414 #endif 1415 } 1416 1417 /* 1418 - may_escape - determine whether 'ch' is escape-able in the current context 1419 == static int may_escape(struct parse *p, const wint_t ch) 1420 */ 1421 static bool 1422 may_escape(struct parse *p, const wint_t ch) 1423 { 1424 1425 if ((p->pflags & PFLAG_LEGACY_ESC) != 0) 1426 return (true); 1427 if (isalpha(ch) || ch == '\'' || ch == '`') 1428 return (false); 1429 return (true); 1430 #ifdef NOTYET 1431 /* 1432 * Build a whitelist of characters that may be escaped to produce an 1433 * ordinary in the current context. This assumes that these have not 1434 * been otherwise interpreted as a special character. Escaping an 1435 * ordinary character yields undefined results according to 1436 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take 1437 * advantage of this and use escaped ordinary characters to provide 1438 * special meaning, e.g. \b, \B, \w, \W, \s, \S. 1439 */ 1440 switch(ch) { 1441 case '|': 1442 case '+': 1443 case '?': 1444 /* The above characters may not be escaped in BREs */ 1445 if (!(p->g->cflags®_EXTENDED)) 1446 return (false); 1447 /* Fallthrough */ 1448 case '(': 1449 case ')': 1450 case '{': 1451 case '}': 1452 case '.': 1453 case '[': 1454 case ']': 1455 case '\\': 1456 case '*': 1457 case '^': 1458 case '$': 1459 return (true); 1460 default: 1461 return (false); 1462 } 1463 #endif 1464 } 1465 1466 /* 1467 - othercase - return the case counterpart of an alphabetic 1468 == static wint_t othercase(wint_t ch); 1469 */ 1470 static wint_t /* if no counterpart, return ch */ 1471 othercase(wint_t ch) 1472 { 1473 assert(iswalpha(ch)); 1474 if (iswupper(ch)) 1475 return(towlower(ch)); 1476 else if (iswlower(ch)) 1477 return(towupper(ch)); 1478 else /* peculiar, but could happen */ 1479 return(ch); 1480 } 1481 1482 /* 1483 - bothcases - emit a dualcase version of a two-case character 1484 == static void bothcases(struct parse *p, wint_t ch); 1485 * 1486 * Boy, is this implementation ever a kludge... 1487 */ 1488 static void 1489 bothcases(struct parse *p, wint_t ch) 1490 { 1491 const char *oldnext = p->next; 1492 const char *oldend = p->end; 1493 char bracket[3 + MB_LEN_MAX]; 1494 size_t n; 1495 1496 _DIAGASSERT(p != NULL); 1497 1498 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1499 p->next = bracket; 1500 #ifdef NLS 1501 mbstate_t mbs; 1502 memset(&mbs, 0, sizeof(mbs)); 1503 n = wcrtomb(bracket, ch, &mbs); 1504 assert(n != (size_t)-1); 1505 #else 1506 n = 0; 1507 bracket[n++] = ch; 1508 #endif 1509 bracket[n] = ']'; 1510 bracket[n + 1] = '\0'; 1511 p->end = bracket+n+1; 1512 p_bracket(p); 1513 assert(p->next == p->end); 1514 p->next = oldnext; 1515 p->end = oldend; 1516 } 1517 1518 /* 1519 - ordinary - emit an ordinary character 1520 == static void ordinary(struct parse *p, wint_t ch); 1521 */ 1522 static void 1523 ordinary(struct parse *p, wint_t ch) 1524 { 1525 cset *cs; 1526 1527 _DIAGASSERT(p != NULL); 1528 1529 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1530 bothcases(p, ch); 1531 else if ((wint_t)(ch & OPDMASK) == ch) 1532 EMIT(OCHAR, (size_t)ch); 1533 else { 1534 /* 1535 * Kludge: character is too big to fit into an OCHAR operand. 1536 * Emit a singleton set. 1537 */ 1538 if ((cs = allocset(p)) == NULL) 1539 return; 1540 CHadd(p, cs, ch); 1541 EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1542 } 1543 } 1544 1545 /* 1546 - nonnewline - emit REG_NEWLINE version of OANY 1547 == static void nonnewline(struct parse *p); 1548 * 1549 * Boy, is this implementation ever a kludge... 1550 */ 1551 static void 1552 nonnewline(struct parse *p) 1553 { 1554 const char *oldnext = p->next; 1555 const char *oldend = p->end; 1556 char bracket[4]; 1557 1558 _DIAGASSERT(p != NULL); 1559 1560 p->next = bracket; 1561 p->end = bracket+3; 1562 bracket[0] = '^'; 1563 bracket[1] = '\n'; 1564 bracket[2] = ']'; 1565 bracket[3] = '\0'; 1566 p_bracket(p); 1567 assert(p->next == bracket+3); 1568 p->next = oldnext; 1569 p->end = oldend; 1570 } 1571 1572 /* 1573 - repeat - generate code for a bounded repetition, recursively if needed 1574 == static void repeat(struct parse *p, sopno start, int from, int to); 1575 */ 1576 static void 1577 repeat(struct parse *p, 1578 sopno start, /* operand from here to end of strip */ 1579 int from, /* repeated from this number */ 1580 int to) /* to this number of times (maybe INFINITY) */ 1581 { 1582 sopno finish = HERE(); 1583 # define N 2 1584 # define INF 3 1585 # define REP(f, t) ((f)*8 + (t)) 1586 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1587 sopno copy; 1588 1589 _DIAGASSERT(p != NULL); 1590 1591 if (p->error != 0) /* head off possible runaway recursion */ 1592 return; 1593 1594 assert(from <= to); 1595 1596 switch (REP(MAP(from), MAP(to))) { 1597 case REP(0, 0): /* must be user doing this */ 1598 DROP(finish-start); /* drop the operand */ 1599 break; 1600 case REP(0, 1): /* as x{1,1}? */ 1601 case REP(0, N): /* as x{1,n}? */ 1602 case REP(0, INF): /* as x{1,}? */ 1603 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1604 INSERT(OCH_, start); /* offset is wrong... */ 1605 repeat(p, start+1, 1, to); 1606 ASTERN(OOR1, start); 1607 AHEAD(start); /* ... fix it */ 1608 EMIT(OOR2, 0); 1609 AHEAD(THERE()); 1610 ASTERN(O_CH, THERETHERE()); 1611 break; 1612 case REP(1, 1): /* trivial case */ 1613 /* done */ 1614 break; 1615 case REP(1, N): /* as x?x{1,n-1} */ 1616 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1617 INSERT(OCH_, start); 1618 ASTERN(OOR1, start); 1619 AHEAD(start); 1620 EMIT(OOR2, 0); /* offset very wrong... */ 1621 AHEAD(THERE()); /* ...so fix it */ 1622 ASTERN(O_CH, THERETHERE()); 1623 copy = dupl(p, start+1, finish+1); 1624 assert(copy == finish+4); 1625 repeat(p, copy, 1, to-1); 1626 break; 1627 case REP(1, INF): /* as x+ */ 1628 INSERT(OPLUS_, start); 1629 ASTERN(O_PLUS, start); 1630 break; 1631 case REP(N, N): /* as xx{m-1,n-1} */ 1632 copy = dupl(p, start, finish); 1633 repeat(p, copy, from-1, to-1); 1634 break; 1635 case REP(N, INF): /* as xx{n-1,INF} */ 1636 copy = dupl(p, start, finish); 1637 repeat(p, copy, from-1, to); 1638 break; 1639 default: /* "can't happen" */ 1640 SETERROR(REG_ASSERT); /* just in case */ 1641 break; 1642 } 1643 } 1644 1645 /* 1646 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1647 - character from the parse struct, signals a REG_ILLSEQ error if the 1648 - character can't be converted. Returns the number of bytes consumed. 1649 */ 1650 static wint_t 1651 wgetnext(struct parse *p) 1652 { 1653 #ifdef NLS 1654 mbstate_t mbs; 1655 wchar_t wc; 1656 size_t n; 1657 1658 memset(&mbs, 0, sizeof(mbs)); 1659 n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs); 1660 if (n == (size_t)-1 || n == (size_t)-2) { 1661 SETERROR(REG_ILLSEQ); 1662 return (0); 1663 } 1664 if (n == 0) 1665 n = 1; 1666 p->next += n; 1667 return wc; 1668 #else 1669 return *p->next++; 1670 #endif 1671 } 1672 1673 /* 1674 - seterr - set an error condition 1675 == static int seterr(struct parse *p, int e); 1676 */ 1677 static int /* useless but makes type checking happy */ 1678 seterr(struct parse *p, int e) 1679 { 1680 1681 _DIAGASSERT(p != NULL); 1682 1683 if (p->error == 0) /* keep earliest error condition */ 1684 p->error = e; 1685 p->next = nuls; /* try to bring things to a halt */ 1686 p->end = nuls; 1687 return(0); /* make the return value well-defined */ 1688 } 1689 1690 /* 1691 - allocset - allocate a set of characters for [] 1692 == static cset *allocset(struct parse *p); 1693 */ 1694 static cset * 1695 allocset(struct parse *p) 1696 { 1697 cset *cs, *ncs; 1698 1699 _DIAGASSERT(p != NULL); 1700 1701 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1702 if (ncs == NULL) { 1703 SETERROR(REG_ESPACE); 1704 return (NULL); 1705 } 1706 p->g->sets = ncs; 1707 cs = &p->g->sets[p->g->ncsets++]; 1708 memset(cs, 0, sizeof(*cs)); 1709 1710 return(cs); 1711 } 1712 1713 /* 1714 - freeset - free a now-unused set 1715 == static void freeset(struct parse *p, cset *cs); 1716 */ 1717 static void 1718 freeset(struct parse *p, cset *cs) 1719 { 1720 cset *top; 1721 1722 _DIAGASSERT(p != NULL); 1723 _DIAGASSERT(cs != NULL); 1724 1725 top = &p->g->sets[p->g->ncsets]; 1726 1727 free(cs->wides); 1728 free(cs->ranges); 1729 free(cs->types); 1730 memset(cs, 0, sizeof(*cs)); 1731 if (cs == top-1) /* recover only the easy case */ 1732 p->g->ncsets--; 1733 } 1734 1735 /* 1736 - singleton - Determine whether a set contains only one character, 1737 - returning it if so, otherwise returning OUT. 1738 */ 1739 static wint_t 1740 singleton(cset *cs) 1741 { 1742 wint_t i, s, n; 1743 1744 for (i = n = 0; i < NC; i++) 1745 if (CHIN(cs, i)) { 1746 n++; 1747 s = i; 1748 } 1749 if (n == 1) 1750 return (s); 1751 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1752 cs->icase == 0) 1753 return (cs->wides[0]); 1754 /* Don't bother handling the other cases. */ 1755 return (OUT); 1756 } 1757 1758 /* 1759 - CHadd - add character to character set. 1760 */ 1761 static void 1762 CHadd(struct parse *p, cset *cs, wint_t ch) 1763 { 1764 wint_t nch, *newwides; 1765 1766 _DIAGASSERT(p != NULL); 1767 _DIAGASSERT(cs != NULL); 1768 1769 assert(ch >= 0); 1770 if (ch < NC) 1771 cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7); 1772 else { 1773 newwides = reallocarray(cs->wides, cs->nwides + 1, 1774 sizeof(*cs->wides)); 1775 if (newwides == NULL) { 1776 SETERROR(REG_ESPACE); 1777 return; 1778 } 1779 cs->wides = newwides; 1780 cs->wides[cs->nwides++] = ch; 1781 } 1782 if (cs->icase) { 1783 if ((nch = towlower(ch)) < NC) 1784 cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7); 1785 if ((nch = towupper(ch)) < NC) 1786 cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7); 1787 } 1788 } 1789 1790 /* 1791 - CHaddrange - add all characters in the range [min,max] to a character set. 1792 */ 1793 static void 1794 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1795 { 1796 crange *newranges; 1797 1798 _DIAGASSERT(p != NULL); 1799 _DIAGASSERT(cs != NULL); 1800 1801 for (; min < NC && min <= max; min++) 1802 CHadd(p, cs, min); 1803 if (min >= max) 1804 return; 1805 newranges = reallocarray(cs->ranges, cs->nranges + 1, 1806 sizeof(*cs->ranges)); 1807 if (newranges == NULL) { 1808 SETERROR(REG_ESPACE); 1809 return; 1810 } 1811 cs->ranges = newranges; 1812 cs->ranges[cs->nranges].min = min; 1813 cs->ranges[cs->nranges].max = max; 1814 cs->nranges++; 1815 } 1816 1817 /* 1818 - CHaddtype - add all characters of a certain type to a character set. 1819 */ 1820 static void 1821 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1822 { 1823 wint_t i; 1824 wctype_t *newtypes; 1825 1826 _DIAGASSERT(p != NULL); 1827 _DIAGASSERT(cs != NULL); 1828 1829 for (i = 0; i < NC; i++) 1830 if (iswctype(i, wct)) 1831 CHadd(p, cs, i); 1832 newtypes = reallocarray(cs->types, cs->ntypes + 1, 1833 sizeof(*cs->types)); 1834 if (newtypes == NULL) { 1835 SETERROR(REG_ESPACE); 1836 return; 1837 } 1838 cs->types = newtypes; 1839 cs->types[cs->ntypes++] = wct; 1840 } 1841 1842 /* 1843 - dupl - emit a duplicate of a bunch of sops 1844 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1845 */ 1846 static sopno /* start of duplicate */ 1847 dupl(struct parse *p, 1848 sopno start, /* from here */ 1849 sopno finish) /* to this less one */ 1850 { 1851 sopno ret = HERE(); 1852 sopno len = finish - start; 1853 1854 _DIAGASSERT(p != NULL); 1855 1856 assert(finish >= start); 1857 if (len == 0) 1858 return(ret); 1859 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1860 return(ret); 1861 (void) memcpy(p->strip + p->slen, 1862 p->strip + start, len * sizeof(*p->strip)); 1863 p->slen += len; 1864 return(ret); 1865 } 1866 1867 /* 1868 - doemit - emit a strip operator 1869 == static void doemit(struct parse *p, sop op, size_t opnd); 1870 * 1871 * It might seem better to implement this as a macro with a function as 1872 * hard-case backup, but it's just too big and messy unless there are 1873 * some changes to the data structures. Maybe later. 1874 */ 1875 static void 1876 doemit(struct parse *p, sop op, size_t opnd) 1877 { 1878 /* avoid making error situations worse */ 1879 if (p->error != 0) 1880 return; 1881 1882 _DIAGASSERT(p != NULL); 1883 1884 /* deal with oversize operands ("can't happen", more or less) */ 1885 assert(opnd < 1<<OPSHIFT); 1886 1887 /* deal with undersized strip */ 1888 if (p->slen >= p->ssize) 1889 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1890 return; 1891 1892 /* finally, it's all reduced to the easy case */ 1893 p->strip[p->slen++] = (sopno)SOP(op, opnd); 1894 } 1895 1896 /* 1897 - doinsert - insert a sop into the strip 1898 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1899 */ 1900 static void 1901 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1902 { 1903 sopno sn; 1904 sop s; 1905 int i; 1906 1907 _DIAGASSERT(p != NULL); 1908 1909 /* avoid making error situations worse */ 1910 if (p->error != 0) 1911 return; 1912 1913 sn = HERE(); 1914 EMIT(op, opnd); /* do checks, ensure space */ 1915 assert(HERE() == sn+1); 1916 s = p->strip[sn]; 1917 1918 /* adjust paren pointers */ 1919 assert(pos > 0); 1920 for (i = 1; i < NPAREN; i++) { 1921 if (p->pbegin[i] >= pos) { 1922 p->pbegin[i]++; 1923 } 1924 if (p->pend[i] >= pos) { 1925 p->pend[i]++; 1926 } 1927 } 1928 1929 memmove(&p->strip[pos+1], &p->strip[pos], 1930 (HERE()-pos-1)*sizeof(*p->strip)); 1931 p->strip[pos] = s; 1932 } 1933 1934 /* 1935 - dofwd - complete a forward reference 1936 == static void dofwd(struct parse *p, sopno pos, sop value); 1937 */ 1938 static void 1939 dofwd(struct parse *p, sopno pos, sop value) 1940 { 1941 1942 _DIAGASSERT(p != NULL); 1943 1944 /* avoid making error situations worse */ 1945 if (p->error != 0) 1946 return; 1947 1948 assert(value < 1<<OPSHIFT); 1949 p->strip[pos] = OP(p->strip[pos]) | value; 1950 } 1951 1952 /* 1953 - enlarge - enlarge the strip 1954 == static int enlarge(struct parse *p, sopno size); 1955 */ 1956 static int 1957 enlarge(struct parse *p, sopno size) 1958 { 1959 sop *sp; 1960 1961 _DIAGASSERT(p != NULL); 1962 1963 if (p->ssize >= size) 1964 return 1; 1965 1966 sp = reallocarray(p->strip, size, sizeof(*p->strip)); 1967 if (sp == NULL) { 1968 SETERROR(REG_ESPACE); 1969 return 0; 1970 } 1971 p->strip = sp; 1972 p->ssize = size; 1973 return 1; 1974 } 1975 1976 /* 1977 - stripsnug - compact the strip 1978 == static void stripsnug(struct parse *p, struct re_guts *g); 1979 */ 1980 static void 1981 stripsnug(struct parse *p, struct re_guts *g) 1982 { 1983 1984 _DIAGASSERT(p != NULL); 1985 _DIAGASSERT(g != NULL); 1986 1987 g->nstates = p->slen; 1988 g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip)); 1989 if (g->strip == NULL) { 1990 SETERROR(REG_ESPACE); 1991 g->strip = p->strip; 1992 } 1993 } 1994 1995 /* 1996 - findmust - fill in must and mlen with longest mandatory literal string 1997 == static void findmust(struct parse *p, struct re_guts *g); 1998 * 1999 * This algorithm could do fancy things like analyzing the operands of | 2000 * for common subsequences. Someday. This code is simple and finds most 2001 * of the interesting cases. 2002 * 2003 * Note that must and mlen got initialized during setup. 2004 */ 2005 static void 2006 findmust(struct parse *p, struct re_guts *g) 2007 { 2008 sop *scan; 2009 sop *start = NULL; 2010 sop *newstart = NULL; 2011 sopno newlen; 2012 sop s; 2013 char *cp; 2014 int offset; 2015 mbstate_t mbs; 2016 2017 _DIAGASSERT(p != NULL); 2018 _DIAGASSERT(g != NULL); 2019 2020 /* avoid making error situations worse */ 2021 if (p->error != 0) 2022 return; 2023 2024 #ifdef notyet 2025 /* 2026 * It's not generally safe to do a ``char'' substring search on 2027 * multibyte character strings, but it's safe for at least 2028 * UTF-8 (see RFC 3629). 2029 */ 2030 if (MB_CUR_MAX > 1 && 2031 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 2032 return; 2033 #endif 2034 2035 /* find the longest OCHAR sequence in strip */ 2036 newlen = 0; 2037 offset = 0; 2038 g->moffset = 0; 2039 scan = g->strip + 1; 2040 do { 2041 s = *scan++; 2042 switch (OP(s)) { 2043 case OCHAR: /* sequence member */ 2044 if (newlen == 0) { /* new sequence */ 2045 memset(&mbs, 0, sizeof(mbs)); 2046 newstart = scan - 1; 2047 } 2048 #ifdef NLS 2049 char buf[MB_LEN_MAX]; 2050 size_t clen = wcrtomb(buf, (int)OPND(s), &mbs); 2051 if (clen == (size_t)-1) 2052 goto toohard; 2053 newlen += (sopno)clen; 2054 #else 2055 newlen++; 2056 #endif 2057 break; 2058 case OPLUS_: /* things that don't break one */ 2059 case OLPAREN: 2060 case ORPAREN: 2061 break; 2062 case OQUEST_: /* things that must be skipped */ 2063 case OCH_: 2064 offset = altoffset(scan, offset); 2065 scan--; 2066 do { 2067 scan += OPND(s); 2068 s = *scan; 2069 /* assert() interferes w debug printouts */ 2070 if (OP(s) != O_QUEST && 2071 OP(s) != O_CH && OP(s) != OOR2) { 2072 g->iflags |= BAD; 2073 return; 2074 } 2075 } while (OP(s) != O_QUEST && OP(s) != O_CH); 2076 /* FALLTHROUGH */ 2077 case OBOW: /* things that break a sequence */ 2078 case OEOW: 2079 case OBOL: 2080 case OEOL: 2081 case OBOS: 2082 case OEOS: 2083 case OWBND: 2084 case ONWBND: 2085 case O_QUEST: 2086 case O_CH: 2087 case OEND: 2088 if (newlen > (sopno)g->mlen) { /* ends one */ 2089 start = newstart; 2090 g->mlen = newlen; 2091 if (offset > -1) { 2092 g->moffset += offset; 2093 offset = newlen; 2094 } else 2095 g->moffset = offset; 2096 } else { 2097 if (offset > -1) 2098 offset += newlen; 2099 } 2100 newlen = 0; 2101 break; 2102 case OANY: 2103 if (newlen > (sopno)g->mlen) { /* ends one */ 2104 start = newstart; 2105 g->mlen = newlen; 2106 if (offset > -1) { 2107 g->moffset += offset; 2108 offset = newlen; 2109 } else 2110 g->moffset = offset; 2111 } else { 2112 if (offset > -1) 2113 offset += newlen; 2114 } 2115 if (offset > -1) 2116 offset++; 2117 newlen = 0; 2118 break; 2119 case OANYOF: /* may or may not invalidate offset */ 2120 /* First, everything as OANY */ 2121 if (newlen > (sopno)g->mlen) { /* ends one */ 2122 start = newstart; 2123 g->mlen = newlen; 2124 if (offset > -1) { 2125 g->moffset += offset; 2126 offset = newlen; 2127 } else 2128 g->moffset = offset; 2129 } else { 2130 if (offset > -1) 2131 offset += newlen; 2132 } 2133 if (offset > -1) 2134 offset++; 2135 newlen = 0; 2136 break; 2137 #ifdef NLS 2138 toohard:/*FALLTHROUGH*/ 2139 #endif 2140 default: 2141 /* Anything here makes it impossible or too hard 2142 * to calculate the offset -- so we give up; 2143 * save the last known good offset, in case the 2144 * must sequence doesn't occur later. 2145 */ 2146 if (newlen > (sopno)g->mlen) { /* ends one */ 2147 start = newstart; 2148 g->mlen = newlen; 2149 if (offset > -1) 2150 g->moffset += offset; 2151 else 2152 g->moffset = offset; 2153 } 2154 offset = -1; 2155 newlen = 0; 2156 break; 2157 } 2158 } while (OP(s) != OEND); 2159 2160 if (g->mlen == 0) { /* there isn't one */ 2161 g->moffset = -1; 2162 return; 2163 } 2164 2165 /* turn it into a character string */ 2166 g->must = malloc((size_t)g->mlen + 1); 2167 if (g->must == NULL) { /* argh; just forget it */ 2168 g->mlen = 0; 2169 g->moffset = -1; 2170 return; 2171 } 2172 cp = g->must; 2173 scan = start; 2174 memset(&mbs, 0, sizeof(mbs)); 2175 while (cp < g->must + g->mlen) { 2176 while (OP(s = *scan++) != OCHAR) 2177 continue; 2178 #ifdef NLS 2179 size_t clen = wcrtomb(cp, (int)OPND(s), &mbs); 2180 assert(clen != (size_t)-1); 2181 cp += clen; 2182 #else 2183 *cp++ = OPND(s); 2184 #endif 2185 } 2186 assert(cp == g->must + g->mlen); 2187 *cp++ = '\0'; /* just on general principles */ 2188 } 2189 2190 /* 2191 - altoffset - choose biggest offset among multiple choices 2192 == static int altoffset(sop *scan, int offset); 2193 * 2194 * Compute, recursively if necessary, the largest offset among multiple 2195 * re paths. 2196 */ 2197 static int 2198 altoffset(sop *scan, int offset) 2199 { 2200 int largest; 2201 int try; 2202 sop s; 2203 2204 _DIAGASSERT(scan != NULL); 2205 2206 /* If we gave up already on offsets, return */ 2207 if (offset == -1) 2208 return -1; 2209 2210 largest = 0; 2211 try = 0; 2212 s = *scan++; 2213 while (OP(s) != O_QUEST && OP(s) != O_CH) { 2214 switch (OP(s)) { 2215 case OOR1: 2216 if (try > largest) 2217 largest = try; 2218 try = 0; 2219 break; 2220 case OQUEST_: 2221 case OCH_: 2222 try = altoffset(scan, try); 2223 if (try == -1) 2224 return -1; 2225 scan--; 2226 do { 2227 scan += OPND(s); 2228 s = *scan; 2229 if (OP(s) != O_QUEST && 2230 OP(s) != O_CH && OP(s) != OOR2) 2231 return -1; 2232 } while (OP(s) != O_QUEST && OP(s) != O_CH); 2233 /* We must skip to the next position, or we'll 2234 * leave altoffset() too early. 2235 */ 2236 scan++; 2237 break; 2238 case OANYOF: 2239 case OCHAR: 2240 case OANY: 2241 try++; 2242 /*FALLTHROUGH*/ 2243 case OBOW: 2244 case OEOW: 2245 case OWBND: 2246 case ONWBND: 2247 case OLPAREN: 2248 case ORPAREN: 2249 case OOR2: 2250 break; 2251 default: 2252 try = -1; 2253 break; 2254 } 2255 if (try == -1) 2256 return -1; 2257 s = *scan++; 2258 } 2259 2260 if (try > largest) 2261 largest = try; 2262 2263 return largest+offset; 2264 } 2265 2266 /* 2267 - computejumps - compute char jumps for BM scan 2268 == static void computejumps(struct parse *p, struct re_guts *g); 2269 * 2270 * This algorithm assumes g->must exists and is has size greater than 2271 * zero. It's based on the algorithm found on Computer Algorithms by 2272 * Sara Baase. 2273 * 2274 * A char jump is the number of characters one needs to jump based on 2275 * the value of the character from the text that was mismatched. 2276 */ 2277 static void 2278 computejumps(struct parse *p, struct re_guts *g) 2279 { 2280 int ch; 2281 size_t mindex; 2282 2283 _DIAGASSERT(p != NULL); 2284 _DIAGASSERT(g != NULL); 2285 2286 /* Avoid making errors worse */ 2287 if (p->error != 0) 2288 return; 2289 2290 g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump)); 2291 if (g->charjump == NULL) /* Not a fatal error */ 2292 return; 2293 /* Adjust for signed chars, if necessary */ 2294 g->charjump = &g->charjump[-(CHAR_MIN)]; 2295 2296 /* If the character does not exist in the pattern, the jump 2297 * is equal to the number of characters in the pattern. 2298 */ 2299 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 2300 g->charjump[ch] = g->mlen; 2301 2302 /* If the character does exist, compute the jump that would 2303 * take us to the last character in the pattern equal to it 2304 * (notice that we match right to left, so that last character 2305 * is the first one that would be matched). 2306 */ 2307 for (mindex = 0; mindex < g->mlen; mindex++) 2308 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 2309 } 2310 2311 /* 2312 - computematchjumps - compute match jumps for BM scan 2313 == static void computematchjumps(struct parse *p, struct re_guts *g); 2314 * 2315 * This algorithm assumes g->must exists and is has size greater than 2316 * zero. It's based on the algorithm found on Computer Algorithms by 2317 * Sara Baase. 2318 * 2319 * A match jump is the number of characters one needs to advance based 2320 * on the already-matched suffix. 2321 * Notice that all values here are minus (g->mlen-1), because of the way 2322 * the search algorithm works. 2323 */ 2324 static void 2325 computematchjumps(struct parse *p, struct re_guts *g) 2326 { 2327 size_t mindex; /* General "must" iterator */ 2328 size_t suffix; /* Keeps track of matching suffix */ 2329 size_t ssuffix; /* Keeps track of suffixes' suffix */ 2330 size_t* pmatches; /* pmatches[k] points to the next i 2331 * such that i+1...mlen is a substring 2332 * of k+1...k+mlen-i-1 2333 */ 2334 2335 _DIAGASSERT(p != NULL); 2336 _DIAGASSERT(g != NULL); 2337 2338 /* Avoid making errors worse */ 2339 if (p->error != 0) 2340 return; 2341 2342 pmatches = calloc(g->mlen, sizeof(*pmatches)); 2343 if (pmatches == NULL) { 2344 g->matchjump = NULL; 2345 return; 2346 } 2347 2348 g->matchjump = calloc(g->mlen, sizeof(*g->matchjump)); 2349 if (g->matchjump == NULL) { /* Not a fatal error */ 2350 free(pmatches); 2351 return; 2352 } 2353 2354 /* Set maximum possible jump for each character in the pattern */ 2355 for (mindex = 0; mindex < g->mlen; mindex++) 2356 g->matchjump[mindex] = 2 * g->mlen - mindex - 1; 2357 2358 /* Compute pmatches[] */ 2359 for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) { 2360 pmatches[mindex] = suffix; 2361 2362 /* If a mismatch is found, interrupting the substring, 2363 * compute the matchjump for that position. If no 2364 * mismatch is found, then a text substring mismatched 2365 * against the suffix will also mismatch against the 2366 * substring. 2367 */ 2368 while (suffix < g->mlen 2369 && g->must[mindex] != g->must[suffix]) { 2370 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2371 g->mlen - mindex - 1); 2372 suffix = pmatches[suffix]; 2373 } 2374 } 2375 2376 /* Compute the matchjump up to the last substring found to jump 2377 * to the beginning of the largest must pattern prefix matching 2378 * it's own suffix. 2379 */ 2380 for (mindex = 0; mindex <= suffix; mindex++) 2381 g->matchjump[mindex] = MIN(g->matchjump[mindex], 2382 g->mlen + suffix - mindex); 2383 2384 ssuffix = pmatches[suffix]; 2385 while (suffix < g->mlen) { 2386 while (suffix <= ssuffix && suffix < g->mlen) { 2387 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2388 g->mlen + ssuffix - suffix); 2389 suffix++; 2390 } 2391 if (suffix < g->mlen) 2392 ssuffix = pmatches[ssuffix]; 2393 } 2394 2395 free(pmatches); 2396 } 2397 2398 /* 2399 - pluscount - count + nesting 2400 == static sopno pluscount(struct parse *p, struct re_guts *g); 2401 */ 2402 static sopno /* nesting depth */ 2403 pluscount(struct parse *p, struct re_guts *g) 2404 { 2405 sop *scan; 2406 sop s; 2407 sopno plusnest = 0; 2408 sopno maxnest = 0; 2409 2410 _DIAGASSERT(p != NULL); 2411 _DIAGASSERT(g != NULL); 2412 2413 if (p->error != 0) 2414 return(0); /* there may not be an OEND */ 2415 2416 scan = g->strip + 1; 2417 do { 2418 s = *scan++; 2419 switch (OP(s)) { 2420 case OPLUS_: 2421 plusnest++; 2422 break; 2423 case O_PLUS: 2424 if (plusnest > maxnest) 2425 maxnest = plusnest; 2426 plusnest--; 2427 break; 2428 } 2429 } while (OP(s) != OEND); 2430 if (plusnest != 0) 2431 g->iflags |= BAD; 2432 return(maxnest); 2433 } 2434