10b57cec5SDimitry Andric /*- 20b57cec5SDimitry Andric * This code is derived from OpenBSD's libc/regex, original license follows: 30b57cec5SDimitry Andric * 40b57cec5SDimitry Andric * Copyright (c) 1992, 1993, 1994 Henry Spencer. 50b57cec5SDimitry Andric * Copyright (c) 1992, 1993, 1994 60b57cec5SDimitry Andric * The Regents of the University of California. All rights reserved. 70b57cec5SDimitry Andric * 80b57cec5SDimitry Andric * This code is derived from software contributed to Berkeley by 90b57cec5SDimitry Andric * Henry Spencer. 100b57cec5SDimitry Andric * 110b57cec5SDimitry Andric * Redistribution and use in source and binary forms, with or without 120b57cec5SDimitry Andric * modification, are permitted provided that the following conditions 130b57cec5SDimitry Andric * are met: 140b57cec5SDimitry Andric * 1. Redistributions of source code must retain the above copyright 150b57cec5SDimitry Andric * notice, this list of conditions and the following disclaimer. 160b57cec5SDimitry Andric * 2. Redistributions in binary form must reproduce the above copyright 170b57cec5SDimitry Andric * notice, this list of conditions and the following disclaimer in the 180b57cec5SDimitry Andric * documentation and/or other materials provided with the distribution. 190b57cec5SDimitry Andric * 3. Neither the name of the University nor the names of its contributors 200b57cec5SDimitry Andric * may be used to endorse or promote products derived from this software 210b57cec5SDimitry Andric * without specific prior written permission. 220b57cec5SDimitry Andric * 230b57cec5SDimitry Andric * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 240b57cec5SDimitry Andric * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 250b57cec5SDimitry Andric * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 260b57cec5SDimitry Andric * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 270b57cec5SDimitry Andric * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 280b57cec5SDimitry Andric * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 290b57cec5SDimitry Andric * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 300b57cec5SDimitry Andric * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 310b57cec5SDimitry Andric * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 320b57cec5SDimitry Andric * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 330b57cec5SDimitry Andric * SUCH DAMAGE. 340b57cec5SDimitry Andric * 350b57cec5SDimitry Andric * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 360b57cec5SDimitry Andric */ 370b57cec5SDimitry Andric 380b57cec5SDimitry Andric #include <sys/types.h> 390b57cec5SDimitry Andric #include <stdint.h> 400b57cec5SDimitry Andric #include <stdio.h> 410b57cec5SDimitry Andric #include <string.h> 420b57cec5SDimitry Andric #include <ctype.h> 430b57cec5SDimitry Andric #include <limits.h> 440b57cec5SDimitry Andric #include <stdlib.h> 450b57cec5SDimitry Andric #include "regex_impl.h" 460b57cec5SDimitry Andric 470b57cec5SDimitry Andric #include "regutils.h" 480b57cec5SDimitry Andric #include "regex2.h" 490b57cec5SDimitry Andric 500b57cec5SDimitry Andric #include "llvm/Config/config.h" 518bcb0991SDimitry Andric #include "llvm/Support/Compiler.h" 520b57cec5SDimitry Andric 530b57cec5SDimitry Andric /* character-class table */ 540b57cec5SDimitry Andric static struct cclass { 550b57cec5SDimitry Andric const char *name; 560b57cec5SDimitry Andric const char *chars; 570b57cec5SDimitry Andric const char *multis; 580b57cec5SDimitry Andric } cclasses[] = { 590b57cec5SDimitry Andric { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ 600b57cec5SDimitry Andric 0123456789", ""} , 610b57cec5SDimitry Andric { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", 620b57cec5SDimitry Andric ""} , 630b57cec5SDimitry Andric { "blank", " \t", ""} , 640b57cec5SDimitry Andric { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\ 650b57cec5SDimitry Andric \25\26\27\30\31\32\33\34\35\36\37\177", ""} , 660b57cec5SDimitry Andric { "digit", "0123456789", ""} , 670b57cec5SDimitry Andric { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ 680b57cec5SDimitry Andric 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", 690b57cec5SDimitry Andric ""} , 700b57cec5SDimitry Andric { "lower", "abcdefghijklmnopqrstuvwxyz", 710b57cec5SDimitry Andric ""} , 720b57cec5SDimitry Andric { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ 730b57cec5SDimitry Andric 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", 740b57cec5SDimitry Andric ""} , 750b57cec5SDimitry Andric { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", 760b57cec5SDimitry Andric ""} , 770b57cec5SDimitry Andric { "space", "\t\n\v\f\r ", ""} , 780b57cec5SDimitry Andric { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", 790b57cec5SDimitry Andric ""} , 800b57cec5SDimitry Andric { "xdigit", "0123456789ABCDEFabcdef", 810b57cec5SDimitry Andric ""} , 820b57cec5SDimitry Andric { NULL, 0, "" } 830b57cec5SDimitry Andric }; 840b57cec5SDimitry Andric 850b57cec5SDimitry Andric /* character-name table */ 860b57cec5SDimitry Andric static struct cname { 870b57cec5SDimitry Andric const char *name; 880b57cec5SDimitry Andric char code; 890b57cec5SDimitry Andric } cnames[] = { 900b57cec5SDimitry Andric { "NUL", '\0' }, 910b57cec5SDimitry Andric { "SOH", '\001' }, 920b57cec5SDimitry Andric { "STX", '\002' }, 930b57cec5SDimitry Andric { "ETX", '\003' }, 940b57cec5SDimitry Andric { "EOT", '\004' }, 950b57cec5SDimitry Andric { "ENQ", '\005' }, 960b57cec5SDimitry Andric { "ACK", '\006' }, 970b57cec5SDimitry Andric { "BEL", '\007' }, 980b57cec5SDimitry Andric { "alert", '\007' }, 990b57cec5SDimitry Andric { "BS", '\010' }, 1000b57cec5SDimitry Andric { "backspace", '\b' }, 1010b57cec5SDimitry Andric { "HT", '\011' }, 1020b57cec5SDimitry Andric { "tab", '\t' }, 1030b57cec5SDimitry Andric { "LF", '\012' }, 1040b57cec5SDimitry Andric { "newline", '\n' }, 1050b57cec5SDimitry Andric { "VT", '\013' }, 1060b57cec5SDimitry Andric { "vertical-tab", '\v' }, 1070b57cec5SDimitry Andric { "FF", '\014' }, 1080b57cec5SDimitry Andric { "form-feed", '\f' }, 1090b57cec5SDimitry Andric { "CR", '\015' }, 1100b57cec5SDimitry Andric { "carriage-return", '\r' }, 1110b57cec5SDimitry Andric { "SO", '\016' }, 1120b57cec5SDimitry Andric { "SI", '\017' }, 1130b57cec5SDimitry Andric { "DLE", '\020' }, 1140b57cec5SDimitry Andric { "DC1", '\021' }, 1150b57cec5SDimitry Andric { "DC2", '\022' }, 1160b57cec5SDimitry Andric { "DC3", '\023' }, 1170b57cec5SDimitry Andric { "DC4", '\024' }, 1180b57cec5SDimitry Andric { "NAK", '\025' }, 1190b57cec5SDimitry Andric { "SYN", '\026' }, 1200b57cec5SDimitry Andric { "ETB", '\027' }, 1210b57cec5SDimitry Andric { "CAN", '\030' }, 1220b57cec5SDimitry Andric { "EM", '\031' }, 1230b57cec5SDimitry Andric { "SUB", '\032' }, 1240b57cec5SDimitry Andric { "ESC", '\033' }, 1250b57cec5SDimitry Andric { "IS4", '\034' }, 1260b57cec5SDimitry Andric { "FS", '\034' }, 1270b57cec5SDimitry Andric { "IS3", '\035' }, 1280b57cec5SDimitry Andric { "GS", '\035' }, 1290b57cec5SDimitry Andric { "IS2", '\036' }, 1300b57cec5SDimitry Andric { "RS", '\036' }, 1310b57cec5SDimitry Andric { "IS1", '\037' }, 1320b57cec5SDimitry Andric { "US", '\037' }, 1330b57cec5SDimitry Andric { "space", ' ' }, 1340b57cec5SDimitry Andric { "exclamation-mark", '!' }, 1350b57cec5SDimitry Andric { "quotation-mark", '"' }, 1360b57cec5SDimitry Andric { "number-sign", '#' }, 1370b57cec5SDimitry Andric { "dollar-sign", '$' }, 1380b57cec5SDimitry Andric { "percent-sign", '%' }, 1390b57cec5SDimitry Andric { "ampersand", '&' }, 1400b57cec5SDimitry Andric { "apostrophe", '\'' }, 1410b57cec5SDimitry Andric { "left-parenthesis", '(' }, 1420b57cec5SDimitry Andric { "right-parenthesis", ')' }, 1430b57cec5SDimitry Andric { "asterisk", '*' }, 1440b57cec5SDimitry Andric { "plus-sign", '+' }, 1450b57cec5SDimitry Andric { "comma", ',' }, 1460b57cec5SDimitry Andric { "hyphen", '-' }, 1470b57cec5SDimitry Andric { "hyphen-minus", '-' }, 1480b57cec5SDimitry Andric { "period", '.' }, 1490b57cec5SDimitry Andric { "full-stop", '.' }, 1500b57cec5SDimitry Andric { "slash", '/' }, 1510b57cec5SDimitry Andric { "solidus", '/' }, 1520b57cec5SDimitry Andric { "zero", '0' }, 1530b57cec5SDimitry Andric { "one", '1' }, 1540b57cec5SDimitry Andric { "two", '2' }, 1550b57cec5SDimitry Andric { "three", '3' }, 1560b57cec5SDimitry Andric { "four", '4' }, 1570b57cec5SDimitry Andric { "five", '5' }, 1580b57cec5SDimitry Andric { "six", '6' }, 1590b57cec5SDimitry Andric { "seven", '7' }, 1600b57cec5SDimitry Andric { "eight", '8' }, 1610b57cec5SDimitry Andric { "nine", '9' }, 1620b57cec5SDimitry Andric { "colon", ':' }, 1630b57cec5SDimitry Andric { "semicolon", ';' }, 1640b57cec5SDimitry Andric { "less-than-sign", '<' }, 1650b57cec5SDimitry Andric { "equals-sign", '=' }, 1660b57cec5SDimitry Andric { "greater-than-sign", '>' }, 1670b57cec5SDimitry Andric { "question-mark", '?' }, 1680b57cec5SDimitry Andric { "commercial-at", '@' }, 1690b57cec5SDimitry Andric { "left-square-bracket", '[' }, 1700b57cec5SDimitry Andric { "backslash", '\\' }, 1710b57cec5SDimitry Andric { "reverse-solidus", '\\' }, 1720b57cec5SDimitry Andric { "right-square-bracket", ']' }, 1730b57cec5SDimitry Andric { "circumflex", '^' }, 1740b57cec5SDimitry Andric { "circumflex-accent", '^' }, 1750b57cec5SDimitry Andric { "underscore", '_' }, 1760b57cec5SDimitry Andric { "low-line", '_' }, 1770b57cec5SDimitry Andric { "grave-accent", '`' }, 1780b57cec5SDimitry Andric { "left-brace", '{' }, 1790b57cec5SDimitry Andric { "left-curly-bracket", '{' }, 1800b57cec5SDimitry Andric { "vertical-line", '|' }, 1810b57cec5SDimitry Andric { "right-brace", '}' }, 1820b57cec5SDimitry Andric { "right-curly-bracket", '}' }, 1830b57cec5SDimitry Andric { "tilde", '~' }, 1840b57cec5SDimitry Andric { "DEL", '\177' }, 1850b57cec5SDimitry Andric { NULL, 0 } 1860b57cec5SDimitry Andric }; 1870b57cec5SDimitry Andric 1880b57cec5SDimitry Andric /* 1890b57cec5SDimitry Andric * parse structure, passed up and down to avoid global variables and 1900b57cec5SDimitry Andric * other clumsinesses 1910b57cec5SDimitry Andric */ 1920b57cec5SDimitry Andric struct parse { 1935f757f3fSDimitry Andric const char *next; /* next character in RE */ 1945f757f3fSDimitry Andric const char *end; /* end of string (-> NUL normally) */ 1950b57cec5SDimitry Andric int error; /* has an error been seen? */ 1960b57cec5SDimitry Andric sop *strip; /* malloced strip */ 1970b57cec5SDimitry Andric sopno ssize; /* malloced strip size (allocated) */ 1980b57cec5SDimitry Andric sopno slen; /* malloced strip length (used) */ 1990b57cec5SDimitry Andric int ncsalloc; /* number of csets allocated */ 2000b57cec5SDimitry Andric struct re_guts *g; 2010b57cec5SDimitry Andric # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 2020b57cec5SDimitry Andric sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 2030b57cec5SDimitry Andric sopno pend[NPAREN]; /* -> ) ([0] unused) */ 2040b57cec5SDimitry Andric }; 2050b57cec5SDimitry Andric 2060b57cec5SDimitry Andric static void p_ere(struct parse *, int); 2070b57cec5SDimitry Andric static void p_ere_exp(struct parse *); 2080b57cec5SDimitry Andric static void p_str(struct parse *); 2090b57cec5SDimitry Andric static void p_bre(struct parse *, int, int); 2100b57cec5SDimitry Andric static int p_simp_re(struct parse *, int); 2110b57cec5SDimitry Andric static int p_count(struct parse *); 2120b57cec5SDimitry Andric static void p_bracket(struct parse *); 2130b57cec5SDimitry Andric static void p_b_term(struct parse *, cset *); 2140b57cec5SDimitry Andric static void p_b_cclass(struct parse *, cset *); 2150b57cec5SDimitry Andric static void p_b_eclass(struct parse *, cset *); 2160b57cec5SDimitry Andric static char p_b_symbol(struct parse *); 2170b57cec5SDimitry Andric static char p_b_coll_elem(struct parse *, int); 2180b57cec5SDimitry Andric static char othercase(int); 2190b57cec5SDimitry Andric static void bothcases(struct parse *, int); 2200b57cec5SDimitry Andric static void ordinary(struct parse *, int); 2210b57cec5SDimitry Andric static void nonnewline(struct parse *); 2220b57cec5SDimitry Andric static void repeat(struct parse *, sopno, int, int); 2230b57cec5SDimitry Andric static int seterr(struct parse *, int); 2240b57cec5SDimitry Andric static cset *allocset(struct parse *); 2250b57cec5SDimitry Andric static void freeset(struct parse *, cset *); 2260b57cec5SDimitry Andric static int freezeset(struct parse *, cset *); 2270b57cec5SDimitry Andric static int firstch(struct parse *, cset *); 2280b57cec5SDimitry Andric static int nch(struct parse *, cset *); 2290b57cec5SDimitry Andric static void mcadd(struct parse *, cset *, const char *); 2300b57cec5SDimitry Andric static void mcinvert(struct parse *, cset *); 2310b57cec5SDimitry Andric static void mccase(struct parse *, cset *); 2320b57cec5SDimitry Andric static int isinsets(struct re_guts *, int); 2330b57cec5SDimitry Andric static int samesets(struct re_guts *, int, int); 2340b57cec5SDimitry Andric static void categorize(struct parse *, struct re_guts *); 2350b57cec5SDimitry Andric static sopno dupl(struct parse *, sopno, sopno); 2360b57cec5SDimitry Andric static void doemit(struct parse *, sop, size_t); 2370b57cec5SDimitry Andric static void doinsert(struct parse *, sop, size_t, sopno); 2380b57cec5SDimitry Andric static void dofwd(struct parse *, sopno, sop); 2390b57cec5SDimitry Andric static void enlarge(struct parse *, sopno); 2400b57cec5SDimitry Andric static void stripsnug(struct parse *, struct re_guts *); 2410b57cec5SDimitry Andric static void findmust(struct parse *, struct re_guts *); 2420b57cec5SDimitry Andric static sopno pluscount(struct parse *, struct re_guts *); 2430b57cec5SDimitry Andric 2440b57cec5SDimitry Andric static char nuls[10]; /* place to point scanner in event of error */ 2450b57cec5SDimitry Andric 2460b57cec5SDimitry Andric /* 2470b57cec5SDimitry Andric * macros for use with parse structure 2480b57cec5SDimitry Andric * BEWARE: these know that the parse structure is named `p' !!! 2490b57cec5SDimitry Andric */ 2500b57cec5SDimitry Andric #define PEEK() (*p->next) 2510b57cec5SDimitry Andric #define PEEK2() (*(p->next+1)) 25281ad6265SDimitry Andric #define MORE() (p->end - p->next > 0) 25381ad6265SDimitry Andric #define MORE2() (p->end - p->next > 1) 2540b57cec5SDimitry Andric #define SEE(c) (MORE() && PEEK() == (c)) 25581ad6265SDimitry Andric #define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b)) 2560b57cec5SDimitry Andric #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 2570b57cec5SDimitry Andric #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 2580b57cec5SDimitry Andric #define NEXT() (p->next++) 2590b57cec5SDimitry Andric #define NEXT2() (p->next += 2) 2600b57cec5SDimitry Andric #define NEXTn(n) (p->next += (n)) 2610b57cec5SDimitry Andric #define GETNEXT() (*p->next++) 2620b57cec5SDimitry Andric #define SETERROR(e) seterr(p, (e)) 2630b57cec5SDimitry Andric #define REQUIRE(co, e) (void)((co) || SETERROR(e)) 2640b57cec5SDimitry Andric #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 2650b57cec5SDimitry Andric #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 2660b57cec5SDimitry Andric #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 2670b57cec5SDimitry Andric #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 2680b57cec5SDimitry Andric #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 2690b57cec5SDimitry Andric #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 2700b57cec5SDimitry Andric #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 2710b57cec5SDimitry Andric #define HERE() (p->slen) 2720b57cec5SDimitry Andric #define THERE() (p->slen - 1) 2730b57cec5SDimitry Andric #define THERETHERE() (p->slen - 2) 2740b57cec5SDimitry Andric #define DROP(n) (p->slen -= (n)) 2750b57cec5SDimitry Andric 2760b57cec5SDimitry Andric #ifdef _POSIX2_RE_DUP_MAX 2770b57cec5SDimitry Andric #define DUPMAX _POSIX2_RE_DUP_MAX 2780b57cec5SDimitry Andric #else 2790b57cec5SDimitry Andric #define DUPMAX 255 2800b57cec5SDimitry Andric #endif 281*62987288SDimitry Andric #define REGINFINITY (DUPMAX + 1) 2820b57cec5SDimitry Andric 2830b57cec5SDimitry Andric #ifndef NDEBUG 2840b57cec5SDimitry Andric static int never = 0; /* for use in asserts; shuts lint up */ 2850b57cec5SDimitry Andric #else 2860b57cec5SDimitry Andric #define never 0 /* some <assert.h>s have bugs too */ 2870b57cec5SDimitry Andric #endif 2880b57cec5SDimitry Andric 2890b57cec5SDimitry Andric /* 2900b57cec5SDimitry Andric - llvm_regcomp - interface for parser and compilation 2910b57cec5SDimitry Andric */ 2920b57cec5SDimitry Andric int /* 0 success, otherwise REG_something */ 2930b57cec5SDimitry Andric llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) 2940b57cec5SDimitry Andric { 2950b57cec5SDimitry Andric struct parse pa; 2960b57cec5SDimitry Andric struct re_guts *g; 2970b57cec5SDimitry Andric struct parse *p = &pa; 2980b57cec5SDimitry Andric int i; 2990b57cec5SDimitry Andric size_t len; 3000b57cec5SDimitry Andric #ifdef REDEBUG 3010b57cec5SDimitry Andric # define GOODFLAGS(f) (f) 3020b57cec5SDimitry Andric #else 3030b57cec5SDimitry Andric # define GOODFLAGS(f) ((f)&~REG_DUMP) 3040b57cec5SDimitry Andric #endif 3050b57cec5SDimitry Andric 3060b57cec5SDimitry Andric cflags = GOODFLAGS(cflags); 3070b57cec5SDimitry Andric if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 3080b57cec5SDimitry Andric return(REG_INVARG); 3090b57cec5SDimitry Andric 3100b57cec5SDimitry Andric if (cflags®_PEND) { 3110b57cec5SDimitry Andric if (preg->re_endp < pattern) 3120b57cec5SDimitry Andric return(REG_INVARG); 3130b57cec5SDimitry Andric len = preg->re_endp - pattern; 3140b57cec5SDimitry Andric } else 3150b57cec5SDimitry Andric len = strlen((const char *)pattern); 3160b57cec5SDimitry Andric 3170b57cec5SDimitry Andric /* do the mallocs early so failure handling is easy */ 3180b57cec5SDimitry Andric g = (struct re_guts *)malloc(sizeof(struct re_guts) + 3190b57cec5SDimitry Andric (NC-1)*sizeof(cat_t)); 3200b57cec5SDimitry Andric if (g == NULL) 3210b57cec5SDimitry Andric return(REG_ESPACE); 3220b57cec5SDimitry Andric p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 3230b57cec5SDimitry Andric p->strip = (sop *)calloc(p->ssize, sizeof(sop)); 3240b57cec5SDimitry Andric p->slen = 0; 3250b57cec5SDimitry Andric if (p->strip == NULL) { 3260b57cec5SDimitry Andric free((char *)g); 3270b57cec5SDimitry Andric return(REG_ESPACE); 3280b57cec5SDimitry Andric } 3290b57cec5SDimitry Andric 3300b57cec5SDimitry Andric /* set things up */ 3310b57cec5SDimitry Andric p->g = g; 3325f757f3fSDimitry Andric p->next = pattern; 3330b57cec5SDimitry Andric p->end = p->next + len; 3340b57cec5SDimitry Andric p->error = 0; 3350b57cec5SDimitry Andric p->ncsalloc = 0; 3360b57cec5SDimitry Andric for (i = 0; i < NPAREN; i++) { 3370b57cec5SDimitry Andric p->pbegin[i] = 0; 3380b57cec5SDimitry Andric p->pend[i] = 0; 3390b57cec5SDimitry Andric } 3400b57cec5SDimitry Andric g->csetsize = NC; 3410b57cec5SDimitry Andric g->sets = NULL; 3420b57cec5SDimitry Andric g->setbits = NULL; 3430b57cec5SDimitry Andric g->ncsets = 0; 3440b57cec5SDimitry Andric g->cflags = cflags; 3450b57cec5SDimitry Andric g->iflags = 0; 3460b57cec5SDimitry Andric g->nbol = 0; 3470b57cec5SDimitry Andric g->neol = 0; 3480b57cec5SDimitry Andric g->must = NULL; 3490b57cec5SDimitry Andric g->mlen = 0; 3500b57cec5SDimitry Andric g->nsub = 0; 3510b57cec5SDimitry Andric g->ncategories = 1; /* category 0 is "everything else" */ 3520b57cec5SDimitry Andric g->categories = &g->catspace[-(CHAR_MIN)]; 3530b57cec5SDimitry Andric (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 3540b57cec5SDimitry Andric g->backrefs = 0; 3550b57cec5SDimitry Andric 3560b57cec5SDimitry Andric /* do it */ 3570b57cec5SDimitry Andric EMIT(OEND, 0); 3580b57cec5SDimitry Andric g->firststate = THERE(); 3590b57cec5SDimitry Andric if (cflags®_EXTENDED) 3600b57cec5SDimitry Andric p_ere(p, OUT); 3610b57cec5SDimitry Andric else if (cflags®_NOSPEC) 3620b57cec5SDimitry Andric p_str(p); 3630b57cec5SDimitry Andric else 3640b57cec5SDimitry Andric p_bre(p, OUT, OUT); 3650b57cec5SDimitry Andric EMIT(OEND, 0); 3660b57cec5SDimitry Andric g->laststate = THERE(); 3670b57cec5SDimitry Andric 3680b57cec5SDimitry Andric /* tidy up loose ends and fill things in */ 3690b57cec5SDimitry Andric categorize(p, g); 3700b57cec5SDimitry Andric stripsnug(p, g); 3710b57cec5SDimitry Andric findmust(p, g); 3720b57cec5SDimitry Andric g->nplus = pluscount(p, g); 3730b57cec5SDimitry Andric g->magic = MAGIC2; 3740b57cec5SDimitry Andric preg->re_nsub = g->nsub; 3750b57cec5SDimitry Andric preg->re_g = g; 3760b57cec5SDimitry Andric preg->re_magic = MAGIC1; 3770b57cec5SDimitry Andric #ifndef REDEBUG 3780b57cec5SDimitry Andric /* not debugging, so can't rely on the assert() in llvm_regexec() */ 3790b57cec5SDimitry Andric if (g->iflags®EX_BAD) 3800b57cec5SDimitry Andric SETERROR(REG_ASSERT); 3810b57cec5SDimitry Andric #endif 3820b57cec5SDimitry Andric 3830b57cec5SDimitry Andric /* win or lose, we're done */ 3840b57cec5SDimitry Andric if (p->error != 0) /* lose */ 3850b57cec5SDimitry Andric llvm_regfree(preg); 3860b57cec5SDimitry Andric return(p->error); 3870b57cec5SDimitry Andric } 3880b57cec5SDimitry Andric 3890b57cec5SDimitry Andric /* 3900b57cec5SDimitry Andric - p_ere - ERE parser top level, concatenation and alternation 3910b57cec5SDimitry Andric */ 3920b57cec5SDimitry Andric static void 3930b57cec5SDimitry Andric p_ere(struct parse *p, int stop) /* character this ERE should end at */ 3940b57cec5SDimitry Andric { 3950b57cec5SDimitry Andric char c; 3960b57cec5SDimitry Andric sopno prevback = 0; 3970b57cec5SDimitry Andric sopno prevfwd = 0; 3980b57cec5SDimitry Andric sopno conc; 3990b57cec5SDimitry Andric int first = 1; /* is this the first alternative? */ 4000b57cec5SDimitry Andric 4010b57cec5SDimitry Andric for (;;) { 4020b57cec5SDimitry Andric /* do a bunch of concatenated expressions */ 4030b57cec5SDimitry Andric conc = HERE(); 4040b57cec5SDimitry Andric while (MORE() && (c = PEEK()) != '|' && c != stop) 4050b57cec5SDimitry Andric p_ere_exp(p); 4060b57cec5SDimitry Andric REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 4070b57cec5SDimitry Andric 4080b57cec5SDimitry Andric if (!EAT('|')) 4090b57cec5SDimitry Andric break; /* NOTE BREAK OUT */ 4100b57cec5SDimitry Andric 4110b57cec5SDimitry Andric if (first) { 4120b57cec5SDimitry Andric INSERT(OCH_, conc); /* offset is wrong */ 4130b57cec5SDimitry Andric prevfwd = conc; 4140b57cec5SDimitry Andric prevback = conc; 4150b57cec5SDimitry Andric first = 0; 4160b57cec5SDimitry Andric } 4170b57cec5SDimitry Andric ASTERN(OOR1, prevback); 4180b57cec5SDimitry Andric prevback = THERE(); 4190b57cec5SDimitry Andric AHEAD(prevfwd); /* fix previous offset */ 4200b57cec5SDimitry Andric prevfwd = HERE(); 4210b57cec5SDimitry Andric EMIT(OOR2, 0); /* offset is very wrong */ 4220b57cec5SDimitry Andric } 4230b57cec5SDimitry Andric 4240b57cec5SDimitry Andric if (!first) { /* tail-end fixups */ 4250b57cec5SDimitry Andric AHEAD(prevfwd); 4260b57cec5SDimitry Andric ASTERN(O_CH, prevback); 4270b57cec5SDimitry Andric } 4280b57cec5SDimitry Andric 4290b57cec5SDimitry Andric assert(!MORE() || SEE(stop)); 4300b57cec5SDimitry Andric } 4310b57cec5SDimitry Andric 4320b57cec5SDimitry Andric /* 4330b57cec5SDimitry Andric - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 4340b57cec5SDimitry Andric */ 4350b57cec5SDimitry Andric static void 4360b57cec5SDimitry Andric p_ere_exp(struct parse *p) 4370b57cec5SDimitry Andric { 4380b57cec5SDimitry Andric char c; 4390b57cec5SDimitry Andric sopno pos; 4400b57cec5SDimitry Andric int count; 4410b57cec5SDimitry Andric int count2; 4420b57cec5SDimitry Andric int backrefnum; 4430b57cec5SDimitry Andric sopno subno; 4440b57cec5SDimitry Andric int wascaret = 0; 4450b57cec5SDimitry Andric 4460b57cec5SDimitry Andric assert(MORE()); /* caller should have ensured this */ 4470b57cec5SDimitry Andric c = GETNEXT(); 4480b57cec5SDimitry Andric 4490b57cec5SDimitry Andric pos = HERE(); 4500b57cec5SDimitry Andric switch (c) { 4510b57cec5SDimitry Andric case '(': 4520b57cec5SDimitry Andric REQUIRE(MORE(), REG_EPAREN); 4530b57cec5SDimitry Andric p->g->nsub++; 4540b57cec5SDimitry Andric subno = p->g->nsub; 4550b57cec5SDimitry Andric if (subno < NPAREN) 4560b57cec5SDimitry Andric p->pbegin[subno] = HERE(); 4570b57cec5SDimitry Andric EMIT(OLPAREN, subno); 4580b57cec5SDimitry Andric if (!SEE(')')) 4590b57cec5SDimitry Andric p_ere(p, ')'); 4600b57cec5SDimitry Andric if (subno < NPAREN) { 4610b57cec5SDimitry Andric p->pend[subno] = HERE(); 4620b57cec5SDimitry Andric assert(p->pend[subno] != 0); 4630b57cec5SDimitry Andric } 4640b57cec5SDimitry Andric EMIT(ORPAREN, subno); 4650b57cec5SDimitry Andric MUSTEAT(')', REG_EPAREN); 4660b57cec5SDimitry Andric break; 4670b57cec5SDimitry Andric #ifndef POSIX_MISTAKE 4680b57cec5SDimitry Andric case ')': /* happens only if no current unmatched ( */ 4690b57cec5SDimitry Andric /* 4700b57cec5SDimitry Andric * You may ask, why the ifndef? Because I didn't notice 4710b57cec5SDimitry Andric * this until slightly too late for 1003.2, and none of the 4720b57cec5SDimitry Andric * other 1003.2 regular-expression reviewers noticed it at 4730b57cec5SDimitry Andric * all. So an unmatched ) is legal POSIX, at least until 4740b57cec5SDimitry Andric * we can get it fixed. 4750b57cec5SDimitry Andric */ 4760b57cec5SDimitry Andric SETERROR(REG_EPAREN); 4770b57cec5SDimitry Andric break; 4780b57cec5SDimitry Andric #endif 4790b57cec5SDimitry Andric case '^': 4800b57cec5SDimitry Andric EMIT(OBOL, 0); 4810b57cec5SDimitry Andric p->g->iflags |= USEBOL; 4820b57cec5SDimitry Andric p->g->nbol++; 4830b57cec5SDimitry Andric wascaret = 1; 4840b57cec5SDimitry Andric break; 4850b57cec5SDimitry Andric case '$': 4860b57cec5SDimitry Andric EMIT(OEOL, 0); 4870b57cec5SDimitry Andric p->g->iflags |= USEEOL; 4880b57cec5SDimitry Andric p->g->neol++; 4890b57cec5SDimitry Andric break; 4900b57cec5SDimitry Andric case '|': 4910b57cec5SDimitry Andric SETERROR(REG_EMPTY); 4920b57cec5SDimitry Andric break; 4930b57cec5SDimitry Andric case '*': 4940b57cec5SDimitry Andric case '+': 4950b57cec5SDimitry Andric case '?': 4960b57cec5SDimitry Andric SETERROR(REG_BADRPT); 4970b57cec5SDimitry Andric break; 4980b57cec5SDimitry Andric case '.': 4990b57cec5SDimitry Andric if (p->g->cflags®_NEWLINE) 5000b57cec5SDimitry Andric nonnewline(p); 5010b57cec5SDimitry Andric else 5020b57cec5SDimitry Andric EMIT(OANY, 0); 5030b57cec5SDimitry Andric break; 5040b57cec5SDimitry Andric case '[': 5050b57cec5SDimitry Andric p_bracket(p); 5060b57cec5SDimitry Andric break; 5070b57cec5SDimitry Andric case '\\': 5080b57cec5SDimitry Andric REQUIRE(MORE(), REG_EESCAPE); 5090b57cec5SDimitry Andric c = GETNEXT(); 5100b57cec5SDimitry Andric if (c >= '1' && c <= '9') { 5110b57cec5SDimitry Andric /* \[0-9] is taken to be a back-reference to a previously specified 5120b57cec5SDimitry Andric * matching group. backrefnum will hold the number. The matching 5130b57cec5SDimitry Andric * group must exist (i.e. if \4 is found there must have been at 5140b57cec5SDimitry Andric * least 4 matching groups specified in the pattern previously). 5150b57cec5SDimitry Andric */ 5160b57cec5SDimitry Andric backrefnum = c - '0'; 5170b57cec5SDimitry Andric if (p->pend[backrefnum] == 0) { 5180b57cec5SDimitry Andric SETERROR(REG_ESUBREG); 5190b57cec5SDimitry Andric break; 5200b57cec5SDimitry Andric } 5210b57cec5SDimitry Andric 5220b57cec5SDimitry Andric /* Make sure everything checks out and emit the sequence 5230b57cec5SDimitry Andric * that marks a back-reference to the parse structure. 5240b57cec5SDimitry Andric */ 5250b57cec5SDimitry Andric assert(backrefnum <= p->g->nsub); 5260b57cec5SDimitry Andric EMIT(OBACK_, backrefnum); 5270b57cec5SDimitry Andric assert(p->pbegin[backrefnum] != 0); 528bdd1243dSDimitry Andric assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN); 529bdd1243dSDimitry Andric assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN); 5300b57cec5SDimitry Andric (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]); 5310b57cec5SDimitry Andric EMIT(O_BACK, backrefnum); 5320b57cec5SDimitry Andric p->g->backrefs = 1; 5330b57cec5SDimitry Andric } else { 5340b57cec5SDimitry Andric /* Other chars are simply themselves when escaped with a backslash. 5350b57cec5SDimitry Andric */ 5360b57cec5SDimitry Andric ordinary(p, c); 5370b57cec5SDimitry Andric } 5380b57cec5SDimitry Andric break; 5390b57cec5SDimitry Andric case '{': /* okay as ordinary except if digit follows */ 5400b57cec5SDimitry Andric REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 5418bcb0991SDimitry Andric LLVM_FALLTHROUGH; 5420b57cec5SDimitry Andric default: 5430b57cec5SDimitry Andric ordinary(p, c); 5440b57cec5SDimitry Andric break; 5450b57cec5SDimitry Andric } 5460b57cec5SDimitry Andric 5470b57cec5SDimitry Andric if (!MORE()) 5480b57cec5SDimitry Andric return; 5490b57cec5SDimitry Andric c = PEEK(); 5500b57cec5SDimitry Andric /* we call { a repetition if followed by a digit */ 5510b57cec5SDimitry Andric if (!( c == '*' || c == '+' || c == '?' || 5520b57cec5SDimitry Andric (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 5530b57cec5SDimitry Andric return; /* no repetition, we're done */ 5540b57cec5SDimitry Andric NEXT(); 5550b57cec5SDimitry Andric 5560b57cec5SDimitry Andric REQUIRE(!wascaret, REG_BADRPT); 5570b57cec5SDimitry Andric switch (c) { 5580b57cec5SDimitry Andric case '*': /* implemented as +? */ 5590b57cec5SDimitry Andric /* this case does not require the (y|) trick, noKLUDGE */ 5600b57cec5SDimitry Andric INSERT(OPLUS_, pos); 5610b57cec5SDimitry Andric ASTERN(O_PLUS, pos); 5620b57cec5SDimitry Andric INSERT(OQUEST_, pos); 5630b57cec5SDimitry Andric ASTERN(O_QUEST, pos); 5640b57cec5SDimitry Andric break; 5650b57cec5SDimitry Andric case '+': 5660b57cec5SDimitry Andric INSERT(OPLUS_, pos); 5670b57cec5SDimitry Andric ASTERN(O_PLUS, pos); 5680b57cec5SDimitry Andric break; 5690b57cec5SDimitry Andric case '?': 5700b57cec5SDimitry Andric /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 5710b57cec5SDimitry Andric INSERT(OCH_, pos); /* offset slightly wrong */ 5720b57cec5SDimitry Andric ASTERN(OOR1, pos); /* this one's right */ 5730b57cec5SDimitry Andric AHEAD(pos); /* fix the OCH_ */ 5740b57cec5SDimitry Andric EMIT(OOR2, 0); /* offset very wrong... */ 5750b57cec5SDimitry Andric AHEAD(THERE()); /* ...so fix it */ 5760b57cec5SDimitry Andric ASTERN(O_CH, THERETHERE()); 5770b57cec5SDimitry Andric break; 5780b57cec5SDimitry Andric case '{': 5790b57cec5SDimitry Andric count = p_count(p); 5800b57cec5SDimitry Andric if (EAT(',')) { 5810b57cec5SDimitry Andric if (isdigit((uch)PEEK())) { 5820b57cec5SDimitry Andric count2 = p_count(p); 5830b57cec5SDimitry Andric REQUIRE(count <= count2, REG_BADBR); 5840b57cec5SDimitry Andric } else /* single number with comma */ 585*62987288SDimitry Andric count2 = REGINFINITY; 5860b57cec5SDimitry Andric } else /* just a single number */ 5870b57cec5SDimitry Andric count2 = count; 5880b57cec5SDimitry Andric repeat(p, pos, count, count2); 5890b57cec5SDimitry Andric if (!EAT('}')) { /* error heuristics */ 5900b57cec5SDimitry Andric while (MORE() && PEEK() != '}') 5910b57cec5SDimitry Andric NEXT(); 5920b57cec5SDimitry Andric REQUIRE(MORE(), REG_EBRACE); 5930b57cec5SDimitry Andric SETERROR(REG_BADBR); 5940b57cec5SDimitry Andric } 5950b57cec5SDimitry Andric break; 5960b57cec5SDimitry Andric } 5970b57cec5SDimitry Andric 5980b57cec5SDimitry Andric if (!MORE()) 5990b57cec5SDimitry Andric return; 6000b57cec5SDimitry Andric c = PEEK(); 6010b57cec5SDimitry Andric if (!( c == '*' || c == '+' || c == '?' || 6020b57cec5SDimitry Andric (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 6030b57cec5SDimitry Andric return; 6040b57cec5SDimitry Andric SETERROR(REG_BADRPT); 6050b57cec5SDimitry Andric } 6060b57cec5SDimitry Andric 6070b57cec5SDimitry Andric /* 6080b57cec5SDimitry Andric - p_str - string (no metacharacters) "parser" 6090b57cec5SDimitry Andric */ 6100b57cec5SDimitry Andric static void 6110b57cec5SDimitry Andric p_str(struct parse *p) 6120b57cec5SDimitry Andric { 6130b57cec5SDimitry Andric REQUIRE(MORE(), REG_EMPTY); 6140b57cec5SDimitry Andric while (MORE()) 6150b57cec5SDimitry Andric ordinary(p, GETNEXT()); 6160b57cec5SDimitry Andric } 6170b57cec5SDimitry Andric 6180b57cec5SDimitry Andric /* 6190b57cec5SDimitry Andric - p_bre - BRE parser top level, anchoring and concatenation 6200b57cec5SDimitry Andric * Giving end1 as OUT essentially eliminates the end1/end2 check. 6210b57cec5SDimitry Andric * 6220b57cec5SDimitry Andric * This implementation is a bit of a kludge, in that a trailing $ is first 6230b57cec5SDimitry Andric * taken as an ordinary character and then revised to be an anchor. The 6240b57cec5SDimitry Andric * only undesirable side effect is that '$' gets included as a character 6250b57cec5SDimitry Andric * category in such cases. This is fairly harmless; not worth fixing. 6260b57cec5SDimitry Andric * The amount of lookahead needed to avoid this kludge is excessive. 6270b57cec5SDimitry Andric */ 6280b57cec5SDimitry Andric static void 6290b57cec5SDimitry Andric p_bre(struct parse *p, 6300b57cec5SDimitry Andric int end1, /* first terminating character */ 6310b57cec5SDimitry Andric int end2) /* second terminating character */ 6320b57cec5SDimitry Andric { 6330b57cec5SDimitry Andric sopno start = HERE(); 6340b57cec5SDimitry Andric int first = 1; /* first subexpression? */ 6350b57cec5SDimitry Andric int wasdollar = 0; 6360b57cec5SDimitry Andric 6370b57cec5SDimitry Andric if (EAT('^')) { 6380b57cec5SDimitry Andric EMIT(OBOL, 0); 6390b57cec5SDimitry Andric p->g->iflags |= USEBOL; 6400b57cec5SDimitry Andric p->g->nbol++; 6410b57cec5SDimitry Andric } 6420b57cec5SDimitry Andric while (MORE() && !SEETWO(end1, end2)) { 6430b57cec5SDimitry Andric wasdollar = p_simp_re(p, first); 6440b57cec5SDimitry Andric first = 0; 6450b57cec5SDimitry Andric } 6460b57cec5SDimitry Andric if (wasdollar) { /* oops, that was a trailing anchor */ 6470b57cec5SDimitry Andric DROP(1); 6480b57cec5SDimitry Andric EMIT(OEOL, 0); 6490b57cec5SDimitry Andric p->g->iflags |= USEEOL; 6500b57cec5SDimitry Andric p->g->neol++; 6510b57cec5SDimitry Andric } 6520b57cec5SDimitry Andric 6530b57cec5SDimitry Andric REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 6540b57cec5SDimitry Andric } 6550b57cec5SDimitry Andric 6560b57cec5SDimitry Andric /* 6570b57cec5SDimitry Andric - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 6580b57cec5SDimitry Andric */ 6590b57cec5SDimitry Andric static int /* was the simple RE an unbackslashed $? */ 6600b57cec5SDimitry Andric p_simp_re(struct parse *p, 6610b57cec5SDimitry Andric int starordinary) /* is a leading * an ordinary character? */ 6620b57cec5SDimitry Andric { 6630b57cec5SDimitry Andric int c; 6640b57cec5SDimitry Andric int count; 6650b57cec5SDimitry Andric int count2; 6660b57cec5SDimitry Andric sopno pos; 6670b57cec5SDimitry Andric int i; 6680b57cec5SDimitry Andric sopno subno; 6690b57cec5SDimitry Andric # define BACKSL (1<<CHAR_BIT) 6700b57cec5SDimitry Andric 6710b57cec5SDimitry Andric pos = HERE(); /* repetition op, if any, covers from here */ 6720b57cec5SDimitry Andric 6730b57cec5SDimitry Andric assert(MORE()); /* caller should have ensured this */ 6740b57cec5SDimitry Andric c = GETNEXT(); 6750b57cec5SDimitry Andric if (c == '\\') { 6760b57cec5SDimitry Andric REQUIRE(MORE(), REG_EESCAPE); 6770b57cec5SDimitry Andric c = BACKSL | GETNEXT(); 6780b57cec5SDimitry Andric } 6790b57cec5SDimitry Andric switch (c) { 6800b57cec5SDimitry Andric case '.': 6810b57cec5SDimitry Andric if (p->g->cflags®_NEWLINE) 6820b57cec5SDimitry Andric nonnewline(p); 6830b57cec5SDimitry Andric else 6840b57cec5SDimitry Andric EMIT(OANY, 0); 6850b57cec5SDimitry Andric break; 6860b57cec5SDimitry Andric case '[': 6870b57cec5SDimitry Andric p_bracket(p); 6880b57cec5SDimitry Andric break; 6890b57cec5SDimitry Andric case BACKSL|'{': 6900b57cec5SDimitry Andric SETERROR(REG_BADRPT); 6910b57cec5SDimitry Andric break; 6920b57cec5SDimitry Andric case BACKSL|'(': 6930b57cec5SDimitry Andric p->g->nsub++; 6940b57cec5SDimitry Andric subno = p->g->nsub; 6950b57cec5SDimitry Andric if (subno < NPAREN) 6960b57cec5SDimitry Andric p->pbegin[subno] = HERE(); 6970b57cec5SDimitry Andric EMIT(OLPAREN, subno); 6980b57cec5SDimitry Andric /* the MORE here is an error heuristic */ 6990b57cec5SDimitry Andric if (MORE() && !SEETWO('\\', ')')) 7000b57cec5SDimitry Andric p_bre(p, '\\', ')'); 7010b57cec5SDimitry Andric if (subno < NPAREN) { 7020b57cec5SDimitry Andric p->pend[subno] = HERE(); 7030b57cec5SDimitry Andric assert(p->pend[subno] != 0); 7040b57cec5SDimitry Andric } 7050b57cec5SDimitry Andric EMIT(ORPAREN, subno); 7060b57cec5SDimitry Andric REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 7070b57cec5SDimitry Andric break; 7080b57cec5SDimitry Andric case BACKSL|')': /* should not get here -- must be user */ 7090b57cec5SDimitry Andric case BACKSL|'}': 7100b57cec5SDimitry Andric SETERROR(REG_EPAREN); 7110b57cec5SDimitry Andric break; 7120b57cec5SDimitry Andric case BACKSL|'1': 7130b57cec5SDimitry Andric case BACKSL|'2': 7140b57cec5SDimitry Andric case BACKSL|'3': 7150b57cec5SDimitry Andric case BACKSL|'4': 7160b57cec5SDimitry Andric case BACKSL|'5': 7170b57cec5SDimitry Andric case BACKSL|'6': 7180b57cec5SDimitry Andric case BACKSL|'7': 7190b57cec5SDimitry Andric case BACKSL|'8': 7200b57cec5SDimitry Andric case BACKSL|'9': 7210b57cec5SDimitry Andric i = (c&~BACKSL) - '0'; 7220b57cec5SDimitry Andric assert(i < NPAREN); 7230b57cec5SDimitry Andric if (p->pend[i] != 0) { 7240b57cec5SDimitry Andric assert(i <= p->g->nsub); 7250b57cec5SDimitry Andric EMIT(OBACK_, i); 7260b57cec5SDimitry Andric assert(p->pbegin[i] != 0); 7270b57cec5SDimitry Andric assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 7280b57cec5SDimitry Andric assert(OP(p->strip[p->pend[i]]) == ORPAREN); 7290b57cec5SDimitry Andric (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 7300b57cec5SDimitry Andric EMIT(O_BACK, i); 7310b57cec5SDimitry Andric } else 7320b57cec5SDimitry Andric SETERROR(REG_ESUBREG); 7330b57cec5SDimitry Andric p->g->backrefs = 1; 7340b57cec5SDimitry Andric break; 7350b57cec5SDimitry Andric case '*': 7360b57cec5SDimitry Andric REQUIRE(starordinary, REG_BADRPT); 7378bcb0991SDimitry Andric LLVM_FALLTHROUGH; 7380b57cec5SDimitry Andric default: 7390b57cec5SDimitry Andric ordinary(p, (char)c); 7400b57cec5SDimitry Andric break; 7410b57cec5SDimitry Andric } 7420b57cec5SDimitry Andric 7430b57cec5SDimitry Andric if (EAT('*')) { /* implemented as +? */ 7440b57cec5SDimitry Andric /* this case does not require the (y|) trick, noKLUDGE */ 7450b57cec5SDimitry Andric INSERT(OPLUS_, pos); 7460b57cec5SDimitry Andric ASTERN(O_PLUS, pos); 7470b57cec5SDimitry Andric INSERT(OQUEST_, pos); 7480b57cec5SDimitry Andric ASTERN(O_QUEST, pos); 7490b57cec5SDimitry Andric } else if (EATTWO('\\', '{')) { 7500b57cec5SDimitry Andric count = p_count(p); 7510b57cec5SDimitry Andric if (EAT(',')) { 7520b57cec5SDimitry Andric if (MORE() && isdigit((uch)PEEK())) { 7530b57cec5SDimitry Andric count2 = p_count(p); 7540b57cec5SDimitry Andric REQUIRE(count <= count2, REG_BADBR); 7550b57cec5SDimitry Andric } else /* single number with comma */ 756*62987288SDimitry Andric count2 = REGINFINITY; 7570b57cec5SDimitry Andric } else /* just a single number */ 7580b57cec5SDimitry Andric count2 = count; 7590b57cec5SDimitry Andric repeat(p, pos, count, count2); 7600b57cec5SDimitry Andric if (!EATTWO('\\', '}')) { /* error heuristics */ 7610b57cec5SDimitry Andric while (MORE() && !SEETWO('\\', '}')) 7620b57cec5SDimitry Andric NEXT(); 7630b57cec5SDimitry Andric REQUIRE(MORE(), REG_EBRACE); 7640b57cec5SDimitry Andric SETERROR(REG_BADBR); 7650b57cec5SDimitry Andric } 7660b57cec5SDimitry Andric } else if (c == '$') /* $ (but not \$) ends it */ 7670b57cec5SDimitry Andric return(1); 7680b57cec5SDimitry Andric 7690b57cec5SDimitry Andric return(0); 7700b57cec5SDimitry Andric } 7710b57cec5SDimitry Andric 7720b57cec5SDimitry Andric /* 7730b57cec5SDimitry Andric - p_count - parse a repetition count 7740b57cec5SDimitry Andric */ 7750b57cec5SDimitry Andric static int /* the value */ 7760b57cec5SDimitry Andric p_count(struct parse *p) 7770b57cec5SDimitry Andric { 7780b57cec5SDimitry Andric int count = 0; 7790b57cec5SDimitry Andric int ndigits = 0; 7800b57cec5SDimitry Andric 7810b57cec5SDimitry Andric while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 7820b57cec5SDimitry Andric count = count*10 + (GETNEXT() - '0'); 7830b57cec5SDimitry Andric ndigits++; 7840b57cec5SDimitry Andric } 7850b57cec5SDimitry Andric 7860b57cec5SDimitry Andric REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 7870b57cec5SDimitry Andric return(count); 7880b57cec5SDimitry Andric } 7890b57cec5SDimitry Andric 7900b57cec5SDimitry Andric /* 7910b57cec5SDimitry Andric - p_bracket - parse a bracketed character list 7920b57cec5SDimitry Andric * 7930b57cec5SDimitry Andric * Note a significant property of this code: if the allocset() did SETERROR, 7940b57cec5SDimitry Andric * no set operations are done. 7950b57cec5SDimitry Andric */ 7960b57cec5SDimitry Andric static void 7970b57cec5SDimitry Andric p_bracket(struct parse *p) 7980b57cec5SDimitry Andric { 7990b57cec5SDimitry Andric cset *cs; 8000b57cec5SDimitry Andric int invert = 0; 8010b57cec5SDimitry Andric 8020b57cec5SDimitry Andric /* Dept of Truly Sickening Special-Case Kludges */ 80381ad6265SDimitry Andric if (p->end - p->next > 5) { 80481ad6265SDimitry Andric if (strncmp(p->next, "[:<:]]", 6) == 0) { 8050b57cec5SDimitry Andric EMIT(OBOW, 0); 8060b57cec5SDimitry Andric NEXTn(6); 8070b57cec5SDimitry Andric return; 8080b57cec5SDimitry Andric } 80981ad6265SDimitry Andric if (strncmp(p->next, "[:>:]]", 6) == 0) { 8100b57cec5SDimitry Andric EMIT(OEOW, 0); 8110b57cec5SDimitry Andric NEXTn(6); 8120b57cec5SDimitry Andric return; 8130b57cec5SDimitry Andric } 81481ad6265SDimitry Andric } 8150b57cec5SDimitry Andric 8160b57cec5SDimitry Andric if ((cs = allocset(p)) == NULL) { 8170b57cec5SDimitry Andric /* allocset did set error status in p */ 8180b57cec5SDimitry Andric return; 8190b57cec5SDimitry Andric } 8200b57cec5SDimitry Andric 8210b57cec5SDimitry Andric if (EAT('^')) 8220b57cec5SDimitry Andric invert++; /* make note to invert set at end */ 8230b57cec5SDimitry Andric if (EAT(']')) 8240b57cec5SDimitry Andric CHadd(cs, ']'); 8250b57cec5SDimitry Andric else if (EAT('-')) 8260b57cec5SDimitry Andric CHadd(cs, '-'); 8270b57cec5SDimitry Andric while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 8280b57cec5SDimitry Andric p_b_term(p, cs); 8290b57cec5SDimitry Andric if (EAT('-')) 8300b57cec5SDimitry Andric CHadd(cs, '-'); 8310b57cec5SDimitry Andric MUSTEAT(']', REG_EBRACK); 8320b57cec5SDimitry Andric 8330b57cec5SDimitry Andric if (p->error != 0) { /* don't mess things up further */ 8340b57cec5SDimitry Andric freeset(p, cs); 8350b57cec5SDimitry Andric return; 8360b57cec5SDimitry Andric } 8370b57cec5SDimitry Andric 8380b57cec5SDimitry Andric if (p->g->cflags®_ICASE) { 8390b57cec5SDimitry Andric int i; 8400b57cec5SDimitry Andric int ci; 8410b57cec5SDimitry Andric 8420b57cec5SDimitry Andric for (i = p->g->csetsize - 1; i >= 0; i--) 8430b57cec5SDimitry Andric if (CHIN(cs, i) && isalpha(i)) { 8440b57cec5SDimitry Andric ci = othercase(i); 8450b57cec5SDimitry Andric if (ci != i) 8460b57cec5SDimitry Andric CHadd(cs, ci); 8470b57cec5SDimitry Andric } 8480b57cec5SDimitry Andric if (cs->multis != NULL) 8490b57cec5SDimitry Andric mccase(p, cs); 8500b57cec5SDimitry Andric } 8510b57cec5SDimitry Andric if (invert) { 8520b57cec5SDimitry Andric int i; 8530b57cec5SDimitry Andric 8540b57cec5SDimitry Andric for (i = p->g->csetsize - 1; i >= 0; i--) 8550b57cec5SDimitry Andric if (CHIN(cs, i)) 8560b57cec5SDimitry Andric CHsub(cs, i); 8570b57cec5SDimitry Andric else 8580b57cec5SDimitry Andric CHadd(cs, i); 8590b57cec5SDimitry Andric if (p->g->cflags®_NEWLINE) 8600b57cec5SDimitry Andric CHsub(cs, '\n'); 8610b57cec5SDimitry Andric if (cs->multis != NULL) 8620b57cec5SDimitry Andric mcinvert(p, cs); 8630b57cec5SDimitry Andric } 8640b57cec5SDimitry Andric 8650b57cec5SDimitry Andric assert(cs->multis == NULL); /* xxx */ 8660b57cec5SDimitry Andric 8670b57cec5SDimitry Andric if (nch(p, cs) == 1) { /* optimize singleton sets */ 8680b57cec5SDimitry Andric ordinary(p, firstch(p, cs)); 8690b57cec5SDimitry Andric freeset(p, cs); 8700b57cec5SDimitry Andric } else 8710b57cec5SDimitry Andric EMIT(OANYOF, freezeset(p, cs)); 8720b57cec5SDimitry Andric } 8730b57cec5SDimitry Andric 8740b57cec5SDimitry Andric /* 8750b57cec5SDimitry Andric - p_b_term - parse one term of a bracketed character list 8760b57cec5SDimitry Andric */ 8770b57cec5SDimitry Andric static void 8780b57cec5SDimitry Andric p_b_term(struct parse *p, cset *cs) 8790b57cec5SDimitry Andric { 8800b57cec5SDimitry Andric char c; 8810b57cec5SDimitry Andric char start, finish; 8820b57cec5SDimitry Andric int i; 8830b57cec5SDimitry Andric 8840b57cec5SDimitry Andric /* classify what we've got */ 8850b57cec5SDimitry Andric switch ((MORE()) ? PEEK() : '\0') { 8860b57cec5SDimitry Andric case '[': 8870b57cec5SDimitry Andric c = (MORE2()) ? PEEK2() : '\0'; 8880b57cec5SDimitry Andric break; 8890b57cec5SDimitry Andric case '-': 8900b57cec5SDimitry Andric SETERROR(REG_ERANGE); 8910b57cec5SDimitry Andric return; /* NOTE RETURN */ 8920b57cec5SDimitry Andric break; 8930b57cec5SDimitry Andric default: 8940b57cec5SDimitry Andric c = '\0'; 8950b57cec5SDimitry Andric break; 8960b57cec5SDimitry Andric } 8970b57cec5SDimitry Andric 8980b57cec5SDimitry Andric switch (c) { 8990b57cec5SDimitry Andric case ':': /* character class */ 9000b57cec5SDimitry Andric NEXT2(); 9010b57cec5SDimitry Andric REQUIRE(MORE(), REG_EBRACK); 9020b57cec5SDimitry Andric c = PEEK(); 9030b57cec5SDimitry Andric REQUIRE(c != '-' && c != ']', REG_ECTYPE); 9040b57cec5SDimitry Andric p_b_cclass(p, cs); 9050b57cec5SDimitry Andric REQUIRE(MORE(), REG_EBRACK); 9060b57cec5SDimitry Andric REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 9070b57cec5SDimitry Andric break; 9080b57cec5SDimitry Andric case '=': /* equivalence class */ 9090b57cec5SDimitry Andric NEXT2(); 9100b57cec5SDimitry Andric REQUIRE(MORE(), REG_EBRACK); 9110b57cec5SDimitry Andric c = PEEK(); 9120b57cec5SDimitry Andric REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 9130b57cec5SDimitry Andric p_b_eclass(p, cs); 9140b57cec5SDimitry Andric REQUIRE(MORE(), REG_EBRACK); 9150b57cec5SDimitry Andric REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 9160b57cec5SDimitry Andric break; 9170b57cec5SDimitry Andric default: /* symbol, ordinary character, or range */ 9180b57cec5SDimitry Andric /* xxx revision needed for multichar stuff */ 9190b57cec5SDimitry Andric start = p_b_symbol(p); 9200b57cec5SDimitry Andric if (SEE('-') && MORE2() && PEEK2() != ']') { 9210b57cec5SDimitry Andric /* range */ 9220b57cec5SDimitry Andric NEXT(); 9230b57cec5SDimitry Andric if (EAT('-')) 9240b57cec5SDimitry Andric finish = '-'; 9250b57cec5SDimitry Andric else 9260b57cec5SDimitry Andric finish = p_b_symbol(p); 9270b57cec5SDimitry Andric } else 9280b57cec5SDimitry Andric finish = start; 9290b57cec5SDimitry Andric /* xxx what about signed chars here... */ 9300b57cec5SDimitry Andric REQUIRE(start <= finish, REG_ERANGE); 9310b57cec5SDimitry Andric for (i = start; i <= finish; i++) 9320b57cec5SDimitry Andric CHadd(cs, i); 9330b57cec5SDimitry Andric break; 9340b57cec5SDimitry Andric } 9350b57cec5SDimitry Andric } 9360b57cec5SDimitry Andric 9370b57cec5SDimitry Andric /* 9380b57cec5SDimitry Andric - p_b_cclass - parse a character-class name and deal with it 9390b57cec5SDimitry Andric */ 9400b57cec5SDimitry Andric static void 9410b57cec5SDimitry Andric p_b_cclass(struct parse *p, cset *cs) 9420b57cec5SDimitry Andric { 9435f757f3fSDimitry Andric const char *sp = p->next; 9440b57cec5SDimitry Andric struct cclass *cp; 9450b57cec5SDimitry Andric size_t len; 9460b57cec5SDimitry Andric const char *u; 9470b57cec5SDimitry Andric char c; 9480b57cec5SDimitry Andric 9490b57cec5SDimitry Andric while (MORE() && isalpha((uch)PEEK())) 9500b57cec5SDimitry Andric NEXT(); 9510b57cec5SDimitry Andric len = p->next - sp; 9520b57cec5SDimitry Andric for (cp = cclasses; cp->name != NULL; cp++) 9530b57cec5SDimitry Andric if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 9540b57cec5SDimitry Andric break; 9550b57cec5SDimitry Andric if (cp->name == NULL) { 9560b57cec5SDimitry Andric /* oops, didn't find it */ 9570b57cec5SDimitry Andric SETERROR(REG_ECTYPE); 9580b57cec5SDimitry Andric return; 9590b57cec5SDimitry Andric } 9600b57cec5SDimitry Andric 9610b57cec5SDimitry Andric u = cp->chars; 9620b57cec5SDimitry Andric while ((c = *u++) != '\0') 9630b57cec5SDimitry Andric CHadd(cs, c); 9640b57cec5SDimitry Andric for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 9650b57cec5SDimitry Andric MCadd(p, cs, u); 9660b57cec5SDimitry Andric } 9670b57cec5SDimitry Andric 9680b57cec5SDimitry Andric /* 9690b57cec5SDimitry Andric - p_b_eclass - parse an equivalence-class name and deal with it 9700b57cec5SDimitry Andric * 9710b57cec5SDimitry Andric * This implementation is incomplete. xxx 9720b57cec5SDimitry Andric */ 9730b57cec5SDimitry Andric static void 9740b57cec5SDimitry Andric p_b_eclass(struct parse *p, cset *cs) 9750b57cec5SDimitry Andric { 9760b57cec5SDimitry Andric char c; 9770b57cec5SDimitry Andric 9780b57cec5SDimitry Andric c = p_b_coll_elem(p, '='); 9790b57cec5SDimitry Andric CHadd(cs, c); 9800b57cec5SDimitry Andric } 9810b57cec5SDimitry Andric 9820b57cec5SDimitry Andric /* 9830b57cec5SDimitry Andric - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 9840b57cec5SDimitry Andric */ 9850b57cec5SDimitry Andric static char /* value of symbol */ 9860b57cec5SDimitry Andric p_b_symbol(struct parse *p) 9870b57cec5SDimitry Andric { 9880b57cec5SDimitry Andric char value; 9890b57cec5SDimitry Andric 9900b57cec5SDimitry Andric REQUIRE(MORE(), REG_EBRACK); 9910b57cec5SDimitry Andric if (!EATTWO('[', '.')) 9920b57cec5SDimitry Andric return(GETNEXT()); 9930b57cec5SDimitry Andric 9940b57cec5SDimitry Andric /* collating symbol */ 9950b57cec5SDimitry Andric value = p_b_coll_elem(p, '.'); 9960b57cec5SDimitry Andric REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 9970b57cec5SDimitry Andric return(value); 9980b57cec5SDimitry Andric } 9990b57cec5SDimitry Andric 10000b57cec5SDimitry Andric /* 10010b57cec5SDimitry Andric - p_b_coll_elem - parse a collating-element name and look it up 10020b57cec5SDimitry Andric */ 10030b57cec5SDimitry Andric static char /* value of collating element */ 10040b57cec5SDimitry Andric p_b_coll_elem(struct parse *p, 10050b57cec5SDimitry Andric int endc) /* name ended by endc,']' */ 10060b57cec5SDimitry Andric { 10075f757f3fSDimitry Andric const char *sp = p->next; 10080b57cec5SDimitry Andric struct cname *cp; 10090b57cec5SDimitry Andric size_t len; 10100b57cec5SDimitry Andric 10110b57cec5SDimitry Andric while (MORE() && !SEETWO(endc, ']')) 10120b57cec5SDimitry Andric NEXT(); 10130b57cec5SDimitry Andric if (!MORE()) { 10140b57cec5SDimitry Andric SETERROR(REG_EBRACK); 10150b57cec5SDimitry Andric return(0); 10160b57cec5SDimitry Andric } 10170b57cec5SDimitry Andric len = p->next - sp; 10180b57cec5SDimitry Andric for (cp = cnames; cp->name != NULL; cp++) 10190b57cec5SDimitry Andric if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 10200b57cec5SDimitry Andric return(cp->code); /* known name */ 10210b57cec5SDimitry Andric if (len == 1) 10220b57cec5SDimitry Andric return(*sp); /* single character */ 10230b57cec5SDimitry Andric SETERROR(REG_ECOLLATE); /* neither */ 10240b57cec5SDimitry Andric return(0); 10250b57cec5SDimitry Andric } 10260b57cec5SDimitry Andric 10270b57cec5SDimitry Andric /* 10280b57cec5SDimitry Andric - othercase - return the case counterpart of an alphabetic 10290b57cec5SDimitry Andric */ 10300b57cec5SDimitry Andric static char /* if no counterpart, return ch */ 10310b57cec5SDimitry Andric othercase(int ch) 10320b57cec5SDimitry Andric { 10330b57cec5SDimitry Andric ch = (uch)ch; 10340b57cec5SDimitry Andric assert(isalpha(ch)); 10350b57cec5SDimitry Andric if (isupper(ch)) 10360b57cec5SDimitry Andric return ((uch)tolower(ch)); 10370b57cec5SDimitry Andric else if (islower(ch)) 10380b57cec5SDimitry Andric return ((uch)toupper(ch)); 10390b57cec5SDimitry Andric else /* peculiar, but could happen */ 10400b57cec5SDimitry Andric return(ch); 10410b57cec5SDimitry Andric } 10420b57cec5SDimitry Andric 10430b57cec5SDimitry Andric /* 10440b57cec5SDimitry Andric - bothcases - emit a dualcase version of a two-case character 10450b57cec5SDimitry Andric * 10460b57cec5SDimitry Andric * Boy, is this implementation ever a kludge... 10470b57cec5SDimitry Andric */ 10480b57cec5SDimitry Andric static void 10490b57cec5SDimitry Andric bothcases(struct parse *p, int ch) 10500b57cec5SDimitry Andric { 10515f757f3fSDimitry Andric const char *oldnext = p->next; 10525f757f3fSDimitry Andric const char *oldend = p->end; 10530b57cec5SDimitry Andric char bracket[3]; 10540b57cec5SDimitry Andric 10550b57cec5SDimitry Andric ch = (uch)ch; 10560b57cec5SDimitry Andric assert(othercase(ch) != ch); /* p_bracket() would recurse */ 10570b57cec5SDimitry Andric p->next = bracket; 10580b57cec5SDimitry Andric p->end = bracket+2; 10590b57cec5SDimitry Andric bracket[0] = ch; 10600b57cec5SDimitry Andric bracket[1] = ']'; 10610b57cec5SDimitry Andric bracket[2] = '\0'; 10620b57cec5SDimitry Andric p_bracket(p); 10630b57cec5SDimitry Andric assert(p->next == bracket+2); 10640b57cec5SDimitry Andric p->next = oldnext; 10650b57cec5SDimitry Andric p->end = oldend; 10660b57cec5SDimitry Andric } 10670b57cec5SDimitry Andric 10680b57cec5SDimitry Andric /* 10690b57cec5SDimitry Andric - ordinary - emit an ordinary character 10700b57cec5SDimitry Andric */ 10710b57cec5SDimitry Andric static void 10720b57cec5SDimitry Andric ordinary(struct parse *p, int ch) 10730b57cec5SDimitry Andric { 10740b57cec5SDimitry Andric cat_t *cap = p->g->categories; 10750b57cec5SDimitry Andric 10760b57cec5SDimitry Andric if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 10770b57cec5SDimitry Andric bothcases(p, ch); 10780b57cec5SDimitry Andric else { 10790b57cec5SDimitry Andric EMIT(OCHAR, (uch)ch); 10800b57cec5SDimitry Andric if (cap[ch] == 0) 10810b57cec5SDimitry Andric cap[ch] = p->g->ncategories++; 10820b57cec5SDimitry Andric } 10830b57cec5SDimitry Andric } 10840b57cec5SDimitry Andric 10850b57cec5SDimitry Andric /* 10860b57cec5SDimitry Andric - nonnewline - emit REG_NEWLINE version of OANY 10870b57cec5SDimitry Andric * 10880b57cec5SDimitry Andric * Boy, is this implementation ever a kludge... 10890b57cec5SDimitry Andric */ 10900b57cec5SDimitry Andric static void 10910b57cec5SDimitry Andric nonnewline(struct parse *p) 10920b57cec5SDimitry Andric { 10935f757f3fSDimitry Andric const char *oldnext = p->next; 10945f757f3fSDimitry Andric const char *oldend = p->end; 10955f757f3fSDimitry Andric static const char bracket[4] = {'^', '\n', ']', '\0'}; 10960b57cec5SDimitry Andric 10970b57cec5SDimitry Andric p->next = bracket; 10980b57cec5SDimitry Andric p->end = bracket+3; 10990b57cec5SDimitry Andric p_bracket(p); 11000b57cec5SDimitry Andric assert(p->next == bracket+3); 11010b57cec5SDimitry Andric p->next = oldnext; 11020b57cec5SDimitry Andric p->end = oldend; 11030b57cec5SDimitry Andric } 11040b57cec5SDimitry Andric 11050b57cec5SDimitry Andric /* 11060b57cec5SDimitry Andric - repeat - generate code for a bounded repetition, recursively if needed 11070b57cec5SDimitry Andric */ 11080b57cec5SDimitry Andric static void 11090b57cec5SDimitry Andric repeat(struct parse *p, 11100b57cec5SDimitry Andric sopno start, /* operand from here to end of strip */ 11110b57cec5SDimitry Andric int from, /* repeated from this number */ 11120b57cec5SDimitry Andric int to) /* to this number of times (maybe INFINITY) */ 11130b57cec5SDimitry Andric { 11140b57cec5SDimitry Andric sopno finish = HERE(); 11150b57cec5SDimitry Andric # define N 2 11160b57cec5SDimitry Andric # define INF 3 11170b57cec5SDimitry Andric # define REP(f, t) ((f)*8 + (t)) 1118*62987288SDimitry Andric # define MAP(n) (((n) <= 1) ? (n) : ((n) == REGINFINITY) ? INF : N) 11190b57cec5SDimitry Andric sopno copy; 11200b57cec5SDimitry Andric 11210b57cec5SDimitry Andric if (p->error != 0) /* head off possible runaway recursion */ 11220b57cec5SDimitry Andric return; 11230b57cec5SDimitry Andric 11240b57cec5SDimitry Andric assert(from <= to); 11250b57cec5SDimitry Andric 11260b57cec5SDimitry Andric switch (REP(MAP(from), MAP(to))) { 11270b57cec5SDimitry Andric case REP(0, 0): /* must be user doing this */ 11280b57cec5SDimitry Andric DROP(finish-start); /* drop the operand */ 11290b57cec5SDimitry Andric break; 11300b57cec5SDimitry Andric case REP(0, 1): /* as x{1,1}? */ 11310b57cec5SDimitry Andric case REP(0, N): /* as x{1,n}? */ 11320b57cec5SDimitry Andric case REP(0, INF): /* as x{1,}? */ 11330b57cec5SDimitry Andric /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 11340b57cec5SDimitry Andric INSERT(OCH_, start); /* offset is wrong... */ 11350b57cec5SDimitry Andric repeat(p, start+1, 1, to); 11360b57cec5SDimitry Andric ASTERN(OOR1, start); 11370b57cec5SDimitry Andric AHEAD(start); /* ... fix it */ 11380b57cec5SDimitry Andric EMIT(OOR2, 0); 11390b57cec5SDimitry Andric AHEAD(THERE()); 11400b57cec5SDimitry Andric ASTERN(O_CH, THERETHERE()); 11410b57cec5SDimitry Andric break; 11420b57cec5SDimitry Andric case REP(1, 1): /* trivial case */ 11430b57cec5SDimitry Andric /* done */ 11440b57cec5SDimitry Andric break; 11450b57cec5SDimitry Andric case REP(1, N): /* as x?x{1,n-1} */ 11460b57cec5SDimitry Andric /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 11470b57cec5SDimitry Andric INSERT(OCH_, start); 11480b57cec5SDimitry Andric ASTERN(OOR1, start); 11490b57cec5SDimitry Andric AHEAD(start); 11500b57cec5SDimitry Andric EMIT(OOR2, 0); /* offset very wrong... */ 11510b57cec5SDimitry Andric AHEAD(THERE()); /* ...so fix it */ 11520b57cec5SDimitry Andric ASTERN(O_CH, THERETHERE()); 11530b57cec5SDimitry Andric copy = dupl(p, start+1, finish+1); 11540b57cec5SDimitry Andric assert(copy == finish+4); 11550b57cec5SDimitry Andric repeat(p, copy, 1, to-1); 11560b57cec5SDimitry Andric break; 11570b57cec5SDimitry Andric case REP(1, INF): /* as x+ */ 11580b57cec5SDimitry Andric INSERT(OPLUS_, start); 11590b57cec5SDimitry Andric ASTERN(O_PLUS, start); 11600b57cec5SDimitry Andric break; 11610b57cec5SDimitry Andric case REP(N, N): /* as xx{m-1,n-1} */ 11620b57cec5SDimitry Andric copy = dupl(p, start, finish); 11630b57cec5SDimitry Andric repeat(p, copy, from-1, to-1); 11640b57cec5SDimitry Andric break; 11650b57cec5SDimitry Andric case REP(N, INF): /* as xx{n-1,INF} */ 11660b57cec5SDimitry Andric copy = dupl(p, start, finish); 11670b57cec5SDimitry Andric repeat(p, copy, from-1, to); 11680b57cec5SDimitry Andric break; 11690b57cec5SDimitry Andric default: /* "can't happen" */ 11700b57cec5SDimitry Andric SETERROR(REG_ASSERT); /* just in case */ 11710b57cec5SDimitry Andric break; 11720b57cec5SDimitry Andric } 11730b57cec5SDimitry Andric } 11740b57cec5SDimitry Andric 11750b57cec5SDimitry Andric /* 11760b57cec5SDimitry Andric - seterr - set an error condition 11770b57cec5SDimitry Andric */ 11780b57cec5SDimitry Andric static int /* useless but makes type checking happy */ 11790b57cec5SDimitry Andric seterr(struct parse *p, int e) 11800b57cec5SDimitry Andric { 11810b57cec5SDimitry Andric if (p->error == 0) /* keep earliest error condition */ 11820b57cec5SDimitry Andric p->error = e; 11830b57cec5SDimitry Andric p->next = nuls; /* try to bring things to a halt */ 11840b57cec5SDimitry Andric p->end = nuls; 11850b57cec5SDimitry Andric return(0); /* make the return value well-defined */ 11860b57cec5SDimitry Andric } 11870b57cec5SDimitry Andric 11880b57cec5SDimitry Andric /* 11890b57cec5SDimitry Andric - allocset - allocate a set of characters for [] 11900b57cec5SDimitry Andric */ 11910b57cec5SDimitry Andric static cset * 11920b57cec5SDimitry Andric allocset(struct parse *p) 11930b57cec5SDimitry Andric { 11940b57cec5SDimitry Andric int no = p->g->ncsets++; 11950b57cec5SDimitry Andric size_t nc; 11960b57cec5SDimitry Andric size_t nbytes; 11970b57cec5SDimitry Andric cset *cs; 11980b57cec5SDimitry Andric size_t css = (size_t)p->g->csetsize; 11990b57cec5SDimitry Andric int i; 12000b57cec5SDimitry Andric 12010b57cec5SDimitry Andric if (no >= p->ncsalloc) { /* need another column of space */ 12020b57cec5SDimitry Andric void *ptr; 12030b57cec5SDimitry Andric 12040b57cec5SDimitry Andric p->ncsalloc += CHAR_BIT; 12050b57cec5SDimitry Andric nc = p->ncsalloc; 12060b57cec5SDimitry Andric if (nc > SIZE_MAX / sizeof(cset)) 12070b57cec5SDimitry Andric goto nomem; 12080b57cec5SDimitry Andric assert(nc % CHAR_BIT == 0); 12090b57cec5SDimitry Andric nbytes = nc / CHAR_BIT * css; 12100b57cec5SDimitry Andric 12110b57cec5SDimitry Andric ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); 12120b57cec5SDimitry Andric if (ptr == NULL) 12130b57cec5SDimitry Andric goto nomem; 12140b57cec5SDimitry Andric p->g->sets = ptr; 12150b57cec5SDimitry Andric 12160b57cec5SDimitry Andric ptr = (uch *)realloc((char *)p->g->setbits, nbytes); 12170b57cec5SDimitry Andric if (ptr == NULL) 12180b57cec5SDimitry Andric goto nomem; 12190b57cec5SDimitry Andric p->g->setbits = ptr; 12200b57cec5SDimitry Andric 12210b57cec5SDimitry Andric for (i = 0; i < no; i++) 12220b57cec5SDimitry Andric p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 12230b57cec5SDimitry Andric 12240b57cec5SDimitry Andric (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); 12250b57cec5SDimitry Andric } 12260b57cec5SDimitry Andric /* XXX should not happen */ 12270b57cec5SDimitry Andric if (p->g->sets == NULL || p->g->setbits == NULL) 12280b57cec5SDimitry Andric goto nomem; 12290b57cec5SDimitry Andric 12300b57cec5SDimitry Andric cs = &p->g->sets[no]; 12310b57cec5SDimitry Andric cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 12320b57cec5SDimitry Andric cs->mask = 1 << ((no) % CHAR_BIT); 12330b57cec5SDimitry Andric cs->hash = 0; 12340b57cec5SDimitry Andric cs->smultis = 0; 12350b57cec5SDimitry Andric cs->multis = NULL; 12360b57cec5SDimitry Andric 12370b57cec5SDimitry Andric return(cs); 12380b57cec5SDimitry Andric nomem: 12390b57cec5SDimitry Andric free(p->g->sets); 12400b57cec5SDimitry Andric p->g->sets = NULL; 12410b57cec5SDimitry Andric free(p->g->setbits); 12420b57cec5SDimitry Andric p->g->setbits = NULL; 12430b57cec5SDimitry Andric 12440b57cec5SDimitry Andric SETERROR(REG_ESPACE); 12450b57cec5SDimitry Andric /* caller's responsibility not to do set ops */ 12460b57cec5SDimitry Andric return(NULL); 12470b57cec5SDimitry Andric } 12480b57cec5SDimitry Andric 12490b57cec5SDimitry Andric /* 12500b57cec5SDimitry Andric - freeset - free a now-unused set 12510b57cec5SDimitry Andric */ 12520b57cec5SDimitry Andric static void 12530b57cec5SDimitry Andric freeset(struct parse *p, cset *cs) 12540b57cec5SDimitry Andric { 12550b57cec5SDimitry Andric size_t i; 12560b57cec5SDimitry Andric cset *top = &p->g->sets[p->g->ncsets]; 12570b57cec5SDimitry Andric size_t css = (size_t)p->g->csetsize; 12580b57cec5SDimitry Andric 12590b57cec5SDimitry Andric for (i = 0; i < css; i++) 12600b57cec5SDimitry Andric CHsub(cs, i); 12610b57cec5SDimitry Andric if (cs == top-1) /* recover only the easy case */ 12620b57cec5SDimitry Andric p->g->ncsets--; 12630b57cec5SDimitry Andric } 12640b57cec5SDimitry Andric 12650b57cec5SDimitry Andric /* 12660b57cec5SDimitry Andric - freezeset - final processing on a set of characters 12670b57cec5SDimitry Andric * 12680b57cec5SDimitry Andric * The main task here is merging identical sets. This is usually a waste 12690b57cec5SDimitry Andric * of time (although the hash code minimizes the overhead), but can win 12700b57cec5SDimitry Andric * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 12710b57cec5SDimitry Andric * is done using addition rather than xor -- all ASCII [aA] sets xor to 12720b57cec5SDimitry Andric * the same value! 12730b57cec5SDimitry Andric */ 12740b57cec5SDimitry Andric static int /* set number */ 12750b57cec5SDimitry Andric freezeset(struct parse *p, cset *cs) 12760b57cec5SDimitry Andric { 12770b57cec5SDimitry Andric uch h = cs->hash; 12780b57cec5SDimitry Andric size_t i; 12790b57cec5SDimitry Andric cset *top = &p->g->sets[p->g->ncsets]; 12800b57cec5SDimitry Andric cset *cs2; 12810b57cec5SDimitry Andric size_t css = (size_t)p->g->csetsize; 12820b57cec5SDimitry Andric 12830b57cec5SDimitry Andric /* look for an earlier one which is the same */ 12840b57cec5SDimitry Andric for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 12850b57cec5SDimitry Andric if (cs2->hash == h && cs2 != cs) { 12860b57cec5SDimitry Andric /* maybe */ 12870b57cec5SDimitry Andric for (i = 0; i < css; i++) 12880b57cec5SDimitry Andric if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 12890b57cec5SDimitry Andric break; /* no */ 12900b57cec5SDimitry Andric if (i == css) 12910b57cec5SDimitry Andric break; /* yes */ 12920b57cec5SDimitry Andric } 12930b57cec5SDimitry Andric 12940b57cec5SDimitry Andric if (cs2 < top) { /* found one */ 12950b57cec5SDimitry Andric freeset(p, cs); 12960b57cec5SDimitry Andric cs = cs2; 12970b57cec5SDimitry Andric } 12980b57cec5SDimitry Andric 12990b57cec5SDimitry Andric return((int)(cs - p->g->sets)); 13000b57cec5SDimitry Andric } 13010b57cec5SDimitry Andric 13020b57cec5SDimitry Andric /* 13030b57cec5SDimitry Andric - firstch - return first character in a set (which must have at least one) 13040b57cec5SDimitry Andric */ 13050b57cec5SDimitry Andric static int /* character; there is no "none" value */ 13060b57cec5SDimitry Andric firstch(struct parse *p, cset *cs) 13070b57cec5SDimitry Andric { 13080b57cec5SDimitry Andric size_t i; 13090b57cec5SDimitry Andric size_t css = (size_t)p->g->csetsize; 13100b57cec5SDimitry Andric 13110b57cec5SDimitry Andric for (i = 0; i < css; i++) 13120b57cec5SDimitry Andric if (CHIN(cs, i)) 13130b57cec5SDimitry Andric return((char)i); 13140b57cec5SDimitry Andric assert(never); 13150b57cec5SDimitry Andric return(0); /* arbitrary */ 13160b57cec5SDimitry Andric } 13170b57cec5SDimitry Andric 13180b57cec5SDimitry Andric /* 13190b57cec5SDimitry Andric - nch - number of characters in a set 13200b57cec5SDimitry Andric */ 13210b57cec5SDimitry Andric static int 13220b57cec5SDimitry Andric nch(struct parse *p, cset *cs) 13230b57cec5SDimitry Andric { 13240b57cec5SDimitry Andric size_t i; 13250b57cec5SDimitry Andric size_t css = (size_t)p->g->csetsize; 13260b57cec5SDimitry Andric int n = 0; 13270b57cec5SDimitry Andric 13280b57cec5SDimitry Andric for (i = 0; i < css; i++) 13290b57cec5SDimitry Andric if (CHIN(cs, i)) 13300b57cec5SDimitry Andric n++; 13310b57cec5SDimitry Andric return(n); 13320b57cec5SDimitry Andric } 13330b57cec5SDimitry Andric 13340b57cec5SDimitry Andric /* 13350b57cec5SDimitry Andric - mcadd - add a collating element to a cset 13360b57cec5SDimitry Andric */ 13370b57cec5SDimitry Andric static void 13380b57cec5SDimitry Andric mcadd( struct parse *p, cset *cs, const char *cp) 13390b57cec5SDimitry Andric { 13400b57cec5SDimitry Andric size_t oldend = cs->smultis; 13410b57cec5SDimitry Andric void *np; 13420b57cec5SDimitry Andric 13430b57cec5SDimitry Andric cs->smultis += strlen(cp) + 1; 13440b57cec5SDimitry Andric np = realloc(cs->multis, cs->smultis); 13450b57cec5SDimitry Andric if (np == NULL) { 13460b57cec5SDimitry Andric if (cs->multis) 13470b57cec5SDimitry Andric free(cs->multis); 13480b57cec5SDimitry Andric cs->multis = NULL; 13490b57cec5SDimitry Andric SETERROR(REG_ESPACE); 13500b57cec5SDimitry Andric return; 13510b57cec5SDimitry Andric } 13520b57cec5SDimitry Andric cs->multis = np; 13530b57cec5SDimitry Andric 13540b57cec5SDimitry Andric llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); 13550b57cec5SDimitry Andric } 13560b57cec5SDimitry Andric 13570b57cec5SDimitry Andric /* 13580b57cec5SDimitry Andric - mcinvert - invert the list of collating elements in a cset 13590b57cec5SDimitry Andric * 13600b57cec5SDimitry Andric * This would have to know the set of possibilities. Implementation 13610b57cec5SDimitry Andric * is deferred. 13620b57cec5SDimitry Andric */ 13630b57cec5SDimitry Andric /* ARGSUSED */ 13640b57cec5SDimitry Andric static void 13650b57cec5SDimitry Andric mcinvert(struct parse *p, cset *cs) 13660b57cec5SDimitry Andric { 13670b57cec5SDimitry Andric assert(cs->multis == NULL); /* xxx */ 13680b57cec5SDimitry Andric } 13690b57cec5SDimitry Andric 13700b57cec5SDimitry Andric /* 13710b57cec5SDimitry Andric - mccase - add case counterparts of the list of collating elements in a cset 13720b57cec5SDimitry Andric * 13730b57cec5SDimitry Andric * This would have to know the set of possibilities. Implementation 13740b57cec5SDimitry Andric * is deferred. 13750b57cec5SDimitry Andric */ 13760b57cec5SDimitry Andric /* ARGSUSED */ 13770b57cec5SDimitry Andric static void 13780b57cec5SDimitry Andric mccase(struct parse *p, cset *cs) 13790b57cec5SDimitry Andric { 13800b57cec5SDimitry Andric assert(cs->multis == NULL); /* xxx */ 13810b57cec5SDimitry Andric } 13820b57cec5SDimitry Andric 13830b57cec5SDimitry Andric /* 13840b57cec5SDimitry Andric - isinsets - is this character in any sets? 13850b57cec5SDimitry Andric */ 13860b57cec5SDimitry Andric static int /* predicate */ 13870b57cec5SDimitry Andric isinsets(struct re_guts *g, int c) 13880b57cec5SDimitry Andric { 13890b57cec5SDimitry Andric uch *col; 13900b57cec5SDimitry Andric int i; 13910b57cec5SDimitry Andric int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 13920b57cec5SDimitry Andric unsigned uc = (uch)c; 13930b57cec5SDimitry Andric 13940b57cec5SDimitry Andric for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 13950b57cec5SDimitry Andric if (col[uc] != 0) 13960b57cec5SDimitry Andric return(1); 13970b57cec5SDimitry Andric return(0); 13980b57cec5SDimitry Andric } 13990b57cec5SDimitry Andric 14000b57cec5SDimitry Andric /* 14010b57cec5SDimitry Andric - samesets - are these two characters in exactly the same sets? 14020b57cec5SDimitry Andric */ 14030b57cec5SDimitry Andric static int /* predicate */ 14040b57cec5SDimitry Andric samesets(struct re_guts *g, int c1, int c2) 14050b57cec5SDimitry Andric { 14060b57cec5SDimitry Andric uch *col; 14070b57cec5SDimitry Andric int i; 14080b57cec5SDimitry Andric int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 14090b57cec5SDimitry Andric unsigned uc1 = (uch)c1; 14100b57cec5SDimitry Andric unsigned uc2 = (uch)c2; 14110b57cec5SDimitry Andric 14120b57cec5SDimitry Andric for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 14130b57cec5SDimitry Andric if (col[uc1] != col[uc2]) 14140b57cec5SDimitry Andric return(0); 14150b57cec5SDimitry Andric return(1); 14160b57cec5SDimitry Andric } 14170b57cec5SDimitry Andric 14180b57cec5SDimitry Andric /* 14190b57cec5SDimitry Andric - categorize - sort out character categories 14200b57cec5SDimitry Andric */ 14210b57cec5SDimitry Andric static void 14220b57cec5SDimitry Andric categorize(struct parse *p, struct re_guts *g) 14230b57cec5SDimitry Andric { 14240b57cec5SDimitry Andric cat_t *cats = g->categories; 14250b57cec5SDimitry Andric int c; 14260b57cec5SDimitry Andric int c2; 14270b57cec5SDimitry Andric cat_t cat; 14280b57cec5SDimitry Andric 14290b57cec5SDimitry Andric /* avoid making error situations worse */ 14300b57cec5SDimitry Andric if (p->error != 0) 14310b57cec5SDimitry Andric return; 14320b57cec5SDimitry Andric 14330b57cec5SDimitry Andric for (c = CHAR_MIN; c <= CHAR_MAX; c++) 14340b57cec5SDimitry Andric if (cats[c] == 0 && isinsets(g, c)) { 14350b57cec5SDimitry Andric cat = g->ncategories++; 14360b57cec5SDimitry Andric cats[c] = cat; 14370b57cec5SDimitry Andric for (c2 = c+1; c2 <= CHAR_MAX; c2++) 14380b57cec5SDimitry Andric if (cats[c2] == 0 && samesets(g, c, c2)) 14390b57cec5SDimitry Andric cats[c2] = cat; 14400b57cec5SDimitry Andric } 14410b57cec5SDimitry Andric } 14420b57cec5SDimitry Andric 14430b57cec5SDimitry Andric /* 14440b57cec5SDimitry Andric - dupl - emit a duplicate of a bunch of sops 14450b57cec5SDimitry Andric */ 14460b57cec5SDimitry Andric static sopno /* start of duplicate */ 14470b57cec5SDimitry Andric dupl(struct parse *p, 14480b57cec5SDimitry Andric sopno start, /* from here */ 14490b57cec5SDimitry Andric sopno finish) /* to this less one */ 14500b57cec5SDimitry Andric { 14510b57cec5SDimitry Andric sopno ret = HERE(); 14520b57cec5SDimitry Andric sopno len = finish - start; 14530b57cec5SDimitry Andric 14540b57cec5SDimitry Andric assert(finish >= start); 14550b57cec5SDimitry Andric if (len == 0) 14560b57cec5SDimitry Andric return(ret); 14570b57cec5SDimitry Andric enlarge(p, p->ssize + len); /* this many unexpected additions */ 14580b57cec5SDimitry Andric assert(p->ssize >= p->slen + len); 14590b57cec5SDimitry Andric (void) memmove((char *)(p->strip + p->slen), 14600b57cec5SDimitry Andric (char *)(p->strip + start), (size_t)len*sizeof(sop)); 14610b57cec5SDimitry Andric p->slen += len; 14620b57cec5SDimitry Andric return(ret); 14630b57cec5SDimitry Andric } 14640b57cec5SDimitry Andric 14650b57cec5SDimitry Andric /* 14660b57cec5SDimitry Andric - doemit - emit a strip operator 14670b57cec5SDimitry Andric * 14680b57cec5SDimitry Andric * It might seem better to implement this as a macro with a function as 14690b57cec5SDimitry Andric * hard-case backup, but it's just too big and messy unless there are 14700b57cec5SDimitry Andric * some changes to the data structures. Maybe later. 14710b57cec5SDimitry Andric */ 14720b57cec5SDimitry Andric static void 14730b57cec5SDimitry Andric doemit(struct parse *p, sop op, size_t opnd) 14740b57cec5SDimitry Andric { 14750b57cec5SDimitry Andric /* avoid making error situations worse */ 14760b57cec5SDimitry Andric if (p->error != 0) 14770b57cec5SDimitry Andric return; 14780b57cec5SDimitry Andric 14790b57cec5SDimitry Andric /* deal with oversize operands ("can't happen", more or less) */ 14800b57cec5SDimitry Andric assert(opnd < 1<<OPSHIFT); 14810b57cec5SDimitry Andric 14820b57cec5SDimitry Andric /* deal with undersized strip */ 14830b57cec5SDimitry Andric if (p->slen >= p->ssize) 14840b57cec5SDimitry Andric enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 14850b57cec5SDimitry Andric assert(p->slen < p->ssize); 14860b57cec5SDimitry Andric 14870b57cec5SDimitry Andric /* finally, it's all reduced to the easy case */ 14880b57cec5SDimitry Andric p->strip[p->slen++] = SOP(op, opnd); 14890b57cec5SDimitry Andric } 14900b57cec5SDimitry Andric 14910b57cec5SDimitry Andric /* 14920b57cec5SDimitry Andric - doinsert - insert a sop into the strip 14930b57cec5SDimitry Andric */ 14940b57cec5SDimitry Andric static void 14950b57cec5SDimitry Andric doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 14960b57cec5SDimitry Andric { 14970b57cec5SDimitry Andric sopno sn; 14980b57cec5SDimitry Andric sop s; 14990b57cec5SDimitry Andric int i; 15000b57cec5SDimitry Andric 15010b57cec5SDimitry Andric /* avoid making error situations worse */ 15020b57cec5SDimitry Andric if (p->error != 0) 15030b57cec5SDimitry Andric return; 15040b57cec5SDimitry Andric 15050b57cec5SDimitry Andric sn = HERE(); 15060b57cec5SDimitry Andric EMIT(op, opnd); /* do checks, ensure space */ 15070b57cec5SDimitry Andric assert(HERE() == sn+1); 15080b57cec5SDimitry Andric s = p->strip[sn]; 15090b57cec5SDimitry Andric 15100b57cec5SDimitry Andric /* adjust paren pointers */ 15110b57cec5SDimitry Andric assert(pos > 0); 15120b57cec5SDimitry Andric for (i = 1; i < NPAREN; i++) { 15130b57cec5SDimitry Andric if (p->pbegin[i] >= pos) { 15140b57cec5SDimitry Andric p->pbegin[i]++; 15150b57cec5SDimitry Andric } 15160b57cec5SDimitry Andric if (p->pend[i] >= pos) { 15170b57cec5SDimitry Andric p->pend[i]++; 15180b57cec5SDimitry Andric } 15190b57cec5SDimitry Andric } 15200b57cec5SDimitry Andric 15210b57cec5SDimitry Andric memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 15220b57cec5SDimitry Andric (HERE()-pos-1)*sizeof(sop)); 15230b57cec5SDimitry Andric p->strip[pos] = s; 15240b57cec5SDimitry Andric } 15250b57cec5SDimitry Andric 15260b57cec5SDimitry Andric /* 15270b57cec5SDimitry Andric - dofwd - complete a forward reference 15280b57cec5SDimitry Andric */ 15290b57cec5SDimitry Andric static void 15300b57cec5SDimitry Andric dofwd(struct parse *p, sopno pos, sop value) 15310b57cec5SDimitry Andric { 15320b57cec5SDimitry Andric /* avoid making error situations worse */ 15330b57cec5SDimitry Andric if (p->error != 0) 15340b57cec5SDimitry Andric return; 15350b57cec5SDimitry Andric 15360b57cec5SDimitry Andric assert(value < 1<<OPSHIFT); 15370b57cec5SDimitry Andric p->strip[pos] = OP(p->strip[pos]) | value; 15380b57cec5SDimitry Andric } 15390b57cec5SDimitry Andric 15400b57cec5SDimitry Andric /* 15410b57cec5SDimitry Andric - enlarge - enlarge the strip 15420b57cec5SDimitry Andric */ 15430b57cec5SDimitry Andric static void 15440b57cec5SDimitry Andric enlarge(struct parse *p, sopno size) 15450b57cec5SDimitry Andric { 15460b57cec5SDimitry Andric sop *sp; 15470b57cec5SDimitry Andric 15480b57cec5SDimitry Andric if (p->ssize >= size) 15490b57cec5SDimitry Andric return; 15500b57cec5SDimitry Andric 15510b57cec5SDimitry Andric if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) { 15520b57cec5SDimitry Andric SETERROR(REG_ESPACE); 15530b57cec5SDimitry Andric return; 15540b57cec5SDimitry Andric } 15550b57cec5SDimitry Andric 15560b57cec5SDimitry Andric sp = (sop *)realloc(p->strip, size*sizeof(sop)); 15570b57cec5SDimitry Andric if (sp == NULL) { 15580b57cec5SDimitry Andric SETERROR(REG_ESPACE); 15590b57cec5SDimitry Andric return; 15600b57cec5SDimitry Andric } 15610b57cec5SDimitry Andric p->strip = sp; 15620b57cec5SDimitry Andric p->ssize = size; 15630b57cec5SDimitry Andric } 15640b57cec5SDimitry Andric 15650b57cec5SDimitry Andric /* 15660b57cec5SDimitry Andric - stripsnug - compact the strip 15670b57cec5SDimitry Andric */ 15680b57cec5SDimitry Andric static void 15690b57cec5SDimitry Andric stripsnug(struct parse *p, struct re_guts *g) 15700b57cec5SDimitry Andric { 15710b57cec5SDimitry Andric g->nstates = p->slen; 15720b57cec5SDimitry Andric if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) { 15730b57cec5SDimitry Andric g->strip = p->strip; 15740b57cec5SDimitry Andric SETERROR(REG_ESPACE); 15750b57cec5SDimitry Andric return; 15760b57cec5SDimitry Andric } 15770b57cec5SDimitry Andric 15780b57cec5SDimitry Andric g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 15790b57cec5SDimitry Andric if (g->strip == NULL) { 15800b57cec5SDimitry Andric SETERROR(REG_ESPACE); 15810b57cec5SDimitry Andric g->strip = p->strip; 15820b57cec5SDimitry Andric } 15830b57cec5SDimitry Andric } 15840b57cec5SDimitry Andric 15850b57cec5SDimitry Andric /* 15860b57cec5SDimitry Andric - findmust - fill in must and mlen with longest mandatory literal string 15870b57cec5SDimitry Andric * 15880b57cec5SDimitry Andric * This algorithm could do fancy things like analyzing the operands of | 15890b57cec5SDimitry Andric * for common subsequences. Someday. This code is simple and finds most 15900b57cec5SDimitry Andric * of the interesting cases. 15910b57cec5SDimitry Andric * 15920b57cec5SDimitry Andric * Note that must and mlen got initialized during setup. 15930b57cec5SDimitry Andric */ 15940b57cec5SDimitry Andric static void 15950b57cec5SDimitry Andric findmust(struct parse *p, struct re_guts *g) 15960b57cec5SDimitry Andric { 15970b57cec5SDimitry Andric sop *scan; 15980b57cec5SDimitry Andric sop *start = 0; /* start initialized in the default case, after that */ 15990b57cec5SDimitry Andric sop *newstart = 0; /* newstart was initialized in the OCHAR case */ 16000b57cec5SDimitry Andric sopno newlen; 16010b57cec5SDimitry Andric sop s; 16020b57cec5SDimitry Andric char *cp; 16030b57cec5SDimitry Andric sopno i; 16040b57cec5SDimitry Andric 16050b57cec5SDimitry Andric /* avoid making error situations worse */ 16060b57cec5SDimitry Andric if (p->error != 0) 16070b57cec5SDimitry Andric return; 16080b57cec5SDimitry Andric 16090b57cec5SDimitry Andric /* find the longest OCHAR sequence in strip */ 16100b57cec5SDimitry Andric newlen = 0; 16110b57cec5SDimitry Andric scan = g->strip + 1; 16120b57cec5SDimitry Andric do { 16130b57cec5SDimitry Andric s = *scan++; 16140b57cec5SDimitry Andric switch (OP(s)) { 16150b57cec5SDimitry Andric case OCHAR: /* sequence member */ 16160b57cec5SDimitry Andric if (newlen == 0) /* new sequence */ 16170b57cec5SDimitry Andric newstart = scan - 1; 16180b57cec5SDimitry Andric newlen++; 16190b57cec5SDimitry Andric break; 16200b57cec5SDimitry Andric case OPLUS_: /* things that don't break one */ 16210b57cec5SDimitry Andric case OLPAREN: 16220b57cec5SDimitry Andric case ORPAREN: 16230b57cec5SDimitry Andric break; 16240b57cec5SDimitry Andric case OQUEST_: /* things that must be skipped */ 16250b57cec5SDimitry Andric case OCH_: 16260b57cec5SDimitry Andric scan--; 16270b57cec5SDimitry Andric do { 16280b57cec5SDimitry Andric scan += OPND(s); 16290b57cec5SDimitry Andric s = *scan; 16300b57cec5SDimitry Andric /* assert() interferes w debug printouts */ 16310b57cec5SDimitry Andric if (OP(s) != O_QUEST && OP(s) != O_CH && 16320b57cec5SDimitry Andric OP(s) != OOR2) { 16330b57cec5SDimitry Andric g->iflags |= REGEX_BAD; 16340b57cec5SDimitry Andric return; 16350b57cec5SDimitry Andric } 16360b57cec5SDimitry Andric } while (OP(s) != O_QUEST && OP(s) != O_CH); 16378bcb0991SDimitry Andric LLVM_FALLTHROUGH; 16380b57cec5SDimitry Andric default: /* things that break a sequence */ 16390b57cec5SDimitry Andric if (newlen > g->mlen) { /* ends one */ 16400b57cec5SDimitry Andric start = newstart; 16410b57cec5SDimitry Andric g->mlen = newlen; 16420b57cec5SDimitry Andric } 16430b57cec5SDimitry Andric newlen = 0; 16440b57cec5SDimitry Andric break; 16450b57cec5SDimitry Andric } 16460b57cec5SDimitry Andric } while (OP(s) != OEND); 16470b57cec5SDimitry Andric 16480b57cec5SDimitry Andric if (g->mlen == 0) /* there isn't one */ 16490b57cec5SDimitry Andric return; 16500b57cec5SDimitry Andric 16510b57cec5SDimitry Andric /* turn it into a character string */ 16520b57cec5SDimitry Andric g->must = malloc((size_t)g->mlen + 1); 16530b57cec5SDimitry Andric if (g->must == NULL) { /* argh; just forget it */ 16540b57cec5SDimitry Andric g->mlen = 0; 16550b57cec5SDimitry Andric return; 16560b57cec5SDimitry Andric } 16570b57cec5SDimitry Andric cp = g->must; 16580b57cec5SDimitry Andric scan = start; 16590b57cec5SDimitry Andric for (i = g->mlen; i > 0; i--) { 16600b57cec5SDimitry Andric while (OP(s = *scan++) != OCHAR) 16610b57cec5SDimitry Andric continue; 16620b57cec5SDimitry Andric assert(cp < g->must + g->mlen); 16630b57cec5SDimitry Andric *cp++ = (char)OPND(s); 16640b57cec5SDimitry Andric } 16650b57cec5SDimitry Andric assert(cp == g->must + g->mlen); 16660b57cec5SDimitry Andric *cp++ = '\0'; /* just on general principles */ 16670b57cec5SDimitry Andric } 16680b57cec5SDimitry Andric 16690b57cec5SDimitry Andric /* 16700b57cec5SDimitry Andric - pluscount - count + nesting 16710b57cec5SDimitry Andric */ 16720b57cec5SDimitry Andric static sopno /* nesting depth */ 16730b57cec5SDimitry Andric pluscount(struct parse *p, struct re_guts *g) 16740b57cec5SDimitry Andric { 16750b57cec5SDimitry Andric sop *scan; 16760b57cec5SDimitry Andric sop s; 16770b57cec5SDimitry Andric sopno plusnest = 0; 16780b57cec5SDimitry Andric sopno maxnest = 0; 16790b57cec5SDimitry Andric 16800b57cec5SDimitry Andric if (p->error != 0) 16810b57cec5SDimitry Andric return(0); /* there may not be an OEND */ 16820b57cec5SDimitry Andric 16830b57cec5SDimitry Andric scan = g->strip + 1; 16840b57cec5SDimitry Andric do { 16850b57cec5SDimitry Andric s = *scan++; 16860b57cec5SDimitry Andric switch (OP(s)) { 16870b57cec5SDimitry Andric case OPLUS_: 16880b57cec5SDimitry Andric plusnest++; 16890b57cec5SDimitry Andric break; 16900b57cec5SDimitry Andric case O_PLUS: 16910b57cec5SDimitry Andric if (plusnest > maxnest) 16920b57cec5SDimitry Andric maxnest = plusnest; 16930b57cec5SDimitry Andric plusnest--; 16940b57cec5SDimitry Andric break; 16950b57cec5SDimitry Andric } 16960b57cec5SDimitry Andric } while (OP(s) != OEND); 16970b57cec5SDimitry Andric if (plusnest != 0) 16980b57cec5SDimitry Andric g->iflags |= REGEX_BAD; 16990b57cec5SDimitry Andric return(maxnest); 17000b57cec5SDimitry Andric } 1701