155847Sbostic /*- 255847Sbostic * Copyright (c) 1992 Henry Spencer. 355847Sbostic * Copyright (c) 1992 The Regents of the University of California. 455847Sbostic * All rights reserved. 555847Sbostic * 655847Sbostic * This code is derived from software contributed to Berkeley by 755847Sbostic * Henry Spencer of the University of Toronto. 855847Sbostic * 955847Sbostic * %sccs.include.redist.c% 1055847Sbostic * 11*56357Sbostic * @(#)regcomp.c 5.3 (Berkeley) 09/30/92 1255847Sbostic */ 1355847Sbostic 1455847Sbostic #if defined(LIBC_SCCS) && !defined(lint) 15*56357Sbostic static char sccsid[] = "@(#)regcomp.c 5.3 (Berkeley) 09/30/92"; 1655847Sbostic #endif /* LIBC_SCCS and not lint */ 1755847Sbostic 1855847Sbostic #include <sys/types.h> 1955847Sbostic #include <stdio.h> 2055847Sbostic #include <string.h> 2155847Sbostic #include <ctype.h> 2255847Sbostic #include <limits.h> 2355847Sbostic #include <stdlib.h> 2455847Sbostic #include <regex.h> 2555847Sbostic 2655847Sbostic #include "utils.h" 2755847Sbostic #include "regex2.h" 2855847Sbostic 2955847Sbostic #include "cclass.h" 3055847Sbostic #include "cname.h" 3155847Sbostic 3255847Sbostic /* 3355847Sbostic * parse structure, passed up and down to avoid global variables and 3455847Sbostic * other clumsinesses 3555847Sbostic */ 3655847Sbostic struct parse { 3755847Sbostic uchar *next; /* next character in RE */ 3855847Sbostic int error; /* has an error been seen? */ 3955847Sbostic sop *strip; /* malloced strip */ 4055847Sbostic sopno ssize; /* malloced strip size (allocated) */ 4155847Sbostic sopno slen; /* malloced strip length (used) */ 4255847Sbostic int ncsalloc; /* number of csets allocated */ 4355847Sbostic struct re_guts *g; 4455847Sbostic # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 4555847Sbostic sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 4655847Sbostic sopno pend[NPAREN]; /* -> ) ([0] unused) */ 4755847Sbostic }; 4855847Sbostic 4956355Sbostic static uchar nuls[10]; /* place to point scanner in event of error */ 5056355Sbostic 5155847Sbostic /* 5255847Sbostic * macros for use with parse structure 5355847Sbostic * BEWARE: these know that the parse structure is named `p' !!! 5455847Sbostic */ 5555847Sbostic #define PEEK() ((uchar)*p->next) 5655847Sbostic #define PEEK2() ((uchar)*(p->next+1)) 5755847Sbostic #define SEE(c) (PEEK() == (c)) 5855847Sbostic #define SEETWO(a, b) (PEEK() == (a) && PEEK2() == (b)) 5955847Sbostic #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 6055847Sbostic #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 6155847Sbostic #define NEXT() (p->next++) 6255847Sbostic #define NEXT2() (p->next += 2) 6355847Sbostic #define NEXTn(n) (p->next += (n)) 6455847Sbostic #define GETNEXT() ((uchar)*p->next++) 6555847Sbostic #define SETERROR(e) seterr(p, (e)) 6655847Sbostic #define REQUIRE(co, e) ((co) || SETERROR(e)) 6755847Sbostic #define MUSTSEE(c, e) (REQUIRE(PEEK() == (c), e)) 6855847Sbostic #define MUSTEAT(c, e) (REQUIRE(GETNEXT() == (c), e)) 6955847Sbostic #define MUSTNOTSEE(c, e) (REQUIRE(PEEK() != (c), e)) 7055847Sbostic #define EMIT(sop, sopnd) doemit(p, sop, (size_t)(sopnd)) 7155847Sbostic #define INSERT(sop, pos) doinsert(p, sop, HERE()-(pos)+1, pos) 7255847Sbostic #define FWD(pos) dofwd(p, pos, HERE()-(pos)) 7355847Sbostic #define BACK(sop, pos) EMIT(sop, HERE()-pos) 7455847Sbostic #define HERE() (p->slen) 7555847Sbostic #define THERE() (p->slen - 1) 7655847Sbostic #define DROP(n) (p->slen -= (n)) 7755847Sbostic 78*56357Sbostic static cset *allocset __P((struct parse *)); 79*56357Sbostic static void bothcases __P((struct parse *, u_int)); 80*56357Sbostic static void categorize __P((struct parse *, struct re_guts *)); 81*56357Sbostic static void doemit __P((struct parse *, sop, size_t)); 82*56357Sbostic static void dofwd __P((struct parse *, sopno, sop)); 83*56357Sbostic static void doinsert __P((struct parse *, sop, size_t, sopno)); 84*56357Sbostic static sopno dupl __P((struct parse *, sopno, sopno)); 85*56357Sbostic static void enlarge __P((struct parse *, sopno)); 86*56357Sbostic static void findmust __P((struct parse *, struct re_guts *)); 87*56357Sbostic static int freezeset __P((struct parse *, cset *)); 88*56357Sbostic static int isinsets __P((struct re_guts *, u_int)); 89*56357Sbostic static void mcadd __P((struct parse *, cset *, uchar *)); 90*56357Sbostic static uchar *mcfind __P((cset *, u_int *)); 91*56357Sbostic static int mcin __P((struct parse *, cset *, u_int *)); 92*56357Sbostic static void mcinvert __P((struct parse *, cset *)); 93*56357Sbostic static void mcsub __P((struct parse *, cset *, u_int *)); 94*56357Sbostic static void nonnewline __P((struct parse *)); 95*56357Sbostic static void ordinary __P((struct parse *, u_int)); 96*56357Sbostic static uchar othercase __P((u_int)); 97*56357Sbostic static void p_b_cclass __P((struct parse *, cset *)); 98*56357Sbostic static uchar p_b_coll_elem __P((struct parse *, u_int)); 99*56357Sbostic static void p_b_eclass __P((struct parse *, cset *)); 100*56357Sbostic static uchar p_b_symbol __P((struct parse *)); 101*56357Sbostic static void p_b_term __P((struct parse *, cset *)); 102*56357Sbostic static void p_bracket __P((struct parse *)); 103*56357Sbostic static void p_bre __P((struct parse *, u_int, u_int)); 104*56357Sbostic static int p_count __P((struct parse *)); 105*56357Sbostic static void p_ere __P((struct parse *, u_int)); 106*56357Sbostic static void p_ere_exp __P((struct parse *)); 107*56357Sbostic static int p_simp_re __P((struct parse *, int)); 108*56357Sbostic static sopno pluscount __P((struct parse *, struct re_guts *)); 109*56357Sbostic static void repeat __P((struct parse *, sopno, int, int)); 110*56357Sbostic static int samesets __P((struct re_guts *, u_int, u_int)); 111*56357Sbostic static int seterr __P((struct parse *, int)); 112*56357Sbostic static void stripsnug __P((struct parse *, struct re_guts *)); 113*56357Sbostic 11455847Sbostic /* 11555847Sbostic - regcomp - interface for parser and compilation 11655847Sbostic */ 11755847Sbostic int /* 0 success, otherwise REG_something */ 11855847Sbostic regcomp(preg, pattern, cflags) 11955847Sbostic regex_t *preg; 12055847Sbostic const char *pattern; 12155847Sbostic int cflags; 12255847Sbostic { 12355847Sbostic struct parse pa; 12455847Sbostic register struct re_guts *g; 12555847Sbostic register struct parse *p = &pa; 12655847Sbostic register int i; 12755847Sbostic 12855847Sbostic /* do the mallocs early so failure handling is easy */ 12955847Sbostic /* the +NUC here is for the category table */ 13055847Sbostic g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC); 13155847Sbostic if (g == NULL) 13255847Sbostic return(REG_ESPACE); 13355847Sbostic p->ssize = strlen(pattern)/2*3 + 1; 13455847Sbostic p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 13555847Sbostic p->slen = 0; 13655847Sbostic if (p->strip == NULL) { 13755847Sbostic free((char *)g); 13855847Sbostic return(REG_ESPACE); 13955847Sbostic } 14055847Sbostic 14155847Sbostic /* set things up */ 14255847Sbostic p->g = g; 14355847Sbostic p->next = (uchar *)pattern; 14455847Sbostic p->error = 0; 14555847Sbostic p->ncsalloc = 0; 14655847Sbostic for (i = 0; i < NPAREN; i++) { 14755847Sbostic p->pbegin[i] = 0; 14855847Sbostic p->pend[i] = 0; 14955847Sbostic } 15055847Sbostic g->csetsize = NUC; 15155847Sbostic g->sets = NULL; 15255847Sbostic g->setbits = NULL; 15355847Sbostic g->ncsets = 0; 15455847Sbostic g->cflags = cflags; 15555847Sbostic g->iflags = 0; 15655847Sbostic g->must = NULL; 15755847Sbostic g->mlen = 0; 15855847Sbostic g->nsub = 0; 15955847Sbostic g->ncategories = 1; /* category 0 is "everything else" */ 16055847Sbostic g->categories = (uchar *)g + sizeof(struct re_guts); 16155847Sbostic (void) memset((char *)g->categories, 0, NUC); 16255847Sbostic g->backrefs = 0; 16355847Sbostic g->nplus = 0; 16455847Sbostic 16555847Sbostic /* do it */ 16655847Sbostic EMIT(OEND, 0); 16755847Sbostic g->firststate = THERE(); 16855847Sbostic if (cflags®_EXTENDED) 16955847Sbostic p_ere(p, '\0'); 17055847Sbostic else 17155847Sbostic p_bre(p, '\0', '\0'); 17255847Sbostic EMIT(OEND, 0); 17355847Sbostic g->laststate = THERE(); 17455847Sbostic 17555847Sbostic /* tidy up loose ends and fill things in */ 17655847Sbostic categorize(p, g); 17755847Sbostic stripsnug(p, g); 17855847Sbostic findmust(p, g); 17955847Sbostic g->nplus = pluscount(p, g); 18055847Sbostic g->magic = MAGIC2; 18155847Sbostic preg->re_nsub = g->nsub; 18255847Sbostic preg->re_g = g; 18355847Sbostic preg->re_magic = MAGIC1; 18456355Sbostic #ifndef REDEBUG 18555847Sbostic /* not debugging, so can't rely on the assert() in regexec() */ 18655847Sbostic if (g->iflags&BAD) 18755847Sbostic SETERROR(REG_ASSERT); 18855847Sbostic #endif 18955847Sbostic 19055847Sbostic /* win or lose, we're done */ 19155847Sbostic if (p->error != 0) /* lose */ 19255847Sbostic regfree(preg); 19355847Sbostic return(p->error); 19455847Sbostic } 19555847Sbostic 19655847Sbostic /* 19755847Sbostic - p_ere - ERE parser top level, concatenation and alternation 19855847Sbostic */ 19955847Sbostic static void 20055847Sbostic p_ere(p, stop) 20155847Sbostic register struct parse *p; 202*56357Sbostic u_int stop; /* character this ERE should end at */ 20355847Sbostic { 20455847Sbostic register uchar c; 20555847Sbostic register sopno prevback; 20655847Sbostic register sopno prevfwd; 20755847Sbostic register sopno conc; 20855847Sbostic register int first = 1; /* is this the first alternative? */ 20955847Sbostic 21055847Sbostic for (;;) { 21155847Sbostic /* do a bunch of concatenated expressions */ 21255847Sbostic conc = HERE(); 21355847Sbostic while ((c = PEEK()) != '|' && c != stop && c != '\0') 21455847Sbostic p_ere_exp(p); 21555847Sbostic REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 21655847Sbostic 21755847Sbostic if (!EAT('|')) 21855847Sbostic break; /* NOTE BREAK OUT */ 21955847Sbostic 22055847Sbostic if (first) { 22155847Sbostic INSERT(OCH_, conc); /* offset is wrong */ 22255847Sbostic prevfwd = conc; 22355847Sbostic prevback = conc; 22455847Sbostic first = 0; 22555847Sbostic } 22655847Sbostic BACK(OOR1, prevback); 22755847Sbostic prevback = THERE(); 22855847Sbostic FWD(prevfwd); /* fix previous offset */ 22955847Sbostic prevfwd = HERE(); 23055847Sbostic EMIT(OOR2, 0); /* offset is very wrong */ 23155847Sbostic } 23255847Sbostic 23355847Sbostic if (!first) { /* tail-end fixups */ 23455847Sbostic FWD(prevfwd); 23555847Sbostic BACK(O_CH, prevback); 23655847Sbostic } 23755847Sbostic 23855847Sbostic assert(SEE(stop) || SEE('\0')); 23955847Sbostic } 24055847Sbostic 24155847Sbostic /* 24255847Sbostic - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 24355847Sbostic */ 24455847Sbostic static void 24555847Sbostic p_ere_exp(p) 24655847Sbostic register struct parse *p; 24755847Sbostic { 24855847Sbostic register uchar c; 24955847Sbostic register sopno pos; 25055847Sbostic register int count; 25155847Sbostic register int count2; 25255847Sbostic register sopno subno; 25355847Sbostic int wascaret = 0; 25455847Sbostic /* we call { a repetition if followed by a digit */ 25555847Sbostic # define ISRPT(c1, c2) (c1 == '*' || c1 == '+' || c1 == '?' || \ 25655847Sbostic (c1 == '{' && isdigit(c2))) 25755847Sbostic 25855847Sbostic c = GETNEXT(); 25955847Sbostic assert(c != '\0'); /* caller should have ensured this */ 26055847Sbostic 26155847Sbostic pos = HERE(); 26255847Sbostic switch (c) { 26355847Sbostic case '(': 26455847Sbostic MUSTNOTSEE('\0', REG_EPAREN); 26555847Sbostic p->g->nsub++; 26655847Sbostic subno = p->g->nsub; 26755847Sbostic if (subno < NPAREN) 26855847Sbostic p->pbegin[subno] = HERE(); 26955847Sbostic EMIT(OLPAREN, subno); 27055847Sbostic if (!SEE(')')) 27155847Sbostic p_ere(p, ')'); 27255847Sbostic if (subno < NPAREN) { 27355847Sbostic p->pend[subno] = HERE(); 27455847Sbostic assert(p->pend[subno] != 0); 27555847Sbostic } 27655847Sbostic EMIT(ORPAREN, subno); 27755847Sbostic MUSTEAT(')', REG_EPAREN); 27855847Sbostic break; 27955847Sbostic #ifndef POSIX_MISTAKE 28055847Sbostic case ')': /* happens only if no current unmatched ( */ 28155847Sbostic /* 28255847Sbostic * You may ask, why the ifndef? Because I didn't notice 28355847Sbostic * this until slightly too late for 1003.2, and none of the 28455847Sbostic * other 1003.2 regular-expression reviewers noticed it at 28555847Sbostic * all. So an unmatched ) is legal POSIX, at least until 28655847Sbostic * we can get it fixed. 28755847Sbostic */ 28855847Sbostic SETERROR(REG_EPAREN); 28955847Sbostic break; 29055847Sbostic #endif 29155847Sbostic case '^': 29255847Sbostic EMIT(OBOL, 0); 29355847Sbostic p->g->iflags |= USEBOL; 29455847Sbostic wascaret = 1; 29555847Sbostic break; 29655847Sbostic case '$': 29755847Sbostic EMIT(OEOL, 0); 29855847Sbostic p->g->iflags |= USEEOL; 29955847Sbostic break; 30055847Sbostic case '|': 30155847Sbostic SETERROR(REG_EMPTY); 30255847Sbostic break; 30355847Sbostic case '*': 30455847Sbostic case '+': 30555847Sbostic case '?': 30655847Sbostic SETERROR(REG_BADRPT); 30755847Sbostic break; 30855847Sbostic case '.': 30955847Sbostic if (p->g->cflags®_NEWLINE) 31055847Sbostic nonnewline(p); 31155847Sbostic else 31255847Sbostic EMIT(OANY, 0); 31355847Sbostic break; 31455847Sbostic case '[': 31555847Sbostic p_bracket(p); 31655847Sbostic break; 31755847Sbostic case '\\': 31855847Sbostic c = GETNEXT(); 31956355Sbostic #ifdef xxx 32055847Sbostic if (c == '^' || c == '.' || c == '[' || c == '$' || 32155847Sbostic c == '(' || c == ')' || c == '|' || 32255847Sbostic c == '*' || c == '+' || c == '?' || 32355847Sbostic c == '{' || c == '\\') 32456355Sbostic #else 32556355Sbostic if (c != '\0') 32656355Sbostic #endif 32755847Sbostic ordinary(p, c); 32855847Sbostic else 32955847Sbostic SETERROR(REG_EESCAPE); 33055847Sbostic break; 33155847Sbostic case '{': /* okay as ordinary except if digit follows */ 33255847Sbostic REQUIRE(!isdigit(PEEK()), REG_BADRPT); 33355847Sbostic /* FALLTHROUGH */ 33455847Sbostic default: 33555847Sbostic ordinary(p, c); 33655847Sbostic break; 33755847Sbostic } 33855847Sbostic 33955847Sbostic c = PEEK(); 34055847Sbostic if (!ISRPT(c, PEEK2())) 34155847Sbostic return; /* no repetition, we're done */ 34255847Sbostic NEXT(); 34355847Sbostic 34455847Sbostic REQUIRE(!wascaret, REG_BADRPT); 34555847Sbostic switch (c) { 34655847Sbostic case '*': /* implemented as +? */ 34755847Sbostic INSERT(OPLUS_, pos); 34855847Sbostic BACK(O_PLUS, pos); 34955847Sbostic INSERT(OQUEST_, pos); 35055847Sbostic BACK(O_QUEST, pos); 35155847Sbostic break; 35255847Sbostic case '+': 35355847Sbostic INSERT(OPLUS_, pos); 35455847Sbostic BACK(O_PLUS, pos); 35555847Sbostic break; 35655847Sbostic case '?': 35755847Sbostic INSERT(OQUEST_, pos); 35855847Sbostic BACK(O_QUEST, pos); 35955847Sbostic break; 36055847Sbostic case '{': 36155847Sbostic count = p_count(p); 36255847Sbostic if (EAT(',')) { 36355847Sbostic if (isdigit(PEEK())) { 36455847Sbostic count2 = p_count(p); 36555847Sbostic REQUIRE(count <= count2, REG_BADBR); 36655847Sbostic } else /* single number with comma */ 36755847Sbostic count2 = INFINITY; 36855847Sbostic } else /* just a single number */ 36955847Sbostic count2 = count; 37055847Sbostic repeat(p, pos, count, count2); 37155847Sbostic if (!EAT('}')) { /* error heuristics */ 37255847Sbostic while ((c = PEEK()) != '\0' && c != '}') 37355847Sbostic NEXT(); 37455847Sbostic if (c == '\0') 37555847Sbostic SETERROR(REG_EBRACE); 37655847Sbostic else 37755847Sbostic SETERROR(REG_BADBR); 37855847Sbostic } 37955847Sbostic break; 38055847Sbostic } 38155847Sbostic 38255847Sbostic c = PEEK(); 38355847Sbostic REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT); 38455847Sbostic } 38555847Sbostic 38655847Sbostic /* 38755847Sbostic - p_bre - BRE parser top level, anchoring and concatenation 38856355Sbostic * Giving end1 as '\0' essentially eliminates the end1/end2 check. 38955847Sbostic * 39056355Sbostic * This implementation is a bit of a kludge, in that a trailing $ is first 39156355Sbostic * taken as an ordinary character and then revised to be an anchor. The 39256355Sbostic * only undesirable side effect is that '$' gets included as a character 39356355Sbostic * category in such cases. This is fairly harmless; not worth fixing. 39456355Sbostic * The amount of lookahead needed to avoid this kludge is excessive, 39556355Sbostic * especially since things like "$*" appear to be legal. xxx 39655847Sbostic */ 39755847Sbostic static void 39855847Sbostic p_bre(p, end1, end2) 39955847Sbostic register struct parse *p; 400*56357Sbostic register u_int end1; /* first terminating character */ 401*56357Sbostic register u_int end2; /* second terminating character */ 40255847Sbostic { 40355847Sbostic register sopno start = HERE(); 40455847Sbostic register int first = 1; /* first subexpression? */ 40556355Sbostic register int wasdollar = 0; 40655847Sbostic 40755847Sbostic if (EAT('^')) { 40855847Sbostic EMIT(OBOL, 0); 40955847Sbostic p->g->iflags |= USEBOL; 41055847Sbostic } 41155847Sbostic while (!SEE('\0') && !SEETWO(end1, end2)) { 41255847Sbostic wasdollar = p_simp_re(p, first); 41355847Sbostic first = 0; 41455847Sbostic } 41555847Sbostic if (wasdollar) { /* oops, that was a trailing anchor */ 41655847Sbostic DROP(1); 41755847Sbostic EMIT(OEOL, 0); 41855847Sbostic p->g->iflags |= USEEOL; 41955847Sbostic } 42055847Sbostic 42155847Sbostic REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 42255847Sbostic } 42355847Sbostic 42455847Sbostic /* 42555847Sbostic - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 42655847Sbostic */ 42755847Sbostic static int /* was the simple RE an unbackslashed $? */ 42855847Sbostic p_simp_re(p, starordinary) 42955847Sbostic register struct parse *p; 43055847Sbostic int starordinary; /* is a leading * an ordinary character? */ 43155847Sbostic { 43255847Sbostic register int c; 43355847Sbostic register int count; 43455847Sbostic register int count2; 43555847Sbostic register sopno pos; 43655847Sbostic register int i; 43755847Sbostic register sopno subno; 43855847Sbostic # define BACKSL (1<<CHAR_BIT) 43955847Sbostic 44055847Sbostic pos = HERE(); /* repetion op, if any, covers from here */ 44155847Sbostic 44255847Sbostic c = GETNEXT(); 44355847Sbostic assert(c != '\0'); /* caller should have ensured this */ 44455847Sbostic if (c == '\\') 44555847Sbostic c = BACKSL | PEEK(); 44655847Sbostic switch (c) { 44755847Sbostic case '.': 44855847Sbostic if (p->g->cflags®_NEWLINE) 44955847Sbostic nonnewline(p); 45055847Sbostic else 45155847Sbostic EMIT(OANY, 0); 45255847Sbostic break; 45355847Sbostic case '[': 45455847Sbostic p_bracket(p); 45555847Sbostic break; 45655847Sbostic case BACKSL|'{': 45755847Sbostic SETERROR(REG_BADRPT); 45855847Sbostic break; 45955847Sbostic case BACKSL|'(': 46055847Sbostic NEXT(); 46155847Sbostic p->g->nsub++; 46255847Sbostic subno = p->g->nsub; 46355847Sbostic if (subno < NPAREN) 46455847Sbostic p->pbegin[subno] = HERE(); 46555847Sbostic EMIT(OLPAREN, subno); 46655847Sbostic /* the SEE here is an error heuristic */ 46755847Sbostic if (!SEE('\0') && !SEETWO('\\', ')')) 46855847Sbostic p_bre(p, '\\', ')'); 46955847Sbostic if (subno < NPAREN) { 47055847Sbostic p->pend[subno] = HERE(); 47155847Sbostic assert(p->pend[subno] != 0); 47255847Sbostic } 47355847Sbostic EMIT(ORPAREN, subno); 47455847Sbostic REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 47555847Sbostic break; 47655847Sbostic case BACKSL|')': /* should not get here -- must be user */ 47755847Sbostic case BACKSL|'}': 47855847Sbostic SETERROR(REG_EPAREN); 47955847Sbostic break; 48055847Sbostic case BACKSL|'1': 48155847Sbostic case BACKSL|'2': 48255847Sbostic case BACKSL|'3': 48355847Sbostic case BACKSL|'4': 48455847Sbostic case BACKSL|'5': 48555847Sbostic case BACKSL|'6': 48655847Sbostic case BACKSL|'7': 48755847Sbostic case BACKSL|'8': 48855847Sbostic case BACKSL|'9': 48955847Sbostic i = (c&~BACKSL) - '0'; 49055847Sbostic assert(i < NPAREN); 49155847Sbostic if (p->pend[i] != 0) { 49255847Sbostic assert(i <= p->g->nsub); 49355847Sbostic EMIT(OBACK_, i); 49455847Sbostic assert(p->pbegin[i] != 0); 49555847Sbostic assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 49655847Sbostic assert(OP(p->strip[p->pend[i]]) == ORPAREN); 49755847Sbostic (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 49855847Sbostic EMIT(O_BACK, i); 49955847Sbostic } else 50055847Sbostic SETERROR(REG_ESUBREG); 50155847Sbostic p->g->backrefs = 1; 50255847Sbostic NEXT(); 50355847Sbostic break; 50456355Sbostic case BACKSL|'\0': 50556355Sbostic SETERROR(REG_EESCAPE); 50656355Sbostic break; 50755847Sbostic case '*': 50855847Sbostic REQUIRE(starordinary, REG_BADRPT); 50955847Sbostic /* FALLTHROUGH */ 51055847Sbostic default: 51156355Sbostic if (c & BACKSL) 51256355Sbostic c = GETNEXT(); 51355847Sbostic ordinary(p, (uchar)c); 51455847Sbostic break; 51555847Sbostic } 51655847Sbostic 51755847Sbostic if (EAT('*')) { /* implemented as +? */ 51855847Sbostic INSERT(OPLUS_, pos); 51955847Sbostic BACK(O_PLUS, pos); 52055847Sbostic INSERT(OQUEST_, pos); 52155847Sbostic BACK(O_QUEST, pos); 52255847Sbostic } else if (EATTWO('\\', '{')) { 52355847Sbostic count = p_count(p); 52455847Sbostic if (EAT(',')) { 52555847Sbostic if (isdigit(PEEK())) { 52655847Sbostic count2 = p_count(p); 52755847Sbostic REQUIRE(count <= count2, REG_BADBR); 52855847Sbostic } else /* single number with comma */ 52955847Sbostic count2 = INFINITY; 53055847Sbostic } else /* just a single number */ 53155847Sbostic count2 = count; 53255847Sbostic repeat(p, pos, count, count2); 53355847Sbostic if (!EATTWO('\\', '}')) { /* error heuristics */ 53455847Sbostic while (!SEE('\0') && !SEETWO('\\', '}')) 53555847Sbostic NEXT(); 53655847Sbostic if (SEE('\0')) 53755847Sbostic SETERROR(REG_EBRACE); 53855847Sbostic else 53955847Sbostic SETERROR(REG_BADBR); 54055847Sbostic } 54155847Sbostic } else if (c == '$') /* unbackslashed $ not followed by reptn */ 54255847Sbostic return(1); 54355847Sbostic 54455847Sbostic return(0); 54555847Sbostic } 54655847Sbostic 54755847Sbostic /* 54855847Sbostic - p_count - parse a repetition count 54955847Sbostic */ 55055847Sbostic static int /* the value */ 55155847Sbostic p_count(p) 55255847Sbostic register struct parse *p; 55355847Sbostic { 55455847Sbostic register int count = 0; 55555847Sbostic register int ndigits = 0; 55655847Sbostic 55755847Sbostic while (isdigit(PEEK()) && count <= DUPMAX) { 55855847Sbostic count = count*10 + (GETNEXT() - '0'); 55955847Sbostic ndigits++; 56055847Sbostic } 56155847Sbostic 56255847Sbostic REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 56355847Sbostic return(count); 56455847Sbostic } 56555847Sbostic 56655847Sbostic /* 56755847Sbostic - p_bracket - parse a bracketed character list 56855847Sbostic * 56955847Sbostic * Note a significant property of this code: if the allocset() did SETERROR, 57055847Sbostic * no set operations are done. 57155847Sbostic */ 57255847Sbostic static void 57355847Sbostic p_bracket(p) 57455847Sbostic register struct parse *p; 57555847Sbostic { 57655847Sbostic register uchar c; 57755847Sbostic register cset *cs = allocset(p); 57855847Sbostic register int invert = 0; 57955847Sbostic 58055847Sbostic if (EAT('^')) 58155847Sbostic invert++; /* make note to invert set at end */ 58255847Sbostic if (EAT(']')) 58355847Sbostic CHadd(cs, ']'); 58455847Sbostic while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']')) 58555847Sbostic p_b_term(p, cs); 58655847Sbostic if (EAT('-')) 58755847Sbostic CHadd(cs, '-'); 58855847Sbostic MUSTEAT(']', REG_EBRACK); 58955847Sbostic 59055847Sbostic if (invert && p->error == 0) { 59155847Sbostic register int i; 59255847Sbostic 59355847Sbostic for (i = p->g->csetsize - 1; i >= 0; i--) 59455847Sbostic if (CHIN(cs, i)) 59555847Sbostic CHsub(cs, i); 59655847Sbostic else 59755847Sbostic CHadd(cs, i); 59855847Sbostic if (p->g->cflags®_NEWLINE) 59955847Sbostic CHsub(cs, '\n'); 60055847Sbostic if (cs->multis != NULL) 60155847Sbostic mcinvert(p, cs); 60255847Sbostic } 60355847Sbostic assert(cs->multis == NULL); /* xxx */ 60455847Sbostic EMIT(OANYOF, freezeset(p, cs)); 60555847Sbostic } 60655847Sbostic 60755847Sbostic /* 60855847Sbostic - p_b_term - parse one term of a bracketed character list 60955847Sbostic */ 61055847Sbostic static void 61155847Sbostic p_b_term(p, cs) 61255847Sbostic register struct parse *p; 61355847Sbostic register cset *cs; 61455847Sbostic { 61555847Sbostic register uchar c; 61655847Sbostic register uchar start, finish; 61755847Sbostic register int i; 61855847Sbostic 61955847Sbostic /* classify what we've got */ 62055847Sbostic switch (PEEK()) { 62155847Sbostic case '[': 62255847Sbostic c = PEEK2(); 62355847Sbostic break; 62455847Sbostic case '-': 62555847Sbostic SETERROR(REG_ERANGE); 62655847Sbostic return; /* NOTE RETURN */ 62755847Sbostic break; 62855847Sbostic default: 62955847Sbostic c = '\0'; 63055847Sbostic break; 63155847Sbostic } 63255847Sbostic 63355847Sbostic switch (c) { 63455847Sbostic case ':': /* character class */ 63555847Sbostic NEXT2(); 63655847Sbostic c = PEEK(); 63755847Sbostic REQUIRE(c != '\0', REG_EBRACK); 63855847Sbostic REQUIRE(c != '-' && c != ']', REG_ECTYPE); 63955847Sbostic p_b_cclass(p, cs); 64055847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 64155847Sbostic REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 64255847Sbostic break; 64355847Sbostic case '=': /* equivalence class */ 64455847Sbostic NEXT2(); 64555847Sbostic c = PEEK(); 64655847Sbostic REQUIRE(c != '\0', REG_EBRACK); 64755847Sbostic REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 64855847Sbostic p_b_eclass(p, cs); 64955847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 65055847Sbostic REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 65155847Sbostic break; 65255847Sbostic default: /* symbol, ordinary character, or range */ 65355847Sbostic /* xxx revision needed for multichar stuff */ 65455847Sbostic start = p_b_symbol(p); 65555847Sbostic if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') { 65655847Sbostic /* range */ 65755847Sbostic NEXT(); 65855847Sbostic if (EAT('-')) 65955847Sbostic finish = '-'; 66055847Sbostic else 66155847Sbostic finish = p_b_symbol(p); 66255847Sbostic } else 66355847Sbostic finish = start; 66455847Sbostic REQUIRE(start <= finish, REG_ERANGE); 66555847Sbostic for (i = start; i <= finish; i++) { 66655847Sbostic CHadd(cs, i); 66755847Sbostic if ((p->g->cflags®_ICASE) && isalpha(i)) { 66855847Sbostic c = othercase((uchar)i); 66955847Sbostic CHadd(cs, c); 67055847Sbostic } 67155847Sbostic } 67255847Sbostic break; 67355847Sbostic } 67455847Sbostic } 67555847Sbostic 67655847Sbostic /* 67755847Sbostic - p_b_cclass - parse a character-class name and deal with it 67855847Sbostic */ 67955847Sbostic static void 68055847Sbostic p_b_cclass(p, cs) 68155847Sbostic register struct parse *p; 68255847Sbostic register cset *cs; 68355847Sbostic { 68455847Sbostic register uchar *sb = p->next; 68555847Sbostic register uchar *se = sb; 68655847Sbostic register struct cclass *cp; 68755847Sbostic register int len; 68855847Sbostic register uchar *u; 68955847Sbostic register uchar c; 69055847Sbostic 69155847Sbostic while (isalpha(*se)) 69255847Sbostic se++; 69355847Sbostic len = se - sb; 69455847Sbostic NEXTn(len); 69555847Sbostic for (cp = cclasses; cp->name != NULL; cp++) 69655847Sbostic if (strncmp(cp->name, (char *)sb, len) == 0 && 69756355Sbostic cp->name[len] == '\0') 69855847Sbostic break; 69955847Sbostic if (cp->name == NULL) { 70055847Sbostic /* oops, didn't find it */ 70155847Sbostic SETERROR(REG_ECTYPE); 70255847Sbostic return; 70355847Sbostic } 70455847Sbostic 70555847Sbostic u = (uchar *)cp->chars; 70655847Sbostic while ((c = *u++) != '\0') 70755847Sbostic CHadd(cs, c); 70855847Sbostic for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1) 70955847Sbostic MCadd(cs, u); 71055847Sbostic } 71155847Sbostic 71255847Sbostic /* 71355847Sbostic - p_b_eclass - parse an equivalence-class name and deal with it 71455847Sbostic * 71555847Sbostic * This implementation is incomplete. xxx 71655847Sbostic */ 71755847Sbostic static void 71855847Sbostic p_b_eclass(p, cs) 71955847Sbostic register struct parse *p; 72055847Sbostic register cset *cs; 72155847Sbostic { 72255847Sbostic register uchar c; 72355847Sbostic 72455847Sbostic c = p_b_coll_elem(p, '='); 72555847Sbostic CHadd(cs, c); 72655847Sbostic } 72755847Sbostic 72855847Sbostic /* 72955847Sbostic - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 73055847Sbostic */ 73155847Sbostic static uchar /* value of symbol */ 73255847Sbostic p_b_symbol(p) 73355847Sbostic register struct parse *p; 73455847Sbostic { 73555847Sbostic register uchar value; 73655847Sbostic 73755847Sbostic if (!EATTWO('[', '.')) { 73855847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 73955847Sbostic return(GETNEXT()); 74055847Sbostic } 74155847Sbostic 74255847Sbostic /* collating symbol */ 74355847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 74455847Sbostic value = p_b_coll_elem(p, '.'); 74555847Sbostic REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 74655847Sbostic return(value); 74755847Sbostic } 74855847Sbostic 74955847Sbostic /* 75055847Sbostic - p_b_coll_elem - parse a collating-element name and look it up 75155847Sbostic */ 75255847Sbostic static uchar /* value of collating element */ 75355847Sbostic p_b_coll_elem(p, endc) 75455847Sbostic register struct parse *p; 755*56357Sbostic u_int endc; /* name ended by endc,']' */ 75655847Sbostic { 75755847Sbostic register uchar *sp = p->next; 75855847Sbostic register struct cname *cp; 75955847Sbostic register int len; 76055847Sbostic register uchar c; 76155847Sbostic 76255847Sbostic while ((c = PEEK()) != '\0' && !SEETWO(endc, ']')) 76355847Sbostic NEXT(); 76455847Sbostic if (c == '\0') { 76555847Sbostic SETERROR(REG_EBRACK); 76655847Sbostic return(0); 76755847Sbostic } 76855847Sbostic len = p->next - sp; 76955847Sbostic for (cp = cnames; cp->name != NULL; cp++) 77055847Sbostic if (strncmp(cp->name, (char *)sp, len) == 0 && 77156355Sbostic cp->name[len] == '\0') 77255847Sbostic return(cp->code); /* known name */ 77355847Sbostic if (len == 1) 77455847Sbostic return(*sp); /* single character */ 77555847Sbostic SETERROR(REG_ECOLLATE); /* neither */ 77655847Sbostic return(0); 77755847Sbostic } 77855847Sbostic 77955847Sbostic /* 78055847Sbostic - othercase - return the case counterpart of an alphabetic 78155847Sbostic */ 78255847Sbostic static uchar 78355847Sbostic othercase(ch) 784*56357Sbostic u_int ch; 78555847Sbostic { 78655847Sbostic assert(isalpha(ch)); 78755847Sbostic if (isupper(ch)) 78855847Sbostic return(tolower(ch)); 78955847Sbostic else if (islower(ch)) 79055847Sbostic return(toupper(ch)); 79155847Sbostic else /* peculiar, but could happen */ 79255847Sbostic return(ch); 79355847Sbostic } 79455847Sbostic 79555847Sbostic /* 79655847Sbostic - bothcases - emit a dualcase version of a character 79756355Sbostic * 79855847Sbostic * Boy, is this implementation ever a kludge... 79955847Sbostic */ 80055847Sbostic static void 80155847Sbostic bothcases(p, ch) 80255847Sbostic register struct parse *p; 803*56357Sbostic u_int ch; 80455847Sbostic { 80555847Sbostic register uchar *oldnext; 80655847Sbostic uchar bracket[3]; 80755847Sbostic 80855847Sbostic oldnext = p->next; 80955847Sbostic p->next = bracket; 81055847Sbostic bracket[0] = ch; 81155847Sbostic bracket[1] = ']'; 81255847Sbostic bracket[2] = '\0'; 81355847Sbostic p_bracket(p); 81455847Sbostic assert(p->next == bracket+2); 81555847Sbostic p->next = oldnext; 81655847Sbostic } 81755847Sbostic 81855847Sbostic /* 81955847Sbostic - ordinary - emit an ordinary character 82055847Sbostic */ 82155847Sbostic static void 82255847Sbostic ordinary(p, ch) 82355847Sbostic register struct parse *p; 824*56357Sbostic register u_int ch; 82555847Sbostic { 82655847Sbostic register uchar *cap = p->g->categories; 82755847Sbostic 82855847Sbostic if ((p->g->cflags®_ICASE) && isalpha(ch)) { 82955847Sbostic bothcases(p, ch); 83055847Sbostic return; 83155847Sbostic } 83255847Sbostic 83355847Sbostic EMIT(OCHAR, ch); 83455847Sbostic if (cap[ch] == 0) 83555847Sbostic cap[ch] = p->g->ncategories++; 83655847Sbostic } 83755847Sbostic 83855847Sbostic /* 83955847Sbostic - nonnewline - emit REG_NEWLINE version of OANY 84056355Sbostic * 84155847Sbostic * Boy, is this implementation ever a kludge... 84255847Sbostic */ 84355847Sbostic static void 84455847Sbostic nonnewline(p) 84555847Sbostic register struct parse *p; 84655847Sbostic { 84755847Sbostic register uchar *oldnext; 84855847Sbostic uchar bracket[4]; 84955847Sbostic 85055847Sbostic oldnext = p->next; 85155847Sbostic p->next = bracket; 85255847Sbostic bracket[0] = '^'; 85355847Sbostic bracket[1] = '\n'; 85455847Sbostic bracket[2] = ']'; 85555847Sbostic bracket[3] = '\0'; 85655847Sbostic p_bracket(p); 85755847Sbostic assert(p->next == bracket+3); 85855847Sbostic p->next = oldnext; 85955847Sbostic } 86055847Sbostic 86155847Sbostic /* 86255847Sbostic - repeat - generate code for a bounded repetition, recursively if needed 86355847Sbostic */ 86455847Sbostic static void 86555847Sbostic repeat(p, start, from, to) 86655847Sbostic register struct parse *p; 86755847Sbostic sopno start; /* operand from here to end of strip */ 86855847Sbostic int from; /* repeated from this number */ 86955847Sbostic int to; /* to this number of times (maybe INFINITY) */ 87055847Sbostic { 87155847Sbostic register sopno finish = HERE(); 87255847Sbostic # define N 2 87355847Sbostic # define INF 3 87455847Sbostic # define REP(f, t) ((f)*8 + (t)) 87555847Sbostic # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 87655847Sbostic register sopno copy; 87755847Sbostic 87855847Sbostic if (p->error != 0) /* head off possible runaway recursion */ 87955847Sbostic return; 88055847Sbostic 88155847Sbostic assert(from <= to); 88255847Sbostic 88355847Sbostic switch (REP(MAP(from), MAP(to))) { 88455847Sbostic case REP(0, 0): /* must be user doing this */ 88555847Sbostic DROP(finish-start); /* drop the operand */ 88655847Sbostic break; 88755847Sbostic case REP(0, 1): /* as x{1,1}? */ 88855847Sbostic case REP(0, N): /* as x{1,n}? */ 88955847Sbostic case REP(0, INF): /* as x{1,}? */ 89055847Sbostic INSERT(OQUEST_, start); /* offset is wrong... */ 89155847Sbostic repeat(p, start+1, 1, to); 89255847Sbostic FWD(start); /* ... fix it */ 89355847Sbostic BACK(O_QUEST, start); 89455847Sbostic break; 89555847Sbostic case REP(1, 1): /* trivial case */ 89655847Sbostic /* done */ 89755847Sbostic break; 89855847Sbostic case REP(1, N): /* as x?x{1,n-1} */ 89955847Sbostic INSERT(OQUEST_, start); 90055847Sbostic BACK(O_QUEST, start); 90155847Sbostic copy = dupl(p, start+1, finish+1); 90255847Sbostic assert(copy == finish+2); 90355847Sbostic repeat(p, copy, 1, to-1); 90455847Sbostic break; 90555847Sbostic case REP(1, INF): /* as x+ */ 90655847Sbostic INSERT(OPLUS_, start); 90755847Sbostic BACK(O_PLUS, start); 90855847Sbostic break; 90955847Sbostic case REP(N, N): /* as xx{m-1,n-1} */ 91055847Sbostic copy = dupl(p, start, finish); 91155847Sbostic repeat(p, copy, from-1, to-1); 91255847Sbostic break; 91355847Sbostic case REP(N, INF): /* as xx{n-1,INF} */ 91455847Sbostic copy = dupl(p, start, finish); 91555847Sbostic repeat(p, copy, from-1, to); 91655847Sbostic break; 91755847Sbostic default: /* "can't happen" */ 91855847Sbostic SETERROR(REG_ASSERT); /* just in case */ 91955847Sbostic break; 92055847Sbostic } 92155847Sbostic } 92255847Sbostic 92355847Sbostic /* 92455847Sbostic - seterr - set an error condition 92555847Sbostic */ 92655847Sbostic static int /* useless but makes type checking happy */ 92755847Sbostic seterr(p, e) 92855847Sbostic register struct parse *p; 92955847Sbostic int e; 93055847Sbostic { 93155847Sbostic if (p->error == 0) /* keep earliest error condition */ 93255847Sbostic p->error = e; 93355847Sbostic p->next = nuls; /* try to bring things to a halt */ 93455847Sbostic return(0); /* make the return value well-defined */ 93555847Sbostic } 93655847Sbostic 93755847Sbostic /* 93855847Sbostic - allocset - allocate a set of characters for [] 93955847Sbostic */ 94055847Sbostic static cset * 94155847Sbostic allocset(p) 94255847Sbostic register struct parse *p; 94355847Sbostic { 94455847Sbostic register int no = p->g->ncsets++; 94555847Sbostic register size_t nc; 94655847Sbostic register size_t nbytes; 94755847Sbostic register cset *cs; 94855847Sbostic register size_t css = (size_t)p->g->csetsize; 94955847Sbostic 95055847Sbostic if (no >= p->ncsalloc) { /* need another column of space */ 95155847Sbostic p->ncsalloc += CHAR_BIT; 95255847Sbostic nc = p->ncsalloc; 95355847Sbostic assert(nc % CHAR_BIT == 0); 95455847Sbostic nbytes = nc / CHAR_BIT * css; 95555847Sbostic if (p->g->sets == NULL) 95655847Sbostic p->g->sets = (cset *)malloc(nc * sizeof(cset)); 95755847Sbostic else 95855847Sbostic p->g->sets = (cset *)realloc((char *)p->g->sets, 95955847Sbostic nc * sizeof(cset)); 96055847Sbostic if (p->g->setbits == NULL) 96155847Sbostic p->g->setbits = (uchar *)malloc(nbytes); 96255847Sbostic else 96355847Sbostic p->g->setbits = (uchar *)realloc((char *)p->g->setbits, 96455847Sbostic nbytes); 96555847Sbostic if (p->g->sets != NULL && p->g->setbits != NULL) 96655847Sbostic (void) memset((char *)p->g->setbits + (nbytes - css), 96755847Sbostic 0, css); 96855847Sbostic else { 96955847Sbostic no = 0; 97055847Sbostic SETERROR(REG_ESPACE); 97155847Sbostic /* caller's responsibility not to do set ops */ 97255847Sbostic } 97355847Sbostic } 97455847Sbostic 97555847Sbostic assert(p->g->sets != NULL); /* xxx */ 97655847Sbostic cs = &p->g->sets[no]; 97755847Sbostic cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 97855847Sbostic cs->mask = 1 << ((no) % CHAR_BIT); 97955847Sbostic cs->hash = 0; 98055847Sbostic cs->smultis = 0; 98155847Sbostic cs->multis = NULL; 98255847Sbostic 98355847Sbostic return(cs); 98455847Sbostic } 98555847Sbostic 98655847Sbostic /* 98755847Sbostic - freezeset - final processing on a set of characters 98855847Sbostic * 98955847Sbostic * The main task here is merging identical sets. This is usually a waste 99055847Sbostic * of time (although the hash code minimizes the overhead), but can win 99155847Sbostic * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 99255847Sbostic * is done using addition rather than xor -- all ASCII [aA] sets xor to 99355847Sbostic * the same value! 99455847Sbostic */ 99555847Sbostic static int /* set number */ 99655847Sbostic freezeset(p, cs) 99755847Sbostic register struct parse *p; 99855847Sbostic register cset *cs; 99955847Sbostic { 100055847Sbostic register uchar h = cs->hash; 100155847Sbostic register int i; 100255847Sbostic register cset *top = &p->g->sets[p->g->ncsets]; 100355847Sbostic register cset *cs2; 100455847Sbostic register size_t css = (size_t)p->g->csetsize; 100555847Sbostic 100655847Sbostic /* look for an earlier one which is the same */ 100755847Sbostic for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 100855847Sbostic if (cs2->hash == h && cs2 != cs) { 100955847Sbostic /* maybe */ 101055847Sbostic for (i = 0; i < css; i++) 101155847Sbostic if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 101255847Sbostic break; /* no */ 101355847Sbostic if (i == css) 101455847Sbostic break; /* yes */ 101555847Sbostic } 101655847Sbostic 101755847Sbostic if (cs2 < top) { /* found one */ 101855847Sbostic assert(cs == top-1); 101955847Sbostic p->g->ncsets--; 102055847Sbostic for (i = 0; i < css; i++) 102155847Sbostic CHsub(cs, i); 102255847Sbostic cs = cs2; 102355847Sbostic } 102455847Sbostic 102555847Sbostic return((int)(cs - p->g->sets)); 102655847Sbostic } 102755847Sbostic 102855847Sbostic /* 102955847Sbostic - mcadd - add a collating element to a cset 103055847Sbostic */ 103155847Sbostic static void 103255847Sbostic mcadd(p, cs, cp) 103355847Sbostic register struct parse *p; 103455847Sbostic register cset *cs; 103555847Sbostic register uchar *cp; 103655847Sbostic { 103755847Sbostic register size_t oldend = cs->smultis; 103855847Sbostic 103955847Sbostic cs->smultis += strlen((char *)cp) + 1; 104055847Sbostic if (cs->multis == NULL) 104155847Sbostic cs->multis = (uchar *)malloc(cs->smultis); 104255847Sbostic else 104355847Sbostic cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 104455847Sbostic if (cs->multis == NULL) { 104555847Sbostic SETERROR(REG_ESPACE); 104655847Sbostic return; 104755847Sbostic } 104855847Sbostic 104955847Sbostic (void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp); 105055847Sbostic cs->multis[cs->smultis - 1] = '\0'; 105155847Sbostic } 105255847Sbostic 105355847Sbostic /* 105455847Sbostic - mcsub - subtract a collating element from a cset 105555847Sbostic */ 105655847Sbostic static void 105755847Sbostic mcsub(p, cs, cp) 105855847Sbostic register struct parse *p; 105955847Sbostic register cset *cs; 1060*56357Sbostic register u_int *cp; 106155847Sbostic { 106255847Sbostic register uchar *fp = mcfind(cs, cp); 106355847Sbostic register size_t len = strlen((char *)fp); 106455847Sbostic 106555847Sbostic assert(p != NULL); 106655847Sbostic (void) memmove((char *)fp, (char *)(fp + len + 1), 106755847Sbostic cs->smultis - (fp + len + 1 - cs->multis)); 106855847Sbostic cs->smultis -= len; 106955847Sbostic 107055847Sbostic if (cs->smultis == 0) { 107155847Sbostic free((char *)cs->multis); 107255847Sbostic cs->multis = NULL; 107355847Sbostic return; 107455847Sbostic } 107555847Sbostic 107655847Sbostic cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 107755847Sbostic assert(cs->multis != NULL); 107855847Sbostic } 107955847Sbostic 108055847Sbostic /* 108155847Sbostic - mcin - is a collating element in a cset? 108255847Sbostic */ 108355847Sbostic static int 108455847Sbostic mcin(p, cs, cp) 108555847Sbostic register struct parse *p; 108655847Sbostic register cset *cs; 1087*56357Sbostic register u_int *cp; 108855847Sbostic { 108955847Sbostic return(mcfind(cs, cp) != NULL); 109055847Sbostic } 109155847Sbostic 109255847Sbostic /* 109355847Sbostic - mcfind - find a collating element in a cset 109455847Sbostic */ 109555847Sbostic static uchar * 109655847Sbostic mcfind(cs, cp) 109755847Sbostic register cset *cs; 1098*56357Sbostic register u_int *cp; 109955847Sbostic { 110055847Sbostic register uchar *p; 110155847Sbostic 110255847Sbostic if (cs->multis == NULL) 110355847Sbostic return(NULL); 110455847Sbostic for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1) 110555847Sbostic if (strcmp((char *)cp, (char *)p) == 0) 110655847Sbostic return(p); 110755847Sbostic return(NULL); 110855847Sbostic } 110955847Sbostic 111055847Sbostic /* 111155847Sbostic - mcinvert - invert the list of collating elements in a cset 111255847Sbostic * 111355847Sbostic * This would have to know the set of possibilities. Implementation 111455847Sbostic * is deferred. 111555847Sbostic */ 111655847Sbostic static void 111755847Sbostic mcinvert(p, cs) 111855847Sbostic register struct parse *p; 111955847Sbostic register cset *cs; 112055847Sbostic { 112155847Sbostic assert(cs->multis == NULL); /* xxx */ 112255847Sbostic } 112355847Sbostic 112455847Sbostic /* 112555847Sbostic - isinsets - is this character in any sets? 112655847Sbostic */ 112755847Sbostic static int /* predicate */ 112855847Sbostic isinsets(g, c) 112955847Sbostic register struct re_guts *g; 1130*56357Sbostic u_int c; 113155847Sbostic { 113255847Sbostic register uchar *col; 113355847Sbostic register int i; 113455847Sbostic register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 113555847Sbostic 113655847Sbostic for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 113755847Sbostic if (col[c] != 0) 113855847Sbostic return(1); 113955847Sbostic return(0); 114055847Sbostic } 114155847Sbostic 114255847Sbostic /* 114355847Sbostic - samesets - are these two characters in exactly the same sets? 114455847Sbostic */ 114555847Sbostic static int /* predicate */ 114655847Sbostic samesets(g, c1, c2) 114755847Sbostic register struct re_guts *g; 1148*56357Sbostic register u_int c1; 1149*56357Sbostic register u_int c2; 115055847Sbostic { 115155847Sbostic register uchar *col; 115255847Sbostic register int i; 115355847Sbostic register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 115455847Sbostic 115555847Sbostic for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 115655847Sbostic if (col[c1] != col[c2]) 115755847Sbostic return(0); 115855847Sbostic return(1); 115955847Sbostic } 116055847Sbostic 116155847Sbostic /* 116255847Sbostic - categorize - sort out character categories 116355847Sbostic */ 116455847Sbostic static void 116555847Sbostic categorize(p, g) 116655847Sbostic struct parse *p; 116755847Sbostic register struct re_guts *g; 116855847Sbostic { 116955847Sbostic register uchar *cats = g->categories; 117055847Sbostic register unsigned c; 117155847Sbostic register unsigned c2; 117255847Sbostic register uchar cat; 117355847Sbostic 117455847Sbostic /* avoid making error situations worse */ 117555847Sbostic if (p->error != 0) 117655847Sbostic return; 117755847Sbostic 117855847Sbostic for (c = 0; c < g->csetsize; c++) 117955847Sbostic if (cats[c] == 0 && isinsets(g, c)) { 118055847Sbostic cat = g->ncategories++; 118155847Sbostic cats[c] = cat; 118255847Sbostic for (c2 = c+1; c2 < g->csetsize; c2++) 118355847Sbostic if (cats[c2] == 0 && samesets(g, c, c2)) 118455847Sbostic cats[c2] = cat; 118555847Sbostic } 118655847Sbostic } 118755847Sbostic 118855847Sbostic /* 118955847Sbostic - dupl - emit a duplicate of a bunch of sops 119055847Sbostic */ 119155847Sbostic static sopno /* start of duplicate */ 119255847Sbostic dupl(p, start, finish) 119355847Sbostic register struct parse *p; 119455847Sbostic sopno start; /* from here */ 119555847Sbostic sopno finish; /* to this less one */ 119655847Sbostic { 119755847Sbostic register sopno ret = HERE(); 119855847Sbostic register sopno len = finish - start; 119955847Sbostic 120055847Sbostic assert(finish >= start); 120155847Sbostic if (len == 0) 120255847Sbostic return(ret); 120355847Sbostic enlarge(p, p->ssize + len); /* this many unexpected additions */ 120455847Sbostic assert(p->ssize >= p->slen + len); 120555847Sbostic (void) memcpy((char *)(p->strip + p->slen), 120655847Sbostic (char *)(p->strip + start), (size_t)len*sizeof(sop)); 120755847Sbostic p->slen += len; 120855847Sbostic return(ret); 120955847Sbostic } 121055847Sbostic 121155847Sbostic /* 121255847Sbostic - doemit - emit a strip operator 121355847Sbostic * 121455847Sbostic * It might seem better to implement this as a macro with a function as 121555847Sbostic * hard-case backup, but it's just too big and messy unless there are 121655847Sbostic * some changes to the data structures. Maybe later. 121755847Sbostic */ 121855847Sbostic static void 121955847Sbostic doemit(p, op, opnd) 122055847Sbostic register struct parse *p; 122155847Sbostic sop op; 122255847Sbostic size_t opnd; 122355847Sbostic { 122455847Sbostic /* avoid making error situations worse */ 122555847Sbostic if (p->error != 0) 122655847Sbostic return; 122755847Sbostic 122855847Sbostic /* deal with oversize operands ("can't happen", more or less) */ 122955847Sbostic assert(opnd < 1<<OPSHIFT); 123055847Sbostic 123155847Sbostic /* deal with undersized strip */ 123255847Sbostic if (p->slen >= p->ssize) 123355847Sbostic enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 123455847Sbostic assert(p->slen < p->ssize); 123555847Sbostic 123655847Sbostic /* finally, it's all reduced to the easy case */ 123755847Sbostic p->strip[p->slen++] = SOP(op, opnd); 123855847Sbostic } 123955847Sbostic 124055847Sbostic /* 124155847Sbostic - doinsert - insert a sop into the strip 124255847Sbostic */ 124355847Sbostic static void 124455847Sbostic doinsert(p, op, opnd, pos) 124555847Sbostic register struct parse *p; 124655847Sbostic sop op; 124755847Sbostic size_t opnd; 124855847Sbostic sopno pos; 124955847Sbostic { 125055847Sbostic register sopno sn; 125155847Sbostic register sop s; 125255847Sbostic register int i; 125355847Sbostic 125455847Sbostic /* avoid making error situations worse */ 125555847Sbostic if (p->error != 0) 125655847Sbostic return; 125755847Sbostic 125855847Sbostic sn = HERE(); 125955847Sbostic EMIT(op, opnd); /* do checks, ensure space */ 126055847Sbostic assert(HERE() == sn+1); 126155847Sbostic s = p->strip[sn]; 126255847Sbostic 126355847Sbostic /* adjust paren pointers */ 126455847Sbostic assert(pos > 0); 126555847Sbostic for (i = 1; i < NPAREN; i++) { 126655847Sbostic if (p->pbegin[i] >= pos) { 126755847Sbostic p->pbegin[i]++; 126855847Sbostic } 126955847Sbostic if (p->pend[i] >= pos) { 127055847Sbostic p->pend[i]++; 127155847Sbostic } 127255847Sbostic } 127355847Sbostic 127455847Sbostic memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 127555847Sbostic (HERE()-pos-1)*sizeof(sop)); 127655847Sbostic p->strip[pos] = s; 127755847Sbostic } 127855847Sbostic 127955847Sbostic /* 128055847Sbostic - dofwd - complete a forward reference 128155847Sbostic */ 128255847Sbostic static void 128355847Sbostic dofwd(p, pos, value) 128455847Sbostic register struct parse *p; 128555847Sbostic register sopno pos; 128655847Sbostic sop value; 128755847Sbostic { 128855847Sbostic /* avoid making error situations worse */ 128955847Sbostic if (p->error != 0) 129055847Sbostic return; 129155847Sbostic 129255847Sbostic assert(value < 1<<OPSHIFT); 129355847Sbostic p->strip[pos] = OP(p->strip[pos]) | value; 129455847Sbostic } 129555847Sbostic 129655847Sbostic /* 129755847Sbostic - enlarge - enlarge the strip 129855847Sbostic */ 129955847Sbostic static void 130055847Sbostic enlarge(p, size) 130155847Sbostic register struct parse *p; 130255847Sbostic register sopno size; 130355847Sbostic { 130455847Sbostic register sop *sp; 130555847Sbostic 130655847Sbostic if (p->ssize >= size) 130755847Sbostic return; 130855847Sbostic 130955847Sbostic sp = (sop *)realloc(p->strip, size*sizeof(sop)); 131055847Sbostic if (sp == NULL) { 131155847Sbostic SETERROR(REG_ESPACE); 131255847Sbostic return; 131355847Sbostic } 131455847Sbostic p->strip = sp; 131555847Sbostic p->ssize = size; 131655847Sbostic } 131755847Sbostic 131855847Sbostic /* 131955847Sbostic - stripsnug - compact the strip 132055847Sbostic */ 132155847Sbostic static void 132255847Sbostic stripsnug(p, g) 132355847Sbostic register struct parse *p; 132455847Sbostic register struct re_guts *g; 132555847Sbostic { 132655847Sbostic g->nstates = p->slen; 132755847Sbostic g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop)); 132855847Sbostic if (g->strip == NULL) { 132955847Sbostic SETERROR(REG_ESPACE); 133055847Sbostic g->strip = p->strip; 133155847Sbostic } 133255847Sbostic } 133355847Sbostic 133455847Sbostic /* 133555847Sbostic - findmust - fill in must and mlen with longest mandatory literal string 133655847Sbostic * 133755847Sbostic * This algorithm could do fancy things like analyzing the operands of | 133855847Sbostic * for common subsequences. Someday. This code is simple and finds most 133955847Sbostic * of the interesting cases. 134055847Sbostic * 134155847Sbostic * Note that must and mlen got initialized during setup. 134255847Sbostic */ 134356355Sbostic static void 134455847Sbostic findmust(p, g) 134555847Sbostic struct parse *p; 134655847Sbostic register struct re_guts *g; 134755847Sbostic { 134855847Sbostic register sop *scan; 134955847Sbostic sop *start; 135055847Sbostic register sop *newstart; 135155847Sbostic register sopno newlen; 135255847Sbostic register sop s; 135355847Sbostic register char *cp; 135455847Sbostic register sopno i; 135555847Sbostic 135655847Sbostic /* avoid making error situations worse */ 135755847Sbostic if (p->error != 0) 135855847Sbostic return; 135955847Sbostic 136055847Sbostic /* find the longest OCHAR sequence in strip */ 136155847Sbostic newlen = 0; 136255847Sbostic scan = g->strip + 1; 136355847Sbostic do { 136455847Sbostic s = *scan++; 136555847Sbostic switch (OP(s)) { 136655847Sbostic case OCHAR: /* sequence member */ 136755847Sbostic if (newlen == 0) /* new sequence */ 136855847Sbostic newstart = scan - 1; 136955847Sbostic newlen++; 137055847Sbostic break; 137155847Sbostic case OPLUS_: /* things that don't break one */ 137255847Sbostic case OLPAREN: 137355847Sbostic case ORPAREN: 137455847Sbostic break; 137555847Sbostic case OQUEST_: /* things that must be skipped */ 137655847Sbostic case OCH_: 137755847Sbostic scan--; 137855847Sbostic do { 137955847Sbostic scan += OPND(s); 138055847Sbostic s = *scan; 138155847Sbostic /* assert() interferes w debug printouts */ 138255847Sbostic if (OP(s) != O_QUEST && OP(s) != O_CH && 138355847Sbostic OP(s) != OOR2) { 138455847Sbostic g->iflags |= BAD; 138555847Sbostic return; 138655847Sbostic } 138755847Sbostic } while (OP(s) != O_QUEST && OP(s) != O_CH); 138855847Sbostic /* fallthrough */ 138955847Sbostic default: /* things that break a sequence */ 139055847Sbostic if (newlen > g->mlen) { /* ends one */ 139155847Sbostic start = newstart; 139255847Sbostic g->mlen = newlen; 139355847Sbostic } 139455847Sbostic newlen = 0; 139555847Sbostic break; 139655847Sbostic } 139755847Sbostic } while (OP(s) != OEND); 139855847Sbostic 139955847Sbostic if (g->mlen == 0) /* there isn't one */ 140055847Sbostic return; 140155847Sbostic 140255847Sbostic /* turn it into a character string */ 140355847Sbostic g->must = malloc((size_t)g->mlen + 1); 140455847Sbostic if (g->must == NULL) { /* argh; just forget it */ 140555847Sbostic g->mlen = 0; 140655847Sbostic return; 140755847Sbostic } 140855847Sbostic cp = g->must; 140955847Sbostic scan = start; 141055847Sbostic for (i = g->mlen; i > 0; i--) { 141155847Sbostic while (OP(s = *scan++) != OCHAR) 141255847Sbostic continue; 141355847Sbostic *cp++ = OPND(s); 141455847Sbostic } 141555847Sbostic *cp++ = '\0'; /* just on general principles */ 141655847Sbostic } 141755847Sbostic 141855847Sbostic /* 141955847Sbostic - pluscount - count + nesting 142055847Sbostic */ 142156355Sbostic static sopno /* nesting depth */ 142255847Sbostic pluscount(p, g) 142355847Sbostic struct parse *p; 142455847Sbostic register struct re_guts *g; 142555847Sbostic { 142655847Sbostic register sop *scan; 142755847Sbostic register sop s; 142855847Sbostic register sopno plusnest = 0; 142955847Sbostic register sopno maxnest = 0; 143055847Sbostic 143155847Sbostic if (p->error != 0) 143255847Sbostic return(0); /* there may not be an OEND */ 143355847Sbostic 143455847Sbostic scan = g->strip + 1; 143555847Sbostic do { 143655847Sbostic s = *scan++; 143755847Sbostic switch (OP(s)) { 143855847Sbostic case OPLUS_: 143955847Sbostic plusnest++; 144055847Sbostic break; 144155847Sbostic case O_PLUS: 144255847Sbostic if (plusnest > maxnest) 144355847Sbostic maxnest = plusnest; 144455847Sbostic plusnest--; 144555847Sbostic break; 144655847Sbostic } 144755847Sbostic } while (OP(s) != OEND); 144855847Sbostic if (plusnest != 0) 144955847Sbostic g->iflags |= BAD; 145055847Sbostic return(maxnest); 145155847Sbostic } 1452