155847Sbostic /*- 255847Sbostic * Copyright (c) 1992 Henry Spencer. 355847Sbostic * Copyright (c) 1992 The Regents of the University of California. 455847Sbostic * All rights reserved. 555847Sbostic * 655847Sbostic * This code is derived from software contributed to Berkeley by 755847Sbostic * Henry Spencer of the University of Toronto. 855847Sbostic * 955847Sbostic * %sccs.include.redist.c% 1055847Sbostic * 11*56618Sbostic * @(#)regcomp.c 5.5 (Berkeley) 10/23/92 1255847Sbostic */ 1355847Sbostic 1455847Sbostic #if defined(LIBC_SCCS) && !defined(lint) 15*56618Sbostic static char sccsid[] = "@(#)regcomp.c 5.5 (Berkeley) 10/23/92"; 1655847Sbostic #endif /* LIBC_SCCS and not lint */ 1755847Sbostic 1855847Sbostic #include <sys/types.h> 1955847Sbostic #include <stdio.h> 2055847Sbostic #include <string.h> 2155847Sbostic #include <ctype.h> 2255847Sbostic #include <limits.h> 2355847Sbostic #include <stdlib.h> 2455847Sbostic #include <regex.h> 2555847Sbostic 2655847Sbostic #include "utils.h" 2755847Sbostic #include "regex2.h" 2855847Sbostic 2955847Sbostic #include "cclass.h" 3055847Sbostic #include "cname.h" 3155847Sbostic 3255847Sbostic /* 3355847Sbostic * parse structure, passed up and down to avoid global variables and 3455847Sbostic * other clumsinesses 3555847Sbostic */ 3655847Sbostic struct parse { 3755847Sbostic uchar *next; /* next character in RE */ 3855847Sbostic int error; /* has an error been seen? */ 3955847Sbostic sop *strip; /* malloced strip */ 4055847Sbostic sopno ssize; /* malloced strip size (allocated) */ 4155847Sbostic sopno slen; /* malloced strip length (used) */ 4255847Sbostic int ncsalloc; /* number of csets allocated */ 4355847Sbostic struct re_guts *g; 4455847Sbostic # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 4555847Sbostic sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 4655847Sbostic sopno pend[NPAREN]; /* -> ) ([0] unused) */ 4755847Sbostic }; 4855847Sbostic 4956355Sbostic static uchar nuls[10]; /* place to point scanner in event of error */ 5056355Sbostic 5155847Sbostic /* 5255847Sbostic * macros for use with parse structure 5355847Sbostic * BEWARE: these know that the parse structure is named `p' !!! 5455847Sbostic */ 5555847Sbostic #define PEEK() ((uchar)*p->next) 5655847Sbostic #define PEEK2() ((uchar)*(p->next+1)) 5755847Sbostic #define SEE(c) (PEEK() == (c)) 5855847Sbostic #define SEETWO(a, b) (PEEK() == (a) && PEEK2() == (b)) 5955847Sbostic #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 6055847Sbostic #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 6155847Sbostic #define NEXT() (p->next++) 6255847Sbostic #define NEXT2() (p->next += 2) 6355847Sbostic #define NEXTn(n) (p->next += (n)) 6455847Sbostic #define GETNEXT() ((uchar)*p->next++) 6555847Sbostic #define SETERROR(e) seterr(p, (e)) 6655847Sbostic #define REQUIRE(co, e) ((co) || SETERROR(e)) 6755847Sbostic #define MUSTSEE(c, e) (REQUIRE(PEEK() == (c), e)) 6855847Sbostic #define MUSTEAT(c, e) (REQUIRE(GETNEXT() == (c), e)) 6955847Sbostic #define MUSTNOTSEE(c, e) (REQUIRE(PEEK() != (c), e)) 7055847Sbostic #define EMIT(sop, sopnd) doemit(p, sop, (size_t)(sopnd)) 7155847Sbostic #define INSERT(sop, pos) doinsert(p, sop, HERE()-(pos)+1, pos) 7255847Sbostic #define FWD(pos) dofwd(p, pos, HERE()-(pos)) 7355847Sbostic #define BACK(sop, pos) EMIT(sop, HERE()-pos) 7455847Sbostic #define HERE() (p->slen) 7555847Sbostic #define THERE() (p->slen - 1) 7655847Sbostic #define DROP(n) (p->slen -= (n)) 7755847Sbostic 7856357Sbostic static cset *allocset __P((struct parse *)); 7956357Sbostic static void bothcases __P((struct parse *, u_int)); 8056357Sbostic static void categorize __P((struct parse *, struct re_guts *)); 8156357Sbostic static void doemit __P((struct parse *, sop, size_t)); 8256357Sbostic static void dofwd __P((struct parse *, sopno, sop)); 8356357Sbostic static void doinsert __P((struct parse *, sop, size_t, sopno)); 8456357Sbostic static sopno dupl __P((struct parse *, sopno, sopno)); 8556357Sbostic static void enlarge __P((struct parse *, sopno)); 8656357Sbostic static void findmust __P((struct parse *, struct re_guts *)); 8756357Sbostic static int freezeset __P((struct parse *, cset *)); 8856357Sbostic static int isinsets __P((struct re_guts *, u_int)); 8956357Sbostic static void mcadd __P((struct parse *, cset *, uchar *)); 9056357Sbostic static uchar *mcfind __P((cset *, u_int *)); 9156357Sbostic static int mcin __P((struct parse *, cset *, u_int *)); 9256357Sbostic static void mcinvert __P((struct parse *, cset *)); 9356357Sbostic static void mcsub __P((struct parse *, cset *, u_int *)); 9456357Sbostic static void nonnewline __P((struct parse *)); 9556357Sbostic static void ordinary __P((struct parse *, u_int)); 9656357Sbostic static uchar othercase __P((u_int)); 9756357Sbostic static void p_b_cclass __P((struct parse *, cset *)); 9856357Sbostic static uchar p_b_coll_elem __P((struct parse *, u_int)); 9956357Sbostic static void p_b_eclass __P((struct parse *, cset *)); 10056357Sbostic static uchar p_b_symbol __P((struct parse *)); 10156357Sbostic static void p_b_term __P((struct parse *, cset *)); 10256357Sbostic static void p_bracket __P((struct parse *)); 10356357Sbostic static void p_bre __P((struct parse *, u_int, u_int)); 10456357Sbostic static int p_count __P((struct parse *)); 10556357Sbostic static void p_ere __P((struct parse *, u_int)); 10656357Sbostic static void p_ere_exp __P((struct parse *)); 10756357Sbostic static int p_simp_re __P((struct parse *, int)); 10856357Sbostic static sopno pluscount __P((struct parse *, struct re_guts *)); 10956357Sbostic static void repeat __P((struct parse *, sopno, int, int)); 11056357Sbostic static int samesets __P((struct re_guts *, u_int, u_int)); 11156357Sbostic static int seterr __P((struct parse *, int)); 11256357Sbostic static void stripsnug __P((struct parse *, struct re_guts *)); 11356357Sbostic 11455847Sbostic /* 11555847Sbostic - regcomp - interface for parser and compilation 11655847Sbostic */ 11755847Sbostic int /* 0 success, otherwise REG_something */ 11855847Sbostic regcomp(preg, pattern, cflags) 11955847Sbostic regex_t *preg; 12055847Sbostic const char *pattern; 12155847Sbostic int cflags; 12255847Sbostic { 12355847Sbostic struct parse pa; 12455847Sbostic register struct re_guts *g; 12555847Sbostic register struct parse *p = &pa; 12655847Sbostic register int i; 12755847Sbostic 12855847Sbostic /* do the mallocs early so failure handling is easy */ 12955847Sbostic /* the +NUC here is for the category table */ 13055847Sbostic g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC); 13155847Sbostic if (g == NULL) 13255847Sbostic return(REG_ESPACE); 13355847Sbostic p->ssize = strlen(pattern)/2*3 + 1; 13455847Sbostic p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 13555847Sbostic p->slen = 0; 13655847Sbostic if (p->strip == NULL) { 13755847Sbostic free((char *)g); 13855847Sbostic return(REG_ESPACE); 13955847Sbostic } 14055847Sbostic 14155847Sbostic /* set things up */ 14255847Sbostic p->g = g; 14355847Sbostic p->next = (uchar *)pattern; 14455847Sbostic p->error = 0; 14555847Sbostic p->ncsalloc = 0; 14655847Sbostic for (i = 0; i < NPAREN; i++) { 14755847Sbostic p->pbegin[i] = 0; 14855847Sbostic p->pend[i] = 0; 14955847Sbostic } 15055847Sbostic g->csetsize = NUC; 15155847Sbostic g->sets = NULL; 15255847Sbostic g->setbits = NULL; 15355847Sbostic g->ncsets = 0; 15455847Sbostic g->cflags = cflags; 15555847Sbostic g->iflags = 0; 15655847Sbostic g->must = NULL; 15755847Sbostic g->mlen = 0; 15855847Sbostic g->nsub = 0; 15955847Sbostic g->ncategories = 1; /* category 0 is "everything else" */ 16055847Sbostic g->categories = (uchar *)g + sizeof(struct re_guts); 16155847Sbostic (void) memset((char *)g->categories, 0, NUC); 16255847Sbostic g->backrefs = 0; 16355847Sbostic g->nplus = 0; 16455847Sbostic 16555847Sbostic /* do it */ 16655847Sbostic EMIT(OEND, 0); 16755847Sbostic g->firststate = THERE(); 16855847Sbostic if (cflags®_EXTENDED) 16955847Sbostic p_ere(p, '\0'); 17055847Sbostic else 17155847Sbostic p_bre(p, '\0', '\0'); 17255847Sbostic EMIT(OEND, 0); 17355847Sbostic g->laststate = THERE(); 17455847Sbostic 17555847Sbostic /* tidy up loose ends and fill things in */ 17655847Sbostic categorize(p, g); 17755847Sbostic stripsnug(p, g); 17855847Sbostic findmust(p, g); 17955847Sbostic g->nplus = pluscount(p, g); 18055847Sbostic g->magic = MAGIC2; 18155847Sbostic preg->re_nsub = g->nsub; 18255847Sbostic preg->re_g = g; 18355847Sbostic preg->re_magic = MAGIC1; 18456355Sbostic #ifndef REDEBUG 18555847Sbostic /* not debugging, so can't rely on the assert() in regexec() */ 18655847Sbostic if (g->iflags&BAD) 18755847Sbostic SETERROR(REG_ASSERT); 18855847Sbostic #endif 18955847Sbostic 19055847Sbostic /* win or lose, we're done */ 19155847Sbostic if (p->error != 0) /* lose */ 19255847Sbostic regfree(preg); 19355847Sbostic return(p->error); 19455847Sbostic } 19555847Sbostic 19655847Sbostic /* 19755847Sbostic - p_ere - ERE parser top level, concatenation and alternation 19855847Sbostic */ 19955847Sbostic static void 20055847Sbostic p_ere(p, stop) 20155847Sbostic register struct parse *p; 20256357Sbostic u_int stop; /* character this ERE should end at */ 20355847Sbostic { 20455847Sbostic register uchar c; 20555847Sbostic register sopno prevback; 20655847Sbostic register sopno prevfwd; 20755847Sbostic register sopno conc; 20855847Sbostic register int first = 1; /* is this the first alternative? */ 20955847Sbostic 21055847Sbostic for (;;) { 21155847Sbostic /* do a bunch of concatenated expressions */ 21255847Sbostic conc = HERE(); 21355847Sbostic while ((c = PEEK()) != '|' && c != stop && c != '\0') 21455847Sbostic p_ere_exp(p); 21555847Sbostic REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 21655847Sbostic 21755847Sbostic if (!EAT('|')) 21855847Sbostic break; /* NOTE BREAK OUT */ 21955847Sbostic 22055847Sbostic if (first) { 22155847Sbostic INSERT(OCH_, conc); /* offset is wrong */ 22255847Sbostic prevfwd = conc; 22355847Sbostic prevback = conc; 22455847Sbostic first = 0; 22555847Sbostic } 22655847Sbostic BACK(OOR1, prevback); 22755847Sbostic prevback = THERE(); 22855847Sbostic FWD(prevfwd); /* fix previous offset */ 22955847Sbostic prevfwd = HERE(); 23055847Sbostic EMIT(OOR2, 0); /* offset is very wrong */ 23155847Sbostic } 23255847Sbostic 23355847Sbostic if (!first) { /* tail-end fixups */ 23455847Sbostic FWD(prevfwd); 23555847Sbostic BACK(O_CH, prevback); 23655847Sbostic } 23755847Sbostic 23855847Sbostic assert(SEE(stop) || SEE('\0')); 23955847Sbostic } 24055847Sbostic 24155847Sbostic /* 24255847Sbostic - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 24355847Sbostic */ 24455847Sbostic static void 24555847Sbostic p_ere_exp(p) 24655847Sbostic register struct parse *p; 24755847Sbostic { 24855847Sbostic register uchar c; 24955847Sbostic register sopno pos; 25055847Sbostic register int count; 25155847Sbostic register int count2; 25255847Sbostic register sopno subno; 25355847Sbostic int wascaret = 0; 25455847Sbostic /* we call { a repetition if followed by a digit */ 25555847Sbostic # define ISRPT(c1, c2) (c1 == '*' || c1 == '+' || c1 == '?' || \ 25655847Sbostic (c1 == '{' && isdigit(c2))) 25755847Sbostic 25855847Sbostic c = GETNEXT(); 25955847Sbostic assert(c != '\0'); /* caller should have ensured this */ 26055847Sbostic 26155847Sbostic pos = HERE(); 26255847Sbostic switch (c) { 26355847Sbostic case '(': 26455847Sbostic MUSTNOTSEE('\0', REG_EPAREN); 26555847Sbostic p->g->nsub++; 26655847Sbostic subno = p->g->nsub; 26755847Sbostic if (subno < NPAREN) 26855847Sbostic p->pbegin[subno] = HERE(); 26955847Sbostic EMIT(OLPAREN, subno); 27055847Sbostic if (!SEE(')')) 27155847Sbostic p_ere(p, ')'); 27255847Sbostic if (subno < NPAREN) { 27355847Sbostic p->pend[subno] = HERE(); 27455847Sbostic assert(p->pend[subno] != 0); 27555847Sbostic } 27655847Sbostic EMIT(ORPAREN, subno); 27755847Sbostic MUSTEAT(')', REG_EPAREN); 27855847Sbostic break; 27955847Sbostic #ifndef POSIX_MISTAKE 28055847Sbostic case ')': /* happens only if no current unmatched ( */ 28155847Sbostic /* 28255847Sbostic * You may ask, why the ifndef? Because I didn't notice 28355847Sbostic * this until slightly too late for 1003.2, and none of the 28455847Sbostic * other 1003.2 regular-expression reviewers noticed it at 28555847Sbostic * all. So an unmatched ) is legal POSIX, at least until 28655847Sbostic * we can get it fixed. 28755847Sbostic */ 28855847Sbostic SETERROR(REG_EPAREN); 28955847Sbostic break; 29055847Sbostic #endif 29155847Sbostic case '^': 29255847Sbostic EMIT(OBOL, 0); 29355847Sbostic p->g->iflags |= USEBOL; 29455847Sbostic wascaret = 1; 29555847Sbostic break; 29655847Sbostic case '$': 29755847Sbostic EMIT(OEOL, 0); 29855847Sbostic p->g->iflags |= USEEOL; 29955847Sbostic break; 30055847Sbostic case '|': 30155847Sbostic SETERROR(REG_EMPTY); 30255847Sbostic break; 30355847Sbostic case '*': 30455847Sbostic case '+': 30555847Sbostic case '?': 30655847Sbostic SETERROR(REG_BADRPT); 30755847Sbostic break; 30855847Sbostic case '.': 30955847Sbostic if (p->g->cflags®_NEWLINE) 31055847Sbostic nonnewline(p); 31155847Sbostic else 31255847Sbostic EMIT(OANY, 0); 31355847Sbostic break; 31455847Sbostic case '[': 31555847Sbostic p_bracket(p); 31655847Sbostic break; 31755847Sbostic case '\\': 31855847Sbostic c = GETNEXT(); 31956355Sbostic #ifdef xxx 32055847Sbostic if (c == '^' || c == '.' || c == '[' || c == '$' || 32155847Sbostic c == '(' || c == ')' || c == '|' || 32255847Sbostic c == '*' || c == '+' || c == '?' || 32355847Sbostic c == '{' || c == '\\') 32456355Sbostic #else 32556355Sbostic if (c != '\0') 32656355Sbostic #endif 32755847Sbostic ordinary(p, c); 32855847Sbostic else 32955847Sbostic SETERROR(REG_EESCAPE); 33055847Sbostic break; 33155847Sbostic case '{': /* okay as ordinary except if digit follows */ 33255847Sbostic REQUIRE(!isdigit(PEEK()), REG_BADRPT); 33355847Sbostic /* FALLTHROUGH */ 33455847Sbostic default: 33555847Sbostic ordinary(p, c); 33655847Sbostic break; 33755847Sbostic } 33855847Sbostic 33955847Sbostic c = PEEK(); 34055847Sbostic if (!ISRPT(c, PEEK2())) 34155847Sbostic return; /* no repetition, we're done */ 34255847Sbostic NEXT(); 34355847Sbostic 34455847Sbostic REQUIRE(!wascaret, REG_BADRPT); 34555847Sbostic switch (c) { 34655847Sbostic case '*': /* implemented as +? */ 34755847Sbostic INSERT(OPLUS_, pos); 34855847Sbostic BACK(O_PLUS, pos); 34955847Sbostic INSERT(OQUEST_, pos); 35055847Sbostic BACK(O_QUEST, pos); 35155847Sbostic break; 35255847Sbostic case '+': 35355847Sbostic INSERT(OPLUS_, pos); 35455847Sbostic BACK(O_PLUS, pos); 35555847Sbostic break; 35655847Sbostic case '?': 35755847Sbostic INSERT(OQUEST_, pos); 35855847Sbostic BACK(O_QUEST, pos); 35955847Sbostic break; 36055847Sbostic case '{': 36155847Sbostic count = p_count(p); 36255847Sbostic if (EAT(',')) { 36355847Sbostic if (isdigit(PEEK())) { 36455847Sbostic count2 = p_count(p); 36555847Sbostic REQUIRE(count <= count2, REG_BADBR); 36655847Sbostic } else /* single number with comma */ 36755847Sbostic count2 = INFINITY; 36855847Sbostic } else /* just a single number */ 36955847Sbostic count2 = count; 37055847Sbostic repeat(p, pos, count, count2); 37155847Sbostic if (!EAT('}')) { /* error heuristics */ 37255847Sbostic while ((c = PEEK()) != '\0' && c != '}') 37355847Sbostic NEXT(); 37455847Sbostic if (c == '\0') 37555847Sbostic SETERROR(REG_EBRACE); 37655847Sbostic else 37755847Sbostic SETERROR(REG_BADBR); 37855847Sbostic } 37955847Sbostic break; 38055847Sbostic } 38155847Sbostic 38255847Sbostic c = PEEK(); 38355847Sbostic REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT); 38455847Sbostic } 38555847Sbostic 38655847Sbostic /* 38755847Sbostic - p_bre - BRE parser top level, anchoring and concatenation 38856355Sbostic * Giving end1 as '\0' essentially eliminates the end1/end2 check. 38955847Sbostic * 39056355Sbostic * This implementation is a bit of a kludge, in that a trailing $ is first 39156355Sbostic * taken as an ordinary character and then revised to be an anchor. The 39256355Sbostic * only undesirable side effect is that '$' gets included as a character 39356355Sbostic * category in such cases. This is fairly harmless; not worth fixing. 39456355Sbostic * The amount of lookahead needed to avoid this kludge is excessive, 39556355Sbostic * especially since things like "$*" appear to be legal. xxx 39655847Sbostic */ 39755847Sbostic static void 39855847Sbostic p_bre(p, end1, end2) 39955847Sbostic register struct parse *p; 40056357Sbostic register u_int end1; /* first terminating character */ 40156357Sbostic register u_int end2; /* second terminating character */ 40255847Sbostic { 40355847Sbostic register sopno start = HERE(); 40455847Sbostic register int first = 1; /* first subexpression? */ 40556355Sbostic register int wasdollar = 0; 40655847Sbostic 40755847Sbostic if (EAT('^')) { 40855847Sbostic EMIT(OBOL, 0); 40955847Sbostic p->g->iflags |= USEBOL; 41055847Sbostic } 41155847Sbostic while (!SEE('\0') && !SEETWO(end1, end2)) { 41255847Sbostic wasdollar = p_simp_re(p, first); 41355847Sbostic first = 0; 41455847Sbostic } 41555847Sbostic if (wasdollar) { /* oops, that was a trailing anchor */ 41655847Sbostic DROP(1); 41755847Sbostic EMIT(OEOL, 0); 41855847Sbostic p->g->iflags |= USEEOL; 41955847Sbostic } 42055847Sbostic 42155847Sbostic REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 42255847Sbostic } 42355847Sbostic 42455847Sbostic /* 42555847Sbostic - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 42655847Sbostic */ 42755847Sbostic static int /* was the simple RE an unbackslashed $? */ 42855847Sbostic p_simp_re(p, starordinary) 42955847Sbostic register struct parse *p; 43055847Sbostic int starordinary; /* is a leading * an ordinary character? */ 43155847Sbostic { 43255847Sbostic register int c; 43355847Sbostic register int count; 43455847Sbostic register int count2; 43555847Sbostic register sopno pos; 43655847Sbostic register int i; 43755847Sbostic register sopno subno; 43855847Sbostic # define BACKSL (1<<CHAR_BIT) 43955847Sbostic 44055847Sbostic pos = HERE(); /* repetion op, if any, covers from here */ 44155847Sbostic 44255847Sbostic c = GETNEXT(); 44355847Sbostic assert(c != '\0'); /* caller should have ensured this */ 44455847Sbostic if (c == '\\') 44555847Sbostic c = BACKSL | PEEK(); 44655847Sbostic switch (c) { 44755847Sbostic case '.': 44855847Sbostic if (p->g->cflags®_NEWLINE) 44955847Sbostic nonnewline(p); 45055847Sbostic else 45155847Sbostic EMIT(OANY, 0); 45255847Sbostic break; 45355847Sbostic case '[': 45455847Sbostic p_bracket(p); 45555847Sbostic break; 45655847Sbostic case BACKSL|'{': 45755847Sbostic SETERROR(REG_BADRPT); 45855847Sbostic break; 45955847Sbostic case BACKSL|'(': 46055847Sbostic NEXT(); 46155847Sbostic p->g->nsub++; 46255847Sbostic subno = p->g->nsub; 46355847Sbostic if (subno < NPAREN) 46455847Sbostic p->pbegin[subno] = HERE(); 46555847Sbostic EMIT(OLPAREN, subno); 46655847Sbostic /* the SEE here is an error heuristic */ 46755847Sbostic if (!SEE('\0') && !SEETWO('\\', ')')) 46855847Sbostic p_bre(p, '\\', ')'); 46955847Sbostic if (subno < NPAREN) { 47055847Sbostic p->pend[subno] = HERE(); 47155847Sbostic assert(p->pend[subno] != 0); 47255847Sbostic } 47355847Sbostic EMIT(ORPAREN, subno); 47455847Sbostic REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 47555847Sbostic break; 47655847Sbostic case BACKSL|')': /* should not get here -- must be user */ 47755847Sbostic case BACKSL|'}': 47855847Sbostic SETERROR(REG_EPAREN); 47955847Sbostic break; 48055847Sbostic case BACKSL|'1': 48155847Sbostic case BACKSL|'2': 48255847Sbostic case BACKSL|'3': 48355847Sbostic case BACKSL|'4': 48455847Sbostic case BACKSL|'5': 48555847Sbostic case BACKSL|'6': 48655847Sbostic case BACKSL|'7': 48755847Sbostic case BACKSL|'8': 48855847Sbostic case BACKSL|'9': 48955847Sbostic i = (c&~BACKSL) - '0'; 49055847Sbostic assert(i < NPAREN); 49155847Sbostic if (p->pend[i] != 0) { 49255847Sbostic assert(i <= p->g->nsub); 49355847Sbostic EMIT(OBACK_, i); 49455847Sbostic assert(p->pbegin[i] != 0); 49555847Sbostic assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 49655847Sbostic assert(OP(p->strip[p->pend[i]]) == ORPAREN); 49755847Sbostic (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 49855847Sbostic EMIT(O_BACK, i); 49955847Sbostic } else 50055847Sbostic SETERROR(REG_ESUBREG); 50155847Sbostic p->g->backrefs = 1; 50255847Sbostic NEXT(); 50355847Sbostic break; 50456355Sbostic case BACKSL|'\0': 50556355Sbostic SETERROR(REG_EESCAPE); 50656355Sbostic break; 50755847Sbostic case '*': 50855847Sbostic REQUIRE(starordinary, REG_BADRPT); 50955847Sbostic /* FALLTHROUGH */ 51055847Sbostic default: 51156355Sbostic if (c & BACKSL) 51256372Sbostic NEXT(); 51356372Sbostic ordinary(p, (uchar)(c &~ BACKSL)); 51455847Sbostic break; 51555847Sbostic } 51655847Sbostic 51755847Sbostic if (EAT('*')) { /* implemented as +? */ 51855847Sbostic INSERT(OPLUS_, pos); 51955847Sbostic BACK(O_PLUS, pos); 52055847Sbostic INSERT(OQUEST_, pos); 52155847Sbostic BACK(O_QUEST, pos); 52255847Sbostic } else if (EATTWO('\\', '{')) { 52355847Sbostic count = p_count(p); 52455847Sbostic if (EAT(',')) { 52555847Sbostic if (isdigit(PEEK())) { 52655847Sbostic count2 = p_count(p); 52755847Sbostic REQUIRE(count <= count2, REG_BADBR); 52855847Sbostic } else /* single number with comma */ 52955847Sbostic count2 = INFINITY; 53055847Sbostic } else /* just a single number */ 53155847Sbostic count2 = count; 53255847Sbostic repeat(p, pos, count, count2); 53355847Sbostic if (!EATTWO('\\', '}')) { /* error heuristics */ 53455847Sbostic while (!SEE('\0') && !SEETWO('\\', '}')) 53555847Sbostic NEXT(); 53655847Sbostic if (SEE('\0')) 53755847Sbostic SETERROR(REG_EBRACE); 53855847Sbostic else 53955847Sbostic SETERROR(REG_BADBR); 54055847Sbostic } 54155847Sbostic } else if (c == '$') /* unbackslashed $ not followed by reptn */ 54255847Sbostic return(1); 54355847Sbostic 54455847Sbostic return(0); 54555847Sbostic } 54655847Sbostic 54755847Sbostic /* 54855847Sbostic - p_count - parse a repetition count 54955847Sbostic */ 55055847Sbostic static int /* the value */ 55155847Sbostic p_count(p) 55255847Sbostic register struct parse *p; 55355847Sbostic { 55455847Sbostic register int count = 0; 55555847Sbostic register int ndigits = 0; 55655847Sbostic 55755847Sbostic while (isdigit(PEEK()) && count <= DUPMAX) { 55855847Sbostic count = count*10 + (GETNEXT() - '0'); 55955847Sbostic ndigits++; 56055847Sbostic } 56155847Sbostic 56255847Sbostic REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 56355847Sbostic return(count); 56455847Sbostic } 56555847Sbostic 56655847Sbostic /* 56755847Sbostic - p_bracket - parse a bracketed character list 56855847Sbostic * 56955847Sbostic * Note a significant property of this code: if the allocset() did SETERROR, 57055847Sbostic * no set operations are done. 57155847Sbostic */ 57255847Sbostic static void 57355847Sbostic p_bracket(p) 57455847Sbostic register struct parse *p; 57555847Sbostic { 57655847Sbostic register uchar c; 57755847Sbostic register cset *cs = allocset(p); 57855847Sbostic register int invert = 0; 57955847Sbostic 58055847Sbostic if (EAT('^')) 58155847Sbostic invert++; /* make note to invert set at end */ 58255847Sbostic if (EAT(']')) 58355847Sbostic CHadd(cs, ']'); 584*56618Sbostic else if (EAT('-')) 585*56618Sbostic CHadd(cs, '-'); 58655847Sbostic while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']')) 58755847Sbostic p_b_term(p, cs); 58855847Sbostic if (EAT('-')) 58955847Sbostic CHadd(cs, '-'); 59055847Sbostic MUSTEAT(']', REG_EBRACK); 59155847Sbostic 59255847Sbostic if (invert && p->error == 0) { 59355847Sbostic register int i; 59455847Sbostic 59555847Sbostic for (i = p->g->csetsize - 1; i >= 0; i--) 59655847Sbostic if (CHIN(cs, i)) 59755847Sbostic CHsub(cs, i); 59855847Sbostic else 59955847Sbostic CHadd(cs, i); 60055847Sbostic if (p->g->cflags®_NEWLINE) 60155847Sbostic CHsub(cs, '\n'); 60255847Sbostic if (cs->multis != NULL) 60355847Sbostic mcinvert(p, cs); 60455847Sbostic } 60555847Sbostic assert(cs->multis == NULL); /* xxx */ 60655847Sbostic EMIT(OANYOF, freezeset(p, cs)); 60755847Sbostic } 60855847Sbostic 60955847Sbostic /* 61055847Sbostic - p_b_term - parse one term of a bracketed character list 61155847Sbostic */ 61255847Sbostic static void 61355847Sbostic p_b_term(p, cs) 61455847Sbostic register struct parse *p; 61555847Sbostic register cset *cs; 61655847Sbostic { 61755847Sbostic register uchar c; 61855847Sbostic register uchar start, finish; 61955847Sbostic register int i; 62055847Sbostic 62155847Sbostic /* classify what we've got */ 62255847Sbostic switch (PEEK()) { 62355847Sbostic case '[': 62455847Sbostic c = PEEK2(); 62555847Sbostic break; 62655847Sbostic case '-': 62755847Sbostic SETERROR(REG_ERANGE); 62855847Sbostic return; /* NOTE RETURN */ 62955847Sbostic break; 63055847Sbostic default: 63155847Sbostic c = '\0'; 63255847Sbostic break; 63355847Sbostic } 63455847Sbostic 63555847Sbostic switch (c) { 63655847Sbostic case ':': /* character class */ 63755847Sbostic NEXT2(); 63855847Sbostic c = PEEK(); 63955847Sbostic REQUIRE(c != '\0', REG_EBRACK); 64055847Sbostic REQUIRE(c != '-' && c != ']', REG_ECTYPE); 64155847Sbostic p_b_cclass(p, cs); 64255847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 64355847Sbostic REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 64455847Sbostic break; 64555847Sbostic case '=': /* equivalence class */ 64655847Sbostic NEXT2(); 64755847Sbostic c = PEEK(); 64855847Sbostic REQUIRE(c != '\0', REG_EBRACK); 64955847Sbostic REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 65055847Sbostic p_b_eclass(p, cs); 65155847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 65255847Sbostic REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 65355847Sbostic break; 65455847Sbostic default: /* symbol, ordinary character, or range */ 65555847Sbostic /* xxx revision needed for multichar stuff */ 65655847Sbostic start = p_b_symbol(p); 65755847Sbostic if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') { 65855847Sbostic /* range */ 65955847Sbostic NEXT(); 66055847Sbostic if (EAT('-')) 66155847Sbostic finish = '-'; 66255847Sbostic else 66355847Sbostic finish = p_b_symbol(p); 66455847Sbostic } else 66555847Sbostic finish = start; 66655847Sbostic REQUIRE(start <= finish, REG_ERANGE); 66755847Sbostic for (i = start; i <= finish; i++) { 66855847Sbostic CHadd(cs, i); 66955847Sbostic if ((p->g->cflags®_ICASE) && isalpha(i)) { 67055847Sbostic c = othercase((uchar)i); 67155847Sbostic CHadd(cs, c); 67255847Sbostic } 67355847Sbostic } 67455847Sbostic break; 67555847Sbostic } 67655847Sbostic } 67755847Sbostic 67855847Sbostic /* 67955847Sbostic - p_b_cclass - parse a character-class name and deal with it 68055847Sbostic */ 68155847Sbostic static void 68255847Sbostic p_b_cclass(p, cs) 68355847Sbostic register struct parse *p; 68455847Sbostic register cset *cs; 68555847Sbostic { 68655847Sbostic register uchar *sb = p->next; 68755847Sbostic register uchar *se = sb; 68855847Sbostic register struct cclass *cp; 68955847Sbostic register int len; 69055847Sbostic register uchar *u; 69155847Sbostic register uchar c; 69255847Sbostic 69355847Sbostic while (isalpha(*se)) 69455847Sbostic se++; 69555847Sbostic len = se - sb; 69655847Sbostic NEXTn(len); 69755847Sbostic for (cp = cclasses; cp->name != NULL; cp++) 69855847Sbostic if (strncmp(cp->name, (char *)sb, len) == 0 && 69956355Sbostic cp->name[len] == '\0') 70055847Sbostic break; 70155847Sbostic if (cp->name == NULL) { 70255847Sbostic /* oops, didn't find it */ 70355847Sbostic SETERROR(REG_ECTYPE); 70455847Sbostic return; 70555847Sbostic } 70655847Sbostic 70755847Sbostic u = (uchar *)cp->chars; 70855847Sbostic while ((c = *u++) != '\0') 70955847Sbostic CHadd(cs, c); 71055847Sbostic for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1) 71155847Sbostic MCadd(cs, u); 71255847Sbostic } 71355847Sbostic 71455847Sbostic /* 71555847Sbostic - p_b_eclass - parse an equivalence-class name and deal with it 71655847Sbostic * 71755847Sbostic * This implementation is incomplete. xxx 71855847Sbostic */ 71955847Sbostic static void 72055847Sbostic p_b_eclass(p, cs) 72155847Sbostic register struct parse *p; 72255847Sbostic register cset *cs; 72355847Sbostic { 72455847Sbostic register uchar c; 72555847Sbostic 72655847Sbostic c = p_b_coll_elem(p, '='); 72755847Sbostic CHadd(cs, c); 72855847Sbostic } 72955847Sbostic 73055847Sbostic /* 73155847Sbostic - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 73255847Sbostic */ 73355847Sbostic static uchar /* value of symbol */ 73455847Sbostic p_b_symbol(p) 73555847Sbostic register struct parse *p; 73655847Sbostic { 73755847Sbostic register uchar value; 73855847Sbostic 73955847Sbostic if (!EATTWO('[', '.')) { 74055847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 74155847Sbostic return(GETNEXT()); 74255847Sbostic } 74355847Sbostic 74455847Sbostic /* collating symbol */ 74555847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 74655847Sbostic value = p_b_coll_elem(p, '.'); 74755847Sbostic REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 74855847Sbostic return(value); 74955847Sbostic } 75055847Sbostic 75155847Sbostic /* 75255847Sbostic - p_b_coll_elem - parse a collating-element name and look it up 75355847Sbostic */ 75455847Sbostic static uchar /* value of collating element */ 75555847Sbostic p_b_coll_elem(p, endc) 75655847Sbostic register struct parse *p; 75756357Sbostic u_int endc; /* name ended by endc,']' */ 75855847Sbostic { 75955847Sbostic register uchar *sp = p->next; 76055847Sbostic register struct cname *cp; 76155847Sbostic register int len; 76255847Sbostic register uchar c; 76355847Sbostic 76455847Sbostic while ((c = PEEK()) != '\0' && !SEETWO(endc, ']')) 76555847Sbostic NEXT(); 76655847Sbostic if (c == '\0') { 76755847Sbostic SETERROR(REG_EBRACK); 76855847Sbostic return(0); 76955847Sbostic } 77055847Sbostic len = p->next - sp; 77155847Sbostic for (cp = cnames; cp->name != NULL; cp++) 77255847Sbostic if (strncmp(cp->name, (char *)sp, len) == 0 && 77356355Sbostic cp->name[len] == '\0') 77455847Sbostic return(cp->code); /* known name */ 77555847Sbostic if (len == 1) 77655847Sbostic return(*sp); /* single character */ 77755847Sbostic SETERROR(REG_ECOLLATE); /* neither */ 77855847Sbostic return(0); 77955847Sbostic } 78055847Sbostic 78155847Sbostic /* 78255847Sbostic - othercase - return the case counterpart of an alphabetic 78355847Sbostic */ 78455847Sbostic static uchar 78555847Sbostic othercase(ch) 78656357Sbostic u_int ch; 78755847Sbostic { 78855847Sbostic assert(isalpha(ch)); 78955847Sbostic if (isupper(ch)) 79055847Sbostic return(tolower(ch)); 79155847Sbostic else if (islower(ch)) 79255847Sbostic return(toupper(ch)); 79355847Sbostic else /* peculiar, but could happen */ 79455847Sbostic return(ch); 79555847Sbostic } 79655847Sbostic 79755847Sbostic /* 79855847Sbostic - bothcases - emit a dualcase version of a character 79956355Sbostic * 80055847Sbostic * Boy, is this implementation ever a kludge... 80155847Sbostic */ 80255847Sbostic static void 80355847Sbostic bothcases(p, ch) 80455847Sbostic register struct parse *p; 80556357Sbostic u_int ch; 80655847Sbostic { 80755847Sbostic register uchar *oldnext; 80855847Sbostic uchar bracket[3]; 80955847Sbostic 81055847Sbostic oldnext = p->next; 81155847Sbostic p->next = bracket; 81255847Sbostic bracket[0] = ch; 81355847Sbostic bracket[1] = ']'; 81455847Sbostic bracket[2] = '\0'; 81555847Sbostic p_bracket(p); 81655847Sbostic assert(p->next == bracket+2); 81755847Sbostic p->next = oldnext; 81855847Sbostic } 81955847Sbostic 82055847Sbostic /* 82155847Sbostic - ordinary - emit an ordinary character 82255847Sbostic */ 82355847Sbostic static void 82455847Sbostic ordinary(p, ch) 82555847Sbostic register struct parse *p; 82656357Sbostic register u_int ch; 82755847Sbostic { 82855847Sbostic register uchar *cap = p->g->categories; 82955847Sbostic 83055847Sbostic if ((p->g->cflags®_ICASE) && isalpha(ch)) { 83155847Sbostic bothcases(p, ch); 83255847Sbostic return; 83355847Sbostic } 83455847Sbostic 83555847Sbostic EMIT(OCHAR, ch); 83655847Sbostic if (cap[ch] == 0) 83755847Sbostic cap[ch] = p->g->ncategories++; 83855847Sbostic } 83955847Sbostic 84055847Sbostic /* 84155847Sbostic - nonnewline - emit REG_NEWLINE version of OANY 84256355Sbostic * 84355847Sbostic * Boy, is this implementation ever a kludge... 84455847Sbostic */ 84555847Sbostic static void 84655847Sbostic nonnewline(p) 84755847Sbostic register struct parse *p; 84855847Sbostic { 84955847Sbostic register uchar *oldnext; 85055847Sbostic uchar bracket[4]; 85155847Sbostic 85255847Sbostic oldnext = p->next; 85355847Sbostic p->next = bracket; 85455847Sbostic bracket[0] = '^'; 85555847Sbostic bracket[1] = '\n'; 85655847Sbostic bracket[2] = ']'; 85755847Sbostic bracket[3] = '\0'; 85855847Sbostic p_bracket(p); 85955847Sbostic assert(p->next == bracket+3); 86055847Sbostic p->next = oldnext; 86155847Sbostic } 86255847Sbostic 86355847Sbostic /* 86455847Sbostic - repeat - generate code for a bounded repetition, recursively if needed 86555847Sbostic */ 86655847Sbostic static void 86755847Sbostic repeat(p, start, from, to) 86855847Sbostic register struct parse *p; 86955847Sbostic sopno start; /* operand from here to end of strip */ 87055847Sbostic int from; /* repeated from this number */ 87155847Sbostic int to; /* to this number of times (maybe INFINITY) */ 87255847Sbostic { 87355847Sbostic register sopno finish = HERE(); 87455847Sbostic # define N 2 87555847Sbostic # define INF 3 87655847Sbostic # define REP(f, t) ((f)*8 + (t)) 87755847Sbostic # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 87855847Sbostic register sopno copy; 87955847Sbostic 88055847Sbostic if (p->error != 0) /* head off possible runaway recursion */ 88155847Sbostic return; 88255847Sbostic 88355847Sbostic assert(from <= to); 88455847Sbostic 88555847Sbostic switch (REP(MAP(from), MAP(to))) { 88655847Sbostic case REP(0, 0): /* must be user doing this */ 88755847Sbostic DROP(finish-start); /* drop the operand */ 88855847Sbostic break; 88955847Sbostic case REP(0, 1): /* as x{1,1}? */ 89055847Sbostic case REP(0, N): /* as x{1,n}? */ 89155847Sbostic case REP(0, INF): /* as x{1,}? */ 89255847Sbostic INSERT(OQUEST_, start); /* offset is wrong... */ 89355847Sbostic repeat(p, start+1, 1, to); 89455847Sbostic FWD(start); /* ... fix it */ 89555847Sbostic BACK(O_QUEST, start); 89655847Sbostic break; 89755847Sbostic case REP(1, 1): /* trivial case */ 89855847Sbostic /* done */ 89955847Sbostic break; 90055847Sbostic case REP(1, N): /* as x?x{1,n-1} */ 90155847Sbostic INSERT(OQUEST_, start); 90255847Sbostic BACK(O_QUEST, start); 90355847Sbostic copy = dupl(p, start+1, finish+1); 90455847Sbostic assert(copy == finish+2); 90555847Sbostic repeat(p, copy, 1, to-1); 90655847Sbostic break; 90755847Sbostic case REP(1, INF): /* as x+ */ 90855847Sbostic INSERT(OPLUS_, start); 90955847Sbostic BACK(O_PLUS, start); 91055847Sbostic break; 91155847Sbostic case REP(N, N): /* as xx{m-1,n-1} */ 91255847Sbostic copy = dupl(p, start, finish); 91355847Sbostic repeat(p, copy, from-1, to-1); 91455847Sbostic break; 91555847Sbostic case REP(N, INF): /* as xx{n-1,INF} */ 91655847Sbostic copy = dupl(p, start, finish); 91755847Sbostic repeat(p, copy, from-1, to); 91855847Sbostic break; 91955847Sbostic default: /* "can't happen" */ 92055847Sbostic SETERROR(REG_ASSERT); /* just in case */ 92155847Sbostic break; 92255847Sbostic } 92355847Sbostic } 92455847Sbostic 92555847Sbostic /* 92655847Sbostic - seterr - set an error condition 92755847Sbostic */ 92855847Sbostic static int /* useless but makes type checking happy */ 92955847Sbostic seterr(p, e) 93055847Sbostic register struct parse *p; 93155847Sbostic int e; 93255847Sbostic { 93355847Sbostic if (p->error == 0) /* keep earliest error condition */ 93455847Sbostic p->error = e; 93555847Sbostic p->next = nuls; /* try to bring things to a halt */ 93655847Sbostic return(0); /* make the return value well-defined */ 93755847Sbostic } 93855847Sbostic 93955847Sbostic /* 94055847Sbostic - allocset - allocate a set of characters for [] 94155847Sbostic */ 94255847Sbostic static cset * 94355847Sbostic allocset(p) 94455847Sbostic register struct parse *p; 94555847Sbostic { 94655847Sbostic register int no = p->g->ncsets++; 94755847Sbostic register size_t nc; 94855847Sbostic register size_t nbytes; 94955847Sbostic register cset *cs; 95055847Sbostic register size_t css = (size_t)p->g->csetsize; 95155847Sbostic 95255847Sbostic if (no >= p->ncsalloc) { /* need another column of space */ 95355847Sbostic p->ncsalloc += CHAR_BIT; 95455847Sbostic nc = p->ncsalloc; 95555847Sbostic assert(nc % CHAR_BIT == 0); 95655847Sbostic nbytes = nc / CHAR_BIT * css; 95755847Sbostic if (p->g->sets == NULL) 95855847Sbostic p->g->sets = (cset *)malloc(nc * sizeof(cset)); 95955847Sbostic else 96055847Sbostic p->g->sets = (cset *)realloc((char *)p->g->sets, 96155847Sbostic nc * sizeof(cset)); 96255847Sbostic if (p->g->setbits == NULL) 96355847Sbostic p->g->setbits = (uchar *)malloc(nbytes); 96455847Sbostic else 96555847Sbostic p->g->setbits = (uchar *)realloc((char *)p->g->setbits, 96655847Sbostic nbytes); 96755847Sbostic if (p->g->sets != NULL && p->g->setbits != NULL) 96855847Sbostic (void) memset((char *)p->g->setbits + (nbytes - css), 96955847Sbostic 0, css); 97055847Sbostic else { 97155847Sbostic no = 0; 97255847Sbostic SETERROR(REG_ESPACE); 97355847Sbostic /* caller's responsibility not to do set ops */ 97455847Sbostic } 97555847Sbostic } 97655847Sbostic 97755847Sbostic assert(p->g->sets != NULL); /* xxx */ 97855847Sbostic cs = &p->g->sets[no]; 97955847Sbostic cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 98055847Sbostic cs->mask = 1 << ((no) % CHAR_BIT); 98155847Sbostic cs->hash = 0; 98255847Sbostic cs->smultis = 0; 98355847Sbostic cs->multis = NULL; 98455847Sbostic 98555847Sbostic return(cs); 98655847Sbostic } 98755847Sbostic 98855847Sbostic /* 98955847Sbostic - freezeset - final processing on a set of characters 99055847Sbostic * 99155847Sbostic * The main task here is merging identical sets. This is usually a waste 99255847Sbostic * of time (although the hash code minimizes the overhead), but can win 99355847Sbostic * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 99455847Sbostic * is done using addition rather than xor -- all ASCII [aA] sets xor to 99555847Sbostic * the same value! 99655847Sbostic */ 99755847Sbostic static int /* set number */ 99855847Sbostic freezeset(p, cs) 99955847Sbostic register struct parse *p; 100055847Sbostic register cset *cs; 100155847Sbostic { 100255847Sbostic register uchar h = cs->hash; 100355847Sbostic register int i; 100455847Sbostic register cset *top = &p->g->sets[p->g->ncsets]; 100555847Sbostic register cset *cs2; 100655847Sbostic register size_t css = (size_t)p->g->csetsize; 100755847Sbostic 100855847Sbostic /* look for an earlier one which is the same */ 100955847Sbostic for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 101055847Sbostic if (cs2->hash == h && cs2 != cs) { 101155847Sbostic /* maybe */ 101255847Sbostic for (i = 0; i < css; i++) 101355847Sbostic if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 101455847Sbostic break; /* no */ 101555847Sbostic if (i == css) 101655847Sbostic break; /* yes */ 101755847Sbostic } 101855847Sbostic 101955847Sbostic if (cs2 < top) { /* found one */ 102055847Sbostic assert(cs == top-1); 102155847Sbostic p->g->ncsets--; 102255847Sbostic for (i = 0; i < css; i++) 102355847Sbostic CHsub(cs, i); 102455847Sbostic cs = cs2; 102555847Sbostic } 102655847Sbostic 102755847Sbostic return((int)(cs - p->g->sets)); 102855847Sbostic } 102955847Sbostic 103055847Sbostic /* 103155847Sbostic - mcadd - add a collating element to a cset 103255847Sbostic */ 103355847Sbostic static void 103455847Sbostic mcadd(p, cs, cp) 103555847Sbostic register struct parse *p; 103655847Sbostic register cset *cs; 103755847Sbostic register uchar *cp; 103855847Sbostic { 103955847Sbostic register size_t oldend = cs->smultis; 104055847Sbostic 104155847Sbostic cs->smultis += strlen((char *)cp) + 1; 104255847Sbostic if (cs->multis == NULL) 104355847Sbostic cs->multis = (uchar *)malloc(cs->smultis); 104455847Sbostic else 104555847Sbostic cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 104655847Sbostic if (cs->multis == NULL) { 104755847Sbostic SETERROR(REG_ESPACE); 104855847Sbostic return; 104955847Sbostic } 105055847Sbostic 105155847Sbostic (void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp); 105255847Sbostic cs->multis[cs->smultis - 1] = '\0'; 105355847Sbostic } 105455847Sbostic 105555847Sbostic /* 105655847Sbostic - mcsub - subtract a collating element from a cset 105755847Sbostic */ 105855847Sbostic static void 105955847Sbostic mcsub(p, cs, cp) 106055847Sbostic register struct parse *p; 106155847Sbostic register cset *cs; 106256357Sbostic register u_int *cp; 106355847Sbostic { 106455847Sbostic register uchar *fp = mcfind(cs, cp); 106555847Sbostic register size_t len = strlen((char *)fp); 106655847Sbostic 106755847Sbostic assert(p != NULL); 106855847Sbostic (void) memmove((char *)fp, (char *)(fp + len + 1), 106955847Sbostic cs->smultis - (fp + len + 1 - cs->multis)); 107055847Sbostic cs->smultis -= len; 107155847Sbostic 107255847Sbostic if (cs->smultis == 0) { 107355847Sbostic free((char *)cs->multis); 107455847Sbostic cs->multis = NULL; 107555847Sbostic return; 107655847Sbostic } 107755847Sbostic 107855847Sbostic cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 107955847Sbostic assert(cs->multis != NULL); 108055847Sbostic } 108155847Sbostic 108255847Sbostic /* 108355847Sbostic - mcin - is a collating element in a cset? 108455847Sbostic */ 108555847Sbostic static int 108655847Sbostic mcin(p, cs, cp) 108755847Sbostic register struct parse *p; 108855847Sbostic register cset *cs; 108956357Sbostic register u_int *cp; 109055847Sbostic { 109155847Sbostic return(mcfind(cs, cp) != NULL); 109255847Sbostic } 109355847Sbostic 109455847Sbostic /* 109555847Sbostic - mcfind - find a collating element in a cset 109655847Sbostic */ 109755847Sbostic static uchar * 109855847Sbostic mcfind(cs, cp) 109955847Sbostic register cset *cs; 110056357Sbostic register u_int *cp; 110155847Sbostic { 110255847Sbostic register uchar *p; 110355847Sbostic 110455847Sbostic if (cs->multis == NULL) 110555847Sbostic return(NULL); 110655847Sbostic for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1) 110755847Sbostic if (strcmp((char *)cp, (char *)p) == 0) 110855847Sbostic return(p); 110955847Sbostic return(NULL); 111055847Sbostic } 111155847Sbostic 111255847Sbostic /* 111355847Sbostic - mcinvert - invert the list of collating elements in a cset 111455847Sbostic * 111555847Sbostic * This would have to know the set of possibilities. Implementation 111655847Sbostic * is deferred. 111755847Sbostic */ 111855847Sbostic static void 111955847Sbostic mcinvert(p, cs) 112055847Sbostic register struct parse *p; 112155847Sbostic register cset *cs; 112255847Sbostic { 112355847Sbostic assert(cs->multis == NULL); /* xxx */ 112455847Sbostic } 112555847Sbostic 112655847Sbostic /* 112755847Sbostic - isinsets - is this character in any sets? 112855847Sbostic */ 112955847Sbostic static int /* predicate */ 113055847Sbostic isinsets(g, c) 113155847Sbostic register struct re_guts *g; 113256357Sbostic u_int c; 113355847Sbostic { 113455847Sbostic register uchar *col; 113555847Sbostic register int i; 113655847Sbostic register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 113755847Sbostic 113855847Sbostic for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 113955847Sbostic if (col[c] != 0) 114055847Sbostic return(1); 114155847Sbostic return(0); 114255847Sbostic } 114355847Sbostic 114455847Sbostic /* 114555847Sbostic - samesets - are these two characters in exactly the same sets? 114655847Sbostic */ 114755847Sbostic static int /* predicate */ 114855847Sbostic samesets(g, c1, c2) 114955847Sbostic register struct re_guts *g; 115056357Sbostic register u_int c1; 115156357Sbostic register u_int c2; 115255847Sbostic { 115355847Sbostic register uchar *col; 115455847Sbostic register int i; 115555847Sbostic register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 115655847Sbostic 115755847Sbostic for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 115855847Sbostic if (col[c1] != col[c2]) 115955847Sbostic return(0); 116055847Sbostic return(1); 116155847Sbostic } 116255847Sbostic 116355847Sbostic /* 116455847Sbostic - categorize - sort out character categories 116555847Sbostic */ 116655847Sbostic static void 116755847Sbostic categorize(p, g) 116855847Sbostic struct parse *p; 116955847Sbostic register struct re_guts *g; 117055847Sbostic { 117155847Sbostic register uchar *cats = g->categories; 117255847Sbostic register unsigned c; 117355847Sbostic register unsigned c2; 117455847Sbostic register uchar cat; 117555847Sbostic 117655847Sbostic /* avoid making error situations worse */ 117755847Sbostic if (p->error != 0) 117855847Sbostic return; 117955847Sbostic 118055847Sbostic for (c = 0; c < g->csetsize; c++) 118155847Sbostic if (cats[c] == 0 && isinsets(g, c)) { 118255847Sbostic cat = g->ncategories++; 118355847Sbostic cats[c] = cat; 118455847Sbostic for (c2 = c+1; c2 < g->csetsize; c2++) 118555847Sbostic if (cats[c2] == 0 && samesets(g, c, c2)) 118655847Sbostic cats[c2] = cat; 118755847Sbostic } 118855847Sbostic } 118955847Sbostic 119055847Sbostic /* 119155847Sbostic - dupl - emit a duplicate of a bunch of sops 119255847Sbostic */ 119355847Sbostic static sopno /* start of duplicate */ 119455847Sbostic dupl(p, start, finish) 119555847Sbostic register struct parse *p; 119655847Sbostic sopno start; /* from here */ 119755847Sbostic sopno finish; /* to this less one */ 119855847Sbostic { 119955847Sbostic register sopno ret = HERE(); 120055847Sbostic register sopno len = finish - start; 120155847Sbostic 120255847Sbostic assert(finish >= start); 120355847Sbostic if (len == 0) 120455847Sbostic return(ret); 120555847Sbostic enlarge(p, p->ssize + len); /* this many unexpected additions */ 120655847Sbostic assert(p->ssize >= p->slen + len); 120755847Sbostic (void) memcpy((char *)(p->strip + p->slen), 120855847Sbostic (char *)(p->strip + start), (size_t)len*sizeof(sop)); 120955847Sbostic p->slen += len; 121055847Sbostic return(ret); 121155847Sbostic } 121255847Sbostic 121355847Sbostic /* 121455847Sbostic - doemit - emit a strip operator 121555847Sbostic * 121655847Sbostic * It might seem better to implement this as a macro with a function as 121755847Sbostic * hard-case backup, but it's just too big and messy unless there are 121855847Sbostic * some changes to the data structures. Maybe later. 121955847Sbostic */ 122055847Sbostic static void 122155847Sbostic doemit(p, op, opnd) 122255847Sbostic register struct parse *p; 122355847Sbostic sop op; 122455847Sbostic size_t opnd; 122555847Sbostic { 122655847Sbostic /* avoid making error situations worse */ 122755847Sbostic if (p->error != 0) 122855847Sbostic return; 122955847Sbostic 123055847Sbostic /* deal with oversize operands ("can't happen", more or less) */ 123155847Sbostic assert(opnd < 1<<OPSHIFT); 123255847Sbostic 123355847Sbostic /* deal with undersized strip */ 123455847Sbostic if (p->slen >= p->ssize) 123555847Sbostic enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 123655847Sbostic assert(p->slen < p->ssize); 123755847Sbostic 123855847Sbostic /* finally, it's all reduced to the easy case */ 123955847Sbostic p->strip[p->slen++] = SOP(op, opnd); 124055847Sbostic } 124155847Sbostic 124255847Sbostic /* 124355847Sbostic - doinsert - insert a sop into the strip 124455847Sbostic */ 124555847Sbostic static void 124655847Sbostic doinsert(p, op, opnd, pos) 124755847Sbostic register struct parse *p; 124855847Sbostic sop op; 124955847Sbostic size_t opnd; 125055847Sbostic sopno pos; 125155847Sbostic { 125255847Sbostic register sopno sn; 125355847Sbostic register sop s; 125455847Sbostic register int i; 125555847Sbostic 125655847Sbostic /* avoid making error situations worse */ 125755847Sbostic if (p->error != 0) 125855847Sbostic return; 125955847Sbostic 126055847Sbostic sn = HERE(); 126155847Sbostic EMIT(op, opnd); /* do checks, ensure space */ 126255847Sbostic assert(HERE() == sn+1); 126355847Sbostic s = p->strip[sn]; 126455847Sbostic 126555847Sbostic /* adjust paren pointers */ 126655847Sbostic assert(pos > 0); 126755847Sbostic for (i = 1; i < NPAREN; i++) { 126855847Sbostic if (p->pbegin[i] >= pos) { 126955847Sbostic p->pbegin[i]++; 127055847Sbostic } 127155847Sbostic if (p->pend[i] >= pos) { 127255847Sbostic p->pend[i]++; 127355847Sbostic } 127455847Sbostic } 127555847Sbostic 127655847Sbostic memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 127755847Sbostic (HERE()-pos-1)*sizeof(sop)); 127855847Sbostic p->strip[pos] = s; 127955847Sbostic } 128055847Sbostic 128155847Sbostic /* 128255847Sbostic - dofwd - complete a forward reference 128355847Sbostic */ 128455847Sbostic static void 128555847Sbostic dofwd(p, pos, value) 128655847Sbostic register struct parse *p; 128755847Sbostic register sopno pos; 128855847Sbostic sop value; 128955847Sbostic { 129055847Sbostic /* avoid making error situations worse */ 129155847Sbostic if (p->error != 0) 129255847Sbostic return; 129355847Sbostic 129455847Sbostic assert(value < 1<<OPSHIFT); 129555847Sbostic p->strip[pos] = OP(p->strip[pos]) | value; 129655847Sbostic } 129755847Sbostic 129855847Sbostic /* 129955847Sbostic - enlarge - enlarge the strip 130055847Sbostic */ 130155847Sbostic static void 130255847Sbostic enlarge(p, size) 130355847Sbostic register struct parse *p; 130455847Sbostic register sopno size; 130555847Sbostic { 130655847Sbostic register sop *sp; 130755847Sbostic 130855847Sbostic if (p->ssize >= size) 130955847Sbostic return; 131055847Sbostic 131155847Sbostic sp = (sop *)realloc(p->strip, size*sizeof(sop)); 131255847Sbostic if (sp == NULL) { 131355847Sbostic SETERROR(REG_ESPACE); 131455847Sbostic return; 131555847Sbostic } 131655847Sbostic p->strip = sp; 131755847Sbostic p->ssize = size; 131855847Sbostic } 131955847Sbostic 132055847Sbostic /* 132155847Sbostic - stripsnug - compact the strip 132255847Sbostic */ 132355847Sbostic static void 132455847Sbostic stripsnug(p, g) 132555847Sbostic register struct parse *p; 132655847Sbostic register struct re_guts *g; 132755847Sbostic { 132855847Sbostic g->nstates = p->slen; 132955847Sbostic g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop)); 133055847Sbostic if (g->strip == NULL) { 133155847Sbostic SETERROR(REG_ESPACE); 133255847Sbostic g->strip = p->strip; 133355847Sbostic } 133455847Sbostic } 133555847Sbostic 133655847Sbostic /* 133755847Sbostic - findmust - fill in must and mlen with longest mandatory literal string 133855847Sbostic * 133955847Sbostic * This algorithm could do fancy things like analyzing the operands of | 134055847Sbostic * for common subsequences. Someday. This code is simple and finds most 134155847Sbostic * of the interesting cases. 134255847Sbostic * 134355847Sbostic * Note that must and mlen got initialized during setup. 134455847Sbostic */ 134556355Sbostic static void 134655847Sbostic findmust(p, g) 134755847Sbostic struct parse *p; 134855847Sbostic register struct re_guts *g; 134955847Sbostic { 135055847Sbostic register sop *scan; 135155847Sbostic sop *start; 135255847Sbostic register sop *newstart; 135355847Sbostic register sopno newlen; 135455847Sbostic register sop s; 135555847Sbostic register char *cp; 135655847Sbostic register sopno i; 135755847Sbostic 135855847Sbostic /* avoid making error situations worse */ 135955847Sbostic if (p->error != 0) 136055847Sbostic return; 136155847Sbostic 136255847Sbostic /* find the longest OCHAR sequence in strip */ 136355847Sbostic newlen = 0; 136455847Sbostic scan = g->strip + 1; 136555847Sbostic do { 136655847Sbostic s = *scan++; 136755847Sbostic switch (OP(s)) { 136855847Sbostic case OCHAR: /* sequence member */ 136955847Sbostic if (newlen == 0) /* new sequence */ 137055847Sbostic newstart = scan - 1; 137155847Sbostic newlen++; 137255847Sbostic break; 137355847Sbostic case OPLUS_: /* things that don't break one */ 137455847Sbostic case OLPAREN: 137555847Sbostic case ORPAREN: 137655847Sbostic break; 137755847Sbostic case OQUEST_: /* things that must be skipped */ 137855847Sbostic case OCH_: 137955847Sbostic scan--; 138055847Sbostic do { 138155847Sbostic scan += OPND(s); 138255847Sbostic s = *scan; 138355847Sbostic /* assert() interferes w debug printouts */ 138455847Sbostic if (OP(s) != O_QUEST && OP(s) != O_CH && 138555847Sbostic OP(s) != OOR2) { 138655847Sbostic g->iflags |= BAD; 138755847Sbostic return; 138855847Sbostic } 138955847Sbostic } while (OP(s) != O_QUEST && OP(s) != O_CH); 139055847Sbostic /* fallthrough */ 139155847Sbostic default: /* things that break a sequence */ 139255847Sbostic if (newlen > g->mlen) { /* ends one */ 139355847Sbostic start = newstart; 139455847Sbostic g->mlen = newlen; 139555847Sbostic } 139655847Sbostic newlen = 0; 139755847Sbostic break; 139855847Sbostic } 139955847Sbostic } while (OP(s) != OEND); 140055847Sbostic 140155847Sbostic if (g->mlen == 0) /* there isn't one */ 140255847Sbostic return; 140355847Sbostic 140455847Sbostic /* turn it into a character string */ 140555847Sbostic g->must = malloc((size_t)g->mlen + 1); 140655847Sbostic if (g->must == NULL) { /* argh; just forget it */ 140755847Sbostic g->mlen = 0; 140855847Sbostic return; 140955847Sbostic } 141055847Sbostic cp = g->must; 141155847Sbostic scan = start; 141255847Sbostic for (i = g->mlen; i > 0; i--) { 141355847Sbostic while (OP(s = *scan++) != OCHAR) 141455847Sbostic continue; 141555847Sbostic *cp++ = OPND(s); 141655847Sbostic } 141755847Sbostic *cp++ = '\0'; /* just on general principles */ 141855847Sbostic } 141955847Sbostic 142055847Sbostic /* 142155847Sbostic - pluscount - count + nesting 142255847Sbostic */ 142356355Sbostic static sopno /* nesting depth */ 142455847Sbostic pluscount(p, g) 142555847Sbostic struct parse *p; 142655847Sbostic register struct re_guts *g; 142755847Sbostic { 142855847Sbostic register sop *scan; 142955847Sbostic register sop s; 143055847Sbostic register sopno plusnest = 0; 143155847Sbostic register sopno maxnest = 0; 143255847Sbostic 143355847Sbostic if (p->error != 0) 143455847Sbostic return(0); /* there may not be an OEND */ 143555847Sbostic 143655847Sbostic scan = g->strip + 1; 143755847Sbostic do { 143855847Sbostic s = *scan++; 143955847Sbostic switch (OP(s)) { 144055847Sbostic case OPLUS_: 144155847Sbostic plusnest++; 144255847Sbostic break; 144355847Sbostic case O_PLUS: 144455847Sbostic if (plusnest > maxnest) 144555847Sbostic maxnest = plusnest; 144655847Sbostic plusnest--; 144755847Sbostic break; 144855847Sbostic } 144955847Sbostic } while (OP(s) != OEND); 145055847Sbostic if (plusnest != 0) 145155847Sbostic g->iflags |= BAD; 145255847Sbostic return(maxnest); 145355847Sbostic } 1454