155847Sbostic /*- 255847Sbostic * Copyright (c) 1992 Henry Spencer. 355847Sbostic * Copyright (c) 1992 The Regents of the University of California. 455847Sbostic * All rights reserved. 555847Sbostic * 655847Sbostic * This code is derived from software contributed to Berkeley by 755847Sbostic * Henry Spencer of the University of Toronto. 855847Sbostic * 955847Sbostic * %sccs.include.redist.c% 1055847Sbostic * 11*56355Sbostic * @(#)regcomp.c 5.2 (Berkeley) 09/30/92 1255847Sbostic */ 1355847Sbostic 1455847Sbostic #if defined(LIBC_SCCS) && !defined(lint) 15*56355Sbostic static char sccsid[] = "@(#)regcomp.c 5.2 (Berkeley) 09/30/92"; 1655847Sbostic #endif /* LIBC_SCCS and not lint */ 1755847Sbostic 1855847Sbostic #include <sys/types.h> 1955847Sbostic #include <stdio.h> 2055847Sbostic #include <string.h> 2155847Sbostic #include <ctype.h> 2255847Sbostic #include <limits.h> 2355847Sbostic #include <stdlib.h> 2455847Sbostic #include <regex.h> 2555847Sbostic 2655847Sbostic #include "utils.h" 2755847Sbostic #include "regex2.h" 2855847Sbostic 2955847Sbostic #include "cclass.h" 3055847Sbostic #include "cname.h" 3155847Sbostic 3255847Sbostic /* 3355847Sbostic * parse structure, passed up and down to avoid global variables and 3455847Sbostic * other clumsinesses 3555847Sbostic */ 3655847Sbostic struct parse { 3755847Sbostic uchar *next; /* next character in RE */ 3855847Sbostic int error; /* has an error been seen? */ 3955847Sbostic sop *strip; /* malloced strip */ 4055847Sbostic sopno ssize; /* malloced strip size (allocated) */ 4155847Sbostic sopno slen; /* malloced strip length (used) */ 4255847Sbostic int ncsalloc; /* number of csets allocated */ 4355847Sbostic struct re_guts *g; 4455847Sbostic # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 4555847Sbostic sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 4655847Sbostic sopno pend[NPAREN]; /* -> ) ([0] unused) */ 4755847Sbostic }; 4855847Sbostic 49*56355Sbostic #include "regcomp.ih" 5055847Sbostic 51*56355Sbostic static uchar nuls[10]; /* place to point scanner in event of error */ 52*56355Sbostic 5355847Sbostic /* 5455847Sbostic * macros for use with parse structure 5555847Sbostic * BEWARE: these know that the parse structure is named `p' !!! 5655847Sbostic */ 5755847Sbostic #define PEEK() ((uchar)*p->next) 5855847Sbostic #define PEEK2() ((uchar)*(p->next+1)) 5955847Sbostic #define SEE(c) (PEEK() == (c)) 6055847Sbostic #define SEETWO(a, b) (PEEK() == (a) && PEEK2() == (b)) 6155847Sbostic #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 6255847Sbostic #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 6355847Sbostic #define NEXT() (p->next++) 6455847Sbostic #define NEXT2() (p->next += 2) 6555847Sbostic #define NEXTn(n) (p->next += (n)) 6655847Sbostic #define GETNEXT() ((uchar)*p->next++) 6755847Sbostic #define SETERROR(e) seterr(p, (e)) 6855847Sbostic #define REQUIRE(co, e) ((co) || SETERROR(e)) 6955847Sbostic #define MUSTSEE(c, e) (REQUIRE(PEEK() == (c), e)) 7055847Sbostic #define MUSTEAT(c, e) (REQUIRE(GETNEXT() == (c), e)) 7155847Sbostic #define MUSTNOTSEE(c, e) (REQUIRE(PEEK() != (c), e)) 7255847Sbostic #define EMIT(sop, sopnd) doemit(p, sop, (size_t)(sopnd)) 7355847Sbostic #define INSERT(sop, pos) doinsert(p, sop, HERE()-(pos)+1, pos) 7455847Sbostic #define FWD(pos) dofwd(p, pos, HERE()-(pos)) 7555847Sbostic #define BACK(sop, pos) EMIT(sop, HERE()-pos) 7655847Sbostic #define HERE() (p->slen) 7755847Sbostic #define THERE() (p->slen - 1) 7855847Sbostic #define DROP(n) (p->slen -= (n)) 7955847Sbostic 8055847Sbostic /* 8155847Sbostic - regcomp - interface for parser and compilation 82*56355Sbostic = extern int regcomp(regex_t *preg, const char *pattern, int cflags); 83*56355Sbostic = #define REG_EXTENDED 001 84*56355Sbostic = #define REG_ICASE 002 85*56355Sbostic = #define REG_NOSUB 004 86*56355Sbostic = #define REG_NEWLINE 010 8755847Sbostic */ 8855847Sbostic int /* 0 success, otherwise REG_something */ 8955847Sbostic regcomp(preg, pattern, cflags) 9055847Sbostic regex_t *preg; 9155847Sbostic const char *pattern; 9255847Sbostic int cflags; 9355847Sbostic { 9455847Sbostic struct parse pa; 9555847Sbostic register struct re_guts *g; 9655847Sbostic register struct parse *p = &pa; 9755847Sbostic register int i; 9855847Sbostic 9955847Sbostic /* do the mallocs early so failure handling is easy */ 10055847Sbostic /* the +NUC here is for the category table */ 10155847Sbostic g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC); 10255847Sbostic if (g == NULL) 10355847Sbostic return(REG_ESPACE); 10455847Sbostic p->ssize = strlen(pattern)/2*3 + 1; 10555847Sbostic p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 10655847Sbostic p->slen = 0; 10755847Sbostic if (p->strip == NULL) { 10855847Sbostic free((char *)g); 10955847Sbostic return(REG_ESPACE); 11055847Sbostic } 11155847Sbostic 11255847Sbostic /* set things up */ 11355847Sbostic p->g = g; 11455847Sbostic p->next = (uchar *)pattern; 11555847Sbostic p->error = 0; 11655847Sbostic p->ncsalloc = 0; 11755847Sbostic for (i = 0; i < NPAREN; i++) { 11855847Sbostic p->pbegin[i] = 0; 11955847Sbostic p->pend[i] = 0; 12055847Sbostic } 12155847Sbostic g->csetsize = NUC; 12255847Sbostic g->sets = NULL; 12355847Sbostic g->setbits = NULL; 12455847Sbostic g->ncsets = 0; 12555847Sbostic g->cflags = cflags; 12655847Sbostic g->iflags = 0; 12755847Sbostic g->must = NULL; 12855847Sbostic g->mlen = 0; 12955847Sbostic g->nsub = 0; 13055847Sbostic g->ncategories = 1; /* category 0 is "everything else" */ 13155847Sbostic g->categories = (uchar *)g + sizeof(struct re_guts); 13255847Sbostic (void) memset((char *)g->categories, 0, NUC); 13355847Sbostic g->backrefs = 0; 13455847Sbostic g->nplus = 0; 13555847Sbostic 13655847Sbostic /* do it */ 13755847Sbostic EMIT(OEND, 0); 13855847Sbostic g->firststate = THERE(); 13955847Sbostic if (cflags®_EXTENDED) 14055847Sbostic p_ere(p, '\0'); 14155847Sbostic else 14255847Sbostic p_bre(p, '\0', '\0'); 14355847Sbostic EMIT(OEND, 0); 14455847Sbostic g->laststate = THERE(); 14555847Sbostic 14655847Sbostic /* tidy up loose ends and fill things in */ 14755847Sbostic categorize(p, g); 14855847Sbostic stripsnug(p, g); 14955847Sbostic findmust(p, g); 15055847Sbostic g->nplus = pluscount(p, g); 15155847Sbostic g->magic = MAGIC2; 15255847Sbostic preg->re_nsub = g->nsub; 15355847Sbostic preg->re_g = g; 15455847Sbostic preg->re_magic = MAGIC1; 155*56355Sbostic #ifndef REDEBUG 15655847Sbostic /* not debugging, so can't rely on the assert() in regexec() */ 15755847Sbostic if (g->iflags&BAD) 15855847Sbostic SETERROR(REG_ASSERT); 15955847Sbostic #endif 16055847Sbostic 16155847Sbostic /* win or lose, we're done */ 16255847Sbostic if (p->error != 0) /* lose */ 16355847Sbostic regfree(preg); 16455847Sbostic return(p->error); 16555847Sbostic } 16655847Sbostic 16755847Sbostic /* 16855847Sbostic - p_ere - ERE parser top level, concatenation and alternation 169*56355Sbostic == static void p_ere(register struct parse *p, uchar stop); 17055847Sbostic */ 17155847Sbostic static void 17255847Sbostic p_ere(p, stop) 17355847Sbostic register struct parse *p; 17455847Sbostic uchar stop; /* character this ERE should end at */ 17555847Sbostic { 17655847Sbostic register uchar c; 17755847Sbostic register sopno prevback; 17855847Sbostic register sopno prevfwd; 17955847Sbostic register sopno conc; 18055847Sbostic register int first = 1; /* is this the first alternative? */ 18155847Sbostic 18255847Sbostic for (;;) { 18355847Sbostic /* do a bunch of concatenated expressions */ 18455847Sbostic conc = HERE(); 18555847Sbostic while ((c = PEEK()) != '|' && c != stop && c != '\0') 18655847Sbostic p_ere_exp(p); 18755847Sbostic REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 18855847Sbostic 18955847Sbostic if (!EAT('|')) 19055847Sbostic break; /* NOTE BREAK OUT */ 19155847Sbostic 19255847Sbostic if (first) { 19355847Sbostic INSERT(OCH_, conc); /* offset is wrong */ 19455847Sbostic prevfwd = conc; 19555847Sbostic prevback = conc; 19655847Sbostic first = 0; 19755847Sbostic } 19855847Sbostic BACK(OOR1, prevback); 19955847Sbostic prevback = THERE(); 20055847Sbostic FWD(prevfwd); /* fix previous offset */ 20155847Sbostic prevfwd = HERE(); 20255847Sbostic EMIT(OOR2, 0); /* offset is very wrong */ 20355847Sbostic } 20455847Sbostic 20555847Sbostic if (!first) { /* tail-end fixups */ 20655847Sbostic FWD(prevfwd); 20755847Sbostic BACK(O_CH, prevback); 20855847Sbostic } 20955847Sbostic 21055847Sbostic assert(SEE(stop) || SEE('\0')); 21155847Sbostic } 21255847Sbostic 21355847Sbostic /* 21455847Sbostic - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 215*56355Sbostic == static void p_ere_exp(register struct parse *p); 21655847Sbostic */ 21755847Sbostic static void 21855847Sbostic p_ere_exp(p) 21955847Sbostic register struct parse *p; 22055847Sbostic { 22155847Sbostic register uchar c; 22255847Sbostic register sopno pos; 22355847Sbostic register int count; 22455847Sbostic register int count2; 22555847Sbostic register sopno subno; 22655847Sbostic int wascaret = 0; 22755847Sbostic /* we call { a repetition if followed by a digit */ 22855847Sbostic # define ISRPT(c1, c2) (c1 == '*' || c1 == '+' || c1 == '?' || \ 22955847Sbostic (c1 == '{' && isdigit(c2))) 23055847Sbostic 23155847Sbostic c = GETNEXT(); 23255847Sbostic assert(c != '\0'); /* caller should have ensured this */ 23355847Sbostic 23455847Sbostic pos = HERE(); 23555847Sbostic switch (c) { 23655847Sbostic case '(': 23755847Sbostic MUSTNOTSEE('\0', REG_EPAREN); 23855847Sbostic p->g->nsub++; 23955847Sbostic subno = p->g->nsub; 24055847Sbostic if (subno < NPAREN) 24155847Sbostic p->pbegin[subno] = HERE(); 24255847Sbostic EMIT(OLPAREN, subno); 24355847Sbostic if (!SEE(')')) 24455847Sbostic p_ere(p, ')'); 24555847Sbostic if (subno < NPAREN) { 24655847Sbostic p->pend[subno] = HERE(); 24755847Sbostic assert(p->pend[subno] != 0); 24855847Sbostic } 24955847Sbostic EMIT(ORPAREN, subno); 25055847Sbostic MUSTEAT(')', REG_EPAREN); 25155847Sbostic break; 25255847Sbostic #ifndef POSIX_MISTAKE 25355847Sbostic case ')': /* happens only if no current unmatched ( */ 25455847Sbostic /* 25555847Sbostic * You may ask, why the ifndef? Because I didn't notice 25655847Sbostic * this until slightly too late for 1003.2, and none of the 25755847Sbostic * other 1003.2 regular-expression reviewers noticed it at 25855847Sbostic * all. So an unmatched ) is legal POSIX, at least until 25955847Sbostic * we can get it fixed. 26055847Sbostic */ 26155847Sbostic SETERROR(REG_EPAREN); 26255847Sbostic break; 26355847Sbostic #endif 26455847Sbostic case '^': 26555847Sbostic EMIT(OBOL, 0); 26655847Sbostic p->g->iflags |= USEBOL; 26755847Sbostic wascaret = 1; 26855847Sbostic break; 26955847Sbostic case '$': 27055847Sbostic EMIT(OEOL, 0); 27155847Sbostic p->g->iflags |= USEEOL; 27255847Sbostic break; 27355847Sbostic case '|': 27455847Sbostic SETERROR(REG_EMPTY); 27555847Sbostic break; 27655847Sbostic case '*': 27755847Sbostic case '+': 27855847Sbostic case '?': 27955847Sbostic SETERROR(REG_BADRPT); 28055847Sbostic break; 28155847Sbostic case '.': 28255847Sbostic if (p->g->cflags®_NEWLINE) 28355847Sbostic nonnewline(p); 28455847Sbostic else 28555847Sbostic EMIT(OANY, 0); 28655847Sbostic break; 28755847Sbostic case '[': 28855847Sbostic p_bracket(p); 28955847Sbostic break; 29055847Sbostic case '\\': 29155847Sbostic c = GETNEXT(); 292*56355Sbostic #ifdef xxx 29355847Sbostic if (c == '^' || c == '.' || c == '[' || c == '$' || 29455847Sbostic c == '(' || c == ')' || c == '|' || 29555847Sbostic c == '*' || c == '+' || c == '?' || 29655847Sbostic c == '{' || c == '\\') 297*56355Sbostic #else 298*56355Sbostic if (c != '\0') 299*56355Sbostic #endif 30055847Sbostic ordinary(p, c); 30155847Sbostic else 30255847Sbostic SETERROR(REG_EESCAPE); 30355847Sbostic break; 30455847Sbostic case '{': /* okay as ordinary except if digit follows */ 30555847Sbostic REQUIRE(!isdigit(PEEK()), REG_BADRPT); 30655847Sbostic /* FALLTHROUGH */ 30755847Sbostic default: 30855847Sbostic ordinary(p, c); 30955847Sbostic break; 31055847Sbostic } 31155847Sbostic 31255847Sbostic c = PEEK(); 31355847Sbostic if (!ISRPT(c, PEEK2())) 31455847Sbostic return; /* no repetition, we're done */ 31555847Sbostic NEXT(); 31655847Sbostic 31755847Sbostic REQUIRE(!wascaret, REG_BADRPT); 31855847Sbostic switch (c) { 31955847Sbostic case '*': /* implemented as +? */ 32055847Sbostic INSERT(OPLUS_, pos); 32155847Sbostic BACK(O_PLUS, pos); 32255847Sbostic INSERT(OQUEST_, pos); 32355847Sbostic BACK(O_QUEST, pos); 32455847Sbostic break; 32555847Sbostic case '+': 32655847Sbostic INSERT(OPLUS_, pos); 32755847Sbostic BACK(O_PLUS, pos); 32855847Sbostic break; 32955847Sbostic case '?': 33055847Sbostic INSERT(OQUEST_, pos); 33155847Sbostic BACK(O_QUEST, pos); 33255847Sbostic break; 33355847Sbostic case '{': 33455847Sbostic count = p_count(p); 33555847Sbostic if (EAT(',')) { 33655847Sbostic if (isdigit(PEEK())) { 33755847Sbostic count2 = p_count(p); 33855847Sbostic REQUIRE(count <= count2, REG_BADBR); 33955847Sbostic } else /* single number with comma */ 34055847Sbostic count2 = INFINITY; 34155847Sbostic } else /* just a single number */ 34255847Sbostic count2 = count; 34355847Sbostic repeat(p, pos, count, count2); 34455847Sbostic if (!EAT('}')) { /* error heuristics */ 34555847Sbostic while ((c = PEEK()) != '\0' && c != '}') 34655847Sbostic NEXT(); 34755847Sbostic if (c == '\0') 34855847Sbostic SETERROR(REG_EBRACE); 34955847Sbostic else 35055847Sbostic SETERROR(REG_BADBR); 35155847Sbostic } 35255847Sbostic break; 35355847Sbostic } 35455847Sbostic 35555847Sbostic c = PEEK(); 35655847Sbostic REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT); 35755847Sbostic } 35855847Sbostic 35955847Sbostic /* 36055847Sbostic - p_bre - BRE parser top level, anchoring and concatenation 361*56355Sbostic == static void p_bre(register struct parse *p, register uchar end1, \ 362*56355Sbostic == register uchar end2); 363*56355Sbostic * Giving end1 as '\0' essentially eliminates the end1/end2 check. 36455847Sbostic * 365*56355Sbostic * This implementation is a bit of a kludge, in that a trailing $ is first 366*56355Sbostic * taken as an ordinary character and then revised to be an anchor. The 367*56355Sbostic * only undesirable side effect is that '$' gets included as a character 368*56355Sbostic * category in such cases. This is fairly harmless; not worth fixing. 369*56355Sbostic * The amount of lookahead needed to avoid this kludge is excessive, 370*56355Sbostic * especially since things like "$*" appear to be legal. xxx 37155847Sbostic */ 37255847Sbostic static void 37355847Sbostic p_bre(p, end1, end2) 37455847Sbostic register struct parse *p; 37555847Sbostic register uchar end1; /* first terminating character */ 37655847Sbostic register uchar end2; /* second terminating character */ 37755847Sbostic { 37855847Sbostic register sopno start = HERE(); 37955847Sbostic register int first = 1; /* first subexpression? */ 380*56355Sbostic register int wasdollar = 0; 38155847Sbostic 38255847Sbostic if (EAT('^')) { 38355847Sbostic EMIT(OBOL, 0); 38455847Sbostic p->g->iflags |= USEBOL; 38555847Sbostic } 38655847Sbostic while (!SEE('\0') && !SEETWO(end1, end2)) { 38755847Sbostic wasdollar = p_simp_re(p, first); 38855847Sbostic first = 0; 38955847Sbostic } 39055847Sbostic if (wasdollar) { /* oops, that was a trailing anchor */ 39155847Sbostic DROP(1); 39255847Sbostic EMIT(OEOL, 0); 39355847Sbostic p->g->iflags |= USEEOL; 39455847Sbostic } 39555847Sbostic 39655847Sbostic REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 39755847Sbostic } 39855847Sbostic 39955847Sbostic /* 40055847Sbostic - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 401*56355Sbostic == static int p_simp_re(register struct parse *p, int starordinary); 40255847Sbostic */ 40355847Sbostic static int /* was the simple RE an unbackslashed $? */ 40455847Sbostic p_simp_re(p, starordinary) 40555847Sbostic register struct parse *p; 40655847Sbostic int starordinary; /* is a leading * an ordinary character? */ 40755847Sbostic { 40855847Sbostic register int c; 40955847Sbostic register int count; 41055847Sbostic register int count2; 41155847Sbostic register sopno pos; 41255847Sbostic register int i; 41355847Sbostic register sopno subno; 41455847Sbostic # define BACKSL (1<<CHAR_BIT) 41555847Sbostic 41655847Sbostic pos = HERE(); /* repetion op, if any, covers from here */ 41755847Sbostic 41855847Sbostic c = GETNEXT(); 41955847Sbostic assert(c != '\0'); /* caller should have ensured this */ 42055847Sbostic if (c == '\\') 42155847Sbostic c = BACKSL | PEEK(); 42255847Sbostic switch (c) { 42355847Sbostic case '.': 42455847Sbostic if (p->g->cflags®_NEWLINE) 42555847Sbostic nonnewline(p); 42655847Sbostic else 42755847Sbostic EMIT(OANY, 0); 42855847Sbostic break; 42955847Sbostic case '[': 43055847Sbostic p_bracket(p); 43155847Sbostic break; 43255847Sbostic case BACKSL|'{': 43355847Sbostic SETERROR(REG_BADRPT); 43455847Sbostic break; 43555847Sbostic case BACKSL|'(': 43655847Sbostic NEXT(); 43755847Sbostic p->g->nsub++; 43855847Sbostic subno = p->g->nsub; 43955847Sbostic if (subno < NPAREN) 44055847Sbostic p->pbegin[subno] = HERE(); 44155847Sbostic EMIT(OLPAREN, subno); 44255847Sbostic /* the SEE here is an error heuristic */ 44355847Sbostic if (!SEE('\0') && !SEETWO('\\', ')')) 44455847Sbostic p_bre(p, '\\', ')'); 44555847Sbostic if (subno < NPAREN) { 44655847Sbostic p->pend[subno] = HERE(); 44755847Sbostic assert(p->pend[subno] != 0); 44855847Sbostic } 44955847Sbostic EMIT(ORPAREN, subno); 45055847Sbostic REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 45155847Sbostic break; 45255847Sbostic case BACKSL|')': /* should not get here -- must be user */ 45355847Sbostic case BACKSL|'}': 45455847Sbostic SETERROR(REG_EPAREN); 45555847Sbostic break; 45655847Sbostic case BACKSL|'1': 45755847Sbostic case BACKSL|'2': 45855847Sbostic case BACKSL|'3': 45955847Sbostic case BACKSL|'4': 46055847Sbostic case BACKSL|'5': 46155847Sbostic case BACKSL|'6': 46255847Sbostic case BACKSL|'7': 46355847Sbostic case BACKSL|'8': 46455847Sbostic case BACKSL|'9': 46555847Sbostic i = (c&~BACKSL) - '0'; 46655847Sbostic assert(i < NPAREN); 46755847Sbostic if (p->pend[i] != 0) { 46855847Sbostic assert(i <= p->g->nsub); 46955847Sbostic EMIT(OBACK_, i); 47055847Sbostic assert(p->pbegin[i] != 0); 47155847Sbostic assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 47255847Sbostic assert(OP(p->strip[p->pend[i]]) == ORPAREN); 47355847Sbostic (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 47455847Sbostic EMIT(O_BACK, i); 47555847Sbostic } else 47655847Sbostic SETERROR(REG_ESUBREG); 47755847Sbostic p->g->backrefs = 1; 47855847Sbostic NEXT(); 47955847Sbostic break; 480*56355Sbostic case BACKSL|'\0': 481*56355Sbostic SETERROR(REG_EESCAPE); 482*56355Sbostic break; 48355847Sbostic case '*': 48455847Sbostic REQUIRE(starordinary, REG_BADRPT); 48555847Sbostic /* FALLTHROUGH */ 48655847Sbostic default: 487*56355Sbostic if (c & BACKSL) 488*56355Sbostic c = GETNEXT(); 48955847Sbostic ordinary(p, (uchar)c); 49055847Sbostic break; 49155847Sbostic } 49255847Sbostic 49355847Sbostic if (EAT('*')) { /* implemented as +? */ 49455847Sbostic INSERT(OPLUS_, pos); 49555847Sbostic BACK(O_PLUS, pos); 49655847Sbostic INSERT(OQUEST_, pos); 49755847Sbostic BACK(O_QUEST, pos); 49855847Sbostic } else if (EATTWO('\\', '{')) { 49955847Sbostic count = p_count(p); 50055847Sbostic if (EAT(',')) { 50155847Sbostic if (isdigit(PEEK())) { 50255847Sbostic count2 = p_count(p); 50355847Sbostic REQUIRE(count <= count2, REG_BADBR); 50455847Sbostic } else /* single number with comma */ 50555847Sbostic count2 = INFINITY; 50655847Sbostic } else /* just a single number */ 50755847Sbostic count2 = count; 50855847Sbostic repeat(p, pos, count, count2); 50955847Sbostic if (!EATTWO('\\', '}')) { /* error heuristics */ 51055847Sbostic while (!SEE('\0') && !SEETWO('\\', '}')) 51155847Sbostic NEXT(); 51255847Sbostic if (SEE('\0')) 51355847Sbostic SETERROR(REG_EBRACE); 51455847Sbostic else 51555847Sbostic SETERROR(REG_BADBR); 51655847Sbostic } 51755847Sbostic } else if (c == '$') /* unbackslashed $ not followed by reptn */ 51855847Sbostic return(1); 51955847Sbostic 52055847Sbostic return(0); 52155847Sbostic } 52255847Sbostic 52355847Sbostic /* 52455847Sbostic - p_count - parse a repetition count 525*56355Sbostic == static int p_count(register struct parse *p); 52655847Sbostic */ 52755847Sbostic static int /* the value */ 52855847Sbostic p_count(p) 52955847Sbostic register struct parse *p; 53055847Sbostic { 53155847Sbostic register int count = 0; 53255847Sbostic register int ndigits = 0; 53355847Sbostic 53455847Sbostic while (isdigit(PEEK()) && count <= DUPMAX) { 53555847Sbostic count = count*10 + (GETNEXT() - '0'); 53655847Sbostic ndigits++; 53755847Sbostic } 53855847Sbostic 53955847Sbostic REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 54055847Sbostic return(count); 54155847Sbostic } 54255847Sbostic 54355847Sbostic /* 54455847Sbostic - p_bracket - parse a bracketed character list 545*56355Sbostic == static void p_bracket(register struct parse *p); 54655847Sbostic * 54755847Sbostic * Note a significant property of this code: if the allocset() did SETERROR, 54855847Sbostic * no set operations are done. 54955847Sbostic */ 55055847Sbostic static void 55155847Sbostic p_bracket(p) 55255847Sbostic register struct parse *p; 55355847Sbostic { 55455847Sbostic register uchar c; 55555847Sbostic register cset *cs = allocset(p); 55655847Sbostic register int invert = 0; 55755847Sbostic 55855847Sbostic if (EAT('^')) 55955847Sbostic invert++; /* make note to invert set at end */ 56055847Sbostic if (EAT(']')) 56155847Sbostic CHadd(cs, ']'); 56255847Sbostic while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']')) 56355847Sbostic p_b_term(p, cs); 56455847Sbostic if (EAT('-')) 56555847Sbostic CHadd(cs, '-'); 56655847Sbostic MUSTEAT(']', REG_EBRACK); 56755847Sbostic 56855847Sbostic if (invert && p->error == 0) { 56955847Sbostic register int i; 57055847Sbostic 57155847Sbostic for (i = p->g->csetsize - 1; i >= 0; i--) 57255847Sbostic if (CHIN(cs, i)) 57355847Sbostic CHsub(cs, i); 57455847Sbostic else 57555847Sbostic CHadd(cs, i); 57655847Sbostic if (p->g->cflags®_NEWLINE) 57755847Sbostic CHsub(cs, '\n'); 57855847Sbostic if (cs->multis != NULL) 57955847Sbostic mcinvert(p, cs); 58055847Sbostic } 58155847Sbostic assert(cs->multis == NULL); /* xxx */ 58255847Sbostic EMIT(OANYOF, freezeset(p, cs)); 58355847Sbostic } 58455847Sbostic 58555847Sbostic /* 58655847Sbostic - p_b_term - parse one term of a bracketed character list 587*56355Sbostic == static void p_b_term(register struct parse *p, register cset *cs); 58855847Sbostic */ 58955847Sbostic static void 59055847Sbostic p_b_term(p, cs) 59155847Sbostic register struct parse *p; 59255847Sbostic register cset *cs; 59355847Sbostic { 59455847Sbostic register uchar c; 59555847Sbostic register uchar start, finish; 59655847Sbostic register int i; 59755847Sbostic 59855847Sbostic /* classify what we've got */ 59955847Sbostic switch (PEEK()) { 60055847Sbostic case '[': 60155847Sbostic c = PEEK2(); 60255847Sbostic break; 60355847Sbostic case '-': 60455847Sbostic SETERROR(REG_ERANGE); 60555847Sbostic return; /* NOTE RETURN */ 60655847Sbostic break; 60755847Sbostic default: 60855847Sbostic c = '\0'; 60955847Sbostic break; 61055847Sbostic } 61155847Sbostic 61255847Sbostic switch (c) { 61355847Sbostic case ':': /* character class */ 61455847Sbostic NEXT2(); 61555847Sbostic c = PEEK(); 61655847Sbostic REQUIRE(c != '\0', REG_EBRACK); 61755847Sbostic REQUIRE(c != '-' && c != ']', REG_ECTYPE); 61855847Sbostic p_b_cclass(p, cs); 61955847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 62055847Sbostic REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 62155847Sbostic break; 62255847Sbostic case '=': /* equivalence class */ 62355847Sbostic NEXT2(); 62455847Sbostic c = PEEK(); 62555847Sbostic REQUIRE(c != '\0', REG_EBRACK); 62655847Sbostic REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 62755847Sbostic p_b_eclass(p, cs); 62855847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 62955847Sbostic REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 63055847Sbostic break; 63155847Sbostic default: /* symbol, ordinary character, or range */ 63255847Sbostic /* xxx revision needed for multichar stuff */ 63355847Sbostic start = p_b_symbol(p); 63455847Sbostic if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') { 63555847Sbostic /* range */ 63655847Sbostic NEXT(); 63755847Sbostic if (EAT('-')) 63855847Sbostic finish = '-'; 63955847Sbostic else 64055847Sbostic finish = p_b_symbol(p); 64155847Sbostic } else 64255847Sbostic finish = start; 64355847Sbostic REQUIRE(start <= finish, REG_ERANGE); 64455847Sbostic for (i = start; i <= finish; i++) { 64555847Sbostic CHadd(cs, i); 64655847Sbostic if ((p->g->cflags®_ICASE) && isalpha(i)) { 64755847Sbostic c = othercase((uchar)i); 64855847Sbostic CHadd(cs, c); 64955847Sbostic } 65055847Sbostic } 65155847Sbostic break; 65255847Sbostic } 65355847Sbostic } 65455847Sbostic 65555847Sbostic /* 65655847Sbostic - p_b_cclass - parse a character-class name and deal with it 657*56355Sbostic == static void p_b_cclass(register struct parse *p, register cset *cs); 65855847Sbostic */ 65955847Sbostic static void 66055847Sbostic p_b_cclass(p, cs) 66155847Sbostic register struct parse *p; 66255847Sbostic register cset *cs; 66355847Sbostic { 66455847Sbostic register uchar *sb = p->next; 66555847Sbostic register uchar *se = sb; 66655847Sbostic register struct cclass *cp; 66755847Sbostic register int len; 66855847Sbostic register uchar *u; 66955847Sbostic register uchar c; 67055847Sbostic 67155847Sbostic while (isalpha(*se)) 67255847Sbostic se++; 67355847Sbostic len = se - sb; 67455847Sbostic NEXTn(len); 67555847Sbostic for (cp = cclasses; cp->name != NULL; cp++) 67655847Sbostic if (strncmp(cp->name, (char *)sb, len) == 0 && 677*56355Sbostic cp->name[len] == '\0') 67855847Sbostic break; 67955847Sbostic if (cp->name == NULL) { 68055847Sbostic /* oops, didn't find it */ 68155847Sbostic SETERROR(REG_ECTYPE); 68255847Sbostic return; 68355847Sbostic } 68455847Sbostic 68555847Sbostic u = (uchar *)cp->chars; 68655847Sbostic while ((c = *u++) != '\0') 68755847Sbostic CHadd(cs, c); 68855847Sbostic for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1) 68955847Sbostic MCadd(cs, u); 69055847Sbostic } 69155847Sbostic 69255847Sbostic /* 69355847Sbostic - p_b_eclass - parse an equivalence-class name and deal with it 694*56355Sbostic == static void p_b_eclass(register struct parse *p, register cset *cs); 69555847Sbostic * 69655847Sbostic * This implementation is incomplete. xxx 69755847Sbostic */ 69855847Sbostic static void 69955847Sbostic p_b_eclass(p, cs) 70055847Sbostic register struct parse *p; 70155847Sbostic register cset *cs; 70255847Sbostic { 70355847Sbostic register uchar c; 70455847Sbostic 70555847Sbostic c = p_b_coll_elem(p, '='); 70655847Sbostic CHadd(cs, c); 70755847Sbostic } 70855847Sbostic 70955847Sbostic /* 71055847Sbostic - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 711*56355Sbostic == static uchar p_b_symbol(register struct parse *p); 71255847Sbostic */ 71355847Sbostic static uchar /* value of symbol */ 71455847Sbostic p_b_symbol(p) 71555847Sbostic register struct parse *p; 71655847Sbostic { 71755847Sbostic register uchar value; 71855847Sbostic 71955847Sbostic if (!EATTWO('[', '.')) { 72055847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 72155847Sbostic return(GETNEXT()); 72255847Sbostic } 72355847Sbostic 72455847Sbostic /* collating symbol */ 72555847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 72655847Sbostic value = p_b_coll_elem(p, '.'); 72755847Sbostic REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 72855847Sbostic return(value); 72955847Sbostic } 73055847Sbostic 73155847Sbostic /* 73255847Sbostic - p_b_coll_elem - parse a collating-element name and look it up 733*56355Sbostic == static uchar p_b_coll_elem(register struct parse *p, uchar endc); 73455847Sbostic */ 73555847Sbostic static uchar /* value of collating element */ 73655847Sbostic p_b_coll_elem(p, endc) 73755847Sbostic register struct parse *p; 73855847Sbostic uchar endc; /* name ended by endc,']' */ 73955847Sbostic { 74055847Sbostic register uchar *sp = p->next; 74155847Sbostic register struct cname *cp; 74255847Sbostic register int len; 74355847Sbostic register uchar c; 74455847Sbostic 74555847Sbostic while ((c = PEEK()) != '\0' && !SEETWO(endc, ']')) 74655847Sbostic NEXT(); 74755847Sbostic if (c == '\0') { 74855847Sbostic SETERROR(REG_EBRACK); 74955847Sbostic return(0); 75055847Sbostic } 75155847Sbostic len = p->next - sp; 75255847Sbostic for (cp = cnames; cp->name != NULL; cp++) 75355847Sbostic if (strncmp(cp->name, (char *)sp, len) == 0 && 754*56355Sbostic cp->name[len] == '\0') 75555847Sbostic return(cp->code); /* known name */ 75655847Sbostic if (len == 1) 75755847Sbostic return(*sp); /* single character */ 75855847Sbostic SETERROR(REG_ECOLLATE); /* neither */ 75955847Sbostic return(0); 76055847Sbostic } 76155847Sbostic 76255847Sbostic /* 76355847Sbostic - othercase - return the case counterpart of an alphabetic 764*56355Sbostic == static uchar othercase(uchar ch); 76555847Sbostic */ 76655847Sbostic static uchar 76755847Sbostic othercase(ch) 76855847Sbostic uchar ch; 76955847Sbostic { 77055847Sbostic assert(isalpha(ch)); 77155847Sbostic if (isupper(ch)) 77255847Sbostic return(tolower(ch)); 77355847Sbostic else if (islower(ch)) 77455847Sbostic return(toupper(ch)); 77555847Sbostic else /* peculiar, but could happen */ 77655847Sbostic return(ch); 77755847Sbostic } 77855847Sbostic 77955847Sbostic /* 78055847Sbostic - bothcases - emit a dualcase version of a character 781*56355Sbostic == static void bothcases(register struct parse *p, uchar ch); 782*56355Sbostic * 78355847Sbostic * Boy, is this implementation ever a kludge... 78455847Sbostic */ 78555847Sbostic static void 78655847Sbostic bothcases(p, ch) 78755847Sbostic register struct parse *p; 78855847Sbostic uchar ch; 78955847Sbostic { 79055847Sbostic register uchar *oldnext; 79155847Sbostic uchar bracket[3]; 79255847Sbostic 79355847Sbostic oldnext = p->next; 79455847Sbostic p->next = bracket; 79555847Sbostic bracket[0] = ch; 79655847Sbostic bracket[1] = ']'; 79755847Sbostic bracket[2] = '\0'; 79855847Sbostic p_bracket(p); 79955847Sbostic assert(p->next == bracket+2); 80055847Sbostic p->next = oldnext; 80155847Sbostic } 80255847Sbostic 80355847Sbostic /* 80455847Sbostic - ordinary - emit an ordinary character 805*56355Sbostic == static void ordinary(register struct parse *p, register uchar ch); 80655847Sbostic */ 80755847Sbostic static void 80855847Sbostic ordinary(p, ch) 80955847Sbostic register struct parse *p; 81055847Sbostic register uchar ch; 81155847Sbostic { 81255847Sbostic register uchar *cap = p->g->categories; 81355847Sbostic 81455847Sbostic if ((p->g->cflags®_ICASE) && isalpha(ch)) { 81555847Sbostic bothcases(p, ch); 81655847Sbostic return; 81755847Sbostic } 81855847Sbostic 81955847Sbostic EMIT(OCHAR, ch); 82055847Sbostic if (cap[ch] == 0) 82155847Sbostic cap[ch] = p->g->ncategories++; 82255847Sbostic } 82355847Sbostic 82455847Sbostic /* 82555847Sbostic - nonnewline - emit REG_NEWLINE version of OANY 826*56355Sbostic == static void nonnewline(register struct parse *p); 827*56355Sbostic * 82855847Sbostic * Boy, is this implementation ever a kludge... 82955847Sbostic */ 83055847Sbostic static void 83155847Sbostic nonnewline(p) 83255847Sbostic register struct parse *p; 83355847Sbostic { 83455847Sbostic register uchar *oldnext; 83555847Sbostic uchar bracket[4]; 83655847Sbostic 83755847Sbostic oldnext = p->next; 83855847Sbostic p->next = bracket; 83955847Sbostic bracket[0] = '^'; 84055847Sbostic bracket[1] = '\n'; 84155847Sbostic bracket[2] = ']'; 84255847Sbostic bracket[3] = '\0'; 84355847Sbostic p_bracket(p); 84455847Sbostic assert(p->next == bracket+3); 84555847Sbostic p->next = oldnext; 84655847Sbostic } 84755847Sbostic 84855847Sbostic /* 84955847Sbostic - repeat - generate code for a bounded repetition, recursively if needed 850*56355Sbostic == static void repeat(register struct parse *p, sopno start, int from, int to); 85155847Sbostic */ 85255847Sbostic static void 85355847Sbostic repeat(p, start, from, to) 85455847Sbostic register struct parse *p; 85555847Sbostic sopno start; /* operand from here to end of strip */ 85655847Sbostic int from; /* repeated from this number */ 85755847Sbostic int to; /* to this number of times (maybe INFINITY) */ 85855847Sbostic { 85955847Sbostic register sopno finish = HERE(); 86055847Sbostic # define N 2 86155847Sbostic # define INF 3 86255847Sbostic # define REP(f, t) ((f)*8 + (t)) 86355847Sbostic # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 86455847Sbostic register sopno copy; 86555847Sbostic 86655847Sbostic if (p->error != 0) /* head off possible runaway recursion */ 86755847Sbostic return; 86855847Sbostic 86955847Sbostic assert(from <= to); 87055847Sbostic 87155847Sbostic switch (REP(MAP(from), MAP(to))) { 87255847Sbostic case REP(0, 0): /* must be user doing this */ 87355847Sbostic DROP(finish-start); /* drop the operand */ 87455847Sbostic break; 87555847Sbostic case REP(0, 1): /* as x{1,1}? */ 87655847Sbostic case REP(0, N): /* as x{1,n}? */ 87755847Sbostic case REP(0, INF): /* as x{1,}? */ 87855847Sbostic INSERT(OQUEST_, start); /* offset is wrong... */ 87955847Sbostic repeat(p, start+1, 1, to); 88055847Sbostic FWD(start); /* ... fix it */ 88155847Sbostic BACK(O_QUEST, start); 88255847Sbostic break; 88355847Sbostic case REP(1, 1): /* trivial case */ 88455847Sbostic /* done */ 88555847Sbostic break; 88655847Sbostic case REP(1, N): /* as x?x{1,n-1} */ 88755847Sbostic INSERT(OQUEST_, start); 88855847Sbostic BACK(O_QUEST, start); 88955847Sbostic copy = dupl(p, start+1, finish+1); 89055847Sbostic assert(copy == finish+2); 89155847Sbostic repeat(p, copy, 1, to-1); 89255847Sbostic break; 89355847Sbostic case REP(1, INF): /* as x+ */ 89455847Sbostic INSERT(OPLUS_, start); 89555847Sbostic BACK(O_PLUS, start); 89655847Sbostic break; 89755847Sbostic case REP(N, N): /* as xx{m-1,n-1} */ 89855847Sbostic copy = dupl(p, start, finish); 89955847Sbostic repeat(p, copy, from-1, to-1); 90055847Sbostic break; 90155847Sbostic case REP(N, INF): /* as xx{n-1,INF} */ 90255847Sbostic copy = dupl(p, start, finish); 90355847Sbostic repeat(p, copy, from-1, to); 90455847Sbostic break; 90555847Sbostic default: /* "can't happen" */ 90655847Sbostic SETERROR(REG_ASSERT); /* just in case */ 90755847Sbostic break; 90855847Sbostic } 90955847Sbostic } 91055847Sbostic 91155847Sbostic /* 91255847Sbostic - seterr - set an error condition 913*56355Sbostic == static int seterr(register struct parse *p, int e); 91455847Sbostic */ 91555847Sbostic static int /* useless but makes type checking happy */ 91655847Sbostic seterr(p, e) 91755847Sbostic register struct parse *p; 91855847Sbostic int e; 91955847Sbostic { 92055847Sbostic if (p->error == 0) /* keep earliest error condition */ 92155847Sbostic p->error = e; 92255847Sbostic p->next = nuls; /* try to bring things to a halt */ 92355847Sbostic return(0); /* make the return value well-defined */ 92455847Sbostic } 92555847Sbostic 92655847Sbostic /* 92755847Sbostic - allocset - allocate a set of characters for [] 928*56355Sbostic == static cset *allocset(register struct parse *p); 92955847Sbostic */ 93055847Sbostic static cset * 93155847Sbostic allocset(p) 93255847Sbostic register struct parse *p; 93355847Sbostic { 93455847Sbostic register int no = p->g->ncsets++; 93555847Sbostic register size_t nc; 93655847Sbostic register size_t nbytes; 93755847Sbostic register cset *cs; 93855847Sbostic register size_t css = (size_t)p->g->csetsize; 93955847Sbostic 94055847Sbostic if (no >= p->ncsalloc) { /* need another column of space */ 94155847Sbostic p->ncsalloc += CHAR_BIT; 94255847Sbostic nc = p->ncsalloc; 94355847Sbostic assert(nc % CHAR_BIT == 0); 94455847Sbostic nbytes = nc / CHAR_BIT * css; 94555847Sbostic if (p->g->sets == NULL) 94655847Sbostic p->g->sets = (cset *)malloc(nc * sizeof(cset)); 94755847Sbostic else 94855847Sbostic p->g->sets = (cset *)realloc((char *)p->g->sets, 94955847Sbostic nc * sizeof(cset)); 95055847Sbostic if (p->g->setbits == NULL) 95155847Sbostic p->g->setbits = (uchar *)malloc(nbytes); 95255847Sbostic else 95355847Sbostic p->g->setbits = (uchar *)realloc((char *)p->g->setbits, 95455847Sbostic nbytes); 95555847Sbostic if (p->g->sets != NULL && p->g->setbits != NULL) 95655847Sbostic (void) memset((char *)p->g->setbits + (nbytes - css), 95755847Sbostic 0, css); 95855847Sbostic else { 95955847Sbostic no = 0; 96055847Sbostic SETERROR(REG_ESPACE); 96155847Sbostic /* caller's responsibility not to do set ops */ 96255847Sbostic } 96355847Sbostic } 96455847Sbostic 96555847Sbostic assert(p->g->sets != NULL); /* xxx */ 96655847Sbostic cs = &p->g->sets[no]; 96755847Sbostic cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 96855847Sbostic cs->mask = 1 << ((no) % CHAR_BIT); 96955847Sbostic cs->hash = 0; 97055847Sbostic cs->smultis = 0; 97155847Sbostic cs->multis = NULL; 97255847Sbostic 97355847Sbostic return(cs); 97455847Sbostic } 97555847Sbostic 97655847Sbostic /* 97755847Sbostic - freezeset - final processing on a set of characters 978*56355Sbostic == static int freezeset(register struct parse *p, register cset *cs); 97955847Sbostic * 98055847Sbostic * The main task here is merging identical sets. This is usually a waste 98155847Sbostic * of time (although the hash code minimizes the overhead), but can win 98255847Sbostic * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 98355847Sbostic * is done using addition rather than xor -- all ASCII [aA] sets xor to 98455847Sbostic * the same value! 98555847Sbostic */ 98655847Sbostic static int /* set number */ 98755847Sbostic freezeset(p, cs) 98855847Sbostic register struct parse *p; 98955847Sbostic register cset *cs; 99055847Sbostic { 99155847Sbostic register uchar h = cs->hash; 99255847Sbostic register int i; 99355847Sbostic register cset *top = &p->g->sets[p->g->ncsets]; 99455847Sbostic register cset *cs2; 99555847Sbostic register size_t css = (size_t)p->g->csetsize; 99655847Sbostic 99755847Sbostic /* look for an earlier one which is the same */ 99855847Sbostic for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 99955847Sbostic if (cs2->hash == h && cs2 != cs) { 100055847Sbostic /* maybe */ 100155847Sbostic for (i = 0; i < css; i++) 100255847Sbostic if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 100355847Sbostic break; /* no */ 100455847Sbostic if (i == css) 100555847Sbostic break; /* yes */ 100655847Sbostic } 100755847Sbostic 100855847Sbostic if (cs2 < top) { /* found one */ 100955847Sbostic assert(cs == top-1); 101055847Sbostic p->g->ncsets--; 101155847Sbostic for (i = 0; i < css; i++) 101255847Sbostic CHsub(cs, i); 101355847Sbostic cs = cs2; 101455847Sbostic } 101555847Sbostic 101655847Sbostic return((int)(cs - p->g->sets)); 101755847Sbostic } 101855847Sbostic 101955847Sbostic /* 102055847Sbostic - mcadd - add a collating element to a cset 1021*56355Sbostic == static void mcadd(register struct parse *p, register cset *cs, \ 1022*56355Sbostic == register uchar *cp); 102355847Sbostic */ 102455847Sbostic static void 102555847Sbostic mcadd(p, cs, cp) 102655847Sbostic register struct parse *p; 102755847Sbostic register cset *cs; 102855847Sbostic register uchar *cp; 102955847Sbostic { 103055847Sbostic register size_t oldend = cs->smultis; 103155847Sbostic 103255847Sbostic cs->smultis += strlen((char *)cp) + 1; 103355847Sbostic if (cs->multis == NULL) 103455847Sbostic cs->multis = (uchar *)malloc(cs->smultis); 103555847Sbostic else 103655847Sbostic cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 103755847Sbostic if (cs->multis == NULL) { 103855847Sbostic SETERROR(REG_ESPACE); 103955847Sbostic return; 104055847Sbostic } 104155847Sbostic 104255847Sbostic (void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp); 104355847Sbostic cs->multis[cs->smultis - 1] = '\0'; 104455847Sbostic } 104555847Sbostic 104655847Sbostic /* 104755847Sbostic - mcsub - subtract a collating element from a cset 1048*56355Sbostic == static void mcsub(register struct parse *p, register cset *cs, \ 1049*56355Sbostic == register uchar *cp); 105055847Sbostic */ 105155847Sbostic static void 105255847Sbostic mcsub(p, cs, cp) 105355847Sbostic register struct parse *p; 105455847Sbostic register cset *cs; 105555847Sbostic register uchar *cp; 105655847Sbostic { 105755847Sbostic register uchar *fp = mcfind(cs, cp); 105855847Sbostic register size_t len = strlen((char *)fp); 105955847Sbostic 106055847Sbostic assert(p != NULL); 106155847Sbostic (void) memmove((char *)fp, (char *)(fp + len + 1), 106255847Sbostic cs->smultis - (fp + len + 1 - cs->multis)); 106355847Sbostic cs->smultis -= len; 106455847Sbostic 106555847Sbostic if (cs->smultis == 0) { 106655847Sbostic free((char *)cs->multis); 106755847Sbostic cs->multis = NULL; 106855847Sbostic return; 106955847Sbostic } 107055847Sbostic 107155847Sbostic cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 107255847Sbostic assert(cs->multis != NULL); 107355847Sbostic } 107455847Sbostic 107555847Sbostic /* 107655847Sbostic - mcin - is a collating element in a cset? 1077*56355Sbostic == static int mcin(register struct parse *p, register cset *cs, \ 1078*56355Sbostic == register uchar *cp); 107955847Sbostic */ 108055847Sbostic static int 108155847Sbostic mcin(p, cs, cp) 108255847Sbostic register struct parse *p; 108355847Sbostic register cset *cs; 108455847Sbostic register uchar *cp; 108555847Sbostic { 108655847Sbostic return(mcfind(cs, cp) != NULL); 108755847Sbostic } 108855847Sbostic 108955847Sbostic /* 109055847Sbostic - mcfind - find a collating element in a cset 1091*56355Sbostic == static uchar *mcfind(register cset *cs, register uchar *cp); 109255847Sbostic */ 109355847Sbostic static uchar * 109455847Sbostic mcfind(cs, cp) 109555847Sbostic register cset *cs; 109655847Sbostic register uchar *cp; 109755847Sbostic { 109855847Sbostic register uchar *p; 109955847Sbostic 110055847Sbostic if (cs->multis == NULL) 110155847Sbostic return(NULL); 110255847Sbostic for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1) 110355847Sbostic if (strcmp((char *)cp, (char *)p) == 0) 110455847Sbostic return(p); 110555847Sbostic return(NULL); 110655847Sbostic } 110755847Sbostic 110855847Sbostic /* 110955847Sbostic - mcinvert - invert the list of collating elements in a cset 1110*56355Sbostic == static void mcinvert(register struct parse *p, register cset *cs); 111155847Sbostic * 111255847Sbostic * This would have to know the set of possibilities. Implementation 111355847Sbostic * is deferred. 111455847Sbostic */ 111555847Sbostic static void 111655847Sbostic mcinvert(p, cs) 111755847Sbostic register struct parse *p; 111855847Sbostic register cset *cs; 111955847Sbostic { 112055847Sbostic assert(cs->multis == NULL); /* xxx */ 112155847Sbostic } 112255847Sbostic 112355847Sbostic /* 112455847Sbostic - isinsets - is this character in any sets? 1125*56355Sbostic == static int isinsets(register struct re_guts *g, uchar c); 112655847Sbostic */ 112755847Sbostic static int /* predicate */ 112855847Sbostic isinsets(g, c) 112955847Sbostic register struct re_guts *g; 113055847Sbostic uchar c; 113155847Sbostic { 113255847Sbostic register uchar *col; 113355847Sbostic register int i; 113455847Sbostic register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 113555847Sbostic 113655847Sbostic for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 113755847Sbostic if (col[c] != 0) 113855847Sbostic return(1); 113955847Sbostic return(0); 114055847Sbostic } 114155847Sbostic 114255847Sbostic /* 114355847Sbostic - samesets - are these two characters in exactly the same sets? 1144*56355Sbostic == static int samesets(register struct re_guts *g, uchar c1, uchar c2); 114555847Sbostic */ 114655847Sbostic static int /* predicate */ 114755847Sbostic samesets(g, c1, c2) 114855847Sbostic register struct re_guts *g; 114955847Sbostic register uchar c1; 115055847Sbostic register uchar c2; 115155847Sbostic { 115255847Sbostic register uchar *col; 115355847Sbostic register int i; 115455847Sbostic register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 115555847Sbostic 115655847Sbostic for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 115755847Sbostic if (col[c1] != col[c2]) 115855847Sbostic return(0); 115955847Sbostic return(1); 116055847Sbostic } 116155847Sbostic 116255847Sbostic /* 116355847Sbostic - categorize - sort out character categories 1164*56355Sbostic == static void categorize(struct parse *p, register struct re_guts *g); 116555847Sbostic */ 116655847Sbostic static void 116755847Sbostic categorize(p, g) 116855847Sbostic struct parse *p; 116955847Sbostic register struct re_guts *g; 117055847Sbostic { 117155847Sbostic register uchar *cats = g->categories; 117255847Sbostic register unsigned c; 117355847Sbostic register unsigned c2; 117455847Sbostic register uchar cat; 117555847Sbostic 117655847Sbostic /* avoid making error situations worse */ 117755847Sbostic if (p->error != 0) 117855847Sbostic return; 117955847Sbostic 118055847Sbostic for (c = 0; c < g->csetsize; c++) 118155847Sbostic if (cats[c] == 0 && isinsets(g, c)) { 118255847Sbostic cat = g->ncategories++; 118355847Sbostic cats[c] = cat; 118455847Sbostic for (c2 = c+1; c2 < g->csetsize; c2++) 118555847Sbostic if (cats[c2] == 0 && samesets(g, c, c2)) 118655847Sbostic cats[c2] = cat; 118755847Sbostic } 118855847Sbostic } 118955847Sbostic 119055847Sbostic /* 119155847Sbostic - dupl - emit a duplicate of a bunch of sops 1192*56355Sbostic == static sopno dupl(register struct parse *p, sopno start, sopno finish); 119355847Sbostic */ 119455847Sbostic static sopno /* start of duplicate */ 119555847Sbostic dupl(p, start, finish) 119655847Sbostic register struct parse *p; 119755847Sbostic sopno start; /* from here */ 119855847Sbostic sopno finish; /* to this less one */ 119955847Sbostic { 120055847Sbostic register sopno ret = HERE(); 120155847Sbostic register sopno len = finish - start; 120255847Sbostic 120355847Sbostic assert(finish >= start); 120455847Sbostic if (len == 0) 120555847Sbostic return(ret); 120655847Sbostic enlarge(p, p->ssize + len); /* this many unexpected additions */ 120755847Sbostic assert(p->ssize >= p->slen + len); 120855847Sbostic (void) memcpy((char *)(p->strip + p->slen), 120955847Sbostic (char *)(p->strip + start), (size_t)len*sizeof(sop)); 121055847Sbostic p->slen += len; 121155847Sbostic return(ret); 121255847Sbostic } 121355847Sbostic 121455847Sbostic /* 121555847Sbostic - doemit - emit a strip operator 1216*56355Sbostic == static void doemit(register struct parse *p, sop op, size_t opnd); 121755847Sbostic * 121855847Sbostic * It might seem better to implement this as a macro with a function as 121955847Sbostic * hard-case backup, but it's just too big and messy unless there are 122055847Sbostic * some changes to the data structures. Maybe later. 122155847Sbostic */ 122255847Sbostic static void 122355847Sbostic doemit(p, op, opnd) 122455847Sbostic register struct parse *p; 122555847Sbostic sop op; 122655847Sbostic size_t opnd; 122755847Sbostic { 122855847Sbostic /* avoid making error situations worse */ 122955847Sbostic if (p->error != 0) 123055847Sbostic return; 123155847Sbostic 123255847Sbostic /* deal with oversize operands ("can't happen", more or less) */ 123355847Sbostic assert(opnd < 1<<OPSHIFT); 123455847Sbostic 123555847Sbostic /* deal with undersized strip */ 123655847Sbostic if (p->slen >= p->ssize) 123755847Sbostic enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 123855847Sbostic assert(p->slen < p->ssize); 123955847Sbostic 124055847Sbostic /* finally, it's all reduced to the easy case */ 124155847Sbostic p->strip[p->slen++] = SOP(op, opnd); 124255847Sbostic } 124355847Sbostic 124455847Sbostic /* 124555847Sbostic - doinsert - insert a sop into the strip 1246*56355Sbostic == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 124755847Sbostic */ 124855847Sbostic static void 124955847Sbostic doinsert(p, op, opnd, pos) 125055847Sbostic register struct parse *p; 125155847Sbostic sop op; 125255847Sbostic size_t opnd; 125355847Sbostic sopno pos; 125455847Sbostic { 125555847Sbostic register sopno sn; 125655847Sbostic register sop s; 125755847Sbostic register int i; 125855847Sbostic 125955847Sbostic /* avoid making error situations worse */ 126055847Sbostic if (p->error != 0) 126155847Sbostic return; 126255847Sbostic 126355847Sbostic sn = HERE(); 126455847Sbostic EMIT(op, opnd); /* do checks, ensure space */ 126555847Sbostic assert(HERE() == sn+1); 126655847Sbostic s = p->strip[sn]; 126755847Sbostic 126855847Sbostic /* adjust paren pointers */ 126955847Sbostic assert(pos > 0); 127055847Sbostic for (i = 1; i < NPAREN; i++) { 127155847Sbostic if (p->pbegin[i] >= pos) { 127255847Sbostic p->pbegin[i]++; 127355847Sbostic } 127455847Sbostic if (p->pend[i] >= pos) { 127555847Sbostic p->pend[i]++; 127655847Sbostic } 127755847Sbostic } 127855847Sbostic 127955847Sbostic memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 128055847Sbostic (HERE()-pos-1)*sizeof(sop)); 128155847Sbostic p->strip[pos] = s; 128255847Sbostic } 128355847Sbostic 128455847Sbostic /* 128555847Sbostic - dofwd - complete a forward reference 1286*56355Sbostic == static void dofwd(register struct parse *p, sopno pos, sop value); 128755847Sbostic */ 128855847Sbostic static void 128955847Sbostic dofwd(p, pos, value) 129055847Sbostic register struct parse *p; 129155847Sbostic register sopno pos; 129255847Sbostic sop value; 129355847Sbostic { 129455847Sbostic /* avoid making error situations worse */ 129555847Sbostic if (p->error != 0) 129655847Sbostic return; 129755847Sbostic 129855847Sbostic assert(value < 1<<OPSHIFT); 129955847Sbostic p->strip[pos] = OP(p->strip[pos]) | value; 130055847Sbostic } 130155847Sbostic 130255847Sbostic /* 130355847Sbostic - enlarge - enlarge the strip 1304*56355Sbostic == static void enlarge(register struct parse *p, sopno size); 130555847Sbostic */ 130655847Sbostic static void 130755847Sbostic enlarge(p, size) 130855847Sbostic register struct parse *p; 130955847Sbostic register sopno size; 131055847Sbostic { 131155847Sbostic register sop *sp; 131255847Sbostic 131355847Sbostic if (p->ssize >= size) 131455847Sbostic return; 131555847Sbostic 131655847Sbostic sp = (sop *)realloc(p->strip, size*sizeof(sop)); 131755847Sbostic if (sp == NULL) { 131855847Sbostic SETERROR(REG_ESPACE); 131955847Sbostic return; 132055847Sbostic } 132155847Sbostic p->strip = sp; 132255847Sbostic p->ssize = size; 132355847Sbostic } 132455847Sbostic 132555847Sbostic /* 132655847Sbostic - stripsnug - compact the strip 1327*56355Sbostic == static void stripsnug(register struct parse *p, register struct re_guts *g); 132855847Sbostic */ 132955847Sbostic static void 133055847Sbostic stripsnug(p, g) 133155847Sbostic register struct parse *p; 133255847Sbostic register struct re_guts *g; 133355847Sbostic { 133455847Sbostic g->nstates = p->slen; 133555847Sbostic g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop)); 133655847Sbostic if (g->strip == NULL) { 133755847Sbostic SETERROR(REG_ESPACE); 133855847Sbostic g->strip = p->strip; 133955847Sbostic } 134055847Sbostic } 134155847Sbostic 134255847Sbostic /* 134355847Sbostic - findmust - fill in must and mlen with longest mandatory literal string 1344*56355Sbostic == static void findmust(register struct parse *p, register struct re_guts *g); 134555847Sbostic * 134655847Sbostic * This algorithm could do fancy things like analyzing the operands of | 134755847Sbostic * for common subsequences. Someday. This code is simple and finds most 134855847Sbostic * of the interesting cases. 134955847Sbostic * 135055847Sbostic * Note that must and mlen got initialized during setup. 135155847Sbostic */ 1352*56355Sbostic static void 135355847Sbostic findmust(p, g) 135455847Sbostic struct parse *p; 135555847Sbostic register struct re_guts *g; 135655847Sbostic { 135755847Sbostic register sop *scan; 135855847Sbostic sop *start; 135955847Sbostic register sop *newstart; 136055847Sbostic register sopno newlen; 136155847Sbostic register sop s; 136255847Sbostic register char *cp; 136355847Sbostic register sopno i; 136455847Sbostic 136555847Sbostic /* avoid making error situations worse */ 136655847Sbostic if (p->error != 0) 136755847Sbostic return; 136855847Sbostic 136955847Sbostic /* find the longest OCHAR sequence in strip */ 137055847Sbostic newlen = 0; 137155847Sbostic scan = g->strip + 1; 137255847Sbostic do { 137355847Sbostic s = *scan++; 137455847Sbostic switch (OP(s)) { 137555847Sbostic case OCHAR: /* sequence member */ 137655847Sbostic if (newlen == 0) /* new sequence */ 137755847Sbostic newstart = scan - 1; 137855847Sbostic newlen++; 137955847Sbostic break; 138055847Sbostic case OPLUS_: /* things that don't break one */ 138155847Sbostic case OLPAREN: 138255847Sbostic case ORPAREN: 138355847Sbostic break; 138455847Sbostic case OQUEST_: /* things that must be skipped */ 138555847Sbostic case OCH_: 138655847Sbostic scan--; 138755847Sbostic do { 138855847Sbostic scan += OPND(s); 138955847Sbostic s = *scan; 139055847Sbostic /* assert() interferes w debug printouts */ 139155847Sbostic if (OP(s) != O_QUEST && OP(s) != O_CH && 139255847Sbostic OP(s) != OOR2) { 139355847Sbostic g->iflags |= BAD; 139455847Sbostic return; 139555847Sbostic } 139655847Sbostic } while (OP(s) != O_QUEST && OP(s) != O_CH); 139755847Sbostic /* fallthrough */ 139855847Sbostic default: /* things that break a sequence */ 139955847Sbostic if (newlen > g->mlen) { /* ends one */ 140055847Sbostic start = newstart; 140155847Sbostic g->mlen = newlen; 140255847Sbostic } 140355847Sbostic newlen = 0; 140455847Sbostic break; 140555847Sbostic } 140655847Sbostic } while (OP(s) != OEND); 140755847Sbostic 140855847Sbostic if (g->mlen == 0) /* there isn't one */ 140955847Sbostic return; 141055847Sbostic 141155847Sbostic /* turn it into a character string */ 141255847Sbostic g->must = malloc((size_t)g->mlen + 1); 141355847Sbostic if (g->must == NULL) { /* argh; just forget it */ 141455847Sbostic g->mlen = 0; 141555847Sbostic return; 141655847Sbostic } 141755847Sbostic cp = g->must; 141855847Sbostic scan = start; 141955847Sbostic for (i = g->mlen; i > 0; i--) { 142055847Sbostic while (OP(s = *scan++) != OCHAR) 142155847Sbostic continue; 142255847Sbostic *cp++ = OPND(s); 142355847Sbostic } 142455847Sbostic *cp++ = '\0'; /* just on general principles */ 142555847Sbostic } 142655847Sbostic 142755847Sbostic /* 142855847Sbostic - pluscount - count + nesting 1429*56355Sbostic == static sopno pluscount(register struct parse *p, register struct re_guts *g); 143055847Sbostic */ 1431*56355Sbostic static sopno /* nesting depth */ 143255847Sbostic pluscount(p, g) 143355847Sbostic struct parse *p; 143455847Sbostic register struct re_guts *g; 143555847Sbostic { 143655847Sbostic register sop *scan; 143755847Sbostic register sop s; 143855847Sbostic register sopno plusnest = 0; 143955847Sbostic register sopno maxnest = 0; 144055847Sbostic 144155847Sbostic if (p->error != 0) 144255847Sbostic return(0); /* there may not be an OEND */ 144355847Sbostic 144455847Sbostic scan = g->strip + 1; 144555847Sbostic do { 144655847Sbostic s = *scan++; 144755847Sbostic switch (OP(s)) { 144855847Sbostic case OPLUS_: 144955847Sbostic plusnest++; 145055847Sbostic break; 145155847Sbostic case O_PLUS: 145255847Sbostic if (plusnest > maxnest) 145355847Sbostic maxnest = plusnest; 145455847Sbostic plusnest--; 145555847Sbostic break; 145655847Sbostic } 145755847Sbostic } while (OP(s) != OEND); 145855847Sbostic if (plusnest != 0) 145955847Sbostic g->iflags |= BAD; 146055847Sbostic return(maxnest); 146155847Sbostic } 1462