155847Sbostic /*- 266362Sbostic * Copyright (c) 1992, 1993, 1994 Henry Spencer. 366362Sbostic * Copyright (c) 1992, 1993, 1994 461162Sbostic * The Regents of the University of California. All rights reserved. 555847Sbostic * 655847Sbostic * This code is derived from software contributed to Berkeley by 755847Sbostic * Henry Spencer of the University of Toronto. 855847Sbostic * 955847Sbostic * %sccs.include.redist.c% 1055847Sbostic * 11*66385Sbostic * @(#)regcomp.c 8.4 (Berkeley) 03/19/94 1255847Sbostic */ 1355847Sbostic 1455847Sbostic #if defined(LIBC_SCCS) && !defined(lint) 15*66385Sbostic static char sccsid[] = "@(#)regcomp.c 8.4 (Berkeley) 03/19/94"; 1655847Sbostic #endif /* LIBC_SCCS and not lint */ 1755847Sbostic 1855847Sbostic #include <sys/types.h> 1955847Sbostic #include <stdio.h> 2055847Sbostic #include <string.h> 2155847Sbostic #include <ctype.h> 2255847Sbostic #include <limits.h> 2355847Sbostic #include <stdlib.h> 2455847Sbostic #include <regex.h> 2555847Sbostic 2655847Sbostic #include "utils.h" 2755847Sbostic #include "regex2.h" 2855847Sbostic 2955847Sbostic #include "cclass.h" 3055847Sbostic #include "cname.h" 3155847Sbostic 3255847Sbostic /* 3355847Sbostic * parse structure, passed up and down to avoid global variables and 3455847Sbostic * other clumsinesses 3555847Sbostic */ 3655847Sbostic struct parse { 3760201Sbostic char *next; /* next character in RE */ 3860201Sbostic char *end; /* end of string (-> NUL normally) */ 3955847Sbostic int error; /* has an error been seen? */ 4055847Sbostic sop *strip; /* malloced strip */ 4155847Sbostic sopno ssize; /* malloced strip size (allocated) */ 4255847Sbostic sopno slen; /* malloced strip length (used) */ 4355847Sbostic int ncsalloc; /* number of csets allocated */ 4455847Sbostic struct re_guts *g; 4555847Sbostic # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 4655847Sbostic sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 4755847Sbostic sopno pend[NPAREN]; /* -> ) ([0] unused) */ 4855847Sbostic }; 4955847Sbostic 5066362Sbostic /* ========= begin header generated by ./mkh ========= */ 5166362Sbostic #ifdef __cplusplus 5266362Sbostic extern "C" { 5366362Sbostic #endif 5456355Sbostic 5566362Sbostic /* === regcomp.c === */ 56*66385Sbostic static void p_ere __P((struct parse *p, int stop)); 57*66385Sbostic static void p_ere_exp __P((struct parse *p)); 58*66385Sbostic static void p_str __P((struct parse *p)); 59*66385Sbostic static void p_bre __P((struct parse *p, int end1, int end2)); 60*66385Sbostic static int p_simp_re __P((struct parse *p, int starordinary)); 61*66385Sbostic static int p_count __P((struct parse *p)); 62*66385Sbostic static void p_bracket __P((struct parse *p)); 63*66385Sbostic static void p_b_term __P((struct parse *p, cset *cs)); 64*66385Sbostic static void p_b_cclass __P((struct parse *p, cset *cs)); 65*66385Sbostic static void p_b_eclass __P((struct parse *p, cset *cs)); 66*66385Sbostic static char p_b_symbol __P((struct parse *p)); 67*66385Sbostic static char p_b_coll_elem __P((struct parse *p, int endc)); 68*66385Sbostic static char othercase __P((int ch)); 69*66385Sbostic static void bothcases __P((struct parse *p, int ch)); 70*66385Sbostic static void ordinary __P((struct parse *p, int ch)); 71*66385Sbostic static void nonnewline __P((struct parse *p)); 72*66385Sbostic static void repeat __P((struct parse *p, sopno start, int from, int to)); 73*66385Sbostic static int seterr __P((struct parse *p, int e)); 74*66385Sbostic static cset *allocset __P((struct parse *p)); 75*66385Sbostic static void freeset __P((struct parse *p, cset *cs)); 76*66385Sbostic static int freezeset __P((struct parse *p, cset *cs)); 77*66385Sbostic static int firstch __P((struct parse *p, cset *cs)); 78*66385Sbostic static int nch __P((struct parse *p, cset *cs)); 79*66385Sbostic static void mcadd __P((struct parse *p, cset *cs, char *cp)); 80*66385Sbostic static void mcsub __P((cset *cs, char *cp)); 81*66385Sbostic static int mcin __P((cset *cs, char *cp)); 82*66385Sbostic static char *mcfind __P((cset *cs, char *cp)); 83*66385Sbostic static void mcinvert __P((struct parse *p, cset *cs)); 84*66385Sbostic static void mccase __P((struct parse *p, cset *cs)); 85*66385Sbostic static int isinsets __P((struct re_guts *g, int c)); 86*66385Sbostic static int samesets __P((struct re_guts *g, int c1, int c2)); 87*66385Sbostic static void categorize __P((struct parse *p, struct re_guts *g)); 88*66385Sbostic static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 89*66385Sbostic static void doemit __P((struct parse *p, sop op, size_t opnd)); 90*66385Sbostic static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 91*66385Sbostic static void dofwd __P((struct parse *p, sopno pos, sop value)); 92*66385Sbostic static void enlarge __P((struct parse *p, sopno size)); 93*66385Sbostic static void stripsnug __P((struct parse *p, struct re_guts *g)); 94*66385Sbostic static void findmust __P((struct parse *p, struct re_guts *g)); 95*66385Sbostic static sopno pluscount __P((struct parse *p, struct re_guts *g)); 9660201Sbostic 9766362Sbostic #ifdef __cplusplus 9866362Sbostic } 9966362Sbostic #endif 10066362Sbostic /* ========= end header generated by ./mkh ========= */ 10166362Sbostic 10260201Sbostic static char nuls[10]; /* place to point scanner in event of error */ 10360201Sbostic 10455847Sbostic /* 10555847Sbostic * macros for use with parse structure 10655847Sbostic * BEWARE: these know that the parse structure is named `p' !!! 10755847Sbostic */ 10860201Sbostic #define PEEK() (*p->next) 10960201Sbostic #define PEEK2() (*(p->next+1)) 11060201Sbostic #define MORE() (p->next < p->end) 11160201Sbostic #define MORE2() (p->next+1 < p->end) 11260201Sbostic #define SEE(c) (MORE() && PEEK() == (c)) 11360201Sbostic #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 11455847Sbostic #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 11555847Sbostic #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 11655847Sbostic #define NEXT() (p->next++) 11755847Sbostic #define NEXT2() (p->next += 2) 11855847Sbostic #define NEXTn(n) (p->next += (n)) 11960201Sbostic #define GETNEXT() (*p->next++) 12055847Sbostic #define SETERROR(e) seterr(p, (e)) 12155847Sbostic #define REQUIRE(co, e) ((co) || SETERROR(e)) 12260201Sbostic #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 12360201Sbostic #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 12460201Sbostic #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 12566362Sbostic #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 12666362Sbostic #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 12760201Sbostic #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 12860201Sbostic #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 12955847Sbostic #define HERE() (p->slen) 13055847Sbostic #define THERE() (p->slen - 1) 13166381Sbostic #define THERETHERE() (p->slen - 2) 13255847Sbostic #define DROP(n) (p->slen -= (n)) 13355847Sbostic 13460201Sbostic #ifndef NDEBUG 13560201Sbostic static int never = 0; /* for use in asserts; shuts lint up */ 13666362Sbostic #else 13766362Sbostic #define never 0 /* some <assert.h>s have bugs too */ 13860201Sbostic #endif 13956357Sbostic 14055847Sbostic /* 14155847Sbostic - regcomp - interface for parser and compilation 14266362Sbostic = extern int regcomp(regex_t *, const char *, int); 14360201Sbostic = #define REG_BASIC 0000 14460201Sbostic = #define REG_EXTENDED 0001 14560201Sbostic = #define REG_ICASE 0002 14660201Sbostic = #define REG_NOSUB 0004 14760201Sbostic = #define REG_NEWLINE 0010 14860201Sbostic = #define REG_NOSPEC 0020 14960201Sbostic = #define REG_PEND 0040 15060201Sbostic = #define REG_DUMP 0200 15155847Sbostic */ 15255847Sbostic int /* 0 success, otherwise REG_something */ 15355847Sbostic regcomp(preg, pattern, cflags) 15455847Sbostic regex_t *preg; 15555847Sbostic const char *pattern; 15655847Sbostic int cflags; 15755847Sbostic { 15855847Sbostic struct parse pa; 15955847Sbostic register struct re_guts *g; 16055847Sbostic register struct parse *p = &pa; 16155847Sbostic register int i; 16260201Sbostic register size_t len; 16366362Sbostic #ifdef REDEBUG 16466362Sbostic # define GOODFLAGS(f) (f) 16566362Sbostic #else 16666362Sbostic # define GOODFLAGS(f) ((f)&~REG_DUMP) 16766362Sbostic #endif 16855847Sbostic 16966362Sbostic cflags = GOODFLAGS(cflags); 17060201Sbostic if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 17160201Sbostic return(REG_INVARG); 17260201Sbostic 17360201Sbostic if (cflags®_PEND) { 17460201Sbostic if (preg->re_endp < pattern) 17560201Sbostic return(REG_INVARG); 17660201Sbostic len = preg->re_endp - pattern; 17760201Sbostic } else 17860201Sbostic len = strlen((char *)pattern); 17960201Sbostic 18055847Sbostic /* do the mallocs early so failure handling is easy */ 18160201Sbostic g = (struct re_guts *)malloc(sizeof(struct re_guts) + 18260201Sbostic (NC-1)*sizeof(cat_t)); 18355847Sbostic if (g == NULL) 18455847Sbostic return(REG_ESPACE); 18560201Sbostic p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 18655847Sbostic p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 18755847Sbostic p->slen = 0; 18855847Sbostic if (p->strip == NULL) { 18955847Sbostic free((char *)g); 19055847Sbostic return(REG_ESPACE); 19155847Sbostic } 19255847Sbostic 19355847Sbostic /* set things up */ 19455847Sbostic p->g = g; 19560201Sbostic p->next = (char *)pattern; /* convenience; we do not modify it */ 19660201Sbostic p->end = p->next + len; 19755847Sbostic p->error = 0; 19855847Sbostic p->ncsalloc = 0; 19955847Sbostic for (i = 0; i < NPAREN; i++) { 20055847Sbostic p->pbegin[i] = 0; 20155847Sbostic p->pend[i] = 0; 20255847Sbostic } 20360201Sbostic g->csetsize = NC; 20455847Sbostic g->sets = NULL; 20555847Sbostic g->setbits = NULL; 20655847Sbostic g->ncsets = 0; 20755847Sbostic g->cflags = cflags; 20855847Sbostic g->iflags = 0; 20960201Sbostic g->nbol = 0; 21060201Sbostic g->neol = 0; 21155847Sbostic g->must = NULL; 21255847Sbostic g->mlen = 0; 21355847Sbostic g->nsub = 0; 21455847Sbostic g->ncategories = 1; /* category 0 is "everything else" */ 21560610Sbostic g->categories = &g->catspace[-(CHAR_MIN)]; 21660201Sbostic (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 21755847Sbostic g->backrefs = 0; 21855847Sbostic 21955847Sbostic /* do it */ 22055847Sbostic EMIT(OEND, 0); 22155847Sbostic g->firststate = THERE(); 22255847Sbostic if (cflags®_EXTENDED) 22360201Sbostic p_ere(p, OUT); 22460201Sbostic else if (cflags®_NOSPEC) 22560201Sbostic p_str(p); 22655847Sbostic else 22760201Sbostic p_bre(p, OUT, OUT); 22855847Sbostic EMIT(OEND, 0); 22955847Sbostic g->laststate = THERE(); 23055847Sbostic 23155847Sbostic /* tidy up loose ends and fill things in */ 23255847Sbostic categorize(p, g); 23355847Sbostic stripsnug(p, g); 23455847Sbostic findmust(p, g); 23555847Sbostic g->nplus = pluscount(p, g); 23655847Sbostic g->magic = MAGIC2; 23755847Sbostic preg->re_nsub = g->nsub; 23855847Sbostic preg->re_g = g; 23955847Sbostic preg->re_magic = MAGIC1; 24056355Sbostic #ifndef REDEBUG 24155847Sbostic /* not debugging, so can't rely on the assert() in regexec() */ 24255847Sbostic if (g->iflags&BAD) 24355847Sbostic SETERROR(REG_ASSERT); 24455847Sbostic #endif 24555847Sbostic 24655847Sbostic /* win or lose, we're done */ 24755847Sbostic if (p->error != 0) /* lose */ 24855847Sbostic regfree(preg); 24955847Sbostic return(p->error); 25055847Sbostic } 25155847Sbostic 25255847Sbostic /* 25355847Sbostic - p_ere - ERE parser top level, concatenation and alternation 25460201Sbostic == static void p_ere(register struct parse *p, int stop); 25555847Sbostic */ 25655847Sbostic static void 25755847Sbostic p_ere(p, stop) 25855847Sbostic register struct parse *p; 25960201Sbostic int stop; /* character this ERE should end at */ 26055847Sbostic { 26160201Sbostic register char c; 26255847Sbostic register sopno prevback; 26355847Sbostic register sopno prevfwd; 26455847Sbostic register sopno conc; 26555847Sbostic register int first = 1; /* is this the first alternative? */ 26655847Sbostic 26755847Sbostic for (;;) { 26855847Sbostic /* do a bunch of concatenated expressions */ 26955847Sbostic conc = HERE(); 27060201Sbostic while (MORE() && (c = PEEK()) != '|' && c != stop) 27155847Sbostic p_ere_exp(p); 27255847Sbostic REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 27355847Sbostic 27455847Sbostic if (!EAT('|')) 27555847Sbostic break; /* NOTE BREAK OUT */ 27655847Sbostic 27755847Sbostic if (first) { 27855847Sbostic INSERT(OCH_, conc); /* offset is wrong */ 27955847Sbostic prevfwd = conc; 28055847Sbostic prevback = conc; 28155847Sbostic first = 0; 28255847Sbostic } 28360201Sbostic ASTERN(OOR1, prevback); 28455847Sbostic prevback = THERE(); 28560201Sbostic AHEAD(prevfwd); /* fix previous offset */ 28655847Sbostic prevfwd = HERE(); 28755847Sbostic EMIT(OOR2, 0); /* offset is very wrong */ 28855847Sbostic } 28955847Sbostic 29055847Sbostic if (!first) { /* tail-end fixups */ 29160201Sbostic AHEAD(prevfwd); 29260201Sbostic ASTERN(O_CH, prevback); 29355847Sbostic } 29455847Sbostic 29560201Sbostic assert(!MORE() || SEE(stop)); 29655847Sbostic } 29755847Sbostic 29855847Sbostic /* 29955847Sbostic - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 30060201Sbostic == static void p_ere_exp(register struct parse *p); 30155847Sbostic */ 30255847Sbostic static void 30355847Sbostic p_ere_exp(p) 30455847Sbostic register struct parse *p; 30555847Sbostic { 30660201Sbostic register char c; 30755847Sbostic register sopno pos; 30855847Sbostic register int count; 30955847Sbostic register int count2; 31055847Sbostic register sopno subno; 31155847Sbostic int wascaret = 0; 31255847Sbostic 31360201Sbostic assert(MORE()); /* caller should have ensured this */ 31455847Sbostic c = GETNEXT(); 31555847Sbostic 31655847Sbostic pos = HERE(); 31755847Sbostic switch (c) { 31855847Sbostic case '(': 31960201Sbostic REQUIRE(MORE(), REG_EPAREN); 32055847Sbostic p->g->nsub++; 32155847Sbostic subno = p->g->nsub; 32255847Sbostic if (subno < NPAREN) 32355847Sbostic p->pbegin[subno] = HERE(); 32455847Sbostic EMIT(OLPAREN, subno); 32555847Sbostic if (!SEE(')')) 32655847Sbostic p_ere(p, ')'); 32755847Sbostic if (subno < NPAREN) { 32855847Sbostic p->pend[subno] = HERE(); 32955847Sbostic assert(p->pend[subno] != 0); 33055847Sbostic } 33155847Sbostic EMIT(ORPAREN, subno); 33255847Sbostic MUSTEAT(')', REG_EPAREN); 33355847Sbostic break; 33455847Sbostic #ifndef POSIX_MISTAKE 33555847Sbostic case ')': /* happens only if no current unmatched ( */ 33655847Sbostic /* 33755847Sbostic * You may ask, why the ifndef? Because I didn't notice 33855847Sbostic * this until slightly too late for 1003.2, and none of the 33955847Sbostic * other 1003.2 regular-expression reviewers noticed it at 34055847Sbostic * all. So an unmatched ) is legal POSIX, at least until 34155847Sbostic * we can get it fixed. 34255847Sbostic */ 34355847Sbostic SETERROR(REG_EPAREN); 34455847Sbostic break; 34555847Sbostic #endif 34655847Sbostic case '^': 34755847Sbostic EMIT(OBOL, 0); 34855847Sbostic p->g->iflags |= USEBOL; 34960201Sbostic p->g->nbol++; 35055847Sbostic wascaret = 1; 35155847Sbostic break; 35255847Sbostic case '$': 35355847Sbostic EMIT(OEOL, 0); 35455847Sbostic p->g->iflags |= USEEOL; 35560201Sbostic p->g->neol++; 35655847Sbostic break; 35755847Sbostic case '|': 35855847Sbostic SETERROR(REG_EMPTY); 35955847Sbostic break; 36055847Sbostic case '*': 36155847Sbostic case '+': 36255847Sbostic case '?': 36355847Sbostic SETERROR(REG_BADRPT); 36455847Sbostic break; 36555847Sbostic case '.': 36655847Sbostic if (p->g->cflags®_NEWLINE) 36755847Sbostic nonnewline(p); 36855847Sbostic else 36955847Sbostic EMIT(OANY, 0); 37055847Sbostic break; 37155847Sbostic case '[': 37255847Sbostic p_bracket(p); 37355847Sbostic break; 37455847Sbostic case '\\': 37560201Sbostic REQUIRE(MORE(), REG_EESCAPE); 37655847Sbostic c = GETNEXT(); 37760201Sbostic ordinary(p, c); 37855847Sbostic break; 37955847Sbostic case '{': /* okay as ordinary except if digit follows */ 38060201Sbostic REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 38155847Sbostic /* FALLTHROUGH */ 38255847Sbostic default: 38355847Sbostic ordinary(p, c); 38455847Sbostic break; 38555847Sbostic } 38655847Sbostic 38760201Sbostic if (!MORE()) 38860201Sbostic return; 38955847Sbostic c = PEEK(); 39060201Sbostic /* we call { a repetition if followed by a digit */ 39160201Sbostic if (!( c == '*' || c == '+' || c == '?' || 39260201Sbostic (c == '{' && MORE2() && isdigit(PEEK2())) )) 39355847Sbostic return; /* no repetition, we're done */ 39455847Sbostic NEXT(); 39555847Sbostic 39655847Sbostic REQUIRE(!wascaret, REG_BADRPT); 39755847Sbostic switch (c) { 39855847Sbostic case '*': /* implemented as +? */ 39966381Sbostic /* this case does not require the (y|) trick, noKLUDGE */ 40055847Sbostic INSERT(OPLUS_, pos); 40160201Sbostic ASTERN(O_PLUS, pos); 40255847Sbostic INSERT(OQUEST_, pos); 40360201Sbostic ASTERN(O_QUEST, pos); 40455847Sbostic break; 40555847Sbostic case '+': 40655847Sbostic INSERT(OPLUS_, pos); 40760201Sbostic ASTERN(O_PLUS, pos); 40855847Sbostic break; 40955847Sbostic case '?': 41066381Sbostic /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 41166381Sbostic INSERT(OCH_, pos); /* offset slightly wrong */ 41266381Sbostic ASTERN(OOR1, pos); /* this one's right */ 41366381Sbostic AHEAD(pos); /* fix the OCH_ */ 41466381Sbostic EMIT(OOR2, 0); /* offset very wrong... */ 41566381Sbostic AHEAD(THERE()); /* ...so fix it */ 41666381Sbostic ASTERN(O_CH, THERETHERE()); 41755847Sbostic break; 41855847Sbostic case '{': 41955847Sbostic count = p_count(p); 42055847Sbostic if (EAT(',')) { 42155847Sbostic if (isdigit(PEEK())) { 42255847Sbostic count2 = p_count(p); 42355847Sbostic REQUIRE(count <= count2, REG_BADBR); 42455847Sbostic } else /* single number with comma */ 42555847Sbostic count2 = INFINITY; 42655847Sbostic } else /* just a single number */ 42755847Sbostic count2 = count; 42855847Sbostic repeat(p, pos, count, count2); 42955847Sbostic if (!EAT('}')) { /* error heuristics */ 43060201Sbostic while (MORE() && PEEK() != '}') 43155847Sbostic NEXT(); 43260201Sbostic REQUIRE(MORE(), REG_EBRACE); 43360201Sbostic SETERROR(REG_BADBR); 43455847Sbostic } 43555847Sbostic break; 43655847Sbostic } 43755847Sbostic 43860201Sbostic if (!MORE()) 43960201Sbostic return; 44055847Sbostic c = PEEK(); 44160201Sbostic if (!( c == '*' || c == '+' || c == '?' || 44260201Sbostic (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 44360201Sbostic return; 44460201Sbostic SETERROR(REG_BADRPT); 44555847Sbostic } 44655847Sbostic 44755847Sbostic /* 44860201Sbostic - p_str - string (no metacharacters) "parser" 44960201Sbostic == static void p_str(register struct parse *p); 45060201Sbostic */ 45160201Sbostic static void 45260201Sbostic p_str(p) 45360201Sbostic register struct parse *p; 45460201Sbostic { 45560201Sbostic REQUIRE(MORE(), REG_EMPTY); 45660201Sbostic while (MORE()) 45760201Sbostic ordinary(p, GETNEXT()); 45860201Sbostic } 45960201Sbostic 46060201Sbostic /* 46155847Sbostic - p_bre - BRE parser top level, anchoring and concatenation 46260201Sbostic == static void p_bre(register struct parse *p, register int end1, \ 46360201Sbostic == register int end2); 46460201Sbostic * Giving end1 as OUT essentially eliminates the end1/end2 check. 46555847Sbostic * 46656355Sbostic * This implementation is a bit of a kludge, in that a trailing $ is first 46756355Sbostic * taken as an ordinary character and then revised to be an anchor. The 46856355Sbostic * only undesirable side effect is that '$' gets included as a character 46956355Sbostic * category in such cases. This is fairly harmless; not worth fixing. 47060201Sbostic * The amount of lookahead needed to avoid this kludge is excessive. 47155847Sbostic */ 47255847Sbostic static void 47355847Sbostic p_bre(p, end1, end2) 47455847Sbostic register struct parse *p; 47560201Sbostic register int end1; /* first terminating character */ 47660201Sbostic register int end2; /* second terminating character */ 47755847Sbostic { 47855847Sbostic register sopno start = HERE(); 47955847Sbostic register int first = 1; /* first subexpression? */ 48056355Sbostic register int wasdollar = 0; 48155847Sbostic 48255847Sbostic if (EAT('^')) { 48355847Sbostic EMIT(OBOL, 0); 48455847Sbostic p->g->iflags |= USEBOL; 48560201Sbostic p->g->nbol++; 48655847Sbostic } 48760201Sbostic while (MORE() && !SEETWO(end1, end2)) { 48855847Sbostic wasdollar = p_simp_re(p, first); 48955847Sbostic first = 0; 49055847Sbostic } 49155847Sbostic if (wasdollar) { /* oops, that was a trailing anchor */ 49255847Sbostic DROP(1); 49355847Sbostic EMIT(OEOL, 0); 49455847Sbostic p->g->iflags |= USEEOL; 49560201Sbostic p->g->neol++; 49655847Sbostic } 49755847Sbostic 49855847Sbostic REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 49955847Sbostic } 50055847Sbostic 50155847Sbostic /* 50255847Sbostic - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 50360201Sbostic == static int p_simp_re(register struct parse *p, int starordinary); 50455847Sbostic */ 50555847Sbostic static int /* was the simple RE an unbackslashed $? */ 50655847Sbostic p_simp_re(p, starordinary) 50755847Sbostic register struct parse *p; 50855847Sbostic int starordinary; /* is a leading * an ordinary character? */ 50955847Sbostic { 51055847Sbostic register int c; 51155847Sbostic register int count; 51255847Sbostic register int count2; 51355847Sbostic register sopno pos; 51455847Sbostic register int i; 51555847Sbostic register sopno subno; 51655847Sbostic # define BACKSL (1<<CHAR_BIT) 51755847Sbostic 51855847Sbostic pos = HERE(); /* repetion op, if any, covers from here */ 51955847Sbostic 52060201Sbostic assert(MORE()); /* caller should have ensured this */ 52155847Sbostic c = GETNEXT(); 52260201Sbostic if (c == '\\') { 52360201Sbostic REQUIRE(MORE(), REG_EESCAPE); 52460201Sbostic c = BACKSL | (unsigned char)GETNEXT(); 52560201Sbostic } 52655847Sbostic switch (c) { 52755847Sbostic case '.': 52855847Sbostic if (p->g->cflags®_NEWLINE) 52955847Sbostic nonnewline(p); 53055847Sbostic else 53155847Sbostic EMIT(OANY, 0); 53255847Sbostic break; 53355847Sbostic case '[': 53455847Sbostic p_bracket(p); 53555847Sbostic break; 53655847Sbostic case BACKSL|'{': 53755847Sbostic SETERROR(REG_BADRPT); 53855847Sbostic break; 53955847Sbostic case BACKSL|'(': 54055847Sbostic p->g->nsub++; 54155847Sbostic subno = p->g->nsub; 54255847Sbostic if (subno < NPAREN) 54355847Sbostic p->pbegin[subno] = HERE(); 54455847Sbostic EMIT(OLPAREN, subno); 54560201Sbostic /* the MORE here is an error heuristic */ 54660201Sbostic if (MORE() && !SEETWO('\\', ')')) 54755847Sbostic p_bre(p, '\\', ')'); 54855847Sbostic if (subno < NPAREN) { 54955847Sbostic p->pend[subno] = HERE(); 55055847Sbostic assert(p->pend[subno] != 0); 55155847Sbostic } 55255847Sbostic EMIT(ORPAREN, subno); 55355847Sbostic REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 55455847Sbostic break; 55555847Sbostic case BACKSL|')': /* should not get here -- must be user */ 55655847Sbostic case BACKSL|'}': 55755847Sbostic SETERROR(REG_EPAREN); 55855847Sbostic break; 55955847Sbostic case BACKSL|'1': 56055847Sbostic case BACKSL|'2': 56155847Sbostic case BACKSL|'3': 56255847Sbostic case BACKSL|'4': 56355847Sbostic case BACKSL|'5': 56455847Sbostic case BACKSL|'6': 56555847Sbostic case BACKSL|'7': 56655847Sbostic case BACKSL|'8': 56755847Sbostic case BACKSL|'9': 56855847Sbostic i = (c&~BACKSL) - '0'; 56955847Sbostic assert(i < NPAREN); 57055847Sbostic if (p->pend[i] != 0) { 57155847Sbostic assert(i <= p->g->nsub); 57255847Sbostic EMIT(OBACK_, i); 57355847Sbostic assert(p->pbegin[i] != 0); 57455847Sbostic assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 57555847Sbostic assert(OP(p->strip[p->pend[i]]) == ORPAREN); 57655847Sbostic (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 57755847Sbostic EMIT(O_BACK, i); 57855847Sbostic } else 57955847Sbostic SETERROR(REG_ESUBREG); 58055847Sbostic p->g->backrefs = 1; 58155847Sbostic break; 58255847Sbostic case '*': 58355847Sbostic REQUIRE(starordinary, REG_BADRPT); 58455847Sbostic /* FALLTHROUGH */ 58555847Sbostic default: 58660201Sbostic ordinary(p, c &~ BACKSL); 58755847Sbostic break; 58855847Sbostic } 58955847Sbostic 59055847Sbostic if (EAT('*')) { /* implemented as +? */ 59166381Sbostic /* this case does not require the (y|) trick, noKLUDGE */ 59255847Sbostic INSERT(OPLUS_, pos); 59360201Sbostic ASTERN(O_PLUS, pos); 59455847Sbostic INSERT(OQUEST_, pos); 59560201Sbostic ASTERN(O_QUEST, pos); 59655847Sbostic } else if (EATTWO('\\', '{')) { 59755847Sbostic count = p_count(p); 59855847Sbostic if (EAT(',')) { 59960201Sbostic if (MORE() && isdigit(PEEK())) { 60055847Sbostic count2 = p_count(p); 60155847Sbostic REQUIRE(count <= count2, REG_BADBR); 60255847Sbostic } else /* single number with comma */ 60355847Sbostic count2 = INFINITY; 60455847Sbostic } else /* just a single number */ 60555847Sbostic count2 = count; 60655847Sbostic repeat(p, pos, count, count2); 60755847Sbostic if (!EATTWO('\\', '}')) { /* error heuristics */ 60860201Sbostic while (MORE() && !SEETWO('\\', '}')) 60955847Sbostic NEXT(); 61060201Sbostic REQUIRE(MORE(), REG_EBRACE); 61160201Sbostic SETERROR(REG_BADBR); 61255847Sbostic } 61360201Sbostic } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 61455847Sbostic return(1); 61555847Sbostic 61655847Sbostic return(0); 61755847Sbostic } 61855847Sbostic 61955847Sbostic /* 62055847Sbostic - p_count - parse a repetition count 62160201Sbostic == static int p_count(register struct parse *p); 62255847Sbostic */ 62355847Sbostic static int /* the value */ 62455847Sbostic p_count(p) 62555847Sbostic register struct parse *p; 62655847Sbostic { 62755847Sbostic register int count = 0; 62855847Sbostic register int ndigits = 0; 62955847Sbostic 63060201Sbostic while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 63155847Sbostic count = count*10 + (GETNEXT() - '0'); 63255847Sbostic ndigits++; 63355847Sbostic } 63455847Sbostic 63555847Sbostic REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 63655847Sbostic return(count); 63755847Sbostic } 63855847Sbostic 63955847Sbostic /* 64055847Sbostic - p_bracket - parse a bracketed character list 64160201Sbostic == static void p_bracket(register struct parse *p); 64255847Sbostic * 64355847Sbostic * Note a significant property of this code: if the allocset() did SETERROR, 64455847Sbostic * no set operations are done. 64555847Sbostic */ 64655847Sbostic static void 64755847Sbostic p_bracket(p) 64855847Sbostic register struct parse *p; 64955847Sbostic { 65060201Sbostic register char c; 65155847Sbostic register cset *cs = allocset(p); 65255847Sbostic register int invert = 0; 65355847Sbostic 65460201Sbostic /* Dept of Truly Sickening Special-Case Kludges */ 65560201Sbostic if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 65660201Sbostic EMIT(OBOW, 0); 65760201Sbostic NEXTn(6); 65860201Sbostic return; 65960201Sbostic } 66060201Sbostic if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 66160201Sbostic EMIT(OEOW, 0); 66260201Sbostic NEXTn(6); 66360201Sbostic return; 66460201Sbostic } 66560201Sbostic 66655847Sbostic if (EAT('^')) 66755847Sbostic invert++; /* make note to invert set at end */ 66855847Sbostic if (EAT(']')) 66955847Sbostic CHadd(cs, ']'); 67056618Sbostic else if (EAT('-')) 67156618Sbostic CHadd(cs, '-'); 67260201Sbostic while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 67355847Sbostic p_b_term(p, cs); 67455847Sbostic if (EAT('-')) 67555847Sbostic CHadd(cs, '-'); 67655847Sbostic MUSTEAT(']', REG_EBRACK); 67755847Sbostic 67860201Sbostic if (p->error != 0) /* don't mess things up further */ 67960201Sbostic return; 68060201Sbostic 68160201Sbostic if (p->g->cflags®_ICASE) { 68255847Sbostic register int i; 68360201Sbostic register int ci; 68455847Sbostic 68555847Sbostic for (i = p->g->csetsize - 1; i >= 0; i--) 68660201Sbostic if (CHIN(cs, i) && isalpha(i)) { 68760201Sbostic ci = othercase(i); 68860201Sbostic if (ci != i) 68960201Sbostic CHadd(cs, ci); 69060201Sbostic } 69160201Sbostic if (cs->multis != NULL) 69260201Sbostic mccase(p, cs); 69360201Sbostic } 69460201Sbostic if (invert) { 69560201Sbostic register int i; 69660201Sbostic 69760201Sbostic for (i = p->g->csetsize - 1; i >= 0; i--) 69855847Sbostic if (CHIN(cs, i)) 69955847Sbostic CHsub(cs, i); 70055847Sbostic else 70155847Sbostic CHadd(cs, i); 70255847Sbostic if (p->g->cflags®_NEWLINE) 70355847Sbostic CHsub(cs, '\n'); 70455847Sbostic if (cs->multis != NULL) 70555847Sbostic mcinvert(p, cs); 70655847Sbostic } 70760201Sbostic 70855847Sbostic assert(cs->multis == NULL); /* xxx */ 70960201Sbostic 71060201Sbostic if (nch(p, cs) == 1) { /* optimize singleton sets */ 71160201Sbostic ordinary(p, firstch(p, cs)); 71260201Sbostic freeset(p, cs); 71360201Sbostic } else 71460201Sbostic EMIT(OANYOF, freezeset(p, cs)); 71555847Sbostic } 71655847Sbostic 71755847Sbostic /* 71855847Sbostic - p_b_term - parse one term of a bracketed character list 71960201Sbostic == static void p_b_term(register struct parse *p, register cset *cs); 72055847Sbostic */ 72155847Sbostic static void 72255847Sbostic p_b_term(p, cs) 72355847Sbostic register struct parse *p; 72455847Sbostic register cset *cs; 72555847Sbostic { 72660201Sbostic register char c; 72760201Sbostic register char start, finish; 72855847Sbostic register int i; 72955847Sbostic 73055847Sbostic /* classify what we've got */ 73160201Sbostic switch ((MORE()) ? PEEK() : '\0') { 73255847Sbostic case '[': 73360201Sbostic c = (MORE2()) ? PEEK2() : '\0'; 73455847Sbostic break; 73555847Sbostic case '-': 73655847Sbostic SETERROR(REG_ERANGE); 73755847Sbostic return; /* NOTE RETURN */ 73855847Sbostic break; 73955847Sbostic default: 74055847Sbostic c = '\0'; 74155847Sbostic break; 74255847Sbostic } 74355847Sbostic 74455847Sbostic switch (c) { 74555847Sbostic case ':': /* character class */ 74655847Sbostic NEXT2(); 74760201Sbostic REQUIRE(MORE(), REG_EBRACK); 74855847Sbostic c = PEEK(); 74955847Sbostic REQUIRE(c != '-' && c != ']', REG_ECTYPE); 75055847Sbostic p_b_cclass(p, cs); 75160201Sbostic REQUIRE(MORE(), REG_EBRACK); 75255847Sbostic REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 75355847Sbostic break; 75455847Sbostic case '=': /* equivalence class */ 75555847Sbostic NEXT2(); 75660201Sbostic REQUIRE(MORE(), REG_EBRACK); 75755847Sbostic c = PEEK(); 75855847Sbostic REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 75955847Sbostic p_b_eclass(p, cs); 76060201Sbostic REQUIRE(MORE(), REG_EBRACK); 76155847Sbostic REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 76255847Sbostic break; 76355847Sbostic default: /* symbol, ordinary character, or range */ 76455847Sbostic /* xxx revision needed for multichar stuff */ 76555847Sbostic start = p_b_symbol(p); 76660201Sbostic if (SEE('-') && MORE2() && PEEK2() != ']') { 76755847Sbostic /* range */ 76855847Sbostic NEXT(); 76955847Sbostic if (EAT('-')) 77055847Sbostic finish = '-'; 77155847Sbostic else 77255847Sbostic finish = p_b_symbol(p); 77355847Sbostic } else 77455847Sbostic finish = start; 77560201Sbostic /* xxx what about signed chars here... */ 77655847Sbostic REQUIRE(start <= finish, REG_ERANGE); 77760201Sbostic for (i = start; i <= finish; i++) 77855847Sbostic CHadd(cs, i); 77955847Sbostic break; 78055847Sbostic } 78155847Sbostic } 78255847Sbostic 78355847Sbostic /* 78455847Sbostic - p_b_cclass - parse a character-class name and deal with it 78560201Sbostic == static void p_b_cclass(register struct parse *p, register cset *cs); 78655847Sbostic */ 78755847Sbostic static void 78855847Sbostic p_b_cclass(p, cs) 78955847Sbostic register struct parse *p; 79055847Sbostic register cset *cs; 79155847Sbostic { 79260201Sbostic register char *sp = p->next; 79355847Sbostic register struct cclass *cp; 79460201Sbostic register size_t len; 79560201Sbostic register char *u; 79660201Sbostic register char c; 79755847Sbostic 79860201Sbostic while (MORE() && isalpha(PEEK())) 79960201Sbostic NEXT(); 80060201Sbostic len = p->next - sp; 80155847Sbostic for (cp = cclasses; cp->name != NULL; cp++) 80260201Sbostic if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 80355847Sbostic break; 80455847Sbostic if (cp->name == NULL) { 80555847Sbostic /* oops, didn't find it */ 80655847Sbostic SETERROR(REG_ECTYPE); 80755847Sbostic return; 80855847Sbostic } 80955847Sbostic 81060201Sbostic u = cp->chars; 81155847Sbostic while ((c = *u++) != '\0') 81255847Sbostic CHadd(cs, c); 81360201Sbostic for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 81466362Sbostic MCadd(p, cs, u); 81555847Sbostic } 81655847Sbostic 81755847Sbostic /* 81855847Sbostic - p_b_eclass - parse an equivalence-class name and deal with it 81960201Sbostic == static void p_b_eclass(register struct parse *p, register cset *cs); 82055847Sbostic * 82155847Sbostic * This implementation is incomplete. xxx 82255847Sbostic */ 82355847Sbostic static void 82455847Sbostic p_b_eclass(p, cs) 82555847Sbostic register struct parse *p; 82655847Sbostic register cset *cs; 82755847Sbostic { 82860201Sbostic register char c; 82955847Sbostic 83055847Sbostic c = p_b_coll_elem(p, '='); 83155847Sbostic CHadd(cs, c); 83255847Sbostic } 83355847Sbostic 83455847Sbostic /* 83555847Sbostic - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 83660201Sbostic == static char p_b_symbol(register struct parse *p); 83755847Sbostic */ 83860201Sbostic static char /* value of symbol */ 83955847Sbostic p_b_symbol(p) 84055847Sbostic register struct parse *p; 84155847Sbostic { 84260201Sbostic register char value; 84355847Sbostic 84460201Sbostic REQUIRE(MORE(), REG_EBRACK); 84560201Sbostic if (!EATTWO('[', '.')) 84655847Sbostic return(GETNEXT()); 84755847Sbostic 84855847Sbostic /* collating symbol */ 84955847Sbostic value = p_b_coll_elem(p, '.'); 85055847Sbostic REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 85155847Sbostic return(value); 85255847Sbostic } 85355847Sbostic 85455847Sbostic /* 85555847Sbostic - p_b_coll_elem - parse a collating-element name and look it up 85660201Sbostic == static char p_b_coll_elem(register struct parse *p, int endc); 85755847Sbostic */ 85860201Sbostic static char /* value of collating element */ 85955847Sbostic p_b_coll_elem(p, endc) 86055847Sbostic register struct parse *p; 86160201Sbostic int endc; /* name ended by endc,']' */ 86255847Sbostic { 86360201Sbostic register char *sp = p->next; 86455847Sbostic register struct cname *cp; 86555847Sbostic register int len; 86660201Sbostic register char c; 86755847Sbostic 86860201Sbostic while (MORE() && !SEETWO(endc, ']')) 86955847Sbostic NEXT(); 87060201Sbostic if (!MORE()) { 87155847Sbostic SETERROR(REG_EBRACK); 87255847Sbostic return(0); 87355847Sbostic } 87455847Sbostic len = p->next - sp; 87555847Sbostic for (cp = cnames; cp->name != NULL; cp++) 87660201Sbostic if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 87755847Sbostic return(cp->code); /* known name */ 87855847Sbostic if (len == 1) 87955847Sbostic return(*sp); /* single character */ 88055847Sbostic SETERROR(REG_ECOLLATE); /* neither */ 88155847Sbostic return(0); 88255847Sbostic } 88355847Sbostic 88455847Sbostic /* 88555847Sbostic - othercase - return the case counterpart of an alphabetic 88660201Sbostic == static char othercase(int ch); 88755847Sbostic */ 88860201Sbostic static char /* if no counterpart, return ch */ 88955847Sbostic othercase(ch) 89060201Sbostic int ch; 89155847Sbostic { 89255847Sbostic assert(isalpha(ch)); 89355847Sbostic if (isupper(ch)) 89455847Sbostic return(tolower(ch)); 89555847Sbostic else if (islower(ch)) 89655847Sbostic return(toupper(ch)); 89755847Sbostic else /* peculiar, but could happen */ 89855847Sbostic return(ch); 89955847Sbostic } 90055847Sbostic 90155847Sbostic /* 90260201Sbostic - bothcases - emit a dualcase version of a two-case character 90360201Sbostic == static void bothcases(register struct parse *p, int ch); 90456355Sbostic * 90555847Sbostic * Boy, is this implementation ever a kludge... 90655847Sbostic */ 90755847Sbostic static void 90855847Sbostic bothcases(p, ch) 90955847Sbostic register struct parse *p; 91060201Sbostic int ch; 91155847Sbostic { 91260201Sbostic register char *oldnext = p->next; 91360201Sbostic register char *oldend = p->end; 91460201Sbostic char bracket[3]; 91555847Sbostic 91660201Sbostic assert(othercase(ch) != ch); /* p_bracket() would recurse */ 91755847Sbostic p->next = bracket; 91860201Sbostic p->end = bracket+2; 91955847Sbostic bracket[0] = ch; 92055847Sbostic bracket[1] = ']'; 92155847Sbostic bracket[2] = '\0'; 92255847Sbostic p_bracket(p); 92355847Sbostic assert(p->next == bracket+2); 92455847Sbostic p->next = oldnext; 92560201Sbostic p->end = oldend; 92655847Sbostic } 92755847Sbostic 92855847Sbostic /* 92955847Sbostic - ordinary - emit an ordinary character 93060201Sbostic == static void ordinary(register struct parse *p, register int ch); 93155847Sbostic */ 93255847Sbostic static void 93355847Sbostic ordinary(p, ch) 93455847Sbostic register struct parse *p; 93560201Sbostic register int ch; 93655847Sbostic { 93760201Sbostic register cat_t *cap = p->g->categories; 93855847Sbostic 93960201Sbostic if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 94055847Sbostic bothcases(p, ch); 94160201Sbostic else { 94260201Sbostic EMIT(OCHAR, (unsigned char)ch); 94360201Sbostic if (cap[ch] == 0) 94460201Sbostic cap[ch] = p->g->ncategories++; 94555847Sbostic } 94655847Sbostic } 94755847Sbostic 94855847Sbostic /* 94955847Sbostic - nonnewline - emit REG_NEWLINE version of OANY 95060201Sbostic == static void nonnewline(register struct parse *p); 95156355Sbostic * 95255847Sbostic * Boy, is this implementation ever a kludge... 95355847Sbostic */ 95455847Sbostic static void 95555847Sbostic nonnewline(p) 95655847Sbostic register struct parse *p; 95755847Sbostic { 95860201Sbostic register char *oldnext = p->next; 95960201Sbostic register char *oldend = p->end; 96060201Sbostic char bracket[4]; 96155847Sbostic 96255847Sbostic p->next = bracket; 96360201Sbostic p->end = bracket+3; 96455847Sbostic bracket[0] = '^'; 96555847Sbostic bracket[1] = '\n'; 96655847Sbostic bracket[2] = ']'; 96755847Sbostic bracket[3] = '\0'; 96855847Sbostic p_bracket(p); 96955847Sbostic assert(p->next == bracket+3); 97055847Sbostic p->next = oldnext; 97160201Sbostic p->end = oldend; 97255847Sbostic } 97355847Sbostic 97455847Sbostic /* 97555847Sbostic - repeat - generate code for a bounded repetition, recursively if needed 97660201Sbostic == static void repeat(register struct parse *p, sopno start, int from, int to); 97755847Sbostic */ 97855847Sbostic static void 97955847Sbostic repeat(p, start, from, to) 98055847Sbostic register struct parse *p; 98155847Sbostic sopno start; /* operand from here to end of strip */ 98255847Sbostic int from; /* repeated from this number */ 98355847Sbostic int to; /* to this number of times (maybe INFINITY) */ 98455847Sbostic { 98555847Sbostic register sopno finish = HERE(); 98655847Sbostic # define N 2 98755847Sbostic # define INF 3 98855847Sbostic # define REP(f, t) ((f)*8 + (t)) 98955847Sbostic # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 99055847Sbostic register sopno copy; 99155847Sbostic 99255847Sbostic if (p->error != 0) /* head off possible runaway recursion */ 99355847Sbostic return; 99455847Sbostic 99555847Sbostic assert(from <= to); 99655847Sbostic 99755847Sbostic switch (REP(MAP(from), MAP(to))) { 99855847Sbostic case REP(0, 0): /* must be user doing this */ 99955847Sbostic DROP(finish-start); /* drop the operand */ 100055847Sbostic break; 100155847Sbostic case REP(0, 1): /* as x{1,1}? */ 100255847Sbostic case REP(0, N): /* as x{1,n}? */ 100355847Sbostic case REP(0, INF): /* as x{1,}? */ 100466381Sbostic /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 100566381Sbostic INSERT(OCH_, start); /* offset is wrong... */ 100655847Sbostic repeat(p, start+1, 1, to); 100766381Sbostic ASTERN(OOR1, start); 100860201Sbostic AHEAD(start); /* ... fix it */ 100966381Sbostic EMIT(OOR2, 0); 101066381Sbostic AHEAD(THERE()); 101166381Sbostic ASTERN(O_CH, THERETHERE()); 101255847Sbostic break; 101355847Sbostic case REP(1, 1): /* trivial case */ 101455847Sbostic /* done */ 101555847Sbostic break; 101655847Sbostic case REP(1, N): /* as x?x{1,n-1} */ 101766381Sbostic /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 101866381Sbostic INSERT(OCH_, start); 101966381Sbostic ASTERN(OOR1, start); 102066381Sbostic AHEAD(start); 102166381Sbostic EMIT(OOR2, 0); /* offset very wrong... */ 102266381Sbostic AHEAD(THERE()); /* ...so fix it */ 102366381Sbostic ASTERN(O_CH, THERETHERE()); 102455847Sbostic copy = dupl(p, start+1, finish+1); 102566381Sbostic assert(copy == finish+4); 102655847Sbostic repeat(p, copy, 1, to-1); 102755847Sbostic break; 102855847Sbostic case REP(1, INF): /* as x+ */ 102955847Sbostic INSERT(OPLUS_, start); 103060201Sbostic ASTERN(O_PLUS, start); 103155847Sbostic break; 103255847Sbostic case REP(N, N): /* as xx{m-1,n-1} */ 103355847Sbostic copy = dupl(p, start, finish); 103455847Sbostic repeat(p, copy, from-1, to-1); 103555847Sbostic break; 103655847Sbostic case REP(N, INF): /* as xx{n-1,INF} */ 103755847Sbostic copy = dupl(p, start, finish); 103855847Sbostic repeat(p, copy, from-1, to); 103955847Sbostic break; 104055847Sbostic default: /* "can't happen" */ 104155847Sbostic SETERROR(REG_ASSERT); /* just in case */ 104255847Sbostic break; 104355847Sbostic } 104455847Sbostic } 104555847Sbostic 104655847Sbostic /* 104755847Sbostic - seterr - set an error condition 104860201Sbostic == static int seterr(register struct parse *p, int e); 104955847Sbostic */ 105055847Sbostic static int /* useless but makes type checking happy */ 105155847Sbostic seterr(p, e) 105255847Sbostic register struct parse *p; 105355847Sbostic int e; 105455847Sbostic { 105555847Sbostic if (p->error == 0) /* keep earliest error condition */ 105655847Sbostic p->error = e; 105755847Sbostic p->next = nuls; /* try to bring things to a halt */ 105860201Sbostic p->end = nuls; 105955847Sbostic return(0); /* make the return value well-defined */ 106055847Sbostic } 106155847Sbostic 106255847Sbostic /* 106355847Sbostic - allocset - allocate a set of characters for [] 106460201Sbostic == static cset *allocset(register struct parse *p); 106555847Sbostic */ 106655847Sbostic static cset * 106755847Sbostic allocset(p) 106855847Sbostic register struct parse *p; 106955847Sbostic { 107055847Sbostic register int no = p->g->ncsets++; 107155847Sbostic register size_t nc; 107255847Sbostic register size_t nbytes; 107355847Sbostic register cset *cs; 107455847Sbostic register size_t css = (size_t)p->g->csetsize; 107566362Sbostic register int i; 107655847Sbostic 107755847Sbostic if (no >= p->ncsalloc) { /* need another column of space */ 107855847Sbostic p->ncsalloc += CHAR_BIT; 107955847Sbostic nc = p->ncsalloc; 108055847Sbostic assert(nc % CHAR_BIT == 0); 108155847Sbostic nbytes = nc / CHAR_BIT * css; 108255847Sbostic if (p->g->sets == NULL) 108355847Sbostic p->g->sets = (cset *)malloc(nc * sizeof(cset)); 108455847Sbostic else 108555847Sbostic p->g->sets = (cset *)realloc((char *)p->g->sets, 108655847Sbostic nc * sizeof(cset)); 108755847Sbostic if (p->g->setbits == NULL) 108866362Sbostic p->g->setbits = (uch *)malloc(nbytes); 108966362Sbostic else { 109066362Sbostic p->g->setbits = (uch *)realloc((char *)p->g->setbits, 109155847Sbostic nbytes); 109266362Sbostic /* xxx this isn't right if setbits is now NULL */ 109366362Sbostic for (i = 0; i < no; i++) 109466362Sbostic p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 109566362Sbostic } 109655847Sbostic if (p->g->sets != NULL && p->g->setbits != NULL) 109755847Sbostic (void) memset((char *)p->g->setbits + (nbytes - css), 109855847Sbostic 0, css); 109955847Sbostic else { 110055847Sbostic no = 0; 110155847Sbostic SETERROR(REG_ESPACE); 110255847Sbostic /* caller's responsibility not to do set ops */ 110355847Sbostic } 110455847Sbostic } 110555847Sbostic 110655847Sbostic assert(p->g->sets != NULL); /* xxx */ 110755847Sbostic cs = &p->g->sets[no]; 110855847Sbostic cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 110955847Sbostic cs->mask = 1 << ((no) % CHAR_BIT); 111055847Sbostic cs->hash = 0; 111155847Sbostic cs->smultis = 0; 111255847Sbostic cs->multis = NULL; 111355847Sbostic 111455847Sbostic return(cs); 111555847Sbostic } 111655847Sbostic 111755847Sbostic /* 111860201Sbostic - freeset - free a now-unused set 111960201Sbostic == static void freeset(register struct parse *p, register cset *cs); 112060201Sbostic */ 112160201Sbostic static void 112260201Sbostic freeset(p, cs) 112360201Sbostic register struct parse *p; 112460201Sbostic register cset *cs; 112560201Sbostic { 112660201Sbostic register int i; 112760201Sbostic register cset *top = &p->g->sets[p->g->ncsets]; 112860201Sbostic register size_t css = (size_t)p->g->csetsize; 112960201Sbostic 113060201Sbostic for (i = 0; i < css; i++) 113160201Sbostic CHsub(cs, i); 113260201Sbostic if (cs == top-1) /* recover only the easy case */ 113360201Sbostic p->g->ncsets--; 113460201Sbostic } 113560201Sbostic 113660201Sbostic /* 113755847Sbostic - freezeset - final processing on a set of characters 113860201Sbostic == static int freezeset(register struct parse *p, register cset *cs); 113955847Sbostic * 114055847Sbostic * The main task here is merging identical sets. This is usually a waste 114155847Sbostic * of time (although the hash code minimizes the overhead), but can win 114255847Sbostic * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 114355847Sbostic * is done using addition rather than xor -- all ASCII [aA] sets xor to 114455847Sbostic * the same value! 114555847Sbostic */ 114655847Sbostic static int /* set number */ 114755847Sbostic freezeset(p, cs) 114855847Sbostic register struct parse *p; 114955847Sbostic register cset *cs; 115055847Sbostic { 115166362Sbostic register uch h = cs->hash; 115255847Sbostic register int i; 115355847Sbostic register cset *top = &p->g->sets[p->g->ncsets]; 115455847Sbostic register cset *cs2; 115555847Sbostic register size_t css = (size_t)p->g->csetsize; 115655847Sbostic 115755847Sbostic /* look for an earlier one which is the same */ 115855847Sbostic for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 115955847Sbostic if (cs2->hash == h && cs2 != cs) { 116055847Sbostic /* maybe */ 116155847Sbostic for (i = 0; i < css; i++) 116255847Sbostic if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 116355847Sbostic break; /* no */ 116455847Sbostic if (i == css) 116555847Sbostic break; /* yes */ 116655847Sbostic } 116755847Sbostic 116855847Sbostic if (cs2 < top) { /* found one */ 116960201Sbostic freeset(p, cs); 117055847Sbostic cs = cs2; 117155847Sbostic } 117255847Sbostic 117355847Sbostic return((int)(cs - p->g->sets)); 117455847Sbostic } 117555847Sbostic 117655847Sbostic /* 117760201Sbostic - firstch - return first character in a set (which must have at least one) 117860201Sbostic == static int firstch(register struct parse *p, register cset *cs); 117960201Sbostic */ 118060201Sbostic static int /* character; there is no "none" value */ 118160201Sbostic firstch(p, cs) 118260201Sbostic register struct parse *p; 118360201Sbostic register cset *cs; 118460201Sbostic { 118560201Sbostic register int i; 118660201Sbostic register size_t css = (size_t)p->g->csetsize; 118760201Sbostic 118860201Sbostic for (i = 0; i < css; i++) 118960201Sbostic if (CHIN(cs, i)) 119060201Sbostic return((char)i); 119160201Sbostic assert(never); 119260201Sbostic return(0); /* arbitrary */ 119360201Sbostic } 119460201Sbostic 119560201Sbostic /* 119660201Sbostic - nch - number of characters in a set 119760201Sbostic == static int nch(register struct parse *p, register cset *cs); 119860201Sbostic */ 119960201Sbostic static int 120060201Sbostic nch(p, cs) 120160201Sbostic register struct parse *p; 120260201Sbostic register cset *cs; 120360201Sbostic { 120460201Sbostic register int i; 120560201Sbostic register size_t css = (size_t)p->g->csetsize; 120660201Sbostic register int n = 0; 120760201Sbostic 120860201Sbostic for (i = 0; i < css; i++) 120960201Sbostic if (CHIN(cs, i)) 121060201Sbostic n++; 121160201Sbostic return(n); 121260201Sbostic } 121360201Sbostic 121460201Sbostic /* 121555847Sbostic - mcadd - add a collating element to a cset 121660201Sbostic == static void mcadd(register struct parse *p, register cset *cs, \ 121760201Sbostic == register char *cp); 121855847Sbostic */ 121955847Sbostic static void 122055847Sbostic mcadd(p, cs, cp) 122155847Sbostic register struct parse *p; 122255847Sbostic register cset *cs; 122360201Sbostic register char *cp; 122455847Sbostic { 122555847Sbostic register size_t oldend = cs->smultis; 122655847Sbostic 122760201Sbostic cs->smultis += strlen(cp) + 1; 122855847Sbostic if (cs->multis == NULL) 122960201Sbostic cs->multis = malloc(cs->smultis); 123055847Sbostic else 123160201Sbostic cs->multis = realloc(cs->multis, cs->smultis); 123255847Sbostic if (cs->multis == NULL) { 123355847Sbostic SETERROR(REG_ESPACE); 123455847Sbostic return; 123555847Sbostic } 123655847Sbostic 123760201Sbostic (void) strcpy(cs->multis + oldend - 1, cp); 123855847Sbostic cs->multis[cs->smultis - 1] = '\0'; 123955847Sbostic } 124055847Sbostic 124155847Sbostic /* 124255847Sbostic - mcsub - subtract a collating element from a cset 124360201Sbostic == static void mcsub(register cset *cs, register char *cp); 124455847Sbostic */ 124555847Sbostic static void 124660201Sbostic mcsub(cs, cp) 124755847Sbostic register cset *cs; 124860201Sbostic register char *cp; 124955847Sbostic { 125060201Sbostic register char *fp = mcfind(cs, cp); 125160201Sbostic register size_t len = strlen(fp); 125255847Sbostic 125360201Sbostic assert(fp != NULL); 125460201Sbostic (void) memmove(fp, fp + len + 1, 125555847Sbostic cs->smultis - (fp + len + 1 - cs->multis)); 125655847Sbostic cs->smultis -= len; 125755847Sbostic 125855847Sbostic if (cs->smultis == 0) { 125960201Sbostic free(cs->multis); 126055847Sbostic cs->multis = NULL; 126155847Sbostic return; 126255847Sbostic } 126355847Sbostic 126460201Sbostic cs->multis = realloc(cs->multis, cs->smultis); 126555847Sbostic assert(cs->multis != NULL); 126655847Sbostic } 126755847Sbostic 126855847Sbostic /* 126955847Sbostic - mcin - is a collating element in a cset? 127060201Sbostic == static int mcin(register cset *cs, register char *cp); 127155847Sbostic */ 127255847Sbostic static int 127360201Sbostic mcin(cs, cp) 127455847Sbostic register cset *cs; 127560201Sbostic register char *cp; 127655847Sbostic { 127755847Sbostic return(mcfind(cs, cp) != NULL); 127855847Sbostic } 127955847Sbostic 128055847Sbostic /* 128155847Sbostic - mcfind - find a collating element in a cset 128260201Sbostic == static char *mcfind(register cset *cs, register char *cp); 128355847Sbostic */ 128460201Sbostic static char * 128555847Sbostic mcfind(cs, cp) 128655847Sbostic register cset *cs; 128760201Sbostic register char *cp; 128855847Sbostic { 128960201Sbostic register char *p; 129055847Sbostic 129155847Sbostic if (cs->multis == NULL) 129255847Sbostic return(NULL); 129360201Sbostic for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 129460201Sbostic if (strcmp(cp, p) == 0) 129555847Sbostic return(p); 129655847Sbostic return(NULL); 129755847Sbostic } 129855847Sbostic 129955847Sbostic /* 130055847Sbostic - mcinvert - invert the list of collating elements in a cset 130166362Sbostic == static void mcinvert(register struct parse *p, register cset *cs); 130255847Sbostic * 130355847Sbostic * This would have to know the set of possibilities. Implementation 130455847Sbostic * is deferred. 130555847Sbostic */ 130655847Sbostic static void 130766362Sbostic mcinvert(p, cs) 130866362Sbostic register struct parse *p; 130955847Sbostic register cset *cs; 131055847Sbostic { 131155847Sbostic assert(cs->multis == NULL); /* xxx */ 131255847Sbostic } 131355847Sbostic 131455847Sbostic /* 131560201Sbostic - mccase - add case counterparts of the list of collating elements in a cset 131666362Sbostic == static void mccase(register struct parse *p, register cset *cs); 131760201Sbostic * 131860201Sbostic * This would have to know the set of possibilities. Implementation 131960201Sbostic * is deferred. 132060201Sbostic */ 132160201Sbostic static void 132266362Sbostic mccase(p, cs) 132366362Sbostic register struct parse *p; 132460201Sbostic register cset *cs; 132560201Sbostic { 132660201Sbostic assert(cs->multis == NULL); /* xxx */ 132760201Sbostic } 132860201Sbostic 132960201Sbostic /* 133055847Sbostic - isinsets - is this character in any sets? 133160201Sbostic == static int isinsets(register struct re_guts *g, int c); 133255847Sbostic */ 133355847Sbostic static int /* predicate */ 133455847Sbostic isinsets(g, c) 133555847Sbostic register struct re_guts *g; 133660201Sbostic int c; 133755847Sbostic { 133866362Sbostic register uch *col; 133955847Sbostic register int i; 134055847Sbostic register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 134160201Sbostic register unsigned uc = (unsigned char)c; 134255847Sbostic 134355847Sbostic for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 134460201Sbostic if (col[uc] != 0) 134555847Sbostic return(1); 134655847Sbostic return(0); 134755847Sbostic } 134855847Sbostic 134955847Sbostic /* 135055847Sbostic - samesets - are these two characters in exactly the same sets? 135160201Sbostic == static int samesets(register struct re_guts *g, int c1, int c2); 135255847Sbostic */ 135355847Sbostic static int /* predicate */ 135455847Sbostic samesets(g, c1, c2) 135555847Sbostic register struct re_guts *g; 135660201Sbostic int c1; 135760201Sbostic int c2; 135855847Sbostic { 135966362Sbostic register uch *col; 136055847Sbostic register int i; 136155847Sbostic register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 136260201Sbostic register unsigned uc1 = (unsigned char)c1; 136360201Sbostic register unsigned uc2 = (unsigned char)c2; 136455847Sbostic 136555847Sbostic for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 136660201Sbostic if (col[uc1] != col[uc2]) 136755847Sbostic return(0); 136855847Sbostic return(1); 136955847Sbostic } 137055847Sbostic 137155847Sbostic /* 137255847Sbostic - categorize - sort out character categories 137360201Sbostic == static void categorize(struct parse *p, register struct re_guts *g); 137455847Sbostic */ 137555847Sbostic static void 137655847Sbostic categorize(p, g) 137755847Sbostic struct parse *p; 137855847Sbostic register struct re_guts *g; 137955847Sbostic { 138060201Sbostic register cat_t *cats = g->categories; 138160201Sbostic register int c; 138260201Sbostic register int c2; 138360201Sbostic register cat_t cat; 138455847Sbostic 138555847Sbostic /* avoid making error situations worse */ 138655847Sbostic if (p->error != 0) 138755847Sbostic return; 138855847Sbostic 138960201Sbostic for (c = CHAR_MIN; c <= CHAR_MAX; c++) 139055847Sbostic if (cats[c] == 0 && isinsets(g, c)) { 139155847Sbostic cat = g->ncategories++; 139255847Sbostic cats[c] = cat; 139360201Sbostic for (c2 = c+1; c2 <= CHAR_MAX; c2++) 139455847Sbostic if (cats[c2] == 0 && samesets(g, c, c2)) 139555847Sbostic cats[c2] = cat; 139655847Sbostic } 139755847Sbostic } 139855847Sbostic 139955847Sbostic /* 140055847Sbostic - dupl - emit a duplicate of a bunch of sops 140160201Sbostic == static sopno dupl(register struct parse *p, sopno start, sopno finish); 140255847Sbostic */ 140355847Sbostic static sopno /* start of duplicate */ 140455847Sbostic dupl(p, start, finish) 140555847Sbostic register struct parse *p; 140655847Sbostic sopno start; /* from here */ 140755847Sbostic sopno finish; /* to this less one */ 140855847Sbostic { 140955847Sbostic register sopno ret = HERE(); 141055847Sbostic register sopno len = finish - start; 141155847Sbostic 141255847Sbostic assert(finish >= start); 141355847Sbostic if (len == 0) 141455847Sbostic return(ret); 141555847Sbostic enlarge(p, p->ssize + len); /* this many unexpected additions */ 141655847Sbostic assert(p->ssize >= p->slen + len); 141755847Sbostic (void) memcpy((char *)(p->strip + p->slen), 141855847Sbostic (char *)(p->strip + start), (size_t)len*sizeof(sop)); 141955847Sbostic p->slen += len; 142055847Sbostic return(ret); 142155847Sbostic } 142255847Sbostic 142355847Sbostic /* 142455847Sbostic - doemit - emit a strip operator 142560201Sbostic == static void doemit(register struct parse *p, sop op, size_t opnd); 142655847Sbostic * 142755847Sbostic * It might seem better to implement this as a macro with a function as 142855847Sbostic * hard-case backup, but it's just too big and messy unless there are 142955847Sbostic * some changes to the data structures. Maybe later. 143055847Sbostic */ 143155847Sbostic static void 143255847Sbostic doemit(p, op, opnd) 143355847Sbostic register struct parse *p; 143455847Sbostic sop op; 143555847Sbostic size_t opnd; 143655847Sbostic { 143755847Sbostic /* avoid making error situations worse */ 143855847Sbostic if (p->error != 0) 143955847Sbostic return; 144055847Sbostic 144155847Sbostic /* deal with oversize operands ("can't happen", more or less) */ 144255847Sbostic assert(opnd < 1<<OPSHIFT); 144355847Sbostic 144455847Sbostic /* deal with undersized strip */ 144555847Sbostic if (p->slen >= p->ssize) 144655847Sbostic enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 144755847Sbostic assert(p->slen < p->ssize); 144855847Sbostic 144955847Sbostic /* finally, it's all reduced to the easy case */ 145055847Sbostic p->strip[p->slen++] = SOP(op, opnd); 145155847Sbostic } 145255847Sbostic 145355847Sbostic /* 145455847Sbostic - doinsert - insert a sop into the strip 145560201Sbostic == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 145655847Sbostic */ 145755847Sbostic static void 145855847Sbostic doinsert(p, op, opnd, pos) 145955847Sbostic register struct parse *p; 146055847Sbostic sop op; 146155847Sbostic size_t opnd; 146255847Sbostic sopno pos; 146355847Sbostic { 146455847Sbostic register sopno sn; 146555847Sbostic register sop s; 146655847Sbostic register int i; 146755847Sbostic 146855847Sbostic /* avoid making error situations worse */ 146955847Sbostic if (p->error != 0) 147055847Sbostic return; 147155847Sbostic 147255847Sbostic sn = HERE(); 147355847Sbostic EMIT(op, opnd); /* do checks, ensure space */ 147455847Sbostic assert(HERE() == sn+1); 147555847Sbostic s = p->strip[sn]; 147655847Sbostic 147755847Sbostic /* adjust paren pointers */ 147855847Sbostic assert(pos > 0); 147955847Sbostic for (i = 1; i < NPAREN; i++) { 148055847Sbostic if (p->pbegin[i] >= pos) { 148155847Sbostic p->pbegin[i]++; 148255847Sbostic } 148355847Sbostic if (p->pend[i] >= pos) { 148455847Sbostic p->pend[i]++; 148555847Sbostic } 148655847Sbostic } 148755847Sbostic 148855847Sbostic memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 148955847Sbostic (HERE()-pos-1)*sizeof(sop)); 149055847Sbostic p->strip[pos] = s; 149155847Sbostic } 149255847Sbostic 149355847Sbostic /* 149455847Sbostic - dofwd - complete a forward reference 149560201Sbostic == static void dofwd(register struct parse *p, sopno pos, sop value); 149655847Sbostic */ 149755847Sbostic static void 149855847Sbostic dofwd(p, pos, value) 149955847Sbostic register struct parse *p; 150055847Sbostic register sopno pos; 150155847Sbostic sop value; 150255847Sbostic { 150355847Sbostic /* avoid making error situations worse */ 150455847Sbostic if (p->error != 0) 150555847Sbostic return; 150655847Sbostic 150755847Sbostic assert(value < 1<<OPSHIFT); 150855847Sbostic p->strip[pos] = OP(p->strip[pos]) | value; 150955847Sbostic } 151055847Sbostic 151155847Sbostic /* 151255847Sbostic - enlarge - enlarge the strip 151360201Sbostic == static void enlarge(register struct parse *p, sopno size); 151455847Sbostic */ 151555847Sbostic static void 151655847Sbostic enlarge(p, size) 151755847Sbostic register struct parse *p; 151855847Sbostic register sopno size; 151955847Sbostic { 152055847Sbostic register sop *sp; 152155847Sbostic 152255847Sbostic if (p->ssize >= size) 152355847Sbostic return; 152455847Sbostic 152555847Sbostic sp = (sop *)realloc(p->strip, size*sizeof(sop)); 152655847Sbostic if (sp == NULL) { 152755847Sbostic SETERROR(REG_ESPACE); 152855847Sbostic return; 152955847Sbostic } 153055847Sbostic p->strip = sp; 153155847Sbostic p->ssize = size; 153255847Sbostic } 153355847Sbostic 153455847Sbostic /* 153555847Sbostic - stripsnug - compact the strip 153660201Sbostic == static void stripsnug(register struct parse *p, register struct re_guts *g); 153755847Sbostic */ 153855847Sbostic static void 153955847Sbostic stripsnug(p, g) 154055847Sbostic register struct parse *p; 154155847Sbostic register struct re_guts *g; 154255847Sbostic { 154355847Sbostic g->nstates = p->slen; 154466362Sbostic g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 154555847Sbostic if (g->strip == NULL) { 154655847Sbostic SETERROR(REG_ESPACE); 154755847Sbostic g->strip = p->strip; 154855847Sbostic } 154955847Sbostic } 155055847Sbostic 155155847Sbostic /* 155255847Sbostic - findmust - fill in must and mlen with longest mandatory literal string 155360201Sbostic == static void findmust(register struct parse *p, register struct re_guts *g); 155455847Sbostic * 155555847Sbostic * This algorithm could do fancy things like analyzing the operands of | 155655847Sbostic * for common subsequences. Someday. This code is simple and finds most 155755847Sbostic * of the interesting cases. 155855847Sbostic * 155955847Sbostic * Note that must and mlen got initialized during setup. 156055847Sbostic */ 156156355Sbostic static void 156255847Sbostic findmust(p, g) 156355847Sbostic struct parse *p; 156455847Sbostic register struct re_guts *g; 156555847Sbostic { 156655847Sbostic register sop *scan; 156755847Sbostic sop *start; 156855847Sbostic register sop *newstart; 156955847Sbostic register sopno newlen; 157055847Sbostic register sop s; 157155847Sbostic register char *cp; 157255847Sbostic register sopno i; 157355847Sbostic 157455847Sbostic /* avoid making error situations worse */ 157555847Sbostic if (p->error != 0) 157655847Sbostic return; 157755847Sbostic 157855847Sbostic /* find the longest OCHAR sequence in strip */ 157955847Sbostic newlen = 0; 158055847Sbostic scan = g->strip + 1; 158155847Sbostic do { 158255847Sbostic s = *scan++; 158355847Sbostic switch (OP(s)) { 158455847Sbostic case OCHAR: /* sequence member */ 158555847Sbostic if (newlen == 0) /* new sequence */ 158655847Sbostic newstart = scan - 1; 158755847Sbostic newlen++; 158855847Sbostic break; 158955847Sbostic case OPLUS_: /* things that don't break one */ 159055847Sbostic case OLPAREN: 159155847Sbostic case ORPAREN: 159255847Sbostic break; 159355847Sbostic case OQUEST_: /* things that must be skipped */ 159455847Sbostic case OCH_: 159555847Sbostic scan--; 159655847Sbostic do { 159755847Sbostic scan += OPND(s); 159855847Sbostic s = *scan; 159955847Sbostic /* assert() interferes w debug printouts */ 160055847Sbostic if (OP(s) != O_QUEST && OP(s) != O_CH && 160155847Sbostic OP(s) != OOR2) { 160255847Sbostic g->iflags |= BAD; 160355847Sbostic return; 160455847Sbostic } 160555847Sbostic } while (OP(s) != O_QUEST && OP(s) != O_CH); 160655847Sbostic /* fallthrough */ 160755847Sbostic default: /* things that break a sequence */ 160855847Sbostic if (newlen > g->mlen) { /* ends one */ 160955847Sbostic start = newstart; 161055847Sbostic g->mlen = newlen; 161155847Sbostic } 161255847Sbostic newlen = 0; 161355847Sbostic break; 161455847Sbostic } 161555847Sbostic } while (OP(s) != OEND); 161655847Sbostic 161755847Sbostic if (g->mlen == 0) /* there isn't one */ 161855847Sbostic return; 161955847Sbostic 162055847Sbostic /* turn it into a character string */ 162155847Sbostic g->must = malloc((size_t)g->mlen + 1); 162255847Sbostic if (g->must == NULL) { /* argh; just forget it */ 162355847Sbostic g->mlen = 0; 162455847Sbostic return; 162555847Sbostic } 162655847Sbostic cp = g->must; 162755847Sbostic scan = start; 162855847Sbostic for (i = g->mlen; i > 0; i--) { 162955847Sbostic while (OP(s = *scan++) != OCHAR) 163055847Sbostic continue; 163166362Sbostic assert(cp < g->must + g->mlen); 163260201Sbostic *cp++ = (char)OPND(s); 163355847Sbostic } 163466362Sbostic assert(cp == g->must + g->mlen); 163555847Sbostic *cp++ = '\0'; /* just on general principles */ 163655847Sbostic } 163755847Sbostic 163855847Sbostic /* 163955847Sbostic - pluscount - count + nesting 164060201Sbostic == static sopno pluscount(register struct parse *p, register struct re_guts *g); 164155847Sbostic */ 164256355Sbostic static sopno /* nesting depth */ 164355847Sbostic pluscount(p, g) 164455847Sbostic struct parse *p; 164555847Sbostic register struct re_guts *g; 164655847Sbostic { 164755847Sbostic register sop *scan; 164855847Sbostic register sop s; 164955847Sbostic register sopno plusnest = 0; 165055847Sbostic register sopno maxnest = 0; 165155847Sbostic 165255847Sbostic if (p->error != 0) 165355847Sbostic return(0); /* there may not be an OEND */ 165455847Sbostic 165555847Sbostic scan = g->strip + 1; 165655847Sbostic do { 165755847Sbostic s = *scan++; 165855847Sbostic switch (OP(s)) { 165955847Sbostic case OPLUS_: 166055847Sbostic plusnest++; 166155847Sbostic break; 166255847Sbostic case O_PLUS: 166355847Sbostic if (plusnest > maxnest) 166455847Sbostic maxnest = plusnest; 166555847Sbostic plusnest--; 166655847Sbostic break; 166755847Sbostic } 166855847Sbostic } while (OP(s) != OEND); 166955847Sbostic if (plusnest != 0) 167055847Sbostic g->iflags |= BAD; 167155847Sbostic return(maxnest); 167255847Sbostic } 1673