1*55847Sbostic /*- 2*55847Sbostic * Copyright (c) 1992 Henry Spencer. 3*55847Sbostic * Copyright (c) 1992 The Regents of the University of California. 4*55847Sbostic * All rights reserved. 5*55847Sbostic * 6*55847Sbostic * This code is derived from software contributed to Berkeley by 7*55847Sbostic * Henry Spencer of the University of Toronto. 8*55847Sbostic * 9*55847Sbostic * %sccs.include.redist.c% 10*55847Sbostic * 11*55847Sbostic * @(#)regcomp.c 5.1 (Berkeley) 08/06/92 12*55847Sbostic */ 13*55847Sbostic 14*55847Sbostic #if defined(LIBC_SCCS) && !defined(lint) 15*55847Sbostic static char sccsid[] = "@(#)regcomp.c 5.1 (Berkeley) 08/06/92"; 16*55847Sbostic #endif /* LIBC_SCCS and not lint */ 17*55847Sbostic 18*55847Sbostic #include <sys/types.h> 19*55847Sbostic 20*55847Sbostic #include <stdio.h> 21*55847Sbostic #include <string.h> 22*55847Sbostic #include <ctype.h> 23*55847Sbostic #include <limits.h> 24*55847Sbostic #include <stdlib.h> 25*55847Sbostic #include <assert.h> 26*55847Sbostic #include <regex.h> 27*55847Sbostic 28*55847Sbostic #include "utils.h" 29*55847Sbostic #include "regex2.h" 30*55847Sbostic 31*55847Sbostic #include "cclass.h" 32*55847Sbostic #include "cname.h" 33*55847Sbostic 34*55847Sbostic static uchar nuls[10]; /* place to point scanner in event of error */ 35*55847Sbostic 36*55847Sbostic /* 37*55847Sbostic * parse structure, passed up and down to avoid global variables and 38*55847Sbostic * other clumsinesses 39*55847Sbostic */ 40*55847Sbostic struct parse { 41*55847Sbostic uchar *next; /* next character in RE */ 42*55847Sbostic int error; /* has an error been seen? */ 43*55847Sbostic sop *strip; /* malloced strip */ 44*55847Sbostic sopno ssize; /* malloced strip size (allocated) */ 45*55847Sbostic sopno slen; /* malloced strip length (used) */ 46*55847Sbostic int ncsalloc; /* number of csets allocated */ 47*55847Sbostic struct re_guts *g; 48*55847Sbostic # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 49*55847Sbostic sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 50*55847Sbostic sopno pend[NPAREN]; /* -> ) ([0] unused) */ 51*55847Sbostic }; 52*55847Sbostic 53*55847Sbostic STATIC void doemit __P((struct parse *, sop, size_t)); 54*55847Sbostic STATIC void dofwd __P((struct parse *, sopno, sop)); 55*55847Sbostic STATIC void doinsert __P((struct parse *, sop, size_t, sopno)); 56*55847Sbostic STATIC sopno dupl __P((struct parse *, sopno, sopno)); 57*55847Sbostic STATIC void enlarge __P((struct parse *, sopno)); 58*55847Sbostic STATIC void mcadd __P((struct parse *, cset *, uchar *)); 59*55847Sbostic STATIC uchar *mcfind __P((cset *, uchar *)); 60*55847Sbostic STATIC int mcin __P((struct parse *, cset *, uchar *)); 61*55847Sbostic STATIC void mcinvert __P((struct parse *, cset *)); 62*55847Sbostic STATIC void mcsub __P((struct parse *, cset *, uchar *)); 63*55847Sbostic STATIC int seterr __P((struct parse *, int)); 64*55847Sbostic 65*55847Sbostic /* 66*55847Sbostic * macros for use with parse structure 67*55847Sbostic * BEWARE: these know that the parse structure is named `p' !!! 68*55847Sbostic */ 69*55847Sbostic #define PEEK() ((uchar)*p->next) 70*55847Sbostic #define PEEK2() ((uchar)*(p->next+1)) 71*55847Sbostic #define SEE(c) (PEEK() == (c)) 72*55847Sbostic #define SEETWO(a, b) (PEEK() == (a) && PEEK2() == (b)) 73*55847Sbostic #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 74*55847Sbostic #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 75*55847Sbostic #define NEXT() (p->next++) 76*55847Sbostic #define NEXT2() (p->next += 2) 77*55847Sbostic #define NEXTn(n) (p->next += (n)) 78*55847Sbostic #define GETNEXT() ((uchar)*p->next++) 79*55847Sbostic #define SETERROR(e) seterr(p, (e)) 80*55847Sbostic #define REQUIRE(co, e) ((co) || SETERROR(e)) 81*55847Sbostic #define MUSTSEE(c, e) (REQUIRE(PEEK() == (c), e)) 82*55847Sbostic #define MUSTEAT(c, e) (REQUIRE(GETNEXT() == (c), e)) 83*55847Sbostic #define MUSTNOTSEE(c, e) (REQUIRE(PEEK() != (c), e)) 84*55847Sbostic #define EMIT(sop, sopnd) doemit(p, sop, (size_t)(sopnd)) 85*55847Sbostic #define INSERT(sop, pos) doinsert(p, sop, HERE()-(pos)+1, pos) 86*55847Sbostic #define FWD(pos) dofwd(p, pos, HERE()-(pos)) 87*55847Sbostic #define BACK(sop, pos) EMIT(sop, HERE()-pos) 88*55847Sbostic #define HERE() (p->slen) 89*55847Sbostic #define THERE() (p->slen - 1) 90*55847Sbostic #define DROP(n) (p->slen -= (n)) 91*55847Sbostic 92*55847Sbostic /* 93*55847Sbostic - regcomp - interface for parser and compilation 94*55847Sbostic */ 95*55847Sbostic int /* 0 success, otherwise REG_something */ 96*55847Sbostic regcomp(preg, pattern, cflags) 97*55847Sbostic regex_t *preg; 98*55847Sbostic const char *pattern; 99*55847Sbostic int cflags; 100*55847Sbostic { 101*55847Sbostic struct parse pa; 102*55847Sbostic register struct re_guts *g; 103*55847Sbostic register struct parse *p = &pa; 104*55847Sbostic sopno nstates; 105*55847Sbostic register int i; 106*55847Sbostic STATIC void p_ere(); 107*55847Sbostic STATIC void p_bre(); 108*55847Sbostic STATIC void stripsnug(); 109*55847Sbostic STATIC void categorize(); 110*55847Sbostic STATIC void findmust(); 111*55847Sbostic STATIC sopno pluscount(); 112*55847Sbostic 113*55847Sbostic /* do the mallocs early so failure handling is easy */ 114*55847Sbostic /* the +NUC here is for the category table */ 115*55847Sbostic g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC); 116*55847Sbostic if (g == NULL) 117*55847Sbostic return(REG_ESPACE); 118*55847Sbostic p->ssize = strlen(pattern)/2*3 + 1; 119*55847Sbostic p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 120*55847Sbostic p->slen = 0; 121*55847Sbostic if (p->strip == NULL) { 122*55847Sbostic free((char *)g); 123*55847Sbostic return(REG_ESPACE); 124*55847Sbostic } 125*55847Sbostic 126*55847Sbostic /* set things up */ 127*55847Sbostic p->g = g; 128*55847Sbostic p->next = (uchar *)pattern; 129*55847Sbostic p->error = 0; 130*55847Sbostic p->ncsalloc = 0; 131*55847Sbostic for (i = 0; i < NPAREN; i++) { 132*55847Sbostic p->pbegin[i] = 0; 133*55847Sbostic p->pend[i] = 0; 134*55847Sbostic } 135*55847Sbostic g->csetsize = NUC; 136*55847Sbostic g->sets = NULL; 137*55847Sbostic g->setbits = NULL; 138*55847Sbostic g->ncsets = 0; 139*55847Sbostic g->cflags = cflags; 140*55847Sbostic g->iflags = 0; 141*55847Sbostic g->must = NULL; 142*55847Sbostic g->mlen = 0; 143*55847Sbostic g->nsub = 0; 144*55847Sbostic g->ncategories = 1; /* category 0 is "everything else" */ 145*55847Sbostic g->categories = (uchar *)g + sizeof(struct re_guts); 146*55847Sbostic (void) memset((char *)g->categories, 0, NUC); 147*55847Sbostic g->backrefs = 0; 148*55847Sbostic g->nplus = 0; 149*55847Sbostic 150*55847Sbostic /* do it */ 151*55847Sbostic EMIT(OEND, 0); 152*55847Sbostic g->firststate = THERE(); 153*55847Sbostic if (cflags®_EXTENDED) 154*55847Sbostic p_ere(p, '\0'); 155*55847Sbostic else 156*55847Sbostic p_bre(p, '\0', '\0'); 157*55847Sbostic EMIT(OEND, 0); 158*55847Sbostic g->laststate = THERE(); 159*55847Sbostic 160*55847Sbostic /* tidy up loose ends and fill things in */ 161*55847Sbostic categorize(p, g); 162*55847Sbostic stripsnug(p, g); 163*55847Sbostic findmust(p, g); 164*55847Sbostic g->nplus = pluscount(p, g); 165*55847Sbostic g->magic = MAGIC2; 166*55847Sbostic preg->re_nsub = g->nsub; 167*55847Sbostic preg->re_g = g; 168*55847Sbostic preg->re_magic = MAGIC1; 169*55847Sbostic #ifdef NDEBUG 170*55847Sbostic /* not debugging, so can't rely on the assert() in regexec() */ 171*55847Sbostic if (g->iflags&BAD) 172*55847Sbostic SETERROR(REG_ASSERT); 173*55847Sbostic #endif 174*55847Sbostic 175*55847Sbostic /* win or lose, we're done */ 176*55847Sbostic if (p->error != 0) /* lose */ 177*55847Sbostic regfree(preg); 178*55847Sbostic return(p->error); 179*55847Sbostic } 180*55847Sbostic 181*55847Sbostic /* 182*55847Sbostic - p_ere - ERE parser top level, concatenation and alternation 183*55847Sbostic */ 184*55847Sbostic static void 185*55847Sbostic p_ere(p, stop) 186*55847Sbostic register struct parse *p; 187*55847Sbostic uchar stop; /* character this ERE should end at */ 188*55847Sbostic { 189*55847Sbostic STATIC void p_ere_exp(); 190*55847Sbostic register uchar c; 191*55847Sbostic register sopno prevback; 192*55847Sbostic register sopno prevfwd; 193*55847Sbostic register sopno conc; 194*55847Sbostic register int first = 1; /* is this the first alternative? */ 195*55847Sbostic 196*55847Sbostic for (;;) { 197*55847Sbostic /* do a bunch of concatenated expressions */ 198*55847Sbostic conc = HERE(); 199*55847Sbostic while ((c = PEEK()) != '|' && c != stop && c != '\0') 200*55847Sbostic p_ere_exp(p); 201*55847Sbostic REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 202*55847Sbostic 203*55847Sbostic if (!EAT('|')) 204*55847Sbostic break; /* NOTE BREAK OUT */ 205*55847Sbostic 206*55847Sbostic if (first) { 207*55847Sbostic INSERT(OCH_, conc); /* offset is wrong */ 208*55847Sbostic prevfwd = conc; 209*55847Sbostic prevback = conc; 210*55847Sbostic first = 0; 211*55847Sbostic } 212*55847Sbostic BACK(OOR1, prevback); 213*55847Sbostic prevback = THERE(); 214*55847Sbostic FWD(prevfwd); /* fix previous offset */ 215*55847Sbostic prevfwd = HERE(); 216*55847Sbostic EMIT(OOR2, 0); /* offset is very wrong */ 217*55847Sbostic } 218*55847Sbostic 219*55847Sbostic if (!first) { /* tail-end fixups */ 220*55847Sbostic FWD(prevfwd); 221*55847Sbostic BACK(O_CH, prevback); 222*55847Sbostic } 223*55847Sbostic 224*55847Sbostic assert(SEE(stop) || SEE('\0')); 225*55847Sbostic } 226*55847Sbostic 227*55847Sbostic /* 228*55847Sbostic - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 229*55847Sbostic */ 230*55847Sbostic static void 231*55847Sbostic p_ere_exp(p) 232*55847Sbostic register struct parse *p; 233*55847Sbostic { 234*55847Sbostic STATIC int p_count(); 235*55847Sbostic STATIC void p_bracket(); 236*55847Sbostic STATIC void ordinary(); 237*55847Sbostic STATIC void nonnewline(); 238*55847Sbostic STATIC void repeat(); 239*55847Sbostic register uchar c; 240*55847Sbostic register sopno pos; 241*55847Sbostic register sopno sub; 242*55847Sbostic register int count; 243*55847Sbostic register int count2; 244*55847Sbostic register sopno subno; 245*55847Sbostic int wascaret = 0; 246*55847Sbostic /* we call { a repetition if followed by a digit */ 247*55847Sbostic # define ISRPT(c1, c2) (c1 == '*' || c1 == '+' || c1 == '?' || \ 248*55847Sbostic (c1 == '{' && isdigit(c2))) 249*55847Sbostic 250*55847Sbostic c = GETNEXT(); 251*55847Sbostic assert(c != '\0'); /* caller should have ensured this */ 252*55847Sbostic 253*55847Sbostic pos = HERE(); 254*55847Sbostic switch (c) { 255*55847Sbostic case '(': 256*55847Sbostic MUSTNOTSEE('\0', REG_EPAREN); 257*55847Sbostic p->g->nsub++; 258*55847Sbostic subno = p->g->nsub; 259*55847Sbostic if (subno < NPAREN) 260*55847Sbostic p->pbegin[subno] = HERE(); 261*55847Sbostic EMIT(OLPAREN, subno); 262*55847Sbostic if (!SEE(')')) 263*55847Sbostic p_ere(p, ')'); 264*55847Sbostic if (subno < NPAREN) { 265*55847Sbostic p->pend[subno] = HERE(); 266*55847Sbostic assert(p->pend[subno] != 0); 267*55847Sbostic } 268*55847Sbostic EMIT(ORPAREN, subno); 269*55847Sbostic MUSTEAT(')', REG_EPAREN); 270*55847Sbostic break; 271*55847Sbostic #ifndef POSIX_MISTAKE 272*55847Sbostic case ')': /* happens only if no current unmatched ( */ 273*55847Sbostic /* 274*55847Sbostic * You may ask, why the ifndef? Because I didn't notice 275*55847Sbostic * this until slightly too late for 1003.2, and none of the 276*55847Sbostic * other 1003.2 regular-expression reviewers noticed it at 277*55847Sbostic * all. So an unmatched ) is legal POSIX, at least until 278*55847Sbostic * we can get it fixed. 279*55847Sbostic */ 280*55847Sbostic SETERROR(REG_EPAREN); 281*55847Sbostic break; 282*55847Sbostic #endif 283*55847Sbostic case '^': 284*55847Sbostic EMIT(OBOL, 0); 285*55847Sbostic p->g->iflags |= USEBOL; 286*55847Sbostic wascaret = 1; 287*55847Sbostic break; 288*55847Sbostic case '$': 289*55847Sbostic EMIT(OEOL, 0); 290*55847Sbostic p->g->iflags |= USEEOL; 291*55847Sbostic break; 292*55847Sbostic case '|': 293*55847Sbostic SETERROR(REG_EMPTY); 294*55847Sbostic break; 295*55847Sbostic case '*': 296*55847Sbostic case '+': 297*55847Sbostic case '?': 298*55847Sbostic SETERROR(REG_BADRPT); 299*55847Sbostic break; 300*55847Sbostic case '.': 301*55847Sbostic if (p->g->cflags®_NEWLINE) 302*55847Sbostic nonnewline(p); 303*55847Sbostic else 304*55847Sbostic EMIT(OANY, 0); 305*55847Sbostic break; 306*55847Sbostic case '[': 307*55847Sbostic p_bracket(p); 308*55847Sbostic break; 309*55847Sbostic case '\\': 310*55847Sbostic c = GETNEXT(); 311*55847Sbostic if (c == '^' || c == '.' || c == '[' || c == '$' || 312*55847Sbostic c == '(' || c == ')' || c == '|' || 313*55847Sbostic c == '*' || c == '+' || c == '?' || 314*55847Sbostic c == '{' || c == '\\') 315*55847Sbostic ordinary(p, c); 316*55847Sbostic else 317*55847Sbostic SETERROR(REG_EESCAPE); 318*55847Sbostic break; 319*55847Sbostic case '{': /* okay as ordinary except if digit follows */ 320*55847Sbostic REQUIRE(!isdigit(PEEK()), REG_BADRPT); 321*55847Sbostic /* FALLTHROUGH */ 322*55847Sbostic default: 323*55847Sbostic ordinary(p, c); 324*55847Sbostic break; 325*55847Sbostic } 326*55847Sbostic 327*55847Sbostic c = PEEK(); 328*55847Sbostic if (!ISRPT(c, PEEK2())) 329*55847Sbostic return; /* no repetition, we're done */ 330*55847Sbostic NEXT(); 331*55847Sbostic 332*55847Sbostic REQUIRE(!wascaret, REG_BADRPT); 333*55847Sbostic switch (c) { 334*55847Sbostic case '*': /* implemented as +? */ 335*55847Sbostic INSERT(OPLUS_, pos); 336*55847Sbostic BACK(O_PLUS, pos); 337*55847Sbostic INSERT(OQUEST_, pos); 338*55847Sbostic BACK(O_QUEST, pos); 339*55847Sbostic break; 340*55847Sbostic case '+': 341*55847Sbostic INSERT(OPLUS_, pos); 342*55847Sbostic BACK(O_PLUS, pos); 343*55847Sbostic break; 344*55847Sbostic case '?': 345*55847Sbostic INSERT(OQUEST_, pos); 346*55847Sbostic BACK(O_QUEST, pos); 347*55847Sbostic break; 348*55847Sbostic case '{': 349*55847Sbostic count = p_count(p); 350*55847Sbostic if (EAT(',')) { 351*55847Sbostic if (isdigit(PEEK())) { 352*55847Sbostic count2 = p_count(p); 353*55847Sbostic REQUIRE(count <= count2, REG_BADBR); 354*55847Sbostic } else /* single number with comma */ 355*55847Sbostic count2 = INFINITY; 356*55847Sbostic } else /* just a single number */ 357*55847Sbostic count2 = count; 358*55847Sbostic repeat(p, pos, count, count2); 359*55847Sbostic if (!EAT('}')) { /* error heuristics */ 360*55847Sbostic while ((c = PEEK()) != '\0' && c != '}') 361*55847Sbostic NEXT(); 362*55847Sbostic if (c == '\0') 363*55847Sbostic SETERROR(REG_EBRACE); 364*55847Sbostic else 365*55847Sbostic SETERROR(REG_BADBR); 366*55847Sbostic } 367*55847Sbostic break; 368*55847Sbostic } 369*55847Sbostic 370*55847Sbostic c = PEEK(); 371*55847Sbostic REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT); 372*55847Sbostic } 373*55847Sbostic 374*55847Sbostic /* 375*55847Sbostic - p_bre - BRE parser top level, anchoring and concatenation 376*55847Sbostic * 377*55847Sbostic * Giving end1 as '\0' essentially eliminates the end1/end2 check. 378*55847Sbostic */ 379*55847Sbostic static void 380*55847Sbostic p_bre(p, end1, end2) 381*55847Sbostic register struct parse *p; 382*55847Sbostic register uchar end1; /* first terminating character */ 383*55847Sbostic register uchar end2; /* second terminating character */ 384*55847Sbostic { 385*55847Sbostic STATIC int p_simp_re(); 386*55847Sbostic register sopno start = HERE(); 387*55847Sbostic register int first = 1; /* first subexpression? */ 388*55847Sbostic register int wasdollar; 389*55847Sbostic 390*55847Sbostic if (EAT('^')) { 391*55847Sbostic EMIT(OBOL, 0); 392*55847Sbostic p->g->iflags |= USEBOL; 393*55847Sbostic } 394*55847Sbostic while (!SEE('\0') && !SEETWO(end1, end2)) { 395*55847Sbostic wasdollar = p_simp_re(p, first); 396*55847Sbostic first = 0; 397*55847Sbostic } 398*55847Sbostic if (wasdollar) { /* oops, that was a trailing anchor */ 399*55847Sbostic DROP(1); 400*55847Sbostic EMIT(OEOL, 0); 401*55847Sbostic p->g->iflags |= USEEOL; 402*55847Sbostic } 403*55847Sbostic 404*55847Sbostic REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 405*55847Sbostic } 406*55847Sbostic 407*55847Sbostic /* 408*55847Sbostic - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 409*55847Sbostic */ 410*55847Sbostic static int /* was the simple RE an unbackslashed $? */ 411*55847Sbostic p_simp_re(p, starordinary) 412*55847Sbostic register struct parse *p; 413*55847Sbostic int starordinary; /* is a leading * an ordinary character? */ 414*55847Sbostic { 415*55847Sbostic STATIC int p_count(); 416*55847Sbostic STATIC void p_bracket(); 417*55847Sbostic STATIC void ordinary(); 418*55847Sbostic STATIC void nonnewline(); 419*55847Sbostic STATIC void repeat(); 420*55847Sbostic register int c; 421*55847Sbostic register int count; 422*55847Sbostic register int count2; 423*55847Sbostic register sopno pos; 424*55847Sbostic register sopno sub; 425*55847Sbostic register int i; 426*55847Sbostic register sopno subno; 427*55847Sbostic # define BACKSL (1<<CHAR_BIT) 428*55847Sbostic 429*55847Sbostic pos = HERE(); /* repetion op, if any, covers from here */ 430*55847Sbostic 431*55847Sbostic c = GETNEXT(); 432*55847Sbostic assert(c != '\0'); /* caller should have ensured this */ 433*55847Sbostic if (c == '\\') 434*55847Sbostic c = BACKSL | PEEK(); 435*55847Sbostic switch (c) { 436*55847Sbostic case '.': 437*55847Sbostic if (p->g->cflags®_NEWLINE) 438*55847Sbostic nonnewline(p); 439*55847Sbostic else 440*55847Sbostic EMIT(OANY, 0); 441*55847Sbostic break; 442*55847Sbostic case '[': 443*55847Sbostic p_bracket(p); 444*55847Sbostic break; 445*55847Sbostic case BACKSL|'^': 446*55847Sbostic case BACKSL|'.': 447*55847Sbostic case BACKSL|'*': 448*55847Sbostic case BACKSL|'[': 449*55847Sbostic case BACKSL|'$': 450*55847Sbostic case BACKSL|'\\': 451*55847Sbostic ordinary(p, c&~BACKSL); 452*55847Sbostic NEXT(); 453*55847Sbostic break; 454*55847Sbostic case BACKSL|'{': 455*55847Sbostic SETERROR(REG_BADRPT); 456*55847Sbostic break; 457*55847Sbostic case BACKSL|'(': 458*55847Sbostic NEXT(); 459*55847Sbostic p->g->nsub++; 460*55847Sbostic subno = p->g->nsub; 461*55847Sbostic if (subno < NPAREN) 462*55847Sbostic p->pbegin[subno] = HERE(); 463*55847Sbostic EMIT(OLPAREN, subno); 464*55847Sbostic /* the SEE here is an error heuristic */ 465*55847Sbostic if (!SEE('\0') && !SEETWO('\\', ')')) 466*55847Sbostic p_bre(p, '\\', ')'); 467*55847Sbostic if (subno < NPAREN) { 468*55847Sbostic p->pend[subno] = HERE(); 469*55847Sbostic assert(p->pend[subno] != 0); 470*55847Sbostic } 471*55847Sbostic EMIT(ORPAREN, subno); 472*55847Sbostic REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 473*55847Sbostic break; 474*55847Sbostic case BACKSL|')': /* should not get here -- must be user */ 475*55847Sbostic case BACKSL|'}': 476*55847Sbostic SETERROR(REG_EPAREN); 477*55847Sbostic break; 478*55847Sbostic case BACKSL|'1': 479*55847Sbostic case BACKSL|'2': 480*55847Sbostic case BACKSL|'3': 481*55847Sbostic case BACKSL|'4': 482*55847Sbostic case BACKSL|'5': 483*55847Sbostic case BACKSL|'6': 484*55847Sbostic case BACKSL|'7': 485*55847Sbostic case BACKSL|'8': 486*55847Sbostic case BACKSL|'9': 487*55847Sbostic i = (c&~BACKSL) - '0'; 488*55847Sbostic assert(i < NPAREN); 489*55847Sbostic if (p->pend[i] != 0) { 490*55847Sbostic assert(i <= p->g->nsub); 491*55847Sbostic EMIT(OBACK_, i); 492*55847Sbostic assert(p->pbegin[i] != 0); 493*55847Sbostic assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 494*55847Sbostic assert(OP(p->strip[p->pend[i]]) == ORPAREN); 495*55847Sbostic (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 496*55847Sbostic EMIT(O_BACK, i); 497*55847Sbostic } else 498*55847Sbostic SETERROR(REG_ESUBREG); 499*55847Sbostic p->g->backrefs = 1; 500*55847Sbostic NEXT(); 501*55847Sbostic break; 502*55847Sbostic case '*': 503*55847Sbostic REQUIRE(starordinary, REG_BADRPT); 504*55847Sbostic /* FALLTHROUGH */ 505*55847Sbostic default: 506*55847Sbostic if (c & BACKSL) { 507*55847Sbostic SETERROR(REG_EESCAPE); 508*55847Sbostic c &= ~BACKSL; 509*55847Sbostic } 510*55847Sbostic ordinary(p, (uchar)c); 511*55847Sbostic break; 512*55847Sbostic } 513*55847Sbostic 514*55847Sbostic if (EAT('*')) { /* implemented as +? */ 515*55847Sbostic INSERT(OPLUS_, pos); 516*55847Sbostic BACK(O_PLUS, pos); 517*55847Sbostic INSERT(OQUEST_, pos); 518*55847Sbostic BACK(O_QUEST, pos); 519*55847Sbostic } else if (EATTWO('\\', '{')) { 520*55847Sbostic count = p_count(p); 521*55847Sbostic if (EAT(',')) { 522*55847Sbostic if (isdigit(PEEK())) { 523*55847Sbostic count2 = p_count(p); 524*55847Sbostic REQUIRE(count <= count2, REG_BADBR); 525*55847Sbostic } else /* single number with comma */ 526*55847Sbostic count2 = INFINITY; 527*55847Sbostic } else /* just a single number */ 528*55847Sbostic count2 = count; 529*55847Sbostic repeat(p, pos, count, count2); 530*55847Sbostic if (!EATTWO('\\', '}')) { /* error heuristics */ 531*55847Sbostic while (!SEE('\0') && !SEETWO('\\', '}')) 532*55847Sbostic NEXT(); 533*55847Sbostic if (SEE('\0')) 534*55847Sbostic SETERROR(REG_EBRACE); 535*55847Sbostic else 536*55847Sbostic SETERROR(REG_BADBR); 537*55847Sbostic } 538*55847Sbostic } else if (c == '$') /* unbackslashed $ not followed by reptn */ 539*55847Sbostic return(1); 540*55847Sbostic 541*55847Sbostic return(0); 542*55847Sbostic } 543*55847Sbostic 544*55847Sbostic /* 545*55847Sbostic - p_count - parse a repetition count 546*55847Sbostic */ 547*55847Sbostic static int /* the value */ 548*55847Sbostic p_count(p) 549*55847Sbostic register struct parse *p; 550*55847Sbostic { 551*55847Sbostic register int count = 0; 552*55847Sbostic register int ndigits = 0; 553*55847Sbostic 554*55847Sbostic while (isdigit(PEEK()) && count <= DUPMAX) { 555*55847Sbostic count = count*10 + (GETNEXT() - '0'); 556*55847Sbostic ndigits++; 557*55847Sbostic } 558*55847Sbostic 559*55847Sbostic REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 560*55847Sbostic return(count); 561*55847Sbostic } 562*55847Sbostic 563*55847Sbostic /* 564*55847Sbostic - p_bracket - parse a bracketed character list 565*55847Sbostic * 566*55847Sbostic * Note a significant property of this code: if the allocset() did SETERROR, 567*55847Sbostic * no set operations are done. 568*55847Sbostic */ 569*55847Sbostic static void 570*55847Sbostic p_bracket(p) 571*55847Sbostic register struct parse *p; 572*55847Sbostic { 573*55847Sbostic STATIC void p_b_term(); 574*55847Sbostic STATIC cset *allocset(); 575*55847Sbostic STATIC int freezeset(); 576*55847Sbostic register uchar c; 577*55847Sbostic register cset *cs = allocset(p); 578*55847Sbostic register int invert = 0; 579*55847Sbostic 580*55847Sbostic if (EAT('^')) 581*55847Sbostic invert++; /* make note to invert set at end */ 582*55847Sbostic if (EAT(']')) 583*55847Sbostic CHadd(cs, ']'); 584*55847Sbostic while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']')) 585*55847Sbostic p_b_term(p, cs); 586*55847Sbostic if (EAT('-')) 587*55847Sbostic CHadd(cs, '-'); 588*55847Sbostic MUSTEAT(']', REG_EBRACK); 589*55847Sbostic 590*55847Sbostic if (invert && p->error == 0) { 591*55847Sbostic register int i; 592*55847Sbostic 593*55847Sbostic for (i = p->g->csetsize - 1; i >= 0; i--) 594*55847Sbostic if (CHIN(cs, i)) 595*55847Sbostic CHsub(cs, i); 596*55847Sbostic else 597*55847Sbostic CHadd(cs, i); 598*55847Sbostic if (p->g->cflags®_NEWLINE) 599*55847Sbostic CHsub(cs, '\n'); 600*55847Sbostic if (cs->multis != NULL) 601*55847Sbostic mcinvert(p, cs); 602*55847Sbostic } 603*55847Sbostic assert(cs->multis == NULL); /* xxx */ 604*55847Sbostic EMIT(OANYOF, freezeset(p, cs)); 605*55847Sbostic } 606*55847Sbostic 607*55847Sbostic /* 608*55847Sbostic - p_b_term - parse one term of a bracketed character list 609*55847Sbostic */ 610*55847Sbostic static void 611*55847Sbostic p_b_term(p, cs) 612*55847Sbostic register struct parse *p; 613*55847Sbostic register cset *cs; 614*55847Sbostic { 615*55847Sbostic STATIC uchar p_b_symbol(); 616*55847Sbostic STATIC void p_b_cclass(); 617*55847Sbostic STATIC void p_b_eclass(); 618*55847Sbostic STATIC uchar othercase(); 619*55847Sbostic register uchar c; 620*55847Sbostic register uchar start, finish; 621*55847Sbostic register int i; 622*55847Sbostic 623*55847Sbostic /* classify what we've got */ 624*55847Sbostic switch (PEEK()) { 625*55847Sbostic case '[': 626*55847Sbostic c = PEEK2(); 627*55847Sbostic break; 628*55847Sbostic case '-': 629*55847Sbostic SETERROR(REG_ERANGE); 630*55847Sbostic return; /* NOTE RETURN */ 631*55847Sbostic break; 632*55847Sbostic default: 633*55847Sbostic c = '\0'; 634*55847Sbostic break; 635*55847Sbostic } 636*55847Sbostic 637*55847Sbostic switch (c) { 638*55847Sbostic case ':': /* character class */ 639*55847Sbostic NEXT2(); 640*55847Sbostic c = PEEK(); 641*55847Sbostic REQUIRE(c != '\0', REG_EBRACK); 642*55847Sbostic REQUIRE(c != '-' && c != ']', REG_ECTYPE); 643*55847Sbostic p_b_cclass(p, cs); 644*55847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 645*55847Sbostic REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 646*55847Sbostic break; 647*55847Sbostic case '=': /* equivalence class */ 648*55847Sbostic NEXT2(); 649*55847Sbostic c = PEEK(); 650*55847Sbostic REQUIRE(c != '\0', REG_EBRACK); 651*55847Sbostic REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 652*55847Sbostic p_b_eclass(p, cs); 653*55847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 654*55847Sbostic REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 655*55847Sbostic break; 656*55847Sbostic default: /* symbol, ordinary character, or range */ 657*55847Sbostic /* xxx revision needed for multichar stuff */ 658*55847Sbostic start = p_b_symbol(p); 659*55847Sbostic if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') { 660*55847Sbostic /* range */ 661*55847Sbostic NEXT(); 662*55847Sbostic if (EAT('-')) 663*55847Sbostic finish = '-'; 664*55847Sbostic else 665*55847Sbostic finish = p_b_symbol(p); 666*55847Sbostic } else 667*55847Sbostic finish = start; 668*55847Sbostic REQUIRE(start <= finish, REG_ERANGE); 669*55847Sbostic for (i = start; i <= finish; i++) { 670*55847Sbostic CHadd(cs, i); 671*55847Sbostic if ((p->g->cflags®_ICASE) && isalpha(i)) { 672*55847Sbostic c = othercase((uchar)i); 673*55847Sbostic CHadd(cs, c); 674*55847Sbostic } 675*55847Sbostic } 676*55847Sbostic break; 677*55847Sbostic } 678*55847Sbostic } 679*55847Sbostic 680*55847Sbostic /* 681*55847Sbostic - p_b_cclass - parse a character-class name and deal with it 682*55847Sbostic */ 683*55847Sbostic static void 684*55847Sbostic p_b_cclass(p, cs) 685*55847Sbostic register struct parse *p; 686*55847Sbostic register cset *cs; 687*55847Sbostic { 688*55847Sbostic register uchar *sb = p->next; 689*55847Sbostic register uchar *se = sb; 690*55847Sbostic register struct cclass *cp; 691*55847Sbostic register int len; 692*55847Sbostic register uchar *u; 693*55847Sbostic register uchar c; 694*55847Sbostic 695*55847Sbostic while (isalpha(*se)) 696*55847Sbostic se++; 697*55847Sbostic len = se - sb; 698*55847Sbostic NEXTn(len); 699*55847Sbostic for (cp = cclasses; cp->name != NULL; cp++) 700*55847Sbostic if (strncmp(cp->name, (char *)sb, len) == 0 && 701*55847Sbostic cp->name[len] == '\0') 702*55847Sbostic break; 703*55847Sbostic if (cp->name == NULL) { 704*55847Sbostic /* oops, didn't find it */ 705*55847Sbostic SETERROR(REG_ECTYPE); 706*55847Sbostic return; 707*55847Sbostic } 708*55847Sbostic 709*55847Sbostic u = (uchar *)cp->chars; 710*55847Sbostic while ((c = *u++) != '\0') 711*55847Sbostic CHadd(cs, c); 712*55847Sbostic for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1) 713*55847Sbostic MCadd(cs, u); 714*55847Sbostic } 715*55847Sbostic 716*55847Sbostic /* 717*55847Sbostic - p_b_eclass - parse an equivalence-class name and deal with it 718*55847Sbostic * 719*55847Sbostic * This implementation is incomplete. xxx 720*55847Sbostic */ 721*55847Sbostic static void 722*55847Sbostic p_b_eclass(p, cs) 723*55847Sbostic register struct parse *p; 724*55847Sbostic register cset *cs; 725*55847Sbostic { 726*55847Sbostic register uchar c; 727*55847Sbostic STATIC uchar p_b_coll_elem(); 728*55847Sbostic 729*55847Sbostic c = p_b_coll_elem(p, '='); 730*55847Sbostic CHadd(cs, c); 731*55847Sbostic } 732*55847Sbostic 733*55847Sbostic /* 734*55847Sbostic - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 735*55847Sbostic */ 736*55847Sbostic static uchar /* value of symbol */ 737*55847Sbostic p_b_symbol(p) 738*55847Sbostic register struct parse *p; 739*55847Sbostic { 740*55847Sbostic STATIC uchar p_b_coll_elem(); 741*55847Sbostic register uchar value; 742*55847Sbostic 743*55847Sbostic if (!EATTWO('[', '.')) { 744*55847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 745*55847Sbostic return(GETNEXT()); 746*55847Sbostic } 747*55847Sbostic 748*55847Sbostic /* collating symbol */ 749*55847Sbostic MUSTNOTSEE('\0', REG_EBRACK); 750*55847Sbostic value = p_b_coll_elem(p, '.'); 751*55847Sbostic REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 752*55847Sbostic return(value); 753*55847Sbostic } 754*55847Sbostic 755*55847Sbostic /* 756*55847Sbostic - p_b_coll_elem - parse a collating-element name and look it up 757*55847Sbostic */ 758*55847Sbostic static uchar /* value of collating element */ 759*55847Sbostic p_b_coll_elem(p, endc) 760*55847Sbostic register struct parse *p; 761*55847Sbostic uchar endc; /* name ended by endc,']' */ 762*55847Sbostic { 763*55847Sbostic register uchar *sp = p->next; 764*55847Sbostic register struct cname *cp; 765*55847Sbostic register int len; 766*55847Sbostic register uchar c; 767*55847Sbostic 768*55847Sbostic while ((c = PEEK()) != '\0' && !SEETWO(endc, ']')) 769*55847Sbostic NEXT(); 770*55847Sbostic if (c == '\0') { 771*55847Sbostic SETERROR(REG_EBRACK); 772*55847Sbostic return(0); 773*55847Sbostic } 774*55847Sbostic len = p->next - sp; 775*55847Sbostic for (cp = cnames; cp->name != NULL; cp++) 776*55847Sbostic if (strncmp(cp->name, (char *)sp, len) == 0 && 777*55847Sbostic cp->name[len] == '\0') 778*55847Sbostic return(cp->code); /* known name */ 779*55847Sbostic if (len == 1) 780*55847Sbostic return(*sp); /* single character */ 781*55847Sbostic SETERROR(REG_ECOLLATE); /* neither */ 782*55847Sbostic return(0); 783*55847Sbostic } 784*55847Sbostic 785*55847Sbostic /* 786*55847Sbostic - othercase - return the case counterpart of an alphabetic 787*55847Sbostic */ 788*55847Sbostic static uchar 789*55847Sbostic othercase(ch) 790*55847Sbostic uchar ch; 791*55847Sbostic { 792*55847Sbostic assert(isalpha(ch)); 793*55847Sbostic if (isupper(ch)) 794*55847Sbostic return(tolower(ch)); 795*55847Sbostic else if (islower(ch)) 796*55847Sbostic return(toupper(ch)); 797*55847Sbostic else /* peculiar, but could happen */ 798*55847Sbostic return(ch); 799*55847Sbostic } 800*55847Sbostic 801*55847Sbostic /* 802*55847Sbostic - bothcases - emit a dualcase version of a character 803*55847Sbostic * Boy, is this implementation ever a kludge... 804*55847Sbostic */ 805*55847Sbostic static void 806*55847Sbostic bothcases(p, ch) 807*55847Sbostic register struct parse *p; 808*55847Sbostic uchar ch; 809*55847Sbostic { 810*55847Sbostic register uchar *oldnext; 811*55847Sbostic uchar bracket[3]; 812*55847Sbostic 813*55847Sbostic oldnext = p->next; 814*55847Sbostic p->next = bracket; 815*55847Sbostic bracket[0] = ch; 816*55847Sbostic bracket[1] = ']'; 817*55847Sbostic bracket[2] = '\0'; 818*55847Sbostic p_bracket(p); 819*55847Sbostic assert(p->next == bracket+2); 820*55847Sbostic p->next = oldnext; 821*55847Sbostic } 822*55847Sbostic 823*55847Sbostic /* 824*55847Sbostic - ordinary - emit an ordinary character 825*55847Sbostic */ 826*55847Sbostic static void 827*55847Sbostic ordinary(p, ch) 828*55847Sbostic register struct parse *p; 829*55847Sbostic register uchar ch; 830*55847Sbostic { 831*55847Sbostic register uchar *cap = p->g->categories; 832*55847Sbostic 833*55847Sbostic if ((p->g->cflags®_ICASE) && isalpha(ch)) { 834*55847Sbostic bothcases(p, ch); 835*55847Sbostic return; 836*55847Sbostic } 837*55847Sbostic 838*55847Sbostic EMIT(OCHAR, ch); 839*55847Sbostic if (cap[ch] == 0) 840*55847Sbostic cap[ch] = p->g->ncategories++; 841*55847Sbostic } 842*55847Sbostic 843*55847Sbostic /* 844*55847Sbostic - nonnewline - emit REG_NEWLINE version of OANY 845*55847Sbostic * Boy, is this implementation ever a kludge... 846*55847Sbostic */ 847*55847Sbostic static void 848*55847Sbostic nonnewline(p) 849*55847Sbostic register struct parse *p; 850*55847Sbostic { 851*55847Sbostic register uchar *oldnext; 852*55847Sbostic uchar bracket[4]; 853*55847Sbostic 854*55847Sbostic oldnext = p->next; 855*55847Sbostic p->next = bracket; 856*55847Sbostic bracket[0] = '^'; 857*55847Sbostic bracket[1] = '\n'; 858*55847Sbostic bracket[2] = ']'; 859*55847Sbostic bracket[3] = '\0'; 860*55847Sbostic p_bracket(p); 861*55847Sbostic assert(p->next == bracket+3); 862*55847Sbostic p->next = oldnext; 863*55847Sbostic } 864*55847Sbostic 865*55847Sbostic /* 866*55847Sbostic - repeat - generate code for a bounded repetition, recursively if needed 867*55847Sbostic */ 868*55847Sbostic static void 869*55847Sbostic repeat(p, start, from, to) 870*55847Sbostic register struct parse *p; 871*55847Sbostic sopno start; /* operand from here to end of strip */ 872*55847Sbostic int from; /* repeated from this number */ 873*55847Sbostic int to; /* to this number of times (maybe INFINITY) */ 874*55847Sbostic { 875*55847Sbostic register sopno finish = HERE(); 876*55847Sbostic # define N 2 877*55847Sbostic # define INF 3 878*55847Sbostic # define REP(f, t) ((f)*8 + (t)) 879*55847Sbostic # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 880*55847Sbostic register sopno copy; 881*55847Sbostic 882*55847Sbostic if (p->error != 0) /* head off possible runaway recursion */ 883*55847Sbostic return; 884*55847Sbostic 885*55847Sbostic assert(from <= to); 886*55847Sbostic 887*55847Sbostic switch (REP(MAP(from), MAP(to))) { 888*55847Sbostic case REP(0, 0): /* must be user doing this */ 889*55847Sbostic DROP(finish-start); /* drop the operand */ 890*55847Sbostic break; 891*55847Sbostic case REP(0, 1): /* as x{1,1}? */ 892*55847Sbostic case REP(0, N): /* as x{1,n}? */ 893*55847Sbostic case REP(0, INF): /* as x{1,}? */ 894*55847Sbostic INSERT(OQUEST_, start); /* offset is wrong... */ 895*55847Sbostic repeat(p, start+1, 1, to); 896*55847Sbostic FWD(start); /* ... fix it */ 897*55847Sbostic BACK(O_QUEST, start); 898*55847Sbostic break; 899*55847Sbostic case REP(1, 1): /* trivial case */ 900*55847Sbostic /* done */ 901*55847Sbostic break; 902*55847Sbostic case REP(1, N): /* as x?x{1,n-1} */ 903*55847Sbostic INSERT(OQUEST_, start); 904*55847Sbostic BACK(O_QUEST, start); 905*55847Sbostic copy = dupl(p, start+1, finish+1); 906*55847Sbostic assert(copy == finish+2); 907*55847Sbostic repeat(p, copy, 1, to-1); 908*55847Sbostic break; 909*55847Sbostic case REP(1, INF): /* as x+ */ 910*55847Sbostic INSERT(OPLUS_, start); 911*55847Sbostic BACK(O_PLUS, start); 912*55847Sbostic break; 913*55847Sbostic case REP(N, N): /* as xx{m-1,n-1} */ 914*55847Sbostic copy = dupl(p, start, finish); 915*55847Sbostic repeat(p, copy, from-1, to-1); 916*55847Sbostic break; 917*55847Sbostic case REP(N, INF): /* as xx{n-1,INF} */ 918*55847Sbostic copy = dupl(p, start, finish); 919*55847Sbostic repeat(p, copy, from-1, to); 920*55847Sbostic break; 921*55847Sbostic default: /* "can't happen" */ 922*55847Sbostic SETERROR(REG_ASSERT); /* just in case */ 923*55847Sbostic break; 924*55847Sbostic } 925*55847Sbostic } 926*55847Sbostic 927*55847Sbostic /* 928*55847Sbostic - seterr - set an error condition 929*55847Sbostic */ 930*55847Sbostic static int /* useless but makes type checking happy */ 931*55847Sbostic seterr(p, e) 932*55847Sbostic register struct parse *p; 933*55847Sbostic int e; 934*55847Sbostic { 935*55847Sbostic if (p->error == 0) /* keep earliest error condition */ 936*55847Sbostic p->error = e; 937*55847Sbostic p->next = nuls; /* try to bring things to a halt */ 938*55847Sbostic return(0); /* make the return value well-defined */ 939*55847Sbostic } 940*55847Sbostic 941*55847Sbostic /* 942*55847Sbostic - allocset - allocate a set of characters for [] 943*55847Sbostic */ 944*55847Sbostic static cset * 945*55847Sbostic allocset(p) 946*55847Sbostic register struct parse *p; 947*55847Sbostic { 948*55847Sbostic register int no = p->g->ncsets++; 949*55847Sbostic register size_t nc; 950*55847Sbostic register size_t nbytes; 951*55847Sbostic register cset *cs; 952*55847Sbostic register int i; 953*55847Sbostic register size_t css = (size_t)p->g->csetsize; 954*55847Sbostic 955*55847Sbostic if (no >= p->ncsalloc) { /* need another column of space */ 956*55847Sbostic p->ncsalloc += CHAR_BIT; 957*55847Sbostic nc = p->ncsalloc; 958*55847Sbostic assert(nc % CHAR_BIT == 0); 959*55847Sbostic nbytes = nc / CHAR_BIT * css; 960*55847Sbostic if (p->g->sets == NULL) 961*55847Sbostic p->g->sets = (cset *)malloc(nc * sizeof(cset)); 962*55847Sbostic else 963*55847Sbostic p->g->sets = (cset *)realloc((char *)p->g->sets, 964*55847Sbostic nc * sizeof(cset)); 965*55847Sbostic if (p->g->setbits == NULL) 966*55847Sbostic p->g->setbits = (uchar *)malloc(nbytes); 967*55847Sbostic else 968*55847Sbostic p->g->setbits = (uchar *)realloc((char *)p->g->setbits, 969*55847Sbostic nbytes); 970*55847Sbostic if (p->g->sets != NULL && p->g->setbits != NULL) 971*55847Sbostic (void) memset((char *)p->g->setbits + (nbytes - css), 972*55847Sbostic 0, css); 973*55847Sbostic else { 974*55847Sbostic no = 0; 975*55847Sbostic SETERROR(REG_ESPACE); 976*55847Sbostic /* caller's responsibility not to do set ops */ 977*55847Sbostic } 978*55847Sbostic } 979*55847Sbostic 980*55847Sbostic assert(p->g->sets != NULL); /* xxx */ 981*55847Sbostic cs = &p->g->sets[no]; 982*55847Sbostic cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 983*55847Sbostic cs->mask = 1 << ((no) % CHAR_BIT); 984*55847Sbostic cs->hash = 0; 985*55847Sbostic cs->smultis = 0; 986*55847Sbostic cs->multis = NULL; 987*55847Sbostic 988*55847Sbostic return(cs); 989*55847Sbostic } 990*55847Sbostic 991*55847Sbostic /* 992*55847Sbostic - freezeset - final processing on a set of characters 993*55847Sbostic * 994*55847Sbostic * The main task here is merging identical sets. This is usually a waste 995*55847Sbostic * of time (although the hash code minimizes the overhead), but can win 996*55847Sbostic * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 997*55847Sbostic * is done using addition rather than xor -- all ASCII [aA] sets xor to 998*55847Sbostic * the same value! 999*55847Sbostic */ 1000*55847Sbostic static int /* set number */ 1001*55847Sbostic freezeset(p, cs) 1002*55847Sbostic register struct parse *p; 1003*55847Sbostic register cset *cs; 1004*55847Sbostic { 1005*55847Sbostic register uchar h = cs->hash; 1006*55847Sbostic register int i; 1007*55847Sbostic register cset *top = &p->g->sets[p->g->ncsets]; 1008*55847Sbostic register uchar c; 1009*55847Sbostic register cset *cs2; 1010*55847Sbostic register size_t css = (size_t)p->g->csetsize; 1011*55847Sbostic 1012*55847Sbostic /* look for an earlier one which is the same */ 1013*55847Sbostic for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1014*55847Sbostic if (cs2->hash == h && cs2 != cs) { 1015*55847Sbostic /* maybe */ 1016*55847Sbostic for (i = 0; i < css; i++) 1017*55847Sbostic if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1018*55847Sbostic break; /* no */ 1019*55847Sbostic if (i == css) 1020*55847Sbostic break; /* yes */ 1021*55847Sbostic } 1022*55847Sbostic 1023*55847Sbostic if (cs2 < top) { /* found one */ 1024*55847Sbostic assert(cs == top-1); 1025*55847Sbostic p->g->ncsets--; 1026*55847Sbostic for (i = 0; i < css; i++) 1027*55847Sbostic CHsub(cs, i); 1028*55847Sbostic cs = cs2; 1029*55847Sbostic } 1030*55847Sbostic 1031*55847Sbostic return((int)(cs - p->g->sets)); 1032*55847Sbostic } 1033*55847Sbostic 1034*55847Sbostic /* 1035*55847Sbostic - mcadd - add a collating element to a cset 1036*55847Sbostic */ 1037*55847Sbostic static void 1038*55847Sbostic mcadd(p, cs, cp) 1039*55847Sbostic register struct parse *p; 1040*55847Sbostic register cset *cs; 1041*55847Sbostic register uchar *cp; 1042*55847Sbostic { 1043*55847Sbostic register size_t oldend = cs->smultis; 1044*55847Sbostic 1045*55847Sbostic cs->smultis += strlen((char *)cp) + 1; 1046*55847Sbostic if (cs->multis == NULL) 1047*55847Sbostic cs->multis = (uchar *)malloc(cs->smultis); 1048*55847Sbostic else 1049*55847Sbostic cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 1050*55847Sbostic if (cs->multis == NULL) { 1051*55847Sbostic SETERROR(REG_ESPACE); 1052*55847Sbostic return; 1053*55847Sbostic } 1054*55847Sbostic 1055*55847Sbostic (void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp); 1056*55847Sbostic cs->multis[cs->smultis - 1] = '\0'; 1057*55847Sbostic } 1058*55847Sbostic 1059*55847Sbostic /* 1060*55847Sbostic - mcsub - subtract a collating element from a cset 1061*55847Sbostic */ 1062*55847Sbostic static void 1063*55847Sbostic mcsub(p, cs, cp) 1064*55847Sbostic register struct parse *p; 1065*55847Sbostic register cset *cs; 1066*55847Sbostic register uchar *cp; 1067*55847Sbostic { 1068*55847Sbostic register uchar *fp = mcfind(cs, cp); 1069*55847Sbostic register size_t len = strlen((char *)fp); 1070*55847Sbostic 1071*55847Sbostic assert(p != NULL); 1072*55847Sbostic (void) memmove((char *)fp, (char *)(fp + len + 1), 1073*55847Sbostic cs->smultis - (fp + len + 1 - cs->multis)); 1074*55847Sbostic cs->smultis -= len; 1075*55847Sbostic 1076*55847Sbostic if (cs->smultis == 0) { 1077*55847Sbostic free((char *)cs->multis); 1078*55847Sbostic cs->multis = NULL; 1079*55847Sbostic return; 1080*55847Sbostic } 1081*55847Sbostic 1082*55847Sbostic cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 1083*55847Sbostic assert(cs->multis != NULL); 1084*55847Sbostic } 1085*55847Sbostic 1086*55847Sbostic /* 1087*55847Sbostic - mcin - is a collating element in a cset? 1088*55847Sbostic */ 1089*55847Sbostic static int 1090*55847Sbostic mcin(p, cs, cp) 1091*55847Sbostic register struct parse *p; 1092*55847Sbostic register cset *cs; 1093*55847Sbostic register uchar *cp; 1094*55847Sbostic { 1095*55847Sbostic return(mcfind(cs, cp) != NULL); 1096*55847Sbostic } 1097*55847Sbostic 1098*55847Sbostic /* 1099*55847Sbostic - mcfind - find a collating element in a cset 1100*55847Sbostic */ 1101*55847Sbostic static uchar * 1102*55847Sbostic mcfind(cs, cp) 1103*55847Sbostic register cset *cs; 1104*55847Sbostic register uchar *cp; 1105*55847Sbostic { 1106*55847Sbostic register uchar *p; 1107*55847Sbostic 1108*55847Sbostic if (cs->multis == NULL) 1109*55847Sbostic return(NULL); 1110*55847Sbostic for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1) 1111*55847Sbostic if (strcmp((char *)cp, (char *)p) == 0) 1112*55847Sbostic return(p); 1113*55847Sbostic return(NULL); 1114*55847Sbostic } 1115*55847Sbostic 1116*55847Sbostic /* 1117*55847Sbostic - mcinvert - invert the list of collating elements in a cset 1118*55847Sbostic * 1119*55847Sbostic * This would have to know the set of possibilities. Implementation 1120*55847Sbostic * is deferred. 1121*55847Sbostic */ 1122*55847Sbostic static void 1123*55847Sbostic mcinvert(p, cs) 1124*55847Sbostic register struct parse *p; 1125*55847Sbostic register cset *cs; 1126*55847Sbostic { 1127*55847Sbostic assert(cs->multis == NULL); /* xxx */ 1128*55847Sbostic } 1129*55847Sbostic 1130*55847Sbostic /* 1131*55847Sbostic - isinsets - is this character in any sets? 1132*55847Sbostic */ 1133*55847Sbostic static int /* predicate */ 1134*55847Sbostic isinsets(g, c) 1135*55847Sbostic register struct re_guts *g; 1136*55847Sbostic uchar c; 1137*55847Sbostic { 1138*55847Sbostic register uchar *col; 1139*55847Sbostic register int i; 1140*55847Sbostic register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1141*55847Sbostic 1142*55847Sbostic for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1143*55847Sbostic if (col[c] != 0) 1144*55847Sbostic return(1); 1145*55847Sbostic return(0); 1146*55847Sbostic } 1147*55847Sbostic 1148*55847Sbostic /* 1149*55847Sbostic - samesets - are these two characters in exactly the same sets? 1150*55847Sbostic */ 1151*55847Sbostic static int /* predicate */ 1152*55847Sbostic samesets(g, c1, c2) 1153*55847Sbostic register struct re_guts *g; 1154*55847Sbostic register uchar c1; 1155*55847Sbostic register uchar c2; 1156*55847Sbostic { 1157*55847Sbostic register uchar *col; 1158*55847Sbostic register int i; 1159*55847Sbostic register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1160*55847Sbostic 1161*55847Sbostic for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1162*55847Sbostic if (col[c1] != col[c2]) 1163*55847Sbostic return(0); 1164*55847Sbostic return(1); 1165*55847Sbostic } 1166*55847Sbostic 1167*55847Sbostic /* 1168*55847Sbostic - categorize - sort out character categories 1169*55847Sbostic */ 1170*55847Sbostic static void 1171*55847Sbostic categorize(p, g) 1172*55847Sbostic struct parse *p; 1173*55847Sbostic register struct re_guts *g; 1174*55847Sbostic { 1175*55847Sbostic register uchar *cats = g->categories; 1176*55847Sbostic register unsigned c; 1177*55847Sbostic register unsigned c2; 1178*55847Sbostic register uchar cat; 1179*55847Sbostic 1180*55847Sbostic /* avoid making error situations worse */ 1181*55847Sbostic if (p->error != 0) 1182*55847Sbostic return; 1183*55847Sbostic 1184*55847Sbostic for (c = 0; c < g->csetsize; c++) 1185*55847Sbostic if (cats[c] == 0 && isinsets(g, c)) { 1186*55847Sbostic cat = g->ncategories++; 1187*55847Sbostic cats[c] = cat; 1188*55847Sbostic for (c2 = c+1; c2 < g->csetsize; c2++) 1189*55847Sbostic if (cats[c2] == 0 && samesets(g, c, c2)) 1190*55847Sbostic cats[c2] = cat; 1191*55847Sbostic } 1192*55847Sbostic } 1193*55847Sbostic 1194*55847Sbostic /* 1195*55847Sbostic - dupl - emit a duplicate of a bunch of sops 1196*55847Sbostic */ 1197*55847Sbostic static sopno /* start of duplicate */ 1198*55847Sbostic dupl(p, start, finish) 1199*55847Sbostic register struct parse *p; 1200*55847Sbostic sopno start; /* from here */ 1201*55847Sbostic sopno finish; /* to this less one */ 1202*55847Sbostic { 1203*55847Sbostic register int i; 1204*55847Sbostic register sopno ret = HERE(); 1205*55847Sbostic register sopno len = finish - start; 1206*55847Sbostic 1207*55847Sbostic assert(finish >= start); 1208*55847Sbostic if (len == 0) 1209*55847Sbostic return(ret); 1210*55847Sbostic enlarge(p, p->ssize + len); /* this many unexpected additions */ 1211*55847Sbostic assert(p->ssize >= p->slen + len); 1212*55847Sbostic (void) memcpy((char *)(p->strip + p->slen), 1213*55847Sbostic (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1214*55847Sbostic p->slen += len; 1215*55847Sbostic return(ret); 1216*55847Sbostic } 1217*55847Sbostic 1218*55847Sbostic /* 1219*55847Sbostic - doemit - emit a strip operator 1220*55847Sbostic * 1221*55847Sbostic * It might seem better to implement this as a macro with a function as 1222*55847Sbostic * hard-case backup, but it's just too big and messy unless there are 1223*55847Sbostic * some changes to the data structures. Maybe later. 1224*55847Sbostic */ 1225*55847Sbostic static void 1226*55847Sbostic doemit(p, op, opnd) 1227*55847Sbostic register struct parse *p; 1228*55847Sbostic sop op; 1229*55847Sbostic size_t opnd; 1230*55847Sbostic { 1231*55847Sbostic /* avoid making error situations worse */ 1232*55847Sbostic if (p->error != 0) 1233*55847Sbostic return; 1234*55847Sbostic 1235*55847Sbostic /* deal with oversize operands ("can't happen", more or less) */ 1236*55847Sbostic assert(opnd < 1<<OPSHIFT); 1237*55847Sbostic 1238*55847Sbostic /* deal with undersized strip */ 1239*55847Sbostic if (p->slen >= p->ssize) 1240*55847Sbostic enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1241*55847Sbostic assert(p->slen < p->ssize); 1242*55847Sbostic 1243*55847Sbostic /* finally, it's all reduced to the easy case */ 1244*55847Sbostic p->strip[p->slen++] = SOP(op, opnd); 1245*55847Sbostic } 1246*55847Sbostic 1247*55847Sbostic /* 1248*55847Sbostic - doinsert - insert a sop into the strip 1249*55847Sbostic */ 1250*55847Sbostic static void 1251*55847Sbostic doinsert(p, op, opnd, pos) 1252*55847Sbostic register struct parse *p; 1253*55847Sbostic sop op; 1254*55847Sbostic size_t opnd; 1255*55847Sbostic sopno pos; 1256*55847Sbostic { 1257*55847Sbostic register sopno sn; 1258*55847Sbostic register sop s; 1259*55847Sbostic register int i; 1260*55847Sbostic 1261*55847Sbostic /* avoid making error situations worse */ 1262*55847Sbostic if (p->error != 0) 1263*55847Sbostic return; 1264*55847Sbostic 1265*55847Sbostic sn = HERE(); 1266*55847Sbostic EMIT(op, opnd); /* do checks, ensure space */ 1267*55847Sbostic assert(HERE() == sn+1); 1268*55847Sbostic s = p->strip[sn]; 1269*55847Sbostic 1270*55847Sbostic /* adjust paren pointers */ 1271*55847Sbostic assert(pos > 0); 1272*55847Sbostic for (i = 1; i < NPAREN; i++) { 1273*55847Sbostic if (p->pbegin[i] >= pos) { 1274*55847Sbostic p->pbegin[i]++; 1275*55847Sbostic } 1276*55847Sbostic if (p->pend[i] >= pos) { 1277*55847Sbostic p->pend[i]++; 1278*55847Sbostic } 1279*55847Sbostic } 1280*55847Sbostic 1281*55847Sbostic memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1282*55847Sbostic (HERE()-pos-1)*sizeof(sop)); 1283*55847Sbostic p->strip[pos] = s; 1284*55847Sbostic } 1285*55847Sbostic 1286*55847Sbostic /* 1287*55847Sbostic - dofwd - complete a forward reference 1288*55847Sbostic */ 1289*55847Sbostic static void 1290*55847Sbostic dofwd(p, pos, value) 1291*55847Sbostic register struct parse *p; 1292*55847Sbostic register sopno pos; 1293*55847Sbostic sop value; 1294*55847Sbostic { 1295*55847Sbostic /* avoid making error situations worse */ 1296*55847Sbostic if (p->error != 0) 1297*55847Sbostic return; 1298*55847Sbostic 1299*55847Sbostic assert(value < 1<<OPSHIFT); 1300*55847Sbostic p->strip[pos] = OP(p->strip[pos]) | value; 1301*55847Sbostic } 1302*55847Sbostic 1303*55847Sbostic /* 1304*55847Sbostic - enlarge - enlarge the strip 1305*55847Sbostic */ 1306*55847Sbostic static void 1307*55847Sbostic enlarge(p, size) 1308*55847Sbostic register struct parse *p; 1309*55847Sbostic register sopno size; 1310*55847Sbostic { 1311*55847Sbostic register sop *sp; 1312*55847Sbostic 1313*55847Sbostic if (p->ssize >= size) 1314*55847Sbostic return; 1315*55847Sbostic 1316*55847Sbostic sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1317*55847Sbostic if (sp == NULL) { 1318*55847Sbostic SETERROR(REG_ESPACE); 1319*55847Sbostic return; 1320*55847Sbostic } 1321*55847Sbostic p->strip = sp; 1322*55847Sbostic p->ssize = size; 1323*55847Sbostic } 1324*55847Sbostic 1325*55847Sbostic /* 1326*55847Sbostic - stripsnug - compact the strip 1327*55847Sbostic */ 1328*55847Sbostic static void 1329*55847Sbostic stripsnug(p, g) 1330*55847Sbostic register struct parse *p; 1331*55847Sbostic register struct re_guts *g; 1332*55847Sbostic { 1333*55847Sbostic g->nstates = p->slen; 1334*55847Sbostic g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop)); 1335*55847Sbostic if (g->strip == NULL) { 1336*55847Sbostic SETERROR(REG_ESPACE); 1337*55847Sbostic g->strip = p->strip; 1338*55847Sbostic } 1339*55847Sbostic } 1340*55847Sbostic 1341*55847Sbostic /* 1342*55847Sbostic - findmust - fill in must and mlen with longest mandatory literal string 1343*55847Sbostic * 1344*55847Sbostic * This algorithm could do fancy things like analyzing the operands of | 1345*55847Sbostic * for common subsequences. Someday. This code is simple and finds most 1346*55847Sbostic * of the interesting cases. 1347*55847Sbostic * 1348*55847Sbostic * Note that must and mlen got initialized during setup. 1349*55847Sbostic */ 1350*55847Sbostic STATIC void 1351*55847Sbostic findmust(p, g) 1352*55847Sbostic struct parse *p; 1353*55847Sbostic register struct re_guts *g; 1354*55847Sbostic { 1355*55847Sbostic register sop *scan; 1356*55847Sbostic sop *start; 1357*55847Sbostic register sop *newstart; 1358*55847Sbostic register sopno newlen; 1359*55847Sbostic register sop s; 1360*55847Sbostic register char *cp; 1361*55847Sbostic register sopno i; 1362*55847Sbostic 1363*55847Sbostic /* avoid making error situations worse */ 1364*55847Sbostic if (p->error != 0) 1365*55847Sbostic return; 1366*55847Sbostic 1367*55847Sbostic /* find the longest OCHAR sequence in strip */ 1368*55847Sbostic newlen = 0; 1369*55847Sbostic scan = g->strip + 1; 1370*55847Sbostic do { 1371*55847Sbostic s = *scan++; 1372*55847Sbostic switch (OP(s)) { 1373*55847Sbostic case OCHAR: /* sequence member */ 1374*55847Sbostic if (newlen == 0) /* new sequence */ 1375*55847Sbostic newstart = scan - 1; 1376*55847Sbostic newlen++; 1377*55847Sbostic break; 1378*55847Sbostic case OPLUS_: /* things that don't break one */ 1379*55847Sbostic case OLPAREN: 1380*55847Sbostic case ORPAREN: 1381*55847Sbostic break; 1382*55847Sbostic case OQUEST_: /* things that must be skipped */ 1383*55847Sbostic case OCH_: 1384*55847Sbostic scan--; 1385*55847Sbostic do { 1386*55847Sbostic scan += OPND(s); 1387*55847Sbostic s = *scan; 1388*55847Sbostic /* assert() interferes w debug printouts */ 1389*55847Sbostic if (OP(s) != O_QUEST && OP(s) != O_CH && 1390*55847Sbostic OP(s) != OOR2) { 1391*55847Sbostic g->iflags |= BAD; 1392*55847Sbostic return; 1393*55847Sbostic } 1394*55847Sbostic } while (OP(s) != O_QUEST && OP(s) != O_CH); 1395*55847Sbostic /* fallthrough */ 1396*55847Sbostic default: /* things that break a sequence */ 1397*55847Sbostic if (newlen > g->mlen) { /* ends one */ 1398*55847Sbostic start = newstart; 1399*55847Sbostic g->mlen = newlen; 1400*55847Sbostic } 1401*55847Sbostic newlen = 0; 1402*55847Sbostic break; 1403*55847Sbostic } 1404*55847Sbostic } while (OP(s) != OEND); 1405*55847Sbostic 1406*55847Sbostic if (g->mlen == 0) /* there isn't one */ 1407*55847Sbostic return; 1408*55847Sbostic 1409*55847Sbostic /* turn it into a character string */ 1410*55847Sbostic g->must = malloc((size_t)g->mlen + 1); 1411*55847Sbostic if (g->must == NULL) { /* argh; just forget it */ 1412*55847Sbostic g->mlen = 0; 1413*55847Sbostic return; 1414*55847Sbostic } 1415*55847Sbostic cp = g->must; 1416*55847Sbostic scan = start; 1417*55847Sbostic for (i = g->mlen; i > 0; i--) { 1418*55847Sbostic while (OP(s = *scan++) != OCHAR) 1419*55847Sbostic continue; 1420*55847Sbostic *cp++ = OPND(s); 1421*55847Sbostic } 1422*55847Sbostic *cp++ = '\0'; /* just on general principles */ 1423*55847Sbostic } 1424*55847Sbostic 1425*55847Sbostic /* 1426*55847Sbostic - pluscount - count + nesting 1427*55847Sbostic */ 1428*55847Sbostic STATIC sopno /* nesting depth */ 1429*55847Sbostic pluscount(p, g) 1430*55847Sbostic struct parse *p; 1431*55847Sbostic register struct re_guts *g; 1432*55847Sbostic { 1433*55847Sbostic register sop *scan; 1434*55847Sbostic register sop s; 1435*55847Sbostic register sopno plusnest = 0; 1436*55847Sbostic register sopno maxnest = 0; 1437*55847Sbostic 1438*55847Sbostic if (p->error != 0) 1439*55847Sbostic return(0); /* there may not be an OEND */ 1440*55847Sbostic 1441*55847Sbostic scan = g->strip + 1; 1442*55847Sbostic do { 1443*55847Sbostic s = *scan++; 1444*55847Sbostic switch (OP(s)) { 1445*55847Sbostic case OPLUS_: 1446*55847Sbostic plusnest++; 1447*55847Sbostic break; 1448*55847Sbostic case O_PLUS: 1449*55847Sbostic if (plusnest > maxnest) 1450*55847Sbostic maxnest = plusnest; 1451*55847Sbostic plusnest--; 1452*55847Sbostic break; 1453*55847Sbostic } 1454*55847Sbostic } while (OP(s) != OEND); 1455*55847Sbostic if (plusnest != 0) 1456*55847Sbostic g->iflags |= BAD; 1457*55847Sbostic return(maxnest); 1458*55847Sbostic } 1459