14887Schin /***********************************************************************
24887Schin * *
34887Schin * This software is part of the ast package *
4*12068SRoger.Faulkner@Oracle.COM * Copyright (c) 1985-2010 AT&T Intellectual Property *
54887Schin * and is licensed under the *
64887Schin * Common Public License, Version 1.0 *
78462SApril.Chin@Sun.COM * by AT&T Intellectual Property *
84887Schin * *
94887Schin * A copy of the License is available at *
104887Schin * http://www.opensource.org/licenses/cpl1.0.txt *
114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
124887Schin * *
134887Schin * Information and Software Systems Research *
144887Schin * AT&T Research *
154887Schin * Florham Park NJ *
164887Schin * *
174887Schin * Glenn Fowler <gsf@research.att.com> *
184887Schin * David Korn <dgk@research.att.com> *
194887Schin * Phong Vo <kpv@research.att.com> *
204887Schin * *
214887Schin ***********************************************************************/
224887Schin #pragma prototyped
234887Schin
244887Schin /*
254887Schin * posix regex compiler
264887Schin */
274887Schin
284887Schin #include "reglib.h"
294887Schin
304887Schin #if _PACKAGE_ast
314887Schin #include "lclib.h"
324887Schin #endif
334887Schin
344887Schin #define serialize re_serialize /* hp.ia64 <unistd.h>! */
354887Schin
364887Schin #define C_ESC (-1)
374887Schin #define C_MB (-2)
384887Schin
394887Schin #if _AST_REGEX_DEBUG
404887Schin
414887Schin #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n))
424887Schin #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0)
434887Schin #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
444887Schin
454887Schin static unsigned long debug;
464887Schin static unsigned long debug_flag;
474887Schin
484887Schin #else
494887Schin
504887Schin #define DEBUG_INIT()
514887Schin #define DEBUG_TEST(f,y,n) (n)
524887Schin #define DEBUG_CODE(f,y,n) do {n} while(0)
534887Schin
544887Schin #endif
554887Schin
564887Schin #if _PACKAGE_ast
574887Schin
584887Schin typedef struct Cchr_s
594887Schin {
604887Schin Dtlink_t lnk;
614887Schin unsigned char nam[2];
624887Schin Ckey_t key;
634887Schin } Cchr_t;
644887Schin
654887Schin #endif
664887Schin
674887Schin #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0)
684887Schin
694887Schin /*
704887Schin * determine whether greedy matching will work, i.e. produce
714887Schin * the best match first. such expressions are "easy", and
724887Schin * need no backtracking once a complete match is found.
734887Schin * if an expression has backreferences or alts it's hard
744887Schin * else if it has only one closure it's easy
754887Schin * else if all closures are simple (i.e. one-character) it's easy
764887Schin * else it's hard.
774887Schin */
784887Schin
794887Schin typedef struct Stats_s
804887Schin {
814887Schin unsigned long l; /* min length to left of x */
824887Schin unsigned long k; /* min length to left of y */
834887Schin unsigned long m; /* min length */
844887Schin unsigned long n; /* max length */
854887Schin unsigned short a; /* number of alternations */
864887Schin unsigned short b; /* number of backrefs */
874887Schin unsigned short c; /* number of closures */
88*12068SRoger.Faulkner@Oracle.COM unsigned short e; /* $ */
894887Schin unsigned short i; /* number of negations */
904887Schin unsigned short p; /* number of named subexpressions */
914887Schin unsigned short s; /* number of simple closures */
924887Schin unsigned short t; /* number of tries */
934887Schin unsigned short u; /* number of unnamed subexpressions */
944887Schin Rex_t* x; /* max length REX_STRING */
954887Schin Rex_t* y; /* max length REX_TRIE */
964887Schin } Stats_t;
974887Schin
984887Schin typedef struct Token_s
994887Schin {
1004887Schin unsigned long min;
1014887Schin unsigned long max;
1024887Schin short lex;
1034887Schin short len;
1044887Schin short esc;
1054887Schin short att;
1064887Schin short push;
1074887Schin } Token_t;
1084887Schin
1094887Schin typedef struct Cenv_s
1104887Schin {
1114887Schin int delimiter; /* pattern delimiter */
1124887Schin int error; /* last error */
1134887Schin int explicit; /* explicit match on this char */
1144887Schin int mappeddot; /* inverse mapped '.' */
1154887Schin int mappednewline; /* inverse mapped '\n' */
1164887Schin int mappedslash; /* inverse mapped '/' */
1174887Schin regflags_t flags; /* flags arg to regcomp */
1184887Schin int type; /* BRE,ERE,ARE,SRE,KRE */
1194887Schin unsigned char* cursor; /* curent point in re */
1204887Schin unsigned char* pattern; /* the original pattern */
1214887Schin unsigned char* literal; /* literal restart pattern */
1224887Schin int parno; /* number of last open paren */
1234887Schin int parnest; /* paren nest count */
1244887Schin int posixkludge; /* to make * nonspecial */
12510898Sroland.mainz@nrubsig.org int regexp; /* <regexp.h> compatibility */
1264887Schin Token_t token; /* token lookahead */
1274887Schin Stats_t stats; /* RE statistics */
1284887Schin int terminator; /* pattern terminator */
1294887Schin Rex_t* paren[2*(BACK_REF_MAX+2)];
1304887Schin /* paren[i]!=0 if \i defined */
1314887Schin regex_t* regex; /* user handle */
1324887Schin regdisc_t* disc; /* user discipline */
1334887Schin unsigned char* map; /* external to native ccode map */
1344887Schin unsigned char* MAP; /* fold and/or map */
1354887Schin } Cenv_t;
1364887Schin
1374887Schin /*
1384887Schin * allocate a new Rex_t node
1394887Schin */
1404887Schin
1414887Schin static Rex_t*
node(Cenv_t * env,int type,int lo,int hi,size_t extra)1424887Schin node(Cenv_t* env, int type, int lo, int hi, size_t extra)
1434887Schin {
1444887Schin register Rex_t* e;
1454887Schin
1464887Schin if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra))
1474887Schin {
1484887Schin memset(e, 0, sizeof(Rex_t) + extra);
1494887Schin e->type = type;
1504887Schin e->marked = 0;
1514887Schin e->lo = lo;
1524887Schin e->hi = hi;
1534887Schin e->flags = env->flags;
1544887Schin e->map = (e->flags & REG_ICASE) ? env->MAP : env->map;
1554887Schin e->explicit = env->explicit;
1564887Schin if (extra)
1574887Schin e->re.data = (char*)e + sizeof(Rex_t);
1584887Schin }
1594887Schin return e;
1604887Schin }
1614887Schin
1624887Schin /*
1634887Schin * free a Trie_node_t node
1644887Schin */
1654887Schin
1664887Schin static void
triedrop(regdisc_t * disc,Trie_node_t * e)1674887Schin triedrop(regdisc_t* disc, Trie_node_t* e)
1684887Schin {
1694887Schin if (e)
1704887Schin {
1714887Schin triedrop(disc, e->son);
1724887Schin triedrop(disc, e->sib);
1734887Schin alloc(disc, e, 0);
1744887Schin }
1754887Schin }
1764887Schin
1774887Schin /*
1784887Schin * free a Rex_t node
1794887Schin */
1804887Schin
1814887Schin void
drop(regdisc_t * disc,Rex_t * e)1824887Schin drop(regdisc_t* disc, Rex_t* e)
1834887Schin {
1844887Schin int i;
1854887Schin Rex_t* x;
1864887Schin
1874887Schin if (e && !(disc->re_flags & REG_NOFREE))
1884887Schin do
1894887Schin {
1904887Schin switch (e->type)
1914887Schin {
1924887Schin case REX_ALT:
1934887Schin case REX_CONJ:
1944887Schin drop(disc, e->re.group.expr.binary.left);
1954887Schin drop(disc, e->re.group.expr.binary.right);
1964887Schin break;
1974887Schin case REX_GROUP:
1984887Schin case REX_GROUP_AHEAD:
1994887Schin case REX_GROUP_AHEAD_NOT:
2004887Schin case REX_GROUP_BEHIND:
2014887Schin case REX_GROUP_BEHIND_NOT:
2024887Schin case REX_GROUP_CUT:
2034887Schin case REX_NEG:
2044887Schin case REX_REP:
2054887Schin drop(disc, e->re.group.expr.rex);
2064887Schin break;
2074887Schin case REX_TRIE:
2084887Schin for (i = 0; i <= UCHAR_MAX; i++)
2094887Schin triedrop(disc, e->re.trie.root[i]);
2104887Schin break;
2114887Schin }
2124887Schin x = e->next;
2134887Schin alloc(disc, e, 0);
2144887Schin } while (e = x);
2154887Schin }
2164887Schin
2174887Schin /*
2184887Schin * mark e and descendants minimal
2194887Schin */
2204887Schin
2214887Schin static void
mark(register Rex_t * e,int set)2224887Schin mark(register Rex_t* e, int set)
2234887Schin {
2244887Schin if (e && !e->marked)
2254887Schin do
2264887Schin {
2274887Schin e->marked = 1;
2284887Schin if (set)
2294887Schin e->flags |= REG_MINIMAL;
2304887Schin else
2314887Schin e->flags &= ~REG_MINIMAL;
2324887Schin switch (e->type)
2334887Schin {
2344887Schin case REX_ALT:
2354887Schin case REX_CONJ:
2364887Schin case REX_GROUP_COND:
2374887Schin if (e->re.group.expr.binary.left)
2384887Schin mark(e->re.group.expr.binary.left, set);
2394887Schin if (e->re.group.expr.binary.right)
2404887Schin mark(e->re.group.expr.binary.right, set);
2414887Schin break;
2424887Schin case REX_GROUP:
2434887Schin case REX_GROUP_AHEAD:
2444887Schin case REX_GROUP_AHEAD_NOT:
2454887Schin case REX_GROUP_BEHIND:
2464887Schin case REX_GROUP_BEHIND_NOT:
2474887Schin case REX_GROUP_CUT:
2484887Schin case REX_NEG:
2494887Schin case REX_REP:
2504887Schin case REX_TRIE:
2514887Schin mark(e->re.group.expr.rex, set);
2524887Schin break;
2534887Schin }
2544887Schin } while (e = e->next);
2554887Schin }
2564887Schin
2574887Schin /*
2584887Schin * assign subexpression numbers by a preorder tree walk
2594887Schin */
2604887Schin
2614887Schin static int
serialize(Cenv_t * env,Rex_t * e,int n)2624887Schin serialize(Cenv_t* env, Rex_t* e, int n)
2634887Schin {
2644887Schin do
2654887Schin {
2664887Schin e->serial = n++;
2674887Schin switch (e->type)
2684887Schin {
2694887Schin case REX_ALT:
2704887Schin case REX_GROUP_COND:
2714887Schin if (e->re.group.expr.binary.left)
2724887Schin n = serialize(env, e->re.group.expr.binary.left, n);
2734887Schin e->re.group.expr.binary.serial = n++;
2744887Schin if (e->re.group.expr.binary.right)
2754887Schin n = serialize(env, e->re.group.expr.binary.right, n);
2764887Schin break;
2774887Schin case REX_CONJ:
2784887Schin n = serialize(env, e->re.group.expr.binary.left, n);
2794887Schin n = serialize(env, e->re.group.expr.binary.right, n);
2804887Schin break;
2814887Schin case REX_GROUP:
2824887Schin case REX_GROUP_AHEAD:
2834887Schin case REX_GROUP_AHEAD_NOT:
2844887Schin case REX_GROUP_BEHIND:
2854887Schin case REX_GROUP_BEHIND_NOT:
2864887Schin case REX_GROUP_CUT:
2874887Schin case REX_NEG:
2884887Schin case REX_REP:
2894887Schin n = serialize(env, e->re.group.expr.rex, n);
2904887Schin break;
2914887Schin }
2924887Schin } while (e = e->next);
2934887Schin return n;
2944887Schin }
2954887Schin
2964887Schin /*
2974887Schin * catenate e and f into a sequence, collapsing them if possible
2984887Schin */
2994887Schin
3004887Schin static Rex_t*
cat(Cenv_t * env,Rex_t * e,Rex_t * f)3014887Schin cat(Cenv_t* env, Rex_t* e, Rex_t* f)
3024887Schin {
3034887Schin Rex_t* g;
3044887Schin
3054887Schin if (!f)
3064887Schin {
3074887Schin drop(env->disc, e);
3084887Schin return 0;
3094887Schin }
3104887Schin if (e->type == REX_NULL)
3114887Schin {
3124887Schin drop(env->disc, e);
3134887Schin return f;
3144887Schin }
3154887Schin if (f->type == REX_NULL)
3164887Schin {
3174887Schin g = f->next;
3184887Schin f->next = 0;
3194887Schin drop(env->disc, f);
3204887Schin f = g;
3214887Schin }
3224887Schin else if (e->type == REX_DOT && f->type == REX_DOT)
3234887Schin {
3244887Schin unsigned int m = e->lo + f->lo;
3254887Schin unsigned int n = e->hi + f->hi;
3264887Schin
3274887Schin if (m <= RE_DUP_MAX)
3284887Schin {
3294887Schin if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX)
3304887Schin {
3314887Schin n = RE_DUP_INF;
3324887Schin goto combine;
3334887Schin }
3344887Schin else if (n <= RE_DUP_MAX)
3354887Schin {
3364887Schin combine:
3374887Schin e->lo = m;
3384887Schin e->hi = n;
3394887Schin g = f->next;
3404887Schin f->next = 0;
3414887Schin drop(env->disc, f);
3424887Schin f = g;
3434887Schin }
3444887Schin }
3454887Schin }
3464887Schin e->next = f;
3474887Schin return e;
3484887Schin }
3494887Schin
3504887Schin /*
3514887Schin * collect re statistics
3524887Schin */
3534887Schin
3544887Schin static int
stats(register Cenv_t * env,register Rex_t * e)3554887Schin stats(register Cenv_t* env, register Rex_t* e)
3564887Schin {
3574887Schin register unsigned long n;
3584887Schin register unsigned long m;
3594887Schin unsigned long cm;
3604887Schin unsigned long nm;
3614887Schin unsigned long cn;
3624887Schin unsigned long nn;
3634887Schin unsigned long l;
3644887Schin unsigned long k;
3654887Schin unsigned long t;
3664887Schin Rex_t* q;
3674887Schin Rex_t* x;
3684887Schin Rex_t* y;
3694887Schin unsigned char c;
3704887Schin unsigned char b;
3714887Schin
3724887Schin do
3734887Schin {
3744887Schin switch (e->type)
3754887Schin {
3764887Schin case REX_ALT:
3774887Schin x = env->stats.x;
3784887Schin l = env->stats.l;
3794887Schin y = env->stats.y;
3804887Schin k = env->stats.k;
3814887Schin t = env->stats.t;
3824887Schin if (++env->stats.a <= 0)
3834887Schin return 1;
3844887Schin cm = env->stats.m;
3854887Schin env->stats.m = 0;
3864887Schin cn = env->stats.n;
3874887Schin env->stats.n = 0;
3884887Schin if (stats(env, e->re.group.expr.binary.left))
3894887Schin return 1;
3904887Schin m = env->stats.m;
3914887Schin env->stats.m = 0;
3924887Schin n = env->stats.n;
3934887Schin env->stats.n = 0;
3944887Schin if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right))
3954887Schin return 1;
3964887Schin if (env->stats.m > m)
3974887Schin env->stats.m = m;
3984887Schin else
3994887Schin m = env->stats.m;
4004887Schin if ((env->stats.m += cm) < m)
4014887Schin return 1;
4024887Schin if (env->stats.n < n)
4034887Schin env->stats.n = n;
4044887Schin else
4054887Schin n = env->stats.n;
4064887Schin if ((env->stats.n += cn) < n)
4074887Schin return 1;
4084887Schin env->stats.x = x;
4094887Schin env->stats.l = l;
4104887Schin env->stats.y = y;
4114887Schin env->stats.k = k;
4124887Schin env->stats.t = t;
4134887Schin break;
4144887Schin case REX_BACK:
4154887Schin if (++env->stats.b <= 0)
4164887Schin return 1;
4174887Schin break;
4184887Schin case REX_CLASS:
4194887Schin case REX_COLL_CLASS:
4204887Schin case REX_DOT:
4214887Schin case REX_ONECHAR:
4224887Schin n = env->stats.m;
4234887Schin if ((env->stats.m += e->lo) < n)
4244887Schin return 1;
4254887Schin if (e->hi != RE_DUP_INF)
4264887Schin {
4274887Schin n = env->stats.n;
4284887Schin if ((env->stats.n += e->hi) < n)
4294887Schin return 1;
4304887Schin }
4314887Schin if (e->lo != e->hi)
4324887Schin {
4334887Schin if (++env->stats.c <= 0)
4344887Schin return 1;
4354887Schin if (++env->stats.s <= 0)
4364887Schin return 1;
4374887Schin }
4384887Schin break;
4394887Schin case REX_CONJ:
4404887Schin cm = env->stats.m;
4414887Schin env->stats.m = 0;
4424887Schin cn = env->stats.n;
4434887Schin env->stats.n = 0;
4444887Schin if (stats(env, e->re.group.expr.binary.left))
4454887Schin return 1;
4464887Schin nm = env->stats.m;
4474887Schin env->stats.m = 0;
4484887Schin nn = env->stats.n;
4494887Schin env->stats.n = 0;
4504887Schin if (stats(env, e->re.group.expr.binary.right))
4514887Schin return 1;
4524887Schin if (env->stats.m < nm)
4534887Schin env->stats.m = nm;
4544887Schin else
4554887Schin nm = env->stats.m;
4564887Schin if ((env->stats.m += cm) < nm)
4574887Schin return 1;
4584887Schin if (env->stats.n < nn)
4594887Schin env->stats.n = nn;
4604887Schin else
4614887Schin nn = env->stats.n;
4624887Schin if ((env->stats.n += cn) < nn)
4634887Schin return 1;
4644887Schin break;
465*12068SRoger.Faulkner@Oracle.COM case REX_END:
466*12068SRoger.Faulkner@Oracle.COM env->stats.e = 1;
467*12068SRoger.Faulkner@Oracle.COM break;
4684887Schin case REX_GROUP:
4694887Schin if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0)
4704887Schin return 1;
4714887Schin if (stats(env, e->re.group.expr.rex))
4724887Schin return 1;
4734887Schin break;
4744887Schin case REX_GROUP_AHEAD:
4754887Schin case REX_GROUP_AHEAD_NOT:
4764887Schin case REX_GROUP_BEHIND:
4774887Schin case REX_GROUP_BEHIND_NOT:
4784887Schin m = env->stats.m;
4794887Schin n = env->stats.n;
4804887Schin x = env->stats.x;
4814887Schin y = env->stats.y;
4824887Schin if (stats(env, e->re.group.expr.rex))
4834887Schin return 1;
4844887Schin env->stats.m = m;
4854887Schin env->stats.n = n;
4864887Schin env->stats.x = x;
4874887Schin env->stats.y = y;
4884887Schin switch (e->type)
4894887Schin {
4904887Schin case REX_GROUP_AHEAD:
4914887Schin case REX_GROUP_BEHIND:
4924887Schin if (++env->stats.u <= 0)
4934887Schin return 1;
4944887Schin break;
4954887Schin }
4964887Schin break;
4974887Schin case REX_GROUP_COND:
4984887Schin if (++env->stats.u <= 0)
4994887Schin return 1;
5004887Schin m = env->stats.m;
5014887Schin n = env->stats.n;
5024887Schin x = env->stats.x;
5034887Schin y = env->stats.y;
5044887Schin if (e->re.group.size > 0 && ++env->stats.b <= 0)
5054887Schin return 1;
5064887Schin if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left))
5074887Schin return 1;
5084887Schin if (q = e->re.group.expr.binary.right)
5094887Schin {
5104887Schin if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left))
5114887Schin return 1;
5124887Schin if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right))
5134887Schin return 1;
5144887Schin }
5154887Schin env->stats.m = m;
5164887Schin env->stats.n = n;
5174887Schin env->stats.x = x;
5184887Schin env->stats.y = y;
5194887Schin break;
5204887Schin case REX_GROUP_CUT:
5214887Schin if (++env->stats.u <= 0)
5224887Schin return 1;
5234887Schin m = env->stats.m;
5244887Schin n = env->stats.n;
5254887Schin x = env->stats.x;
5264887Schin y = env->stats.y;
5274887Schin if (stats(env, e->re.group.expr.rex))
5284887Schin return 1;
5294887Schin env->stats.m = m;
5304887Schin env->stats.n = n;
5314887Schin env->stats.x = x;
5324887Schin env->stats.y = y;
5334887Schin break;
5344887Schin case REX_NEG:
5354887Schin env->stats.i++;
5364887Schin x = env->stats.x;
5374887Schin l = env->stats.l;
5384887Schin y = env->stats.y;
5394887Schin k = env->stats.k;
5404887Schin t = env->stats.t;
5414887Schin cm = env->stats.m;
5424887Schin env->stats.m = 0;
5434887Schin if (stats(env, e->re.group.expr.rex))
5444887Schin return 1;
5454887Schin env->stats.m = !env->stats.m;
5464887Schin if ((env->stats.m += cm) < cm)
5474887Schin return 1;
5484887Schin env->stats.x = x;
5494887Schin env->stats.l = l;
5504887Schin env->stats.y = y;
5514887Schin env->stats.k = k;
5524887Schin env->stats.t = t;
5534887Schin break;
5544887Schin case REX_REP:
5554887Schin x = env->stats.x;
5564887Schin l = env->stats.l;
5574887Schin y = env->stats.y;
5584887Schin k = env->stats.k;
5594887Schin t = env->stats.t;
5604887Schin if (++env->stats.c <= 0)
5614887Schin return 1;
5624887Schin b = env->stats.b;
5634887Schin c = env->stats.c;
5644887Schin cm = env->stats.m;
5654887Schin env->stats.m = 0;
5664887Schin if (stats(env, e->re.group.expr.rex))
5674887Schin return 1;
5684887Schin if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0)
5694887Schin return 1;
5704887Schin if (e->lo < 1)
5714887Schin {
5724887Schin env->stats.x = x;
5734887Schin env->stats.l = l;
5744887Schin env->stats.y = y;
5754887Schin env->stats.k = k;
5764887Schin env->stats.t = t;
5774887Schin env->stats.m = cm;
5784887Schin }
5794887Schin else
5804887Schin {
5814887Schin m = env->stats.m;
5824887Schin if ((env->stats.m *= e->lo) > 0 && env->stats.m < m)
5834887Schin return 1;
5844887Schin m = env->stats.m;
5854887Schin if ((env->stats.m += cm) < m)
5864887Schin return 1;
5874887Schin if (env->stats.x != x)
5884887Schin env->stats.l = cm;
5894887Schin if (env->stats.y != y)
5904887Schin env->stats.k = cm;
5914887Schin }
5924887Schin break;
5934887Schin case REX_STRING:
5948462SApril.Chin@Sun.COM if (!e->map)
5954887Schin {
5968462SApril.Chin@Sun.COM cm = env->stats.m;
5978462SApril.Chin@Sun.COM if ((env->stats.m += e->re.string.size) < cm)
5988462SApril.Chin@Sun.COM return 1;
5998462SApril.Chin@Sun.COM cn = env->stats.n;
6008462SApril.Chin@Sun.COM if ((env->stats.n += e->re.string.size) < cn)
6018462SApril.Chin@Sun.COM return 1;
6028462SApril.Chin@Sun.COM if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size)
6038462SApril.Chin@Sun.COM {
6048462SApril.Chin@Sun.COM env->stats.x = e;
6058462SApril.Chin@Sun.COM env->stats.l = cm;
6068462SApril.Chin@Sun.COM }
6074887Schin }
6084887Schin break;
6094887Schin case REX_TRIE:
6104887Schin if (++env->stats.s <= 0)
6114887Schin return 1;
6124887Schin cm = env->stats.m;
6134887Schin if ((env->stats.m += e->re.trie.min) < cm)
6144887Schin return 1;
6154887Schin cn = env->stats.n;
6164887Schin if ((env->stats.n += e->re.trie.max) < cn)
6174887Schin return 1;
6184887Schin env->stats.t++;
6194887Schin if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min)
6204887Schin {
6214887Schin env->stats.y = e;
6224887Schin env->stats.k = cm;
6234887Schin }
6244887Schin break;
6254887Schin }
6264887Schin } while (e = e->next);
6274887Schin return 0;
6284887Schin }
6294887Schin
6304887Schin static int token(Cenv_t*);
6314887Schin
6324887Schin static int
magic(register Cenv_t * env,register int c,int escaped)6334887Schin magic(register Cenv_t* env, register int c, int escaped)
6344887Schin {
6354887Schin register char* sp;
6364887Schin register int n;
6374887Schin int o = c;
6384887Schin int e = env->error;
6394887Schin int l = env->token.len;
6404887Schin short* mp;
6414887Schin char* ep;
6424887Schin
6434887Schin if (mp = state.magic[c])
6444887Schin {
6454887Schin c = mp[env->type+escaped];
6464887Schin if (c >= T_META)
6474887Schin {
6484887Schin sp = (char*)env->cursor + env->token.len;
6494887Schin switch (c)
6504887Schin {
6514887Schin case T_LEFT:
6524887Schin n = 0;
6534887Schin ep = sp;
6544887Schin while (*sp >= '0' && *sp <= '9')
6554887Schin {
6564887Schin if (n > (INT_MAX / 10))
6574887Schin {
6584887Schin env->error = REG_BADBR;
6594887Schin goto bad;
6604887Schin }
6614887Schin n = n * 10 + *sp++ - '0';
6624887Schin }
6634887Schin if (sp == ep)
6644887Schin {
6658462SApril.Chin@Sun.COM if (env->type < SRE || *sp != ',')
6668462SApril.Chin@Sun.COM {
6678462SApril.Chin@Sun.COM env->error = *sp ? REG_BADBR : REG_EBRACE;
6688462SApril.Chin@Sun.COM goto bad;
6698462SApril.Chin@Sun.COM }
6704887Schin }
6714887Schin else if (n > RE_DUP_MAX)
6724887Schin {
6734887Schin env->error = REG_BADBR;
6744887Schin goto bad;
6754887Schin }
6764887Schin env->token.min = n;
6774887Schin if (*sp == ',')
6784887Schin {
6794887Schin n = 0;
6804887Schin ep = ++sp;
6814887Schin while (*sp >= '0' && *sp <= '9')
6824887Schin {
6834887Schin if (n > (INT_MAX / 10))
6844887Schin {
6854887Schin env->error = REG_BADBR;
6864887Schin goto bad;
6874887Schin }
6884887Schin n = n * 10 + *sp++ - '0';
6894887Schin }
6904887Schin if (sp == ep)
6914887Schin n = RE_DUP_INF;
6924887Schin else if (n < env->token.min)
6934887Schin {
6944887Schin env->error = REG_BADBR;
6954887Schin goto bad;
6964887Schin }
6974887Schin }
6984887Schin env->token.max = n;
6994887Schin switch (*sp)
7004887Schin {
7014887Schin case 0:
7024887Schin env->error = REG_EBRACE;
7034887Schin goto bad;
7044887Schin case '\\':
7054887Schin if (!escaped)
7064887Schin {
7074887Schin env->error = REG_BADBR;
7084887Schin goto bad;
7094887Schin }
7104887Schin sp++;
7114887Schin break;
7124887Schin default:
7134887Schin if (escaped)
7144887Schin {
7154887Schin env->error = REG_BADBR;
7164887Schin goto bad;
7174887Schin }
7184887Schin break;
7194887Schin }
7204887Schin switch (*sp++)
7214887Schin {
7224887Schin case 0:
7234887Schin env->error = REG_EBRACE;
7244887Schin goto bad;
7254887Schin case '}':
7264887Schin break;
7274887Schin default:
7284887Schin env->error = REG_BADBR;
7294887Schin goto bad;
7304887Schin }
7314887Schin env->token.len = sp - (char*)env->cursor;
7324887Schin break;
7334887Schin case T_RIGHT:
7344887Schin env->error = REG_EBRACE;
7354887Schin goto bad;
7364887Schin case T_OPEN:
7374887Schin if (env->type < SRE && *sp == '?')
7384887Schin {
7394887Schin env->token.len++;
7404887Schin env->token.lex = 0;
7414887Schin goto group;
7424887Schin }
7434887Schin break;
7444887Schin case T_ESCAPE:
7454887Schin c = chresc(sp - 2, &ep);
7464887Schin if (ep < sp)
7474887Schin goto bad;
7484887Schin env->token.len += ep - sp;
7494887Schin if (c >= T_META)
7504887Schin {
7514887Schin env->token.lex = c;
7524887Schin c = C_ESC;
7534887Schin }
7544887Schin return c;
7554887Schin case T_BACK+0:
7564887Schin case T_BACK+1:
7574887Schin case T_BACK+2:
7584887Schin case T_BACK+3:
7594887Schin case T_BACK+4:
7604887Schin case T_BACK+5:
7614887Schin case T_BACK+6:
7624887Schin case T_BACK+7:
7634887Schin n = chresc(sp - 2, &ep);
7644887Schin if (ep > sp + 1)
7654887Schin {
7664887Schin env->token.len += ep - sp;
7674887Schin return n;
7684887Schin }
7694887Schin /*FALLTHROUGH*/
7704887Schin case T_BACK+8:
7714887Schin case T_BACK+9:
7724887Schin if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT))
7734887Schin {
7744887Schin env->error = REG_BADESC;
7754887Schin goto bad;
7764887Schin }
7774887Schin if ((env->flags & REG_MULTIREF) && isdigit(*sp))
7784887Schin {
7794887Schin c = (c - T_BACK) * 10 + (*sp - '0');
7804887Schin if (c > 0 && c <= env->parno && env->paren[c])
7814887Schin c += T_BACK;
7824887Schin else
7834887Schin c = chresc(sp - 2, &ep);
7844887Schin env->token.len++;
7854887Schin }
7864887Schin if (c == T_BACK)
7874887Schin c = 0;
7884887Schin break;
7894887Schin case T_BAD:
7904887Schin if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META)
7914887Schin return c;
7924887Schin goto bad;
7934887Schin }
7944887Schin if (env->type >= SRE)
7954887Schin {
7964887Schin if (c == T_DOT)
7974887Schin c = '.';
7984887Schin else if (c < T_OPEN)
7994887Schin {
8004887Schin if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(')
8014887Schin {
8024887Schin env->token.len++;
8034887Schin env->token.att = 1;
8044887Schin }
8054887Schin if (env->type == KRE && *(env->cursor + env->token.len) == '(')
8064887Schin {
8074887Schin env->token.len++;
8084887Schin switch (c)
8094887Schin {
8104887Schin case T_AT:
8114887Schin break;
8124887Schin case T_PERCENT:
8134887Schin env->token.lex = c;
8144887Schin goto group;
8154887Schin case T_TILDE:
8164887Schin env->token.lex = 0;
8174887Schin goto group;
8184887Schin default:
8194887Schin env->token.lex = c;
8204887Schin break;
8214887Schin }
8224887Schin c = T_OPEN;
8234887Schin }
8244887Schin else if (c == T_STAR)
8254887Schin c = T_DOTSTAR;
8264887Schin else if (c == T_QUES)
8274887Schin c = T_DOT;
8284887Schin else
8294887Schin {
8304887Schin c = o;
8314887Schin env->token.len = l;
8324887Schin }
8334887Schin }
8344887Schin else if (c > T_BACK)
8354887Schin {
8364887Schin c = (c - T_BACK) * 2 - 1;
8374887Schin c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c;
8384887Schin }
8394887Schin else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP))
8404887Schin {
8414887Schin if (c == T_AND)
8424887Schin c = '&';
8434887Schin else if (c == T_BAR)
8444887Schin c = '|';
8454887Schin else if (c == T_OPEN)
8464887Schin c = '(';
8474887Schin }
8484887Schin }
8494887Schin }
8504887Schin }
8514887Schin else if (escaped == 2)
8524887Schin {
8534887Schin if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter))
8544887Schin /*ok*/;
8554887Schin else
8564887Schin {
8574887Schin env->error = REG_BADESC;
8584887Schin goto bad;
8594887Schin }
8604887Schin }
8614887Schin else if (escaped && !(env->flags & REG_LENIENT) && c != ']')
8624887Schin {
8634887Schin env->error = REG_BADESC;
8644887Schin goto bad;
8654887Schin }
8664887Schin return c;
8674887Schin group:
8684887Schin sp = (char*)env->cursor + env->token.len;
8694887Schin switch (*sp++)
8704887Schin {
8714887Schin case ')':
8724887Schin break;
8734887Schin case '#':
8744887Schin for (;;)
8754887Schin {
8764887Schin switch (*sp++)
8774887Schin {
8784887Schin case 0:
8794887Schin env->error = REG_EPAREN;
8804887Schin return T_BAD;
8814887Schin case ')':
8824887Schin break;
8834887Schin default:
8844887Schin continue;
8854887Schin }
8864887Schin break;
8874887Schin }
8884887Schin break;
8894887Schin default:
8904887Schin return T_GROUP;
8914887Schin }
8924887Schin env->cursor = (unsigned char*)sp;
8934887Schin return token(env);
8944887Schin bad:
8954887Schin if (escaped == 2)
8964887Schin env->error = e;
8974887Schin else if (env->flags & REG_LENIENT)
8984887Schin return o;
8994887Schin else if (escaped == 1 && !env->error)
9004887Schin {
9014887Schin if (mp || o == ']')
9024887Schin return o;
9034887Schin env->error = REG_BADESC;
9044887Schin }
9054887Schin return T_BAD;
9064887Schin }
9074887Schin
9084887Schin static int
token(register Cenv_t * env)9094887Schin token(register Cenv_t* env)
9104887Schin {
9114887Schin int c;
9124887Schin int posixkludge;
9134887Schin
9144887Schin if (env->token.push)
9154887Schin return env->token.lex;
9164887Schin env->token.att = env->token.esc = 0;
9174887Schin if ((env->token.len = MBSIZE(env->cursor)) > 1)
9184887Schin return env->token.lex = C_MB;
9194887Schin env->token.lex = 0;
9204887Schin for (;;)
9214887Schin {
9224887Schin c = *env->cursor;
9234887Schin if (c == 0 || c == env->delimiter || c == env->terminator)
9244887Schin return T_END;
9254887Schin if (!(env->flags & REG_COMMENT))
9264887Schin break;
9274887Schin if (c == '#')
9284887Schin {
9294887Schin do
9304887Schin {
9314887Schin c = *++env->cursor;
9324887Schin if (c == 0 || c == env->delimiter)
9334887Schin return T_END;
9344887Schin } while (c != '\n');
9354887Schin }
9364887Schin else if (!isspace(c))
9374887Schin break;
9384887Schin env->cursor++;
9394887Schin }
9404887Schin if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter)
9414887Schin {
9424887Schin if (env->parnest)
9434887Schin {
9444887Schin env->error = REG_EPAREN;
9454887Schin return T_BAD;
9464887Schin }
9474887Schin env->parno = 0;
9484887Schin env->pattern = env->cursor + 1;
9494887Schin return T_BAR;
9504887Schin }
9514887Schin if (env->flags & REG_LITERAL)
9524887Schin return c;
9534887Schin if (posixkludge = env->posixkludge)
9544887Schin {
9554887Schin env->posixkludge = 0;
9564887Schin if (c == '*')
9574887Schin return c;
9584887Schin }
9594887Schin if (c == '\\')
9604887Schin {
9614887Schin if (env->flags & REG_SHELL_ESCAPED)
9624887Schin return c;
9634887Schin if (!(c = *(env->cursor + 1)) || c == env->terminator)
9644887Schin {
9654887Schin if (env->flags & REG_LENIENT)
9664887Schin {
9674887Schin if (c)
9684887Schin {
9694887Schin env->token.esc = env->token.len;
9704887Schin env->token.len += MBSIZE(env->cursor + 1);
9714887Schin return c;
9724887Schin }
9734887Schin return '\\';
9744887Schin }
9754887Schin env->error = REG_EESCAPE;
9764887Schin return T_BAD;
9774887Schin }
9784887Schin env->token.esc = env->token.len;
9794887Schin env->token.len += MBSIZE(env->cursor + 1);
9804887Schin if (env->delimiter && c == 'n')
9814887Schin return '\n';
9824887Schin else if (c == env->delimiter)
9834887Schin return magic(env, c, 0);
9844887Schin else if (c == '(' && env->type == BRE)
9854887Schin env->posixkludge = 1;
9864887Schin else if (c == ')' && env->type == BRE && env->parnest <= 0)
9874887Schin {
9884887Schin env->error = REG_EPAREN;
9894887Schin return T_BAD;
9904887Schin }
9914887Schin else if (isspace(c) && (env->flags & REG_COMMENT))
9924887Schin return c;
9934887Schin return magic(env, c, 1);
9944887Schin }
9954887Schin else if (c == '$')
9964887Schin {
9974887Schin if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n')
9984887Schin return T_DOLL;
9994887Schin }
10004887Schin else if (c == '^')
10014887Schin {
1002*12068SRoger.Faulkner@Oracle.COM if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1))
10034887Schin {
1004*12068SRoger.Faulkner@Oracle.COM env->posixkludge = 2;
10054887Schin return T_CFLX;
10064887Schin }
10074887Schin }
10084887Schin else if (c == ')')
10094887Schin {
10104887Schin if (env->type != BRE && env->parnest <= 0)
10114887Schin return c;
10124887Schin }
10134887Schin else if (c == '/' && env->explicit == env->mappedslash)
10144887Schin {
10154887Schin while (*(env->cursor + env->token.len) == c)
10164887Schin env->token.len++;
10174887Schin return T_SLASHPLUS;
10184887Schin }
10194887Schin return magic(env, c, 0);
10204887Schin }
10214887Schin
10224887Schin static Celt_t*
col(Celt_t * ce,int ic,unsigned char * bp,int bw,int bc,unsigned char * ep,int ew,int ec)10234887Schin col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec)
10244887Schin {
10258462SApril.Chin@Sun.COM register char* s;
10264887Schin register unsigned char* k;
10274887Schin register unsigned char* e;
10284887Schin register int c;
10294887Schin register int cc;
10304887Schin int bt;
10314887Schin int et;
10324887Schin Ckey_t key;
10334887Schin
10344887Schin cc = 0;
10354887Schin for (;;)
10364887Schin {
10374887Schin k = key;
10384887Schin if (bw == 1)
10394887Schin {
10404887Schin c = bc;
10414887Schin if (ic)
10424887Schin {
10438462SApril.Chin@Sun.COM if (isupper(c))
10448462SApril.Chin@Sun.COM {
10458462SApril.Chin@Sun.COM c = tolower(c);
10468462SApril.Chin@Sun.COM cc = -1;
10478462SApril.Chin@Sun.COM }
10488462SApril.Chin@Sun.COM else if (islower(c))
10498462SApril.Chin@Sun.COM {
10508462SApril.Chin@Sun.COM c = toupper(c);
10518462SApril.Chin@Sun.COM cc = -1;
10528462SApril.Chin@Sun.COM }
10538462SApril.Chin@Sun.COM }
10548462SApril.Chin@Sun.COM *k++ = c;
10558462SApril.Chin@Sun.COM }
10568462SApril.Chin@Sun.COM else if (bw < COLL_KEY_MAX)
10578462SApril.Chin@Sun.COM {
10588462SApril.Chin@Sun.COM s = (char*)bp;
10598462SApril.Chin@Sun.COM if (ic)
10608462SApril.Chin@Sun.COM {
10618462SApril.Chin@Sun.COM c = mbchar(s);
10624887Schin if (iswupper(c))
10634887Schin {
10644887Schin c = towlower(c);
10654887Schin cc = 1;
10664887Schin }
10674887Schin else if (iswlower(c))
10684887Schin {
10694887Schin c = towupper(c);
10704887Schin cc = 1;
10714887Schin }
10724887Schin }
10738462SApril.Chin@Sun.COM if (cc > 0)
10744887Schin {
10758462SApril.Chin@Sun.COM cc = -1;
1076*12068SRoger.Faulkner@Oracle.COM k += mbconv((char*)k, c);
10774887Schin }
10788462SApril.Chin@Sun.COM else
10798462SApril.Chin@Sun.COM for (e = k + bw; k < e; *k++ = *s++);
10804887Schin }
10814887Schin *k = 0;
10824887Schin mbxfrm(ce->beg, key, COLL_KEY_MAX);
10834887Schin if (ep)
10844887Schin {
10854887Schin k = key;
10864887Schin c = mbchar(k);
10874887Schin if (iswupper(c))
10884887Schin bt = COLL_range_uc;
10894887Schin else if (iswlower(c))
10904887Schin bt = COLL_range_lc;
10914887Schin else
10924887Schin bt = COLL_range;
10934887Schin k = key;
10944887Schin if (ew == 1)
10954887Schin {
10964887Schin c = ec;
10974887Schin if (ic)
10984887Schin {
10998462SApril.Chin@Sun.COM if (isupper(c))
11008462SApril.Chin@Sun.COM {
11018462SApril.Chin@Sun.COM c = tolower(c);
11028462SApril.Chin@Sun.COM cc = -1;
11038462SApril.Chin@Sun.COM }
11048462SApril.Chin@Sun.COM else if (islower(c))
11058462SApril.Chin@Sun.COM {
11068462SApril.Chin@Sun.COM c = toupper(c);
11078462SApril.Chin@Sun.COM cc = -1;
11088462SApril.Chin@Sun.COM }
11098462SApril.Chin@Sun.COM }
11108462SApril.Chin@Sun.COM *k++ = c;
11118462SApril.Chin@Sun.COM }
11128462SApril.Chin@Sun.COM else if (ew < COLL_KEY_MAX)
11138462SApril.Chin@Sun.COM {
11148462SApril.Chin@Sun.COM s = (char*)ep;
11158462SApril.Chin@Sun.COM if (ic)
11168462SApril.Chin@Sun.COM {
11178462SApril.Chin@Sun.COM c = mbchar(s);
11184887Schin if (iswupper(c))
11194887Schin {
11204887Schin c = towlower(c);
11214887Schin cc = 1;
11224887Schin }
11234887Schin else if (iswlower(c))
11244887Schin {
11254887Schin c = towupper(c);
11264887Schin cc = 1;
11274887Schin }
11284887Schin }
11298462SApril.Chin@Sun.COM if (cc > 0)
11304887Schin {
11318462SApril.Chin@Sun.COM cc = -1;
1132*12068SRoger.Faulkner@Oracle.COM k += mbconv((char*)k, c);
11334887Schin }
11348462SApril.Chin@Sun.COM else
11358462SApril.Chin@Sun.COM for (e = k + ew; k < e; *k++ = *s++);
11364887Schin }
11374887Schin *k = 0;
11384887Schin mbxfrm(ce->end, key, COLL_KEY_MAX);
11394887Schin k = key;
11404887Schin c = mbchar(k);
11414887Schin if (iswupper(c))
11424887Schin et = COLL_range_uc;
11434887Schin else if (iswlower(c))
11444887Schin et = COLL_range_lc;
11454887Schin else
11464887Schin et = COLL_range;
11474887Schin ce->typ = bt == et ? bt : COLL_range;
11484887Schin }
11494887Schin else
11504887Schin ce->typ = COLL_char;
11514887Schin ce++;
11524887Schin if (!ic || !cc)
11534887Schin break;
11544887Schin ic = 0;
11554887Schin }
11564887Schin return ce;
11574887Schin }
11584887Schin
11594887Schin static Rex_t*
bra(Cenv_t * env)11604887Schin bra(Cenv_t* env)
11614887Schin {
11624887Schin Rex_t* e;
11634887Schin int c;
11644887Schin int i;
11654887Schin int w;
11664887Schin int neg;
11674887Schin int last;
11684887Schin int inrange;
11694887Schin int complicated;
11704887Schin int collate;
11714887Schin int elements;
11724887Schin unsigned char* first;
11734887Schin unsigned char* start;
11744887Schin unsigned char* begin;
11754887Schin unsigned char* s;
11764887Schin regclass_t f;
11774887Schin unsigned char buf[4 * (COLL_KEY_MAX + 1)];
11784887Schin #if _PACKAGE_ast
11794887Schin int ic;
11804887Schin char mbc[COLL_KEY_MAX + 1];
11814887Schin #endif
11824887Schin
11834887Schin if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
11844887Schin return 0;
11854887Schin collate = complicated = elements = 0;
11864887Schin if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!')
11874887Schin {
11884887Schin env->cursor++;
11894887Schin neg = 1;
11904887Schin }
11914887Schin else
11924887Schin neg = 0;
11938462SApril.Chin@Sun.COM first = env->cursor;
11948462SApril.Chin@Sun.COM start = first + MBSIZE(first);
11954887Schin if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter))
11964887Schin goto error;
11974887Schin begin = env->cursor + MBSIZE(env->cursor);
11984887Schin
11994887Schin /*
12004887Schin * inrange: 0=no, 1=possibly, 2=definitely
12014887Schin */
12024887Schin
12034887Schin inrange = 0;
12044887Schin for (;;)
12054887Schin {
1206*12068SRoger.Faulkner@Oracle.COM if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE))
12074887Schin goto error;
12084887Schin env->cursor += (w = MBSIZE(env->cursor));
1209*12068SRoger.Faulkner@Oracle.COM if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
12104887Schin {
12114887Schin if (*env->cursor)
12124887Schin {
12134887Schin if (*env->cursor == 'n')
12144887Schin {
12154887Schin env->cursor++;
12164887Schin c = '\n';
12174887Schin }
12184887Schin else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
12194887Schin {
12204887Schin env->token.len = 1;
12214887Schin w = magic(env, *env->cursor, 2);
12224887Schin if (env->token.len > 1 || w != T_BAD)
12234887Schin {
12244887Schin if (env->token.len == 1 && (f = classfun(w)))
12254887Schin {
12264887Schin if (inrange > 1)
12274887Schin {
12284887Schin if (env->type < SRE && !(env->flags & REG_LENIENT))
12294887Schin goto erange;
12304887Schin inrange = 0;
12314887Schin }
12324887Schin env->cursor++;
12334887Schin for (c = 0; c <= UCHAR_MAX; c++)
12344887Schin if ((*f)(c))
12354887Schin setadd(e->re.charclass, c);
12364887Schin complicated++;
12374887Schin elements++;
12384887Schin continue;
12394887Schin }
12404887Schin if (env->token.len > 1 || w >= 0 && w < T_META)
12414887Schin {
12424887Schin c = w;
12434887Schin if (c > UCHAR_MAX)
12444887Schin {
12454887Schin if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide())
12464887Schin goto erange;
12474887Schin c = UCHAR_MAX;
12484887Schin }
12494887Schin env->cursor += env->token.len;
12504887Schin }
12514887Schin }
12524887Schin }
12534887Schin }
12544887Schin }
12554887Schin else if (c == ']')
12564887Schin {
12574887Schin if (env->cursor == begin)
12584887Schin {
12594887Schin last = c;
12604887Schin inrange = 1;
12614887Schin continue;
12624887Schin }
12634887Schin if (inrange != 0)
12644887Schin {
12654887Schin setadd(e->re.charclass, last);
12664887Schin elements++;
12674887Schin if (inrange == 2)
12684887Schin {
12694887Schin setadd(e->re.charclass, '-');
12704887Schin elements++;
12714887Schin }
12724887Schin }
12734887Schin break;
12744887Schin }
12754887Schin else if (c == '-')
12764887Schin {
12774887Schin if (!inrange && env->cursor != begin && *env->cursor != ']')
12784887Schin {
12794887Schin if (env->type < SRE && !(env->flags & REG_LENIENT))
12804887Schin goto erange;
12814887Schin continue;
12824887Schin }
12834887Schin else if (inrange == 1)
12844887Schin {
12854887Schin inrange = 2;
12864887Schin complicated++;
12874887Schin continue;
12884887Schin }
12894887Schin }
12904887Schin else if (c == '[')
12914887Schin {
12924887Schin switch (*env->cursor)
12934887Schin {
12944887Schin case 0:
12954887Schin goto error;
12964887Schin case ':':
129710898Sroland.mainz@nrubsig.org if (env->regexp)
129810898Sroland.mainz@nrubsig.org goto normal;
12994887Schin if (inrange == 1)
13004887Schin {
13014887Schin setadd(e->re.charclass, last);
13024887Schin elements++;
13034887Schin }
13044887Schin if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
13054887Schin {
13064887Schin if (env->cursor == start && (c = *(env->cursor + 1)))
13074887Schin {
13084887Schin s = start = env->cursor + 1;
13094887Schin while (*++s && *s != ':');
13104887Schin if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']')
13114887Schin {
13124887Schin if ((i = (s - start)) == 1)
13134887Schin {
13144887Schin switch (c)
13154887Schin {
13164887Schin case '<':
13174887Schin i = REX_WBEG;
13184887Schin break;
13194887Schin case '>':
13204887Schin i = REX_WEND;
13214887Schin break;
13224887Schin default:
13234887Schin i = 0;
13244887Schin break;
13254887Schin }
13264887Schin if (i)
13274887Schin {
13284887Schin env->cursor = s + 3;
13294887Schin drop(env->disc, e);
13304887Schin return node(env, i, 0, 0, 0);
13314887Schin }
13324887Schin }
13334887Schin }
13344887Schin }
13354887Schin env->error = REG_ECTYPE;
13364887Schin goto error;
13374887Schin }
13384887Schin for (c = 0; c <= UCHAR_MAX; c++)
13394887Schin if ((*f)(c))
13404887Schin setadd(e->re.charclass, c);
13414887Schin inrange = 0;
13424887Schin complicated++;
13434887Schin elements++;
13444887Schin continue;
13454887Schin case '=':
134610898Sroland.mainz@nrubsig.org if (env->regexp)
134710898Sroland.mainz@nrubsig.org goto normal;
13484887Schin if (inrange == 2)
13494887Schin goto erange;
13504887Schin if (inrange == 1)
13514887Schin {
13524887Schin setadd(e->re.charclass, last);
13534887Schin elements++;
13544887Schin }
13554887Schin if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0)
13564887Schin goto ecollate;
13574887Schin if (c > 1)
13584887Schin collate++;
13594887Schin else
13604887Schin setadd(e->re.charclass, buf[0]);
13614887Schin c = buf[0];
13624887Schin inrange = 0;
13634887Schin complicated++;
13644887Schin elements++;
13654887Schin continue;
13664887Schin case '.':
136710898Sroland.mainz@nrubsig.org if (env->regexp)
136810898Sroland.mainz@nrubsig.org goto normal;
13694887Schin if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0)
13704887Schin goto ecollate;
13714887Schin if (c > 1)
13724887Schin collate++;
13734887Schin c = buf[0];
13744887Schin complicated++;
13754887Schin break;
13764887Schin default:
137710898Sroland.mainz@nrubsig.org normal:
13784887Schin if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
13794887Schin goto error;
13804887Schin break;
13814887Schin }
13824887Schin }
13834887Schin else if (w > 1)
13844887Schin complicated++;
13854887Schin if (inrange == 2)
13864887Schin {
1387*12068SRoger.Faulkner@Oracle.COM if (last <= c)
13884887Schin {
1389*12068SRoger.Faulkner@Oracle.COM for (i = last; i <= c; i++)
1390*12068SRoger.Faulkner@Oracle.COM setadd(e->re.charclass, i);
1391*12068SRoger.Faulkner@Oracle.COM inrange = env->type >= SRE || (env->flags & REG_LENIENT);
1392*12068SRoger.Faulkner@Oracle.COM elements += 2;
1393*12068SRoger.Faulkner@Oracle.COM }
1394*12068SRoger.Faulkner@Oracle.COM else if (env->type >= SRE)
1395*12068SRoger.Faulkner@Oracle.COM {
13964887Schin setadd(e->re.charclass, last);
13974887Schin setadd(e->re.charclass, c);
1398*12068SRoger.Faulkner@Oracle.COM elements += 2;
1399*12068SRoger.Faulkner@Oracle.COM inrange = 1;
14004887Schin }
1401*12068SRoger.Faulkner@Oracle.COM else if (!(env->flags & REG_LENIENT))
1402*12068SRoger.Faulkner@Oracle.COM goto erange;
14034887Schin else
1404*12068SRoger.Faulkner@Oracle.COM inrange = 0;
14054887Schin }
14064887Schin else if (inrange == 1)
14074887Schin {
14084887Schin setadd(e->re.charclass, last);
14094887Schin elements++;
14104887Schin }
14114887Schin else
14124887Schin inrange = 1;
14134887Schin last = c;
14144887Schin }
14154887Schin #if _PACKAGE_ast
14164887Schin if (complicated && mbcoll())
14174887Schin {
14184887Schin Dt_t* dt;
14194887Schin Cchr_t* cc;
14204887Schin Cchr_t* tc;
14214887Schin Cchr_t* xc;
14224887Schin Celt_t* ce;
14234887Schin Cchr_t key;
14244887Schin int rw;
14254887Schin int rc;
14268462SApril.Chin@Sun.COM int wc;
14274887Schin unsigned char* rp;
14284887Schin unsigned char* pp;
14298462SApril.Chin@Sun.COM char* wp;
14308462SApril.Chin@Sun.COM char cb[2][COLL_KEY_MAX+1];
14314887Schin
14324887Schin static Dtdisc_t disc;
14334887Schin
14344887Schin static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
14354887Schin
14364887Schin if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data))
14374887Schin {
14384887Schin disc.key = offsetof(Cchr_t, key);
14394887Schin if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree)))
14404887Schin {
14414887Schin for (i = 0; i < elementsof(primary) - 1; i++, cc++)
14424887Schin {
14434887Schin cc->nam[0] = primary[i];
14444887Schin mbxfrm(cc->key, cc->nam, COLL_KEY_MAX);
14454887Schin dtinsert(dt, cc);
14464887Schin }
14474887Schin for (i = 0; i < elementsof(cc->key); i++)
14484887Schin cc->key[i] = ~0;
14494887Schin dtinsert(dt, cc);
14504887Schin LCINFO(AST_LC_COLLATE)->data = (void*)dt;
14514887Schin }
14524887Schin else
14534887Schin {
14544887Schin if (cc)
14554887Schin free(cc);
14564887Schin drop(env->disc, e);
14574887Schin return 0;
14584887Schin }
14594887Schin }
14604887Schin if (dt)
14614887Schin {
14624887Schin drop(env->disc, e);
14634887Schin if (ic = env->flags & REG_ICASE)
14644887Schin elements *= 2;
14658462SApril.Chin@Sun.COM if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t))))
14664887Schin return 0;
14674887Schin ce = (Celt_t*)e->re.data;
14684887Schin e->re.collate.invert = neg;
14694887Schin e->re.collate.elements = ce;
14704887Schin env->cursor = first;
14714887Schin inrange = 0;
14724887Schin for (;;)
14734887Schin {
14744887Schin if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter)
14754887Schin goto error;
14764887Schin pp = env->cursor;
14774887Schin env->cursor += (w = MBSIZE(env->cursor));
1478*12068SRoger.Faulkner@Oracle.COM if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
14794887Schin {
14804887Schin if (*env->cursor)
14814887Schin {
14824887Schin if (*env->cursor == 'n')
14834887Schin {
14844887Schin pp = env->cursor++;
14854887Schin c = '\n';
14864887Schin }
14874887Schin else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
14884887Schin {
14894887Schin env->token.len = 1;
14904887Schin w = magic(env, *env->cursor, 2);
14914887Schin if (env->token.len > 1 || w != T_BAD)
14924887Schin {
14934887Schin if (env->token.len == 1 && (f = classfun(w)))
14944887Schin {
14954887Schin if (inrange > 1)
14964887Schin {
14974887Schin if (env->type < SRE && !(env->flags & REG_LENIENT))
14984887Schin goto erange;
14994887Schin inrange = 0;
15004887Schin }
15014887Schin env->cursor++;
15024887Schin ce->fun = f;
15034887Schin ce->typ = COLL_call;
15044887Schin ce++;
15054887Schin continue;
15064887Schin }
15074887Schin if (env->token.len > 1 || w >= 0 && w < T_META)
15084887Schin {
15094887Schin c = w;
1510*12068SRoger.Faulkner@Oracle.COM w = mbconv(mbc, c);
15114887Schin pp = (unsigned char*)mbc;
15124887Schin env->cursor += env->token.len;
15134887Schin }
15144887Schin }
15154887Schin }
15164887Schin }
15174887Schin }
15184887Schin else if (c == ']')
15194887Schin {
15204887Schin if (env->cursor == begin)
15214887Schin {
15224887Schin rp = pp;
15234887Schin rw = w;
15244887Schin inrange = 1;
15254887Schin continue;
15264887Schin }
15274887Schin if (inrange != 0)
15284887Schin {
15294887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
15304887Schin if (inrange == 2)
15314887Schin ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0);
15324887Schin }
15334887Schin break;
15344887Schin }
15354887Schin else if (c == '-')
15364887Schin {
15374887Schin if (!inrange && env->cursor != begin && *env->cursor != ']')
15384887Schin {
15394887Schin if (env->type < SRE && !(env->flags & REG_LENIENT))
15404887Schin goto erange;
15414887Schin continue;
15424887Schin }
15434887Schin else if (inrange == 1)
15444887Schin {
15454887Schin inrange = 2;
15464887Schin continue;
15474887Schin }
15484887Schin }
15494887Schin else if (c == '[')
15504887Schin {
15514887Schin switch (*env->cursor)
15524887Schin {
15534887Schin case 0:
15544887Schin goto error;
15554887Schin case ':':
155610898Sroland.mainz@nrubsig.org if (env->regexp)
155710898Sroland.mainz@nrubsig.org goto complicated_normal;
15584887Schin if (inrange == 1)
15594887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
15604887Schin if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
15614887Schin {
15624887Schin if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']')
15634887Schin {
15644887Schin switch (c)
15654887Schin {
15664887Schin case '<':
15674887Schin i = REX_WBEG;
15684887Schin break;
15694887Schin case '>':
15704887Schin i = REX_WEND;
15714887Schin break;
15724887Schin default:
15734887Schin i = 0;
15744887Schin break;
15754887Schin }
15764887Schin if (i)
15774887Schin {
15784887Schin env->cursor += 5;
15794887Schin drop(env->disc, e);
15804887Schin return node(env, i, 0, 0, 0);
15814887Schin }
15824887Schin }
15834887Schin env->error = REG_ECTYPE;
15844887Schin goto error;
15854887Schin }
15864887Schin ce->fun = f;
15874887Schin ce->typ = COLL_call;
15884887Schin ce++;
15894887Schin inrange = 0;
15904887Schin continue;
15914887Schin case '=':
159210898Sroland.mainz@nrubsig.org if (env->regexp)
159310898Sroland.mainz@nrubsig.org goto complicated_normal;
15944887Schin if (inrange == 2)
15954887Schin goto erange;
15964887Schin if (inrange == 1)
15974887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
15988462SApril.Chin@Sun.COM pp = (unsigned char*)cb[inrange];
15994887Schin rp = env->cursor + 1;
16008462SApril.Chin@Sun.COM if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0)
16014887Schin goto ecollate;
16028462SApril.Chin@Sun.COM wp = (char*)pp;
16038462SApril.Chin@Sun.COM wc = mbchar(wp);
16044887Schin c = 0;
16054887Schin if (ic)
16068462SApril.Chin@Sun.COM {
16078462SApril.Chin@Sun.COM if (iswupper(wc))
16088462SApril.Chin@Sun.COM {
16098462SApril.Chin@Sun.COM wc = towlower(wc);
1610*12068SRoger.Faulkner@Oracle.COM rw = mbconv((char*)pp, wc);
16118462SApril.Chin@Sun.COM c = 'u';
16128462SApril.Chin@Sun.COM }
16138462SApril.Chin@Sun.COM else if (iswlower(wc))
16148462SApril.Chin@Sun.COM c = 'l';
16158462SApril.Chin@Sun.COM }
16164887Schin for (;;)
16174887Schin {
16188462SApril.Chin@Sun.COM mbxfrm(key.key, (char*)pp, COLL_KEY_MAX);
16198462SApril.Chin@Sun.COM if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key)))
16208462SApril.Chin@Sun.COM goto ecollate;
16218462SApril.Chin@Sun.COM xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc;
16228462SApril.Chin@Sun.COM if (c == 'l' || c == 'L' && !(c = 0))
16234887Schin ce->typ = COLL_range_lc;
16248462SApril.Chin@Sun.COM else if (c == 'u' || c == 'U' && !(c = 0))
16254887Schin ce->typ = COLL_range_uc;
16264887Schin else
16274887Schin ce->typ = COLL_range;
16284887Schin strcpy((char*)ce->beg, (char*)xc->key);
16294887Schin if (!(cc = (Cchr_t*)dtnext(dt, cc)))
16304887Schin goto ecollate;
16314887Schin if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc)))
16324887Schin cc = tc;
16334887Schin strcpy((char*)ce->end, (char*)cc->key);
16344887Schin ce->max = -1;
16354887Schin ce++;
16364887Schin if (!c)
16374887Schin break;
16388462SApril.Chin@Sun.COM if (c == 'u')
16398462SApril.Chin@Sun.COM {
16408462SApril.Chin@Sun.COM wc = towlower(wc);
16418462SApril.Chin@Sun.COM c = 'L';
16428462SApril.Chin@Sun.COM }
16438462SApril.Chin@Sun.COM else
16448462SApril.Chin@Sun.COM {
16458462SApril.Chin@Sun.COM wc = towupper(wc);
16468462SApril.Chin@Sun.COM c = 'U';
16478462SApril.Chin@Sun.COM }
1648*12068SRoger.Faulkner@Oracle.COM rw = mbconv((char*)pp, wc);
16494887Schin }
16504887Schin inrange = 0;
16518462SApril.Chin@Sun.COM c = *pp;
16524887Schin continue;
16534887Schin case '.':
165410898Sroland.mainz@nrubsig.org if (env->regexp)
165510898Sroland.mainz@nrubsig.org goto complicated_normal;
16568462SApril.Chin@Sun.COM pp = (unsigned char*)cb[inrange];
16578462SApril.Chin@Sun.COM if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0)
16584887Schin goto ecollate;
16594887Schin c = buf[0];
16604887Schin break;
16614887Schin default:
166210898Sroland.mainz@nrubsig.org complicated_normal:
16634887Schin if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
16644887Schin goto error;
16654887Schin break;
16664887Schin }
16674887Schin }
16684887Schin if (inrange == 2)
16694887Schin {
16704887Schin ce = col(ce, ic, rp, rw, rc, pp, w, c);
16714887Schin if (strcmp((char*)ce->beg, (char*)ce->end) > 0)
16724887Schin {
16734887Schin if (env->type < SRE && !(env->flags & REG_LENIENT))
16744887Schin goto erange;
16754887Schin (ce-1)->typ = COLL_char;
16764887Schin strcpy((char*)ce->beg, (char*)(ce-1)->end);
16774887Schin ce->typ = COLL_char;
16784887Schin ce++;
16794887Schin }
16804887Schin inrange = env->type >= SRE || (env->flags & REG_LENIENT);
16814887Schin }
16824887Schin else if (inrange == 1)
16834887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
16844887Schin else
16854887Schin inrange = 1;
16864887Schin rp = pp;
16874887Schin rw = w;
16884887Schin rc = c;
16894887Schin }
16904887Schin ce->typ = COLL_end;
16914887Schin return e;
16924887Schin }
16934887Schin }
16944887Schin #endif
16954887Schin if (collate)
16964887Schin goto ecollate;
16974887Schin if (env->flags & REG_ICASE)
16984887Schin for (i = 0; i <= UCHAR_MAX; i++)
16994887Schin if (settst(e->re.charclass, i))
17004887Schin {
17014887Schin if (isupper(i))
17024887Schin c = tolower(i);
17034887Schin else if (islower(i))
17044887Schin c = toupper(i);
17054887Schin else
17064887Schin continue;
17074887Schin setadd(e->re.charclass, c);
17084887Schin }
17094887Schin if (neg)
17104887Schin {
17114887Schin for (i = 0; i < elementsof(e->re.charclass->bits); i++)
17124887Schin e->re.charclass->bits[i] ^= ~0;
17134887Schin if (env->explicit >= 0)
17144887Schin setclr(e->re.charclass, env->explicit);
17154887Schin }
17164887Schin return e;
17174887Schin ecollate:
17184887Schin env->error = REG_ECOLLATE;
17194887Schin goto error;
17204887Schin erange:
17214887Schin env->error = REG_ERANGE;
17224887Schin error:
17234887Schin drop(env->disc, e);
17244887Schin if (!env->error)
17254887Schin env->error = REG_EBRACK;
17264887Schin return 0;
17274887Schin }
17284887Schin
17294887Schin static Rex_t*
ccl(Cenv_t * env,int type)17304887Schin ccl(Cenv_t* env, int type)
17314887Schin {
17324887Schin int i;
17334887Schin Rex_t* e;
17344887Schin Celt_t* ce;
17354887Schin regclass_t f;
17364887Schin
17374887Schin if (!(f = classfun(type)))
17384887Schin {
17394887Schin env->error = REG_BADESC;
17404887Schin return 0;
17414887Schin }
17424887Schin if (!mbcoll())
17434887Schin {
17444887Schin if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
17454887Schin return 0;
17464887Schin for (i = 0; i <= UCHAR_MAX; i++)
17474887Schin if ((*f)(i))
17484887Schin setadd(e->re.charclass, i);
17494887Schin if (env->explicit >= 0)
17504887Schin setclr(e->re.charclass, env->explicit);
17514887Schin }
17524887Schin else
17534887Schin {
17544887Schin if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t))))
17554887Schin return 0;
17564887Schin ce = (Celt_t*)e->re.data;
17574887Schin e->re.collate.invert = 0;
17584887Schin e->re.collate.elements = ce;
17598462SApril.Chin@Sun.COM ce->fun = f;
17604887Schin ce->typ = COLL_call;
17614887Schin ce++;
17624887Schin ce->typ = COLL_end;
17634887Schin }
17644887Schin return e;
17654887Schin }
17664887Schin
17674887Schin static Rex_t*
rep(Cenv_t * env,Rex_t * e,int number,int last)17684887Schin rep(Cenv_t* env, Rex_t* e, int number, int last)
17694887Schin {
17704887Schin Rex_t* f;
17714887Schin unsigned long m = 0;
17724887Schin unsigned long n = RE_DUP_INF;
17734887Schin int minimal = -1;
17744887Schin
17754887Schin if (!e)
17764887Schin return 0;
17774887Schin switch (token(env))
17784887Schin {
17794887Schin case T_BANG:
17804887Schin eat(env);
17814887Schin if (!(f = node(env, REX_NEG, m, n, 0)))
17824887Schin {
17834887Schin drop(env->disc, e);
17844887Schin return 0;
17854887Schin }
17864887Schin f->re.group.expr.rex = e;
17874887Schin return f;
17884887Schin case T_QUES:
17894887Schin eat(env);
17904887Schin n = 1;
17914887Schin break;
17924887Schin case T_STAR:
17934887Schin eat(env);
17944887Schin break;
17954887Schin case T_PLUS:
17964887Schin eat(env);
17974887Schin m = 1;
17984887Schin break;
17994887Schin case T_LEFT:
18004887Schin eat(env);
18014887Schin m = env->token.min;
18024887Schin n = env->token.max;
18034887Schin break;
18044887Schin default:
18054887Schin return e;
18064887Schin }
18074887Schin if (env->token.att)
18084887Schin minimal = 1;
18094887Schin else if (env->type < SRE)
18104887Schin switch (token(env))
18114887Schin {
18124887Schin case T_QUES:
18134887Schin eat(env);
18144887Schin minimal = !(env->flags & REG_MINIMAL);
18154887Schin break;
18164887Schin case T_STAR: /*AST*/
18174887Schin eat(env);
18184887Schin minimal = !!(env->flags & REG_MINIMAL);
18194887Schin break;
18204887Schin }
18214887Schin switch (e->type)
18224887Schin {
18234887Schin case REX_DOT:
18244887Schin case REX_CLASS:
18254887Schin case REX_COLL_CLASS:
18264887Schin case REX_ONECHAR:
18274887Schin e->lo = m;
18284887Schin e->hi = n;
18294887Schin if (minimal >= 0)
18304887Schin mark(e, minimal);
18314887Schin return e;
18324887Schin #if HUH_2002_08_07
18334887Schin case REX_BEG:
18344887Schin #endif
18354887Schin case REX_BEG_STR:
18364887Schin case REX_END_STR:
18374887Schin case REX_FIN_STR:
18384887Schin case REX_WBEG:
18394887Schin case REX_WEND:
18404887Schin case REX_WORD:
18414887Schin case REX_WORD_NOT:
18424887Schin env->error = REG_BADRPT;
18434887Schin drop(env->disc, e);
18444887Schin return 0;
18454887Schin }
18464887Schin if (m == 1 && n == 1)
18474887Schin {
18484887Schin if (minimal >= 0)
18494887Schin mark(e, minimal);
18504887Schin return e;
18514887Schin }
18524887Schin if (!(f = node(env, REX_REP, m, n, 0)))
18534887Schin {
18544887Schin drop(env->disc, e);
18554887Schin return 0;
18564887Schin }
18574887Schin f->re.group.expr.rex = e;
18584887Schin f->re.group.number = number;
18594887Schin f->re.group.last = last;
18604887Schin if (minimal >= 0)
18614887Schin mark(f, minimal);
18624887Schin if (m <= n && n)
18634887Schin {
18644887Schin for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex);
18654887Schin if (e && e->type == REX_NEG)
18664887Schin f->type = REX_GROUP;
18674887Schin }
18684887Schin return f;
18694887Schin }
18704887Schin
18714887Schin static int
isstring(Rex_t * e)18724887Schin isstring(Rex_t* e)
18734887Schin {
18744887Schin switch (e->type)
18754887Schin {
18764887Schin case REX_ONECHAR:
18774887Schin return e->lo == 1 && e->hi == 1;
18784887Schin case REX_STRING:
18794887Schin return 1;
18804887Schin }
18814887Schin return 0;
18824887Schin }
18834887Schin
18844887Schin static Trie_node_t*
trienode(Cenv_t * env,int c)18854887Schin trienode(Cenv_t* env, int c)
18864887Schin {
18874887Schin Trie_node_t* t;
18884887Schin
18894887Schin if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t)))
18904887Schin {
18914887Schin memset(t, 0, sizeof(Trie_node_t));
18924887Schin t->c = c;
18934887Schin }
18944887Schin return t;
18954887Schin }
18964887Schin
18974887Schin static int
insert(Cenv_t * env,Rex_t * f,Rex_t * g)18984887Schin insert(Cenv_t* env, Rex_t* f, Rex_t* g)
18994887Schin {
19004887Schin unsigned char* s;
19014887Schin unsigned char* e;
19024887Schin Trie_node_t* t;
19034887Schin int len;
19044887Schin unsigned char tmp[2];
19054887Schin
19064887Schin switch (f->type)
19074887Schin {
19084887Schin case REX_ONECHAR:
19094887Schin *(s = tmp) = f->re.onechar;
19104887Schin e = s + 1;
19114887Schin break;
19124887Schin case REX_STRING:
19134887Schin s = f->re.string.base;
19144887Schin e = s + f->re.string.size;
19154887Schin break;
19164887Schin default:
19174887Schin return 1;
19184887Schin }
19194887Schin if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s)))
19204887Schin return 1;
19214887Schin for (len = 1;;)
19224887Schin {
19234887Schin if (t->c == *s)
19244887Schin {
19254887Schin if (++s >= e)
19264887Schin break;
19274887Schin if (!t->son && !(t->son = trienode(env, *s)))
19284887Schin return 1;
19294887Schin t = t->son;
19304887Schin len++;
19314887Schin }
19324887Schin else
19334887Schin {
19344887Schin if (!t->sib && !(t->sib = trienode(env, *s)))
19354887Schin return 1;
19364887Schin t = t->sib;
19374887Schin }
19384887Schin }
19394887Schin if (g->re.trie.min > len)
19404887Schin g->re.trie.min = len;
19414887Schin if (g->re.trie.max < len)
19424887Schin g->re.trie.max = len;
19434887Schin t->end = 1;
19444887Schin return 0;
19454887Schin }
19464887Schin
19474887Schin /*
19484887Schin * trie() tries to combine nontrivial e and f into a REX_TRIE
19494887Schin * unless 0 is returned, e and f are deleted as far as possible
19504887Schin * also called by regcomb
19514887Schin */
19524887Schin
19534887Schin static Rex_t*
trie(Cenv_t * env,Rex_t * e,Rex_t * f)19544887Schin trie(Cenv_t* env, Rex_t* e, Rex_t* f)
19554887Schin {
19564887Schin Rex_t* g;
19574887Schin
19584887Schin if (e->next || f->next || !isstring(e) || e->flags != f->flags)
19594887Schin return 0;
19604887Schin if (isstring(f))
19614887Schin {
19624887Schin if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*))))
19634887Schin return 0;
19644887Schin g->re.trie.min = INT_MAX;
19654887Schin if (insert(env, f, g))
19664887Schin goto nospace;
19674887Schin drop(env->disc, f);
19684887Schin }
19694887Schin else if (f->type != REX_TRIE)
19704887Schin return 0;
19714887Schin else
19724887Schin g = f;
19734887Schin if (insert(env, e, g))
19744887Schin goto nospace;
19754887Schin drop(env->disc, e);
19764887Schin return g;
19774887Schin nospace:
19784887Schin if (g != f)
19794887Schin drop(env->disc, g);
19804887Schin return 0;
19814887Schin }
19824887Schin
19834887Schin static Rex_t* alt(Cenv_t*, int, int);
19844887Schin
19854887Schin static int
chr(register Cenv_t * env,int * escaped)19864887Schin chr(register Cenv_t* env, int* escaped)
19874887Schin {
19884887Schin unsigned char* p;
19894887Schin int c;
19904887Schin
19914887Schin *escaped = 0;
19924887Schin if (!(c = *env->cursor))
19934887Schin return -1;
19944887Schin env->cursor++;
19954887Schin if (c == '\\')
19964887Schin {
19974887Schin if (env->flags & REG_SHELL_ESCAPED)
19984887Schin return c;
19994887Schin if (!(c = *(env->cursor + 1)) || c == env->terminator)
20004887Schin {
20014887Schin if (env->flags & REG_LENIENT)
20024887Schin return c ? c : '\\';
20034887Schin env->error = REG_EESCAPE;
20044887Schin return -1;
20054887Schin }
20064887Schin p = env->cursor;
20074887Schin c = chresc((char*)env->cursor - 1, (char**)&env->cursor);
20084887Schin *escaped = env->cursor - p;
20094887Schin }
20104887Schin return c;
20114887Schin }
20124887Schin
20134887Schin /*
20144887Schin * open the perly gates
20154887Schin */
20164887Schin
20174887Schin static Rex_t*
grp(Cenv_t * env,int parno)20184887Schin grp(Cenv_t* env, int parno)
20194887Schin {
20204887Schin Rex_t* e;
20214887Schin Rex_t* f;
20224887Schin int c;
20234887Schin int i;
20244887Schin int n;
20254887Schin int x;
20264887Schin int esc;
20274887Schin int typ;
20284887Schin int beg;
20294887Schin unsigned char* p;
20304887Schin
20314887Schin beg = env->pattern == env->cursor - env->token.len;
20324887Schin if (!(c = env->token.lex) && (c = *env->cursor))
20334887Schin env->cursor++;
20344887Schin env->token.len = 0;
20354887Schin env->parnest++;
20364887Schin typ = -1;
20374887Schin switch (c)
20384887Schin {
20394887Schin case '-':
20404887Schin case '+':
20414887Schin case 'a':
20424887Schin case 'g':
20434887Schin case 'i':
20444887Schin case 'l':
20454887Schin case 'm':
20464887Schin case 'p':
20474887Schin case 'r':
20484887Schin case 's':
20494887Schin case 'x':
20504887Schin case 'A':
20514887Schin case 'B':
20524887Schin case 'E':
20534887Schin case 'F':
20544887Schin case 'G':
20554887Schin case 'I':
20564887Schin case 'K':
20578462SApril.Chin@Sun.COM case 'L':
20588462SApril.Chin@Sun.COM case 'M': /* glob(3) */
20598462SApril.Chin@Sun.COM case 'N': /* glob(3) */
20608462SApril.Chin@Sun.COM case 'O': /* glob(3) */
2061*12068SRoger.Faulkner@Oracle.COM case 'P':
20628462SApril.Chin@Sun.COM case 'R': /* pcre */
20634887Schin case 'S':
20644887Schin case 'U': /* pcre */
20654887Schin case 'X': /* pcre */
20664887Schin x = REX_GROUP;
20674887Schin i = 1;
20684887Schin env->token.push = 1;
20694887Schin for (;;)
20704887Schin {
20714887Schin switch (c)
20724887Schin {
20738462SApril.Chin@Sun.COM case ')':
20748462SApril.Chin@Sun.COM if (!(env->flags & REG_LITERAL))
20758462SApril.Chin@Sun.COM {
20768462SApril.Chin@Sun.COM env->error = REG_BADRPT;
20778462SApril.Chin@Sun.COM return 0;
20788462SApril.Chin@Sun.COM }
20798462SApril.Chin@Sun.COM /*FALLTHROUGH*/
20804887Schin case 0:
20814887Schin case T_CLOSE:
20824887Schin x = 0;
20834887Schin goto done;
20844887Schin case ':':
20854887Schin eat(env);
20864887Schin if (token(env) == T_CLOSE)
20874887Schin x = 0;
20884887Schin goto done;
20894887Schin case '-':
20904887Schin i = 0;
20914887Schin break;
20924887Schin case '+':
20934887Schin i = 1;
20944887Schin break;
20954887Schin case 'a':
20964887Schin if (i)
20974887Schin env->flags |= (REG_LEFT|REG_RIGHT);
20984887Schin else
20994887Schin env->flags &= ~(REG_LEFT|REG_RIGHT);
21004887Schin break;
21014887Schin case 'g':
21024887Schin if (i)
21034887Schin env->flags &= ~REG_MINIMAL;
21044887Schin else
21054887Schin env->flags |= REG_MINIMAL;
21064887Schin break;
21074887Schin case 'i':
21084887Schin if (i)
21094887Schin env->flags |= REG_ICASE;
21104887Schin else
21114887Schin env->flags &= ~REG_ICASE;
21124887Schin break;
21134887Schin case 'l':
21144887Schin if (i)
21154887Schin env->flags |= REG_LEFT;
21164887Schin else
21174887Schin env->flags &= ~REG_LEFT;
21184887Schin break;
21194887Schin case 'm':
21204887Schin if (i)
21214887Schin env->flags |= REG_NEWLINE;
21224887Schin else
21234887Schin env->flags &= ~REG_NEWLINE;
21244887Schin env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
21254887Schin break;
21264887Schin case 'p':
21274887Schin if (i)
21284887Schin env->flags &= ~REG_LENIENT;
21294887Schin else
21304887Schin env->flags |= REG_LENIENT;
21314887Schin break;
21324887Schin case 'r':
21334887Schin if (i)
21344887Schin env->flags |= REG_RIGHT;
21354887Schin else
21364887Schin env->flags &= ~REG_RIGHT;
21374887Schin break;
21384887Schin case 's':
21394887Schin if (i)
21404887Schin env->flags |= REG_SPAN;
21414887Schin else
21424887Schin env->flags &= ~REG_SPAN;
21434887Schin env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
21444887Schin break;
21454887Schin case 'x':
21464887Schin if (i)
21474887Schin env->flags |= REG_COMMENT;
21484887Schin else
21494887Schin env->flags &= ~REG_COMMENT;
21504887Schin break;
21514887Schin case 'A':
2152*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
21534887Schin env->flags |= REG_AUGMENTED|REG_EXTENDED;
21544887Schin typ = ARE;
21554887Schin break;
21564887Schin case 'B':
21574887Schin case 'G':
2158*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
21594887Schin typ = BRE;
21604887Schin break;
21614887Schin case 'E':
2162*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
21634887Schin env->flags |= REG_EXTENDED;
21644887Schin typ = ERE;
21654887Schin break;
21664887Schin case 'F':
21674887Schin case 'L':
2168*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
21694887Schin env->flags |= REG_LITERAL;
21704887Schin typ = ERE;
21714887Schin break;
21724887Schin case 'K':
2173*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
21744887Schin env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT;
21754887Schin typ = KRE;
21764887Schin break;
21774887Schin case 'M':
21784887Schin /* used by caller to disable glob(3) GLOB_BRACE */
21794887Schin break;
21804887Schin case 'N':
21814887Schin /* used by caller to disable glob(3) GLOB_NOCHECK */
21824887Schin break;
21838462SApril.Chin@Sun.COM case 'O':
21844887Schin /* used by caller to disable glob(3) GLOB_STARSTAR */
21854887Schin break;
2186*12068SRoger.Faulkner@Oracle.COM case 'P':
2187*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2188*12068SRoger.Faulkner@Oracle.COM env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE;
2189*12068SRoger.Faulkner@Oracle.COM typ = ERE;
2190*12068SRoger.Faulkner@Oracle.COM break;
21914887Schin case 'S':
2192*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
21934887Schin env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT;
21944887Schin typ = SRE;
21954887Schin break;
21964887Schin case 'U': /* PCRE_UNGREEDY */
21974887Schin if (i)
21984887Schin env->flags |= REG_MINIMAL;
21994887Schin else
22004887Schin env->flags &= ~REG_MINIMAL;
22014887Schin break;
22024887Schin case 'X': /* PCRE_EXTRA */
22034887Schin break;
22044887Schin default:
22054887Schin env->error = REG_BADRPT;
22064887Schin return 0;
22074887Schin }
22084887Schin eat(env);
22094887Schin c = token(env);
22104887Schin }
22114887Schin done:
22124887Schin break;
22134887Schin case ':':
22144887Schin x = REX_GROUP;
22154887Schin break;
22164887Schin case '=':
22174887Schin x = REX_GROUP_AHEAD;
22184887Schin break;
22194887Schin case '!':
22204887Schin x = REX_GROUP_AHEAD_NOT;
22214887Schin break;
22224887Schin case '<':
22234887Schin switch (token(env))
22244887Schin {
22254887Schin case '=':
22264887Schin x = REX_GROUP_BEHIND;
22274887Schin break;
22284887Schin case '!':
22294887Schin case T_BANG:
22304887Schin x = REX_GROUP_BEHIND_NOT;
22314887Schin break;
22324887Schin default:
22334887Schin env->error = REG_BADRPT;
22344887Schin return 0;
22354887Schin }
22364887Schin eat(env);
22374887Schin break;
22384887Schin case '>':
22394887Schin x = REX_GROUP_CUT;
22404887Schin break;
22414887Schin case '%':
22424887Schin case T_PERCENT:
22434887Schin e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short));
22444887Schin e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor;
22454887Schin n = 1;
22464887Schin for (;;)
22474887Schin {
22484887Schin switch (i = chr(env, &esc))
22494887Schin {
22504887Schin case -1:
22514887Schin case 0:
22524887Schin invalid:
22534887Schin env->cursor -= esc + 1;
22544887Schin env->error = REG_EPAREN;
22554887Schin return 0;
22564887Schin case 'D':
22574887Schin x = REX_NEST_delimiter;
22584887Schin /*FALLTHROUGH*/
22594887Schin delimiter:
22604887Schin if ((i = chr(env, &esc)) < 0)
22614887Schin goto invalid;
22624887Schin if (e->re.nest.type[i] & ~x)
22634887Schin goto invalid;
22644887Schin e->re.nest.type[i] = x;
22654887Schin continue;
22664887Schin case 'E':
22674887Schin x = REX_NEST_escape;
22684887Schin goto delimiter;
22694887Schin case 'L':
22704887Schin x = REX_NEST_literal;
22714887Schin goto quote;
22724887Schin case 'O':
22734887Schin switch (i = chr(env, &esc))
22744887Schin {
22754887Schin case 'T':
22764887Schin e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator;
22774887Schin break;
22784887Schin default:
22794887Schin goto invalid;
22804887Schin }
22814887Schin continue;
22824887Schin case 'Q':
22834887Schin x = REX_NEST_quote;
22844887Schin /*FALLTHROUGH*/
22854887Schin quote:
22864887Schin if ((i = chr(env, &esc)) < 0)
22874887Schin goto invalid;
22884887Schin if (e->re.nest.type[i] & ~x)
22894887Schin goto invalid;
22904887Schin e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT);
22914887Schin continue;
22924887Schin case 'S':
22934887Schin x = REX_NEST_separator;
22944887Schin goto delimiter;
22954887Schin case 'T':
22964887Schin x = REX_NEST_terminator;
22974887Schin goto delimiter;
22984887Schin case '|':
22994887Schin case '&':
23004887Schin if (!esc)
23014887Schin goto invalid;
23024887Schin goto nesting;
23034887Schin case '(':
23044887Schin if (!esc)
23054887Schin n++;
23064887Schin goto nesting;
23074887Schin case ')':
23084887Schin if (!esc && !--n)
23094887Schin break;
23104887Schin goto nesting;
23114887Schin default:
23124887Schin nesting:
23134887Schin if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
23144887Schin goto invalid;
23154887Schin e->re.nest.type[i] = REX_NEST_open;
23164887Schin if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
23174887Schin goto invalid;
23184887Schin if (!esc)
23194887Schin {
23204887Schin if (x == ')' && !--n)
23214887Schin goto invalid;
23224887Schin else if (x == '(')
23234887Schin n++;
23244887Schin }
23254887Schin e->re.nest.type[x] |= REX_NEST_close;
23264887Schin e->re.nest.type[i] |= x << REX_NEST_SHIFT;
23274887Schin continue;
23284887Schin }
23294887Schin break;
23304887Schin }
23314887Schin env->parnest--;
23324887Schin if (c == T_PERCENT)
23334887Schin for (n = 0; n < 2; n++)
23344887Schin {
23354887Schin parno = ++env->parno;
23364887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0)))
23374887Schin {
23384887Schin drop(env->disc, e);
23394887Schin return 0;
23404887Schin }
23414887Schin if (parno < elementsof(env->paren))
23424887Schin env->paren[parno] = f;
23434887Schin f->re.group.back = 0;
23444887Schin f->re.group.number = parno;
23454887Schin f->re.group.expr.rex = e;
23464887Schin e = f;
23474887Schin }
23484887Schin return e;
23494887Schin case '(':
23504887Schin c = 0;
23514887Schin if (isdigit(*env->cursor))
23524887Schin {
23534887Schin f = 0;
23544887Schin do
23554887Schin {
23564887Schin if (c > (INT_MAX / 10))
23574887Schin {
23584887Schin env->error = REG_BADRPT;
23594887Schin return 0;
23604887Schin }
23614887Schin c = c * 10 + (*env->cursor++ - '0');
23624887Schin } while (isdigit(*env->cursor));
23634887Schin if (*env->cursor++ != ')')
23644887Schin {
23654887Schin env->error = REG_BADRPT;
23664887Schin return 0;
23674887Schin }
23684887Schin if (c && env->type >= SRE)
23694887Schin c = c * 2 - 1;
23704887Schin if (!c || c > env->parno || !env->paren[c])
23714887Schin {
23724887Schin if (!(env->flags & REG_LENIENT))
23734887Schin {
23744887Schin env->error = REG_ESUBREG;
23754887Schin return 0;
23764887Schin }
23774887Schin if (c)
23784887Schin c = -1;
23794887Schin }
23804887Schin }
23814887Schin else
23824887Schin {
23834887Schin if (env->type < SRE && *env->cursor++ != '?')
23844887Schin {
23854887Schin env->error = REG_BADRPT;
23864887Schin return 0;
23874887Schin }
23884887Schin if (!(f = grp(env, parno + 1)) && env->error)
23894887Schin return 0;
23904887Schin }
23914887Schin if (!(e = node(env, REX_GROUP_COND, 0, 0, 0)))
23924887Schin {
23934887Schin drop(env->disc, f);
23944887Schin return 0;
23954887Schin }
23964887Schin e->re.group.size = c;
23974887Schin e->re.group.expr.binary.left = f;
23984887Schin if (!(e->re.group.expr.binary.right = alt(env, parno, 1)))
23994887Schin {
24004887Schin drop(env->disc, e);
24014887Schin return 0;
24024887Schin }
24034887Schin if (token(env) != T_CLOSE)
24044887Schin {
24054887Schin env->error = REG_EPAREN;
24064887Schin return 0;
24074887Schin }
24084887Schin eat(env);
24094887Schin env->parnest--;
24104887Schin return rep(env, e, parno, parno);
24114887Schin case '{':
24124887Schin p = env->cursor;
24134887Schin n = 1;
24144887Schin while (c = *env->cursor)
24154887Schin {
24164887Schin if (c == '\\' && *(env->cursor + 1))
24174887Schin env->cursor++;
24184887Schin else if (c == '{')
24194887Schin n++;
24204887Schin else if (c == '}' && !--n)
24214887Schin break;
24224887Schin else if (c == env->delimiter || c == env->terminator)
24234887Schin break;
24244887Schin env->cursor++;
24254887Schin }
24264887Schin if (c != '}')
24274887Schin {
24284887Schin env->error = REG_EBRACE;
24294887Schin return 0;
24304887Schin }
24314887Schin if (*++env->cursor != ')')
24324887Schin {
24334887Schin env->error = REG_EPAREN;
24344887Schin return 0;
24354887Schin }
24364887Schin env->cursor++;
24374887Schin env->parnest--;
24384887Schin if (env->disc->re_version < REG_VERSION_EXEC)
24394887Schin {
24404887Schin env->error = REG_BADRPT;
24414887Schin return 0;
24424887Schin }
24434887Schin if (!env->disc->re_execf)
24444887Schin return 0;
24454887Schin if (!(e = node(env, REX_EXEC, 0, 0, 0)))
24464887Schin return 0;
24474887Schin e->re.exec.text = (const char*)p;
24484887Schin e->re.exec.size = env->cursor - p - 2;
24494887Schin if (!env->disc->re_compf)
24504887Schin e->re.exec.data = 0;
24514887Schin else
24524887Schin e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc);
24534887Schin return e;
24544887Schin case '0': case '1': case '2': case '3': case '4':
24554887Schin case '5': case '6': case '7': case '8': case '9':
24564887Schin c -= '0';
24574887Schin while (isdigit(*env->cursor))
24584887Schin {
24594887Schin if (c > (INT_MAX / 10))
24604887Schin {
24614887Schin env->error = REG_ESUBREG;
24624887Schin return 0;
24634887Schin }
24644887Schin c = c * 10 + *env->cursor++ - '0';
24654887Schin }
24664887Schin if (*env->cursor == ')')
24674887Schin {
24684887Schin env->cursor++;
24694887Schin env->parnest--;
24704887Schin env->token.len = 1;
24714887Schin if (c > env->parno || !env->paren[c])
24724887Schin {
24734887Schin env->error = REG_ESUBREG;
24744887Schin return 0;
24754887Schin }
24764887Schin env->paren[c]->re.group.back = 1;
24774887Schin return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
24784887Schin }
24794887Schin /*FALLTHROUGH*/
24804887Schin default:
24814887Schin env->error = REG_BADRPT;
24824887Schin return 0;
24834887Schin }
24844887Schin if (x && !(e = alt(env, parno, 0)))
24854887Schin return 0;
24864887Schin c = token(env);
24874887Schin env->parnest--;
24888462SApril.Chin@Sun.COM if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')'))
24894887Schin {
24904887Schin env->error = REG_EPAREN;
24914887Schin return 0;
24924887Schin }
24934887Schin eat(env);
24944887Schin if (typ >= 0)
24954887Schin {
24964887Schin if (beg)
24974887Schin env->pattern = env->cursor;
24984887Schin env->type = typ;
24994887Schin }
25004887Schin if (!x)
25014887Schin return 0;
25024887Schin if (!(f = node(env, x, 0, 0, 0)))
25034887Schin {
25044887Schin drop(env->disc, e);
25054887Schin return 0;
25064887Schin }
25074887Schin f->re.group.expr.rex = e;
25084887Schin if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT)
25094887Schin {
25104887Schin if (stats(env, e))
25114887Schin {
25124887Schin drop(env->disc, f);
25134887Schin if (!env->error)
25144887Schin env->error = REG_ECOUNT;
25154887Schin return 0;
25164887Schin }
25174887Schin f->re.group.size = env->stats.m;
25184887Schin memset(&env->stats, 0, sizeof(env->stats));
25194887Schin }
25204887Schin switch (x)
25214887Schin {
25224887Schin case REX_GROUP:
25234887Schin case REX_GROUP_CUT:
25244887Schin f = rep(env, f, parno, env->parno);
25254887Schin break;
25264887Schin }
25274887Schin return f;
25284887Schin }
25294887Schin
25304887Schin static Rex_t*
seq(Cenv_t * env)25314887Schin seq(Cenv_t* env)
25324887Schin {
25334887Schin Rex_t* e;
25344887Schin Rex_t* f;
25354887Schin Token_t tok;
25364887Schin int c;
25374887Schin int i;
25384887Schin int n;
25394887Schin int x;
25404887Schin int parno;
25414887Schin int type;
25424887Schin regflags_t flags;
25434887Schin unsigned char* s;
25444887Schin unsigned char* p;
25454887Schin unsigned char* t;
25464887Schin unsigned char* u;
25474887Schin unsigned char buf[256];
25484887Schin
25494887Schin for (;;)
25504887Schin {
25514887Schin s = buf;
25524887Schin while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len])
25534887Schin {
25544887Schin x = c;
25554887Schin p = env->cursor;
25564887Schin if (c >= 0)
25574887Schin {
25584887Schin n = 1;
25594887Schin *s++ = (env->flags & REG_ICASE) ? toupper(c) : c;
25604887Schin }
25618462SApril.Chin@Sun.COM else if (c == C_ESC || (env->flags & REG_ICASE))
25624887Schin {
25638462SApril.Chin@Sun.COM c = (c == C_ESC) ? env->token.lex : mbchar(p);
25648462SApril.Chin@Sun.COM if (env->flags & REG_ICASE)
25658462SApril.Chin@Sun.COM c = towupper(c);
25668462SApril.Chin@Sun.COM if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX)
25678462SApril.Chin@Sun.COM break;
2568*12068SRoger.Faulkner@Oracle.COM if ((n = mbconv((char*)s, c)) < 0)
25698462SApril.Chin@Sun.COM *s++ = c;
25708462SApril.Chin@Sun.COM else if (n)
25718462SApril.Chin@Sun.COM s += n;
25728462SApril.Chin@Sun.COM else
25734887Schin {
25744887Schin n = 1;
25754887Schin *s++ = 0;
25764887Schin }
25774887Schin }
25784887Schin else
25794887Schin {
25804887Schin n = env->token.len - env->token.esc;
25814887Schin for (t = p, u = s + n; s < u; *s++ = *t++);
25824887Schin }
25834887Schin eat(env);
25844887Schin }
25854887Schin if (c == T_BAD)
25864887Schin return 0;
25874887Schin if (s > buf)
25884887Schin switch (c)
25894887Schin {
25904887Schin case T_STAR:
25914887Schin case T_PLUS:
25924887Schin case T_LEFT:
25934887Schin case T_QUES:
25944887Schin case T_BANG:
25954887Schin if ((s -= n) == buf)
25964887Schin e = 0;
25974887Schin else
25984887Schin {
25994887Schin i = s - buf;
26004887Schin if (!(e = node(env, REX_STRING, 0, 0, i)))
26014887Schin return 0;
26024887Schin memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i);
26034887Schin e->re.string.size = i;
26044887Schin }
26054887Schin if (x >= 0)
26064887Schin {
26074887Schin if (!(f = node(env, REX_ONECHAR, 1, 1, 0)))
26084887Schin {
26094887Schin drop(env->disc, e);
26104887Schin return 0;
26114887Schin }
26124887Schin f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x;
26134887Schin }
26144887Schin else
26154887Schin {
26168462SApril.Chin@Sun.COM if (!(f = node(env, REX_STRING, 0, 0, n)))
26174887Schin return 0;
26188462SApril.Chin@Sun.COM memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n);
26198462SApril.Chin@Sun.COM f->re.string.size = n;
26204887Schin }
26214887Schin if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env))))
26224887Schin {
26234887Schin drop(env->disc, e);
26244887Schin return 0;
26254887Schin }
26264887Schin if (e)
26274887Schin f = cat(env, e, f);
26284887Schin return f;
26294887Schin default:
26304887Schin c = s - buf;
26314887Schin if (!(e = node(env, REX_STRING, 0, 0, c)))
26324887Schin return 0;
26334887Schin memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c);
26344887Schin e->re.string.size = c;
26354887Schin return cat(env, e, seq(env));
26364887Schin }
26374887Schin else if (c > T_BACK)
26384887Schin {
26394887Schin eat(env);
26404887Schin c -= T_BACK;
26414887Schin if (c > env->parno || !env->paren[c])
26424887Schin {
26434887Schin env->error = REG_ESUBREG;
26444887Schin return 0;
26454887Schin }
26464887Schin env->paren[c]->re.group.back = 1;
26474887Schin e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
26484887Schin }
26494887Schin else
26504887Schin switch (c)
26514887Schin {
26524887Schin case T_AND:
26534887Schin case T_CLOSE:
26544887Schin case T_BAR:
26554887Schin case T_END:
26564887Schin return node(env, REX_NULL, 0, 0, 0);
26574887Schin case T_DOLL:
26584887Schin eat(env);
26594887Schin e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0);
26604887Schin break;
26614887Schin case T_CFLX:
26624887Schin eat(env);
26634887Schin if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED))
26644887Schin e = rep(env, e, 0, 0);
26654887Schin break;
26664887Schin case T_OPEN:
26674887Schin tok = env->token;
26684887Schin eat(env);
26694887Schin flags = env->flags;
26704887Schin type = env->type;
26714887Schin if (env->token.att)
26724887Schin env->flags |= REG_MINIMAL;
26734887Schin env->parnest++;
26744887Schin if (env->type == KRE)
26754887Schin ++env->parno;
26764887Schin parno = ++env->parno;
26774887Schin if (!(e = alt(env, parno + 1, 0)))
26784887Schin break;
26794887Schin if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL))
26804887Schin {
26814887Schin drop(env->disc, e);
26824887Schin env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL;
26834887Schin return 0;
26844887Schin }
26854887Schin if (token(env) != T_CLOSE)
26864887Schin {
26874887Schin drop(env->disc, e);
26884887Schin env->error = REG_EPAREN;
26894887Schin return 0;
26904887Schin }
26914887Schin env->parnest--;
26924887Schin eat(env);
26934887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0)))
26944887Schin {
26954887Schin drop(env->disc, e);
26964887Schin return 0;
26974887Schin }
26984887Schin if (parno < elementsof(env->paren))
26994887Schin env->paren[parno] = f;
27004887Schin f->re.group.back = 0;
27014887Schin f->re.group.number = parno;
27024887Schin f->re.group.expr.rex = e;
27034887Schin if (tok.lex)
27044887Schin {
27054887Schin tok.push = 1;
27064887Schin env->token = tok;
27074887Schin }
27084887Schin if (!(e = rep(env, f, parno, env->parno)))
27094887Schin return 0;
27104887Schin if (env->type == KRE)
27114887Schin {
27124887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0)))
27134887Schin {
27144887Schin drop(env->disc, e);
27154887Schin return 0;
27164887Schin }
27174887Schin if (--parno < elementsof(env->paren))
27184887Schin env->paren[parno] = f;
27194887Schin f->re.group.back = 0;
27204887Schin f->re.group.number = parno;
27214887Schin f->re.group.expr.rex = e;
27224887Schin e = f;
27234887Schin }
27244887Schin env->flags = flags;
27254887Schin env->type = type;
27264887Schin break;
27274887Schin case T_GROUP:
27284887Schin p = env->cursor;
27294887Schin eat(env);
27304887Schin flags = env->flags;
27314887Schin type = env->type;
27324887Schin if (!(e = grp(env, env->parno + 1)))
27334887Schin {
27344887Schin if (env->error)
27354887Schin return 0;
27364887Schin if (env->literal == env->pattern && env->literal == p)
27374887Schin env->literal = env->cursor;
27384887Schin continue;
27394887Schin }
27404887Schin env->flags = flags;
27414887Schin env->type = type;
27424887Schin break;
27434887Schin case T_BRA:
27444887Schin eat(env);
27454887Schin if (e = bra(env))
27464887Schin e = rep(env, e, 0, 0);
27474887Schin break;
27484887Schin case T_ALNUM:
27494887Schin case T_ALNUM_NOT:
27504887Schin case T_DIGIT:
27514887Schin case T_DIGIT_NOT:
27524887Schin case T_SPACE:
27534887Schin case T_SPACE_NOT:
27544887Schin eat(env);
27554887Schin if (e = ccl(env, c))
27564887Schin e = rep(env, e, 0, 0);
27574887Schin break;
27584887Schin case T_LT:
27594887Schin eat(env);
27604887Schin e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0);
27614887Schin break;
27624887Schin case T_GT:
27634887Schin eat(env);
27644887Schin e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0);
27654887Schin break;
27664887Schin case T_DOT:
27674887Schin eat(env);
27684887Schin e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
27694887Schin break;
27704887Schin case T_DOTSTAR:
27714887Schin eat(env);
27724887Schin env->token.lex = T_STAR;
27734887Schin env->token.push = 1;
27744887Schin e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
27754887Schin break;
27764887Schin case T_SLASHPLUS:
27774887Schin eat(env);
27784887Schin env->token.lex = T_PLUS;
27794887Schin env->token.push = 1;
27804887Schin if (e = node(env, REX_ONECHAR, 1, 1, 0))
27814887Schin {
27824887Schin e->re.onechar = '/';
27834887Schin e = rep(env, e, 0, 0);
27844887Schin }
27854887Schin break;
27864887Schin case T_WORD:
27874887Schin eat(env);
27884887Schin e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0);
27894887Schin break;
27904887Schin case T_WORD_NOT:
27914887Schin eat(env);
27924887Schin e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0);
27934887Schin break;
27944887Schin case T_BEG_STR:
27954887Schin eat(env);
27964887Schin e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0);
27974887Schin break;
27984887Schin case T_END_STR:
27994887Schin eat(env);
28004887Schin e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0);
28014887Schin break;
28024887Schin case T_FIN_STR:
28034887Schin eat(env);
28044887Schin e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0);
28054887Schin break;
28064887Schin default:
28074887Schin env->error = REG_BADRPT;
28084887Schin return 0;
28094887Schin }
28104887Schin if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator)
28114887Schin e = cat(env, e, seq(env));
28124887Schin return e;
28134887Schin }
28144887Schin }
28154887Schin
28164887Schin static Rex_t*
con(Cenv_t * env)28174887Schin con(Cenv_t* env)
28184887Schin {
28194887Schin Rex_t* e;
28204887Schin Rex_t* f;
28214887Schin Rex_t* g;
28224887Schin
28234887Schin if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND)
28244887Schin return e;
28254887Schin eat(env);
28264887Schin if (!(f = con(env)))
28274887Schin {
28284887Schin drop(env->disc, e);
28294887Schin return 0;
28304887Schin }
28314887Schin if (!(g = node(env, REX_CONJ, 0, 0, 0)))
28324887Schin {
28334887Schin drop(env->disc, e);
28344887Schin drop(env->disc, f);
28354887Schin return 0;
28364887Schin }
28374887Schin g->re.group.expr.binary.left = e;
28384887Schin g->re.group.expr.binary.right = f;
28394887Schin return g;
28404887Schin }
28414887Schin
28424887Schin static Rex_t*
alt(Cenv_t * env,int number,int cond)28434887Schin alt(Cenv_t* env, int number, int cond)
28444887Schin {
28454887Schin Rex_t* e;
28464887Schin Rex_t* f;
28474887Schin Rex_t* g;
28484887Schin
28494887Schin if (!(e = con(env)))
28504887Schin return 0;
28514887Schin else if (token(env) != T_BAR)
28524887Schin {
28534887Schin if (!cond)
28544887Schin return e;
28554887Schin f = 0;
28564887Schin if (e->type == REX_NULL)
28574887Schin goto bad;
28584887Schin }
28594887Schin else
28604887Schin {
28614887Schin eat(env);
28624887Schin if (!(f = alt(env, number, 0)))
28634887Schin {
28644887Schin drop(env->disc, e);
28654887Schin return 0;
28664887Schin }
28674887Schin if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL))
28684887Schin goto bad;
28694887Schin if (!cond && (g = trie(env, e, f)))
28704887Schin return g;
28714887Schin }
28724887Schin if (!(g = node(env, REX_ALT, 0, 0, 0)))
28734887Schin {
28744887Schin env->error = REG_ESPACE;
28754887Schin goto bad;
28764887Schin }
28774887Schin g->re.group.number = number;
28784887Schin g->re.group.last = env->parno;
28794887Schin g->re.group.expr.binary.left = e;
28804887Schin g->re.group.expr.binary.right = f;
28814887Schin return g;
28824887Schin bad:
28834887Schin drop(env->disc, e);
28844887Schin drop(env->disc, f);
28854887Schin if (!env->error)
28864887Schin env->error = REG_ENULL;
28874887Schin return 0;
28884887Schin }
28894887Schin
28904887Schin /*
28914887Schin * add v to REX_BM tables
28924887Schin */
28934887Schin
28944887Schin static void
bmstr(Cenv_t * env,register Rex_t * a,unsigned char * v,int n,Bm_mask_t b)28954887Schin bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b)
28964887Schin {
28974887Schin int c;
28984887Schin int m;
28994887Schin size_t z;
29004887Schin
29014887Schin for (m = 0; m < n; m++)
29024887Schin {
29034887Schin if (!(z = n - m - 1))
29044887Schin z = HIT;
29054887Schin c = v[m];
29064887Schin a->re.bm.mask[m][c] |= b;
29074887Schin if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
29084887Schin a->re.bm.skip[c] = z;
29094887Schin if (a->flags & REG_ICASE)
29104887Schin {
29114887Schin if (isupper(c))
29124887Schin c = tolower(c);
29134887Schin else if (islower(c))
29144887Schin c = toupper(c);
29154887Schin else
29164887Schin continue;
29174887Schin a->re.bm.mask[m][c] |= b;
29184887Schin if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
29194887Schin a->re.bm.skip[c] = z;
29204887Schin }
29214887Schin }
29224887Schin }
29234887Schin
29244887Schin /*
29254887Schin * set up BM table from trie
29264887Schin */
29274887Schin
29284887Schin static int
bmtrie(Cenv_t * env,Rex_t * a,unsigned char * v,Trie_node_t * x,int n,int m,Bm_mask_t b)29294887Schin bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b)
29304887Schin {
29314887Schin do
29324887Schin {
29334887Schin v[m] = x->c;
29344887Schin if (m >= (n - 1))
29354887Schin {
29364887Schin bmstr(env, a, v, n, b);
29374887Schin if (!(b <<= 1))
29384887Schin {
29394887Schin b = 1;
29404887Schin a->re.bm.complete = 0;
29414887Schin }
29424887Schin else if (x->son)
29434887Schin a->re.bm.complete = 0;
29444887Schin }
29454887Schin else if (x->son)
29464887Schin b = bmtrie(env, a, v, x->son, n, m + 1, b);
29474887Schin } while (x = x->sib);
29484887Schin return b;
29494887Schin }
29504887Schin
29514887Schin /*
29524887Schin * rewrite the expression tree for some special cases
29534887Schin * 1. it is a null expression - illegal
29544887Schin * 2. max length fixed string found -- use BM algorithm
29554887Schin * 3. it begins with an unanchored string - use KMP algorithm
29564887Schin * 0 returned on success
29574887Schin */
29584887Schin
29594887Schin static int
special(Cenv_t * env,regex_t * p)29604887Schin special(Cenv_t* env, regex_t* p)
29614887Schin {
29624887Schin Rex_t* a;
29634887Schin Rex_t* e;
29644887Schin Rex_t* t;
29654887Schin Rex_t* x;
29664887Schin Rex_t* y;
29674887Schin unsigned char* s;
29684887Schin int* f;
29694887Schin int n;
29704887Schin int m;
29714887Schin int k;
29724887Schin
29734887Schin DEBUG_INIT();
29744887Schin if (e = p->env->rex)
29754887Schin {
29764887Schin if ((x = env->stats.x) && x->re.string.size < 3)
29774887Schin x = 0;
29784887Schin if ((t = env->stats.y) && t->re.trie.min < 3)
29794887Schin t = 0;
29804887Schin if (x && t)
29814887Schin {
29824887Schin if (x->re.string.size >= t->re.trie.min)
29834887Schin t = 0;
29844887Schin else
29854887Schin x = 0;
29864887Schin }
29878462SApril.Chin@Sun.COM if (x || t)
29884887Schin {
29894887Schin Bm_mask_t** mask;
29904887Schin Bm_mask_t* h;
29914887Schin unsigned char* v;
29924887Schin size_t* q;
29934887Schin unsigned long l;
29944887Schin int i;
29954887Schin int j;
29964887Schin
29974887Schin if (x)
29984887Schin {
29994887Schin y = x;
30004887Schin n = m = x->re.string.size;
30014887Schin l = env->stats.l;
30024887Schin }
30034887Schin else
30044887Schin {
30054887Schin y = t;
30064887Schin n = t->re.trie.min;
30074887Schin m = t->re.trie.max;
30084887Schin l = env->stats.k;
30094887Schin }
30104887Schin if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t))))
30114887Schin return 1;
30124887Schin if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t))))
30134887Schin {
30144887Schin alloc(env->disc, q, 0);
30154887Schin return 1;
30164887Schin }
30174887Schin a->flags = y->flags;
30184887Schin a->map = y->map;
30194887Schin a->re.bm.size = n;
30204887Schin a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1;
30214887Schin a->re.bm.left = l - 1;
30224887Schin a->re.bm.right = env->stats.m - l - n;
3023*12068SRoger.Faulkner@Oracle.COM a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n;
30244887Schin h = (Bm_mask_t*)&a->re.bm.mask[n];
30254887Schin a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1));
30264887Schin a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1];
30274887Schin for (m = 0; m <= UCHAR_MAX; m++)
30284887Schin a->re.bm.skip[m] = n;
30294887Schin a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left);
30304887Schin for (i = 1; i <= n; i++)
30314887Schin a->re.bm.fail[i] = 2 * n - i;
30324887Schin mask = a->re.bm.mask;
30334887Schin for (m = 0; m < n; m++)
30344887Schin {
30354887Schin mask[m] = h;
30364887Schin h += UCHAR_MAX + 1;
30374887Schin }
30384887Schin if (x)
30394887Schin bmstr(env, a, x->re.string.base, n, 1);
30404887Schin else
30414887Schin {
30424887Schin v = (unsigned char*)q;
30434887Schin memset(v, 0, n);
30444887Schin m = 1;
30454887Schin for (i = 0; i <= UCHAR_MAX; i++)
30464887Schin if (t->re.trie.root[i])
30474887Schin m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m);
30484887Schin }
30494887Schin mask--;
30504887Schin memset(q, 0, n * sizeof(*q));
30514887Schin for (k = (j = n) + 1; j > 0; j--, k--)
30524887Schin {
30534887Schin DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0));
30544887Schin for (q[j] = k; k <= n; k = q[k])
30554887Schin {
30564887Schin for (m = 0; m <= UCHAR_MAX; m++)
30574887Schin if (mask[k][m] == mask[j][m])
30584887Schin {
30594887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0));
30604887Schin goto cut;
30614887Schin }
30624887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0));
30634887Schin if (a->re.bm.fail[k] > n - j)
30644887Schin a->re.bm.fail[k] = n - j;
30654887Schin }
30664887Schin cut: ;
30674887Schin }
30684887Schin for (i = 1; i <= n; i++)
30694887Schin if (a->re.bm.fail[i] > n + k - i)
30704887Schin {
30714887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0));
30724887Schin a->re.bm.fail[i] = n + k - i;
30734887Schin }
30744887Schin #if _AST_REGEX_DEBUG
30754887Schin if (DEBUG_TEST(0x0020,(1),(0)))
30764887Schin {
30774887Schin sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0);
30784887Schin for (m = 0; m < n; m++)
30794887Schin for (i = 1; i <= UCHAR_MAX; i++)
30804887Schin if (a->re.bm.mask[m][i])
30814887Schin sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]);
30824887Schin for (i = ' '; i <= UCHAR_MAX; i++)
30834887Schin if (a->re.bm.skip[i] >= HIT)
30844887Schin sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i);
30854887Schin else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n)
30864887Schin sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]);
30874887Schin for (j = 31; j >= 0; j--)
30884887Schin {
30894887Schin sfprintf(sfstderr, " ");
30904887Schin next:
30914887Schin for (m = 0; m < n; m++)
30924887Schin {
30934887Schin for (i = 0040; i < 0177; i++)
30944887Schin if (a->re.bm.mask[m][i] & (1 << j))
30954887Schin {
30964887Schin sfprintf(sfstderr, " %c", i);
30974887Schin break;
30984887Schin }
30994887Schin if (i >= 0177)
31004887Schin {
31014887Schin if (j > 0)
31024887Schin {
31034887Schin j--;
31044887Schin goto next;
31054887Schin }
31064887Schin sfprintf(sfstderr, " ?");
31074887Schin }
31084887Schin }
31094887Schin sfprintf(sfstderr, "\n");
31104887Schin }
31114887Schin sfprintf(sfstderr, "FAIL: ");
31124887Schin for (m = 1; m <= n; m++)
31134887Schin sfprintf(sfstderr, "%3d", a->re.bm.fail[m]);
31144887Schin sfprintf(sfstderr, "\n");
31154887Schin }
31164887Schin #endif
31174887Schin alloc(env->disc, q, 0);
31184887Schin a->next = e;
31194887Schin p->env->rex = a;
31204887Schin return 0;
31214887Schin }
31224887Schin switch (e->type)
31234887Schin {
31244887Schin case REX_BEG:
31254887Schin if (env->flags & REG_NEWLINE)
31264887Schin return 0;
31274887Schin break;
31284887Schin case REX_GROUP:
31294887Schin if (env->stats.b)
31304887Schin return 0;
31314887Schin e = e->re.group.expr.rex;
31324887Schin if (e->type != REX_DOT)
31334887Schin return 0;
31344887Schin /*FALLTHROUGH*/
31354887Schin case REX_DOT:
31364887Schin if (e->lo == 0 && e->hi == RE_DUP_INF)
31374887Schin break;
31384887Schin return 0;
31394887Schin case REX_NULL:
31404887Schin if (env->flags & REG_NULL)
31414887Schin break;
31424887Schin env->error = REG_ENULL;
31434887Schin return 1;
31444887Schin case REX_STRING:
31458462SApril.Chin@Sun.COM if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map)
31464887Schin return 0;
31474887Schin s = e->re.string.base;
31484887Schin n = e->re.string.size;
31494887Schin if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1))))
31504887Schin return 1;
31514887Schin a->flags = e->flags;
31524887Schin a->map = e->map;
31534887Schin f = a->re.string.fail;
31544887Schin memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n);
31554887Schin s = a->re.string.base;
31564887Schin a->re.string.size = n;
31574887Schin f[0] = m = -1;
31584887Schin for (k = 1; k < n; k++)
31594887Schin {
31604887Schin while (m >= 0 && s[m+1] != s[k])
31614887Schin m = f[m];
31624887Schin if (s[m+1] == s[k])
31634887Schin m++;
31644887Schin f[k] = m;
31654887Schin }
31664887Schin a->next = e->next;
31674887Schin p->env->rex = a;
31684887Schin e->next = 0;
31694887Schin drop(env->disc, e);
31704887Schin break;
31714887Schin default:
31724887Schin return 0;
31734887Schin }
31744887Schin }
31754887Schin p->env->once = 1;
31764887Schin return 0;
31774887Schin }
31784887Schin
31794887Schin int
regcomp(regex_t * p,const char * pattern,regflags_t flags)31804887Schin regcomp(regex_t* p, const char* pattern, regflags_t flags)
31814887Schin {
31824887Schin Rex_t* e;
31834887Schin Rex_t* f;
31844887Schin regdisc_t* disc;
31854887Schin unsigned char* fold;
31864887Schin int i;
31874887Schin Cenv_t env;
31884887Schin
31894887Schin if (!p)
31904887Schin return REG_BADPAT;
31914887Schin if (flags & REG_DISCIPLINE)
31924887Schin {
31934887Schin flags &= ~REG_DISCIPLINE;
31944887Schin disc = p->re_disc;
31954887Schin }
31964887Schin else
31974887Schin disc = &state.disc;
31984887Schin if (!disc->re_errorlevel)
31994887Schin disc->re_errorlevel = 2;
32004887Schin p->env = 0;
32014887Schin if (!pattern)
32024887Schin return fatal(disc, REG_BADPAT, pattern);
32034887Schin if (!state.initialized)
32044887Schin {
32054887Schin state.initialized = 1;
32064887Schin for (i = 0; i < elementsof(state.escape); i++)
32074887Schin state.magic[state.escape[i].key] = state.escape[i].val;
32084887Schin }
32094887Schin if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data))
32104887Schin {
32114887Schin if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1)))
32124887Schin return fatal(disc, REG_ESPACE, pattern);
32134887Schin for (i = 0; i <= UCHAR_MAX; i++)
32144887Schin fold[i] = toupper(i);
32154887Schin LCINFO(AST_LC_CTYPE)->data = (void*)fold;
32164887Schin }
32174887Schin again:
32184887Schin if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t))))
32194887Schin return fatal(disc, REG_ESPACE, pattern);
32204887Schin memset(p->env, 0, sizeof(*p->env));
32214887Schin memset(&env, 0, sizeof(env));
32224887Schin env.regex = p;
32234887Schin env.flags = flags;
32244887Schin env.disc = p->env->disc = disc;
32254887Schin if (env.flags & REG_AUGMENTED)
32264887Schin env.flags |= REG_EXTENDED;
32274887Schin env.mappeddot = '.';
32284887Schin env.mappednewline = '\n';
32294887Schin env.mappedslash = '/';
32304887Schin if (disc->re_version >= REG_VERSION_MAP && disc->re_map)
32314887Schin {
32324887Schin env.map = disc->re_map;
32334887Schin env.MAP = p->env->fold;
32344887Schin for (i = 0; i <= UCHAR_MAX; i++)
32354887Schin {
32364887Schin env.MAP[i] = fold[env.map[i]];
32374887Schin if (env.map[i] == '.')
32384887Schin env.mappeddot = i;
32394887Schin if (env.map[i] == '\n')
32404887Schin env.mappednewline = i;
32414887Schin if (env.map[i] == '/')
32424887Schin env.mappedslash = i;
32434887Schin }
32444887Schin }
32454887Schin else
32464887Schin env.MAP = fold;
32474887Schin env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE;
32484887Schin env.explicit = -1;
32494887Schin if (env.flags & REG_SHELL)
32504887Schin {
32514887Schin if (env.flags & REG_SHELL_PATH)
32524887Schin env.explicit = env.mappedslash;
32534887Schin env.flags |= REG_LENIENT|REG_NULL;
32544887Schin env.type = env.type == BRE ? SRE : KRE;
32554887Schin }
32564887Schin if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE)
32574887Schin env.explicit = env.mappednewline;
32584887Schin p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1;
32594887Schin env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL));
326010898Sroland.mainz@nrubsig.org env.regexp = !!(env.flags & REG_REGEXP);
32614887Schin env.token.lex = 0;
32624887Schin env.token.push = 0;
32634887Schin if (env.flags & REG_DELIMITED)
32644887Schin {
32654887Schin switch (env.delimiter = *pattern++)
32664887Schin {
32674887Schin case 0:
32684887Schin case '\\':
32694887Schin case '\n':
32704887Schin case '\r':
32714887Schin env.error = REG_EDELIM;
32724887Schin goto bad;
32734887Schin }
32744887Schin env.terminator = '\n';
32754887Schin }
32764887Schin env.literal = env.pattern = env.cursor = (unsigned char*)pattern;
32774887Schin if (!(p->env->rex = alt(&env, 1, 0)))
32784887Schin goto bad;
32794887Schin if (env.parnest)
32804887Schin {
32814887Schin env.error = REG_EPAREN;
32824887Schin goto bad;
32834887Schin }
32844887Schin p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL);
32854887Schin if (env.flags & REG_LEFT)
32864887Schin {
32874887Schin if (p->env->rex->type != REX_BEG)
32884887Schin {
32894887Schin if (p->env->rex->type == REX_ALT)
32904887Schin env.flags &= ~REG_FIRST;
32914887Schin if (!(e = node(&env, REX_BEG, 0, 0, 0)))
32924887Schin {
32934887Schin regfree(p);
32944887Schin return fatal(disc, REG_ESPACE, pattern);
32954887Schin }
32964887Schin e->next = p->env->rex;
32974887Schin p->env->rex = e;
32984887Schin p->env->once = 1;
32994887Schin }
33004887Schin p->env->stats.re_flags |= REG_LEFT;
33014887Schin }
33024887Schin for (e = p->env->rex; e->next; e = e->next);
33034887Schin p->env->done.type = REX_DONE;
33044887Schin p->env->done.flags = e->flags;
33054887Schin if (env.flags & REG_RIGHT)
33064887Schin {
33074887Schin if (e->type != REX_END)
33084887Schin {
33094887Schin if (p->env->rex->type == REX_ALT)
33104887Schin env.flags &= ~REG_FIRST;
33114887Schin if (!(f = node(&env, REX_END, 0, 0, 0)))
33124887Schin {
33134887Schin regfree(p);
33144887Schin return fatal(disc, REG_ESPACE, pattern);
33154887Schin }
33164887Schin f->flags = e->flags;
33174887Schin f->map = e->map;
33184887Schin e->next = f;
33194887Schin }
33204887Schin p->env->stats.re_flags |= REG_RIGHT;
33214887Schin }
33224887Schin if (stats(&env, p->env->rex))
33234887Schin {
33244887Schin if (!env.error)
33254887Schin env.error = REG_ECOUNT;
33264887Schin goto bad;
33274887Schin }
33284887Schin if (env.stats.b)
33294887Schin p->env->hard = p->env->separate = 1;
33304887Schin else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c)))
33314887Schin p->env->hard = 1;
33324887Schin if (p->env->hard || env.stats.c || env.stats.i)
33334887Schin p->env->stats.re_min = p->env->stats.re_max = -1;
33344887Schin else
33354887Schin {
33364887Schin if (!(p->env->stats.re_min = env.stats.m))
33374887Schin p->env->stats.re_min = -1;
33384887Schin if (!(p->env->stats.re_max = env.stats.n))
33394887Schin p->env->stats.re_max = -1;
33404887Schin }
33414887Schin if (special(&env, p))
33424887Schin goto bad;
33434887Schin serialize(&env, p->env->rex, 1);
33444887Schin p->re_nsub = env.stats.p;
33454887Schin if (env.type == KRE)
33464887Schin p->re_nsub /= 2;
33474887Schin if (env.flags & REG_DELIMITED)
33484887Schin {
33494887Schin p->re_npat = env.cursor - env.pattern + 1;
33504887Schin if (*env.cursor == env.delimiter)
33514887Schin p->re_npat++;
33524887Schin else if (env.flags & REG_MUSTDELIM)
33534887Schin {
33544887Schin env.error = REG_EDELIM;
33554887Schin goto bad;
33564887Schin }
33574887Schin else
33584887Schin env.flags &= ~REG_DELIMITED;
33594887Schin }
33604887Schin p->env->explicit = env.explicit;
33614887Schin p->env->flags = env.flags & REG_COMP;
33624887Schin p->env->min = env.stats.m;
33634887Schin p->env->nsub = env.stats.p + env.stats.u;
33644887Schin p->env->refs = 1;
33654887Schin return 0;
33664887Schin bad:
33674887Schin regfree(p);
33684887Schin if (!env.error)
33694887Schin env.error = REG_ESPACE;
33704887Schin if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL))
33714887Schin {
33724887Schin flags |= REG_LITERAL;
33734887Schin pattern = (const char*)env.literal;
33744887Schin goto again;
33754887Schin }
33764887Schin return fatal(disc, env.error, pattern);
33774887Schin }
33784887Schin
33794887Schin /*
33804887Schin * regcomp() on sized pattern
33814887Schin * the lazy way around adding and checking env.end
33824887Schin */
33834887Schin
33844887Schin int
regncomp(regex_t * p,const char * pattern,size_t size,regflags_t flags)33854887Schin regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags)
33864887Schin {
33874887Schin char* s;
33884887Schin int r;
33894887Schin
33904887Schin if (!(s = malloc(size + 1)))
33914887Schin return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern);
33924887Schin memcpy(s, pattern, size);
33934887Schin s[size] = 0;
33944887Schin r = regcomp(p, s, flags);
33954887Schin free(s);
33964887Schin return r;
33974887Schin }
33984887Schin
33994887Schin /*
34004887Schin * combine two compiled regular expressions if possible,
34014887Schin * replacing first with the combination and freeing second.
34024887Schin * return 0 on success.
34034887Schin * the only combinations handled are building a Trie
34044887Schin * from String|Kmp|Trie and String|Kmp
34054887Schin */
34064887Schin
34074887Schin int
regcomb(regex_t * p,regex_t * q)34084887Schin regcomb(regex_t* p, regex_t* q)
34094887Schin {
34104887Schin Rex_t* e = p->env->rex;
34114887Schin Rex_t* f = q->env->rex;
34124887Schin Rex_t* g;
3413*12068SRoger.Faulkner@Oracle.COM Rex_t* h;
34144887Schin Cenv_t env;
34154887Schin
34164887Schin if (!e || !f)
34174887Schin return fatal(p->env->disc, REG_BADPAT, NiL);
34184887Schin if (p->env->separate || q->env->separate)
34194887Schin return REG_ESUBREG;
34204887Schin memset(&env, 0, sizeof(env));
34214887Schin env.disc = p->env->disc;
34224887Schin if (e->type == REX_BM)
34234887Schin {
34244887Schin p->env->rex = e->next;
34254887Schin e->next = 0;
34264887Schin drop(env.disc, e);
34274887Schin e = p->env->rex;
34284887Schin }
34294887Schin if (f->type == REX_BM)
34304887Schin {
34314887Schin q->env->rex = f->next;
34324887Schin f->next = 0;
34334887Schin drop(env.disc, f);
34344887Schin f = q->env->rex;
34354887Schin }
34364887Schin if (e->type == REX_BEG && f->type == REX_BEG)
34374887Schin {
34384887Schin p->env->flags |= REG_LEFT;
34394887Schin p->env->rex = e->next;
34404887Schin e->next = 0;
34414887Schin drop(env.disc, e);
34424887Schin e = p->env->rex;
34434887Schin q->env->rex = f->next;
34444887Schin f->next = 0;
34454887Schin drop(env.disc, f);
34464887Schin f = q->env->rex;
34474887Schin }
3448*12068SRoger.Faulkner@Oracle.COM for (g = e; g->next; g = g->next);
3449*12068SRoger.Faulkner@Oracle.COM for (h = f; h->next; h = h->next);
3450*12068SRoger.Faulkner@Oracle.COM if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END)
34514887Schin {
34524887Schin p->env->flags |= REG_RIGHT;
3453*12068SRoger.Faulkner@Oracle.COM drop(env.disc, g->next);
3454*12068SRoger.Faulkner@Oracle.COM g->next = 0;
3455*12068SRoger.Faulkner@Oracle.COM drop(env.disc, h->next);
3456*12068SRoger.Faulkner@Oracle.COM h->next = 0;
34574887Schin }
34584887Schin if (!(g = trie(&env, f, e)))
34594887Schin return fatal(p->env->disc, REG_BADPAT, NiL);
34604887Schin p->env->rex = g;
34614887Schin if (!q->env->once)
34624887Schin p->env->once = 0;
34634887Schin q->env->rex = 0;
34644887Schin if (p->env->flags & REG_LEFT)
34654887Schin {
34664887Schin if (!(e = node(&env, REX_BEG, 0, 0, 0)))
34674887Schin {
34684887Schin regfree(p);
34694887Schin return fatal(p->env->disc, REG_ESPACE, NiL);
34704887Schin }
34714887Schin e->next = p->env->rex;
34724887Schin p->env->rex = e;
34734887Schin p->env->once = 1;
34744887Schin }
34754887Schin if (p->env->flags & REG_RIGHT)
34764887Schin {
34774887Schin for (f = p->env->rex; f->next; f = f->next);
34784887Schin if (f->type != REX_END)
34794887Schin {
34804887Schin if (!(e = node(&env, REX_END, 0, 0, 0)))
34814887Schin {
34824887Schin regfree(p);
34834887Schin return fatal(p->env->disc, REG_ESPACE, NiL);
34844887Schin }
34854887Schin f->next = e;
34864887Schin }
34874887Schin }
34884887Schin env.explicit = p->env->explicit;
34894887Schin env.flags = p->env->flags;
34904887Schin env.disc = p->env->disc;
34914887Schin if (stats(&env, p->env->rex))
34924887Schin {
34934887Schin regfree(p);
34944887Schin return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL);
34954887Schin }
34964887Schin if (special(&env, p))
34974887Schin {
34984887Schin regfree(p);
34994887Schin return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL);
35004887Schin }
35014887Schin p->env->min = g->re.trie.min;
35024887Schin return 0;
35034887Schin }
35044887Schin
35054887Schin /*
35064887Schin * copy a reference of p into q
35074887Schin * p and q may then have separate regsubcomp() instantiations
35084887Schin */
35094887Schin
35104887Schin int
regdup(regex_t * p,regex_t * q)35114887Schin regdup(regex_t* p, regex_t* q)
35124887Schin {
35134887Schin if (!p || !q)
35144887Schin return REG_BADPAT;
35154887Schin *q = *p;
35164887Schin p->env->refs++;
35174887Schin q->re_sub = 0;
35184887Schin return 0;
35194887Schin }
3520