14887Schin /*********************************************************************** 24887Schin * * 34887Schin * This software is part of the ast package * 4*10898Sroland.mainz@nrubsig.org * Copyright (c) 1985-2009 AT&T Intellectual Property * 54887Schin * and is licensed under the * 64887Schin * Common Public License, Version 1.0 * 78462SApril.Chin@Sun.COM * by AT&T Intellectual Property * 84887Schin * * 94887Schin * A copy of the License is available at * 104887Schin * http://www.opensource.org/licenses/cpl1.0.txt * 114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 124887Schin * * 134887Schin * Information and Software Systems Research * 144887Schin * AT&T Research * 154887Schin * Florham Park NJ * 164887Schin * * 174887Schin * Glenn Fowler <gsf@research.att.com> * 184887Schin * David Korn <dgk@research.att.com> * 194887Schin * Phong Vo <kpv@research.att.com> * 204887Schin * * 214887Schin ***********************************************************************/ 224887Schin #pragma prototyped 234887Schin 244887Schin /* 254887Schin * posix regex compiler 264887Schin */ 274887Schin 284887Schin #include "reglib.h" 294887Schin 304887Schin #if _PACKAGE_ast 314887Schin #include "lclib.h" 324887Schin #endif 334887Schin 344887Schin #define serialize re_serialize /* hp.ia64 <unistd.h>! */ 354887Schin 364887Schin #define C_ESC (-1) 374887Schin #define C_MB (-2) 384887Schin 394887Schin #if _AST_REGEX_DEBUG 404887Schin 414887Schin #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 424887Schin #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 434887Schin #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 444887Schin 454887Schin static unsigned long debug; 464887Schin static unsigned long debug_flag; 474887Schin 484887Schin #else 494887Schin 504887Schin #define DEBUG_INIT() 514887Schin #define DEBUG_TEST(f,y,n) (n) 524887Schin #define DEBUG_CODE(f,y,n) do {n} while(0) 534887Schin 544887Schin #endif 554887Schin 564887Schin #if _PACKAGE_ast 574887Schin 584887Schin typedef struct Cchr_s 594887Schin { 604887Schin Dtlink_t lnk; 614887Schin unsigned char nam[2]; 624887Schin Ckey_t key; 634887Schin } Cchr_t; 644887Schin 654887Schin #endif 664887Schin 674887Schin #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0) 684887Schin 694887Schin /* 704887Schin * determine whether greedy matching will work, i.e. produce 714887Schin * the best match first. such expressions are "easy", and 724887Schin * need no backtracking once a complete match is found. 734887Schin * if an expression has backreferences or alts it's hard 744887Schin * else if it has only one closure it's easy 754887Schin * else if all closures are simple (i.e. one-character) it's easy 764887Schin * else it's hard. 774887Schin */ 784887Schin 794887Schin typedef struct Stats_s 804887Schin { 814887Schin unsigned long l; /* min length to left of x */ 824887Schin unsigned long k; /* min length to left of y */ 834887Schin unsigned long m; /* min length */ 844887Schin unsigned long n; /* max length */ 854887Schin unsigned short a; /* number of alternations */ 864887Schin unsigned short b; /* number of backrefs */ 874887Schin unsigned short c; /* number of closures */ 884887Schin unsigned short i; /* number of negations */ 894887Schin unsigned short p; /* number of named subexpressions */ 904887Schin unsigned short s; /* number of simple closures */ 914887Schin unsigned short t; /* number of tries */ 924887Schin unsigned short u; /* number of unnamed subexpressions */ 934887Schin Rex_t* x; /* max length REX_STRING */ 944887Schin Rex_t* y; /* max length REX_TRIE */ 954887Schin } Stats_t; 964887Schin 974887Schin typedef struct Token_s 984887Schin { 994887Schin unsigned long min; 1004887Schin unsigned long max; 1014887Schin short lex; 1024887Schin short len; 1034887Schin short esc; 1044887Schin short att; 1054887Schin short push; 1064887Schin } Token_t; 1074887Schin 1084887Schin typedef struct Cenv_s 1094887Schin { 1104887Schin int delimiter; /* pattern delimiter */ 1114887Schin int error; /* last error */ 1124887Schin int explicit; /* explicit match on this char */ 1134887Schin int mappeddot; /* inverse mapped '.' */ 1144887Schin int mappednewline; /* inverse mapped '\n' */ 1154887Schin int mappedslash; /* inverse mapped '/' */ 1164887Schin regflags_t flags; /* flags arg to regcomp */ 1174887Schin int type; /* BRE,ERE,ARE,SRE,KRE */ 1184887Schin unsigned char* cursor; /* curent point in re */ 1194887Schin unsigned char* pattern; /* the original pattern */ 1204887Schin unsigned char* literal; /* literal restart pattern */ 1214887Schin int parno; /* number of last open paren */ 1224887Schin int parnest; /* paren nest count */ 1234887Schin int posixkludge; /* to make * nonspecial */ 124*10898Sroland.mainz@nrubsig.org int regexp; /* <regexp.h> compatibility */ 1254887Schin Token_t token; /* token lookahead */ 1264887Schin Stats_t stats; /* RE statistics */ 1274887Schin int terminator; /* pattern terminator */ 1284887Schin Rex_t* paren[2*(BACK_REF_MAX+2)]; 1294887Schin /* paren[i]!=0 if \i defined */ 1304887Schin regex_t* regex; /* user handle */ 1314887Schin regdisc_t* disc; /* user discipline */ 1324887Schin unsigned char* map; /* external to native ccode map */ 1334887Schin unsigned char* MAP; /* fold and/or map */ 1344887Schin } Cenv_t; 1354887Schin 1364887Schin /* 1374887Schin * allocate a new Rex_t node 1384887Schin */ 1394887Schin 1404887Schin static Rex_t* 1414887Schin node(Cenv_t* env, int type, int lo, int hi, size_t extra) 1424887Schin { 1434887Schin register Rex_t* e; 1444887Schin 1454887Schin if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra)) 1464887Schin { 1474887Schin memset(e, 0, sizeof(Rex_t) + extra); 1484887Schin e->type = type; 1494887Schin e->marked = 0; 1504887Schin e->lo = lo; 1514887Schin e->hi = hi; 1524887Schin e->flags = env->flags; 1534887Schin e->map = (e->flags & REG_ICASE) ? env->MAP : env->map; 1544887Schin e->explicit = env->explicit; 1554887Schin if (extra) 1564887Schin e->re.data = (char*)e + sizeof(Rex_t); 1574887Schin } 1584887Schin return e; 1594887Schin } 1604887Schin 1614887Schin /* 1624887Schin * free a Trie_node_t node 1634887Schin */ 1644887Schin 1654887Schin static void 1664887Schin triedrop(regdisc_t* disc, Trie_node_t* e) 1674887Schin { 1684887Schin if (e) 1694887Schin { 1704887Schin triedrop(disc, e->son); 1714887Schin triedrop(disc, e->sib); 1724887Schin alloc(disc, e, 0); 1734887Schin } 1744887Schin } 1754887Schin 1764887Schin /* 1774887Schin * free a Rex_t node 1784887Schin */ 1794887Schin 1804887Schin void 1814887Schin drop(regdisc_t* disc, Rex_t* e) 1824887Schin { 1834887Schin int i; 1844887Schin Rex_t* x; 1854887Schin 1864887Schin if (e && !(disc->re_flags & REG_NOFREE)) 1874887Schin do 1884887Schin { 1894887Schin switch (e->type) 1904887Schin { 1914887Schin case REX_ALT: 1924887Schin case REX_CONJ: 1934887Schin drop(disc, e->re.group.expr.binary.left); 1944887Schin drop(disc, e->re.group.expr.binary.right); 1954887Schin break; 1964887Schin case REX_GROUP: 1974887Schin case REX_GROUP_AHEAD: 1984887Schin case REX_GROUP_AHEAD_NOT: 1994887Schin case REX_GROUP_BEHIND: 2004887Schin case REX_GROUP_BEHIND_NOT: 2014887Schin case REX_GROUP_CUT: 2024887Schin case REX_NEG: 2034887Schin case REX_REP: 2044887Schin drop(disc, e->re.group.expr.rex); 2054887Schin break; 2064887Schin case REX_TRIE: 2074887Schin for (i = 0; i <= UCHAR_MAX; i++) 2084887Schin triedrop(disc, e->re.trie.root[i]); 2094887Schin break; 2104887Schin } 2114887Schin x = e->next; 2124887Schin alloc(disc, e, 0); 2134887Schin } while (e = x); 2144887Schin } 2154887Schin 2164887Schin /* 2174887Schin * mark e and descendants minimal 2184887Schin */ 2194887Schin 2204887Schin static void 2214887Schin mark(register Rex_t* e, int set) 2224887Schin { 2234887Schin if (e && !e->marked) 2244887Schin do 2254887Schin { 2264887Schin e->marked = 1; 2274887Schin if (set) 2284887Schin e->flags |= REG_MINIMAL; 2294887Schin else 2304887Schin e->flags &= ~REG_MINIMAL; 2314887Schin switch (e->type) 2324887Schin { 2334887Schin case REX_ALT: 2344887Schin case REX_CONJ: 2354887Schin case REX_GROUP_COND: 2364887Schin if (e->re.group.expr.binary.left) 2374887Schin mark(e->re.group.expr.binary.left, set); 2384887Schin if (e->re.group.expr.binary.right) 2394887Schin mark(e->re.group.expr.binary.right, set); 2404887Schin break; 2414887Schin case REX_GROUP: 2424887Schin case REX_GROUP_AHEAD: 2434887Schin case REX_GROUP_AHEAD_NOT: 2444887Schin case REX_GROUP_BEHIND: 2454887Schin case REX_GROUP_BEHIND_NOT: 2464887Schin case REX_GROUP_CUT: 2474887Schin case REX_NEG: 2484887Schin case REX_REP: 2494887Schin case REX_TRIE: 2504887Schin mark(e->re.group.expr.rex, set); 2514887Schin break; 2524887Schin } 2534887Schin } while (e = e->next); 2544887Schin } 2554887Schin 2564887Schin /* 2574887Schin * assign subexpression numbers by a preorder tree walk 2584887Schin */ 2594887Schin 2604887Schin static int 2614887Schin serialize(Cenv_t* env, Rex_t* e, int n) 2624887Schin { 2634887Schin do 2644887Schin { 2654887Schin e->serial = n++; 2664887Schin switch (e->type) 2674887Schin { 2684887Schin case REX_ALT: 2694887Schin case REX_GROUP_COND: 2704887Schin if (e->re.group.expr.binary.left) 2714887Schin n = serialize(env, e->re.group.expr.binary.left, n); 2724887Schin e->re.group.expr.binary.serial = n++; 2734887Schin if (e->re.group.expr.binary.right) 2744887Schin n = serialize(env, e->re.group.expr.binary.right, n); 2754887Schin break; 2764887Schin case REX_CONJ: 2774887Schin n = serialize(env, e->re.group.expr.binary.left, n); 2784887Schin n = serialize(env, e->re.group.expr.binary.right, n); 2794887Schin break; 2804887Schin case REX_GROUP: 2814887Schin case REX_GROUP_AHEAD: 2824887Schin case REX_GROUP_AHEAD_NOT: 2834887Schin case REX_GROUP_BEHIND: 2844887Schin case REX_GROUP_BEHIND_NOT: 2854887Schin case REX_GROUP_CUT: 2864887Schin case REX_NEG: 2874887Schin case REX_REP: 2884887Schin n = serialize(env, e->re.group.expr.rex, n); 2894887Schin break; 2904887Schin } 2914887Schin } while (e = e->next); 2924887Schin return n; 2934887Schin } 2944887Schin 2954887Schin /* 2964887Schin * catenate e and f into a sequence, collapsing them if possible 2974887Schin */ 2984887Schin 2994887Schin static Rex_t* 3004887Schin cat(Cenv_t* env, Rex_t* e, Rex_t* f) 3014887Schin { 3024887Schin Rex_t* g; 3034887Schin 3044887Schin if (!f) 3054887Schin { 3064887Schin drop(env->disc, e); 3074887Schin return 0; 3084887Schin } 3094887Schin if (e->type == REX_NULL) 3104887Schin { 3114887Schin drop(env->disc, e); 3124887Schin return f; 3134887Schin } 3144887Schin if (f->type == REX_NULL) 3154887Schin { 3164887Schin g = f->next; 3174887Schin f->next = 0; 3184887Schin drop(env->disc, f); 3194887Schin f = g; 3204887Schin } 3214887Schin else if (e->type == REX_DOT && f->type == REX_DOT) 3224887Schin { 3234887Schin unsigned int m = e->lo + f->lo; 3244887Schin unsigned int n = e->hi + f->hi; 3254887Schin 3264887Schin if (m <= RE_DUP_MAX) 3274887Schin { 3284887Schin if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX) 3294887Schin { 3304887Schin n = RE_DUP_INF; 3314887Schin goto combine; 3324887Schin } 3334887Schin else if (n <= RE_DUP_MAX) 3344887Schin { 3354887Schin combine: 3364887Schin e->lo = m; 3374887Schin e->hi = n; 3384887Schin g = f->next; 3394887Schin f->next = 0; 3404887Schin drop(env->disc, f); 3414887Schin f = g; 3424887Schin } 3434887Schin } 3444887Schin } 3454887Schin e->next = f; 3464887Schin return e; 3474887Schin } 3484887Schin 3494887Schin /* 3504887Schin * collect re statistics 3514887Schin */ 3524887Schin 3534887Schin static int 3544887Schin stats(register Cenv_t* env, register Rex_t* e) 3554887Schin { 3564887Schin register unsigned long n; 3574887Schin register unsigned long m; 3584887Schin unsigned long cm; 3594887Schin unsigned long nm; 3604887Schin unsigned long cn; 3614887Schin unsigned long nn; 3624887Schin unsigned long l; 3634887Schin unsigned long k; 3644887Schin unsigned long t; 3654887Schin Rex_t* q; 3664887Schin Rex_t* x; 3674887Schin Rex_t* y; 3684887Schin unsigned char c; 3694887Schin unsigned char b; 3704887Schin 3714887Schin do 3724887Schin { 3734887Schin switch (e->type) 3744887Schin { 3754887Schin case REX_ALT: 3764887Schin x = env->stats.x; 3774887Schin l = env->stats.l; 3784887Schin y = env->stats.y; 3794887Schin k = env->stats.k; 3804887Schin t = env->stats.t; 3814887Schin if (++env->stats.a <= 0) 3824887Schin return 1; 3834887Schin cm = env->stats.m; 3844887Schin env->stats.m = 0; 3854887Schin cn = env->stats.n; 3864887Schin env->stats.n = 0; 3874887Schin if (stats(env, e->re.group.expr.binary.left)) 3884887Schin return 1; 3894887Schin m = env->stats.m; 3904887Schin env->stats.m = 0; 3914887Schin n = env->stats.n; 3924887Schin env->stats.n = 0; 3934887Schin if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right)) 3944887Schin return 1; 3954887Schin if (env->stats.m > m) 3964887Schin env->stats.m = m; 3974887Schin else 3984887Schin m = env->stats.m; 3994887Schin if ((env->stats.m += cm) < m) 4004887Schin return 1; 4014887Schin if (env->stats.n < n) 4024887Schin env->stats.n = n; 4034887Schin else 4044887Schin n = env->stats.n; 4054887Schin if ((env->stats.n += cn) < n) 4064887Schin return 1; 4074887Schin env->stats.x = x; 4084887Schin env->stats.l = l; 4094887Schin env->stats.y = y; 4104887Schin env->stats.k = k; 4114887Schin env->stats.t = t; 4124887Schin break; 4134887Schin case REX_BACK: 4144887Schin if (++env->stats.b <= 0) 4154887Schin return 1; 4164887Schin break; 4174887Schin case REX_CLASS: 4184887Schin case REX_COLL_CLASS: 4194887Schin case REX_DOT: 4204887Schin case REX_ONECHAR: 4214887Schin n = env->stats.m; 4224887Schin if ((env->stats.m += e->lo) < n) 4234887Schin return 1; 4244887Schin if (e->hi != RE_DUP_INF) 4254887Schin { 4264887Schin n = env->stats.n; 4274887Schin if ((env->stats.n += e->hi) < n) 4284887Schin return 1; 4294887Schin } 4304887Schin if (e->lo != e->hi) 4314887Schin { 4324887Schin if (++env->stats.c <= 0) 4334887Schin return 1; 4344887Schin if (++env->stats.s <= 0) 4354887Schin return 1; 4364887Schin } 4374887Schin break; 4384887Schin case REX_CONJ: 4394887Schin cm = env->stats.m; 4404887Schin env->stats.m = 0; 4414887Schin cn = env->stats.n; 4424887Schin env->stats.n = 0; 4434887Schin if (stats(env, e->re.group.expr.binary.left)) 4444887Schin return 1; 4454887Schin nm = env->stats.m; 4464887Schin env->stats.m = 0; 4474887Schin nn = env->stats.n; 4484887Schin env->stats.n = 0; 4494887Schin if (stats(env, e->re.group.expr.binary.right)) 4504887Schin return 1; 4514887Schin if (env->stats.m < nm) 4524887Schin env->stats.m = nm; 4534887Schin else 4544887Schin nm = env->stats.m; 4554887Schin if ((env->stats.m += cm) < nm) 4564887Schin return 1; 4574887Schin if (env->stats.n < nn) 4584887Schin env->stats.n = nn; 4594887Schin else 4604887Schin nn = env->stats.n; 4614887Schin if ((env->stats.n += cn) < nn) 4624887Schin return 1; 4634887Schin break; 4644887Schin case REX_GROUP: 4654887Schin if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0) 4664887Schin return 1; 4674887Schin if (stats(env, e->re.group.expr.rex)) 4684887Schin return 1; 4694887Schin break; 4704887Schin case REX_GROUP_AHEAD: 4714887Schin case REX_GROUP_AHEAD_NOT: 4724887Schin case REX_GROUP_BEHIND: 4734887Schin case REX_GROUP_BEHIND_NOT: 4744887Schin m = env->stats.m; 4754887Schin n = env->stats.n; 4764887Schin x = env->stats.x; 4774887Schin y = env->stats.y; 4784887Schin if (stats(env, e->re.group.expr.rex)) 4794887Schin return 1; 4804887Schin env->stats.m = m; 4814887Schin env->stats.n = n; 4824887Schin env->stats.x = x; 4834887Schin env->stats.y = y; 4844887Schin switch (e->type) 4854887Schin { 4864887Schin case REX_GROUP_AHEAD: 4874887Schin case REX_GROUP_BEHIND: 4884887Schin if (++env->stats.u <= 0) 4894887Schin return 1; 4904887Schin break; 4914887Schin } 4924887Schin break; 4934887Schin case REX_GROUP_COND: 4944887Schin if (++env->stats.u <= 0) 4954887Schin return 1; 4964887Schin m = env->stats.m; 4974887Schin n = env->stats.n; 4984887Schin x = env->stats.x; 4994887Schin y = env->stats.y; 5004887Schin if (e->re.group.size > 0 && ++env->stats.b <= 0) 5014887Schin return 1; 5024887Schin if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left)) 5034887Schin return 1; 5044887Schin if (q = e->re.group.expr.binary.right) 5054887Schin { 5064887Schin if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left)) 5074887Schin return 1; 5084887Schin if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right)) 5094887Schin return 1; 5104887Schin } 5114887Schin env->stats.m = m; 5124887Schin env->stats.n = n; 5134887Schin env->stats.x = x; 5144887Schin env->stats.y = y; 5154887Schin break; 5164887Schin case REX_GROUP_CUT: 5174887Schin if (++env->stats.u <= 0) 5184887Schin return 1; 5194887Schin m = env->stats.m; 5204887Schin n = env->stats.n; 5214887Schin x = env->stats.x; 5224887Schin y = env->stats.y; 5234887Schin if (stats(env, e->re.group.expr.rex)) 5244887Schin return 1; 5254887Schin env->stats.m = m; 5264887Schin env->stats.n = n; 5274887Schin env->stats.x = x; 5284887Schin env->stats.y = y; 5294887Schin break; 5304887Schin case REX_NEG: 5314887Schin env->stats.i++; 5324887Schin x = env->stats.x; 5334887Schin l = env->stats.l; 5344887Schin y = env->stats.y; 5354887Schin k = env->stats.k; 5364887Schin t = env->stats.t; 5374887Schin cm = env->stats.m; 5384887Schin env->stats.m = 0; 5394887Schin if (stats(env, e->re.group.expr.rex)) 5404887Schin return 1; 5414887Schin env->stats.m = !env->stats.m; 5424887Schin if ((env->stats.m += cm) < cm) 5434887Schin return 1; 5444887Schin env->stats.x = x; 5454887Schin env->stats.l = l; 5464887Schin env->stats.y = y; 5474887Schin env->stats.k = k; 5484887Schin env->stats.t = t; 5494887Schin break; 5504887Schin case REX_REP: 5514887Schin x = env->stats.x; 5524887Schin l = env->stats.l; 5534887Schin y = env->stats.y; 5544887Schin k = env->stats.k; 5554887Schin t = env->stats.t; 5564887Schin if (++env->stats.c <= 0) 5574887Schin return 1; 5584887Schin b = env->stats.b; 5594887Schin c = env->stats.c; 5604887Schin cm = env->stats.m; 5614887Schin env->stats.m = 0; 5624887Schin if (stats(env, e->re.group.expr.rex)) 5634887Schin return 1; 5644887Schin if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0) 5654887Schin return 1; 5664887Schin if (e->lo < 1) 5674887Schin { 5684887Schin env->stats.x = x; 5694887Schin env->stats.l = l; 5704887Schin env->stats.y = y; 5714887Schin env->stats.k = k; 5724887Schin env->stats.t = t; 5734887Schin env->stats.m = cm; 5744887Schin } 5754887Schin else 5764887Schin { 5774887Schin m = env->stats.m; 5784887Schin if ((env->stats.m *= e->lo) > 0 && env->stats.m < m) 5794887Schin return 1; 5804887Schin m = env->stats.m; 5814887Schin if ((env->stats.m += cm) < m) 5824887Schin return 1; 5834887Schin if (env->stats.x != x) 5844887Schin env->stats.l = cm; 5854887Schin if (env->stats.y != y) 5864887Schin env->stats.k = cm; 5874887Schin } 5884887Schin break; 5894887Schin case REX_STRING: 5908462SApril.Chin@Sun.COM if (!e->map) 5914887Schin { 5928462SApril.Chin@Sun.COM cm = env->stats.m; 5938462SApril.Chin@Sun.COM if ((env->stats.m += e->re.string.size) < cm) 5948462SApril.Chin@Sun.COM return 1; 5958462SApril.Chin@Sun.COM cn = env->stats.n; 5968462SApril.Chin@Sun.COM if ((env->stats.n += e->re.string.size) < cn) 5978462SApril.Chin@Sun.COM return 1; 5988462SApril.Chin@Sun.COM if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size) 5998462SApril.Chin@Sun.COM { 6008462SApril.Chin@Sun.COM env->stats.x = e; 6018462SApril.Chin@Sun.COM env->stats.l = cm; 6028462SApril.Chin@Sun.COM } 6034887Schin } 6044887Schin break; 6054887Schin case REX_TRIE: 6064887Schin if (++env->stats.s <= 0) 6074887Schin return 1; 6084887Schin cm = env->stats.m; 6094887Schin if ((env->stats.m += e->re.trie.min) < cm) 6104887Schin return 1; 6114887Schin cn = env->stats.n; 6124887Schin if ((env->stats.n += e->re.trie.max) < cn) 6134887Schin return 1; 6144887Schin env->stats.t++; 6154887Schin if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min) 6164887Schin { 6174887Schin env->stats.y = e; 6184887Schin env->stats.k = cm; 6194887Schin } 6204887Schin break; 6214887Schin } 6224887Schin } while (e = e->next); 6234887Schin return 0; 6244887Schin } 6254887Schin 6264887Schin static int token(Cenv_t*); 6274887Schin 6284887Schin static int 6294887Schin magic(register Cenv_t* env, register int c, int escaped) 6304887Schin { 6314887Schin register char* sp; 6324887Schin register int n; 6334887Schin int o = c; 6344887Schin int e = env->error; 6354887Schin int l = env->token.len; 6364887Schin short* mp; 6374887Schin char* ep; 6384887Schin 6394887Schin if (mp = state.magic[c]) 6404887Schin { 6414887Schin c = mp[env->type+escaped]; 6424887Schin if (c >= T_META) 6434887Schin { 6444887Schin sp = (char*)env->cursor + env->token.len; 6454887Schin switch (c) 6464887Schin { 6474887Schin case T_LEFT: 6484887Schin n = 0; 6494887Schin ep = sp; 6504887Schin while (*sp >= '0' && *sp <= '9') 6514887Schin { 6524887Schin if (n > (INT_MAX / 10)) 6534887Schin { 6544887Schin env->error = REG_BADBR; 6554887Schin goto bad; 6564887Schin } 6574887Schin n = n * 10 + *sp++ - '0'; 6584887Schin } 6594887Schin if (sp == ep) 6604887Schin { 6618462SApril.Chin@Sun.COM if (env->type < SRE || *sp != ',') 6628462SApril.Chin@Sun.COM { 6638462SApril.Chin@Sun.COM env->error = *sp ? REG_BADBR : REG_EBRACE; 6648462SApril.Chin@Sun.COM goto bad; 6658462SApril.Chin@Sun.COM } 6664887Schin } 6674887Schin else if (n > RE_DUP_MAX) 6684887Schin { 6694887Schin env->error = REG_BADBR; 6704887Schin goto bad; 6714887Schin } 6724887Schin env->token.min = n; 6734887Schin if (*sp == ',') 6744887Schin { 6754887Schin n = 0; 6764887Schin ep = ++sp; 6774887Schin while (*sp >= '0' && *sp <= '9') 6784887Schin { 6794887Schin if (n > (INT_MAX / 10)) 6804887Schin { 6814887Schin env->error = REG_BADBR; 6824887Schin goto bad; 6834887Schin } 6844887Schin n = n * 10 + *sp++ - '0'; 6854887Schin } 6864887Schin if (sp == ep) 6874887Schin n = RE_DUP_INF; 6884887Schin else if (n < env->token.min) 6894887Schin { 6904887Schin env->error = REG_BADBR; 6914887Schin goto bad; 6924887Schin } 6934887Schin } 6944887Schin env->token.max = n; 6954887Schin switch (*sp) 6964887Schin { 6974887Schin case 0: 6984887Schin env->error = REG_EBRACE; 6994887Schin goto bad; 7004887Schin case '\\': 7014887Schin if (!escaped) 7024887Schin { 7034887Schin env->error = REG_BADBR; 7044887Schin goto bad; 7054887Schin } 7064887Schin sp++; 7074887Schin break; 7084887Schin default: 7094887Schin if (escaped) 7104887Schin { 7114887Schin env->error = REG_BADBR; 7124887Schin goto bad; 7134887Schin } 7144887Schin break; 7154887Schin } 7164887Schin switch (*sp++) 7174887Schin { 7184887Schin case 0: 7194887Schin env->error = REG_EBRACE; 7204887Schin goto bad; 7214887Schin case '}': 7224887Schin break; 7234887Schin default: 7244887Schin env->error = REG_BADBR; 7254887Schin goto bad; 7264887Schin } 7274887Schin env->token.len = sp - (char*)env->cursor; 7284887Schin break; 7294887Schin case T_RIGHT: 7304887Schin env->error = REG_EBRACE; 7314887Schin goto bad; 7324887Schin case T_OPEN: 7334887Schin if (env->type < SRE && *sp == '?') 7344887Schin { 7354887Schin env->token.len++; 7364887Schin env->token.lex = 0; 7374887Schin goto group; 7384887Schin } 7394887Schin break; 7404887Schin case T_ESCAPE: 7414887Schin c = chresc(sp - 2, &ep); 7424887Schin if (ep < sp) 7434887Schin goto bad; 7444887Schin env->token.len += ep - sp; 7454887Schin if (c >= T_META) 7464887Schin { 7474887Schin env->token.lex = c; 7484887Schin c = C_ESC; 7494887Schin } 7504887Schin return c; 7514887Schin case T_BACK+0: 7524887Schin case T_BACK+1: 7534887Schin case T_BACK+2: 7544887Schin case T_BACK+3: 7554887Schin case T_BACK+4: 7564887Schin case T_BACK+5: 7574887Schin case T_BACK+6: 7584887Schin case T_BACK+7: 7594887Schin n = chresc(sp - 2, &ep); 7604887Schin if (ep > sp + 1) 7614887Schin { 7624887Schin env->token.len += ep - sp; 7634887Schin return n; 7644887Schin } 7654887Schin /*FALLTHROUGH*/ 7664887Schin case T_BACK+8: 7674887Schin case T_BACK+9: 7684887Schin if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT)) 7694887Schin { 7704887Schin env->error = REG_BADESC; 7714887Schin goto bad; 7724887Schin } 7734887Schin if ((env->flags & REG_MULTIREF) && isdigit(*sp)) 7744887Schin { 7754887Schin c = (c - T_BACK) * 10 + (*sp - '0'); 7764887Schin if (c > 0 && c <= env->parno && env->paren[c]) 7774887Schin c += T_BACK; 7784887Schin else 7794887Schin c = chresc(sp - 2, &ep); 7804887Schin env->token.len++; 7814887Schin } 7824887Schin if (c == T_BACK) 7834887Schin c = 0; 7844887Schin break; 7854887Schin case T_BAD: 7864887Schin if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META) 7874887Schin return c; 7884887Schin goto bad; 7894887Schin } 7904887Schin if (env->type >= SRE) 7914887Schin { 7924887Schin if (c == T_DOT) 7934887Schin c = '.'; 7944887Schin else if (c < T_OPEN) 7954887Schin { 7964887Schin if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(') 7974887Schin { 7984887Schin env->token.len++; 7994887Schin env->token.att = 1; 8004887Schin } 8014887Schin if (env->type == KRE && *(env->cursor + env->token.len) == '(') 8024887Schin { 8034887Schin env->token.len++; 8044887Schin switch (c) 8054887Schin { 8064887Schin case T_AT: 8074887Schin break; 8084887Schin case T_PERCENT: 8094887Schin env->token.lex = c; 8104887Schin goto group; 8114887Schin case T_TILDE: 8124887Schin env->token.lex = 0; 8134887Schin goto group; 8144887Schin default: 8154887Schin env->token.lex = c; 8164887Schin break; 8174887Schin } 8184887Schin c = T_OPEN; 8194887Schin } 8204887Schin else if (c == T_STAR) 8214887Schin c = T_DOTSTAR; 8224887Schin else if (c == T_QUES) 8234887Schin c = T_DOT; 8244887Schin else 8254887Schin { 8264887Schin c = o; 8274887Schin env->token.len = l; 8284887Schin } 8294887Schin } 8304887Schin else if (c > T_BACK) 8314887Schin { 8324887Schin c = (c - T_BACK) * 2 - 1; 8334887Schin c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c; 8344887Schin } 8354887Schin else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP)) 8364887Schin { 8374887Schin if (c == T_AND) 8384887Schin c = '&'; 8394887Schin else if (c == T_BAR) 8404887Schin c = '|'; 8414887Schin else if (c == T_OPEN) 8424887Schin c = '('; 8434887Schin } 8444887Schin } 8454887Schin } 8464887Schin } 8474887Schin else if (escaped == 2) 8484887Schin { 8494887Schin if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter)) 8504887Schin /*ok*/; 8514887Schin else 8524887Schin { 8534887Schin env->error = REG_BADESC; 8544887Schin goto bad; 8554887Schin } 8564887Schin } 8574887Schin else if (escaped && !(env->flags & REG_LENIENT) && c != ']') 8584887Schin { 8594887Schin env->error = REG_BADESC; 8604887Schin goto bad; 8614887Schin } 8624887Schin return c; 8634887Schin group: 8644887Schin sp = (char*)env->cursor + env->token.len; 8654887Schin switch (*sp++) 8664887Schin { 8674887Schin case ')': 8684887Schin break; 8694887Schin case '#': 8704887Schin for (;;) 8714887Schin { 8724887Schin switch (*sp++) 8734887Schin { 8744887Schin case 0: 8754887Schin env->error = REG_EPAREN; 8764887Schin return T_BAD; 8774887Schin case ')': 8784887Schin break; 8794887Schin default: 8804887Schin continue; 8814887Schin } 8824887Schin break; 8834887Schin } 8844887Schin break; 8854887Schin default: 8864887Schin return T_GROUP; 8874887Schin } 8884887Schin env->cursor = (unsigned char*)sp; 8894887Schin return token(env); 8904887Schin bad: 8914887Schin if (escaped == 2) 8924887Schin env->error = e; 8934887Schin else if (env->flags & REG_LENIENT) 8944887Schin return o; 8954887Schin else if (escaped == 1 && !env->error) 8964887Schin { 8974887Schin if (mp || o == ']') 8984887Schin return o; 8994887Schin env->error = REG_BADESC; 9004887Schin } 9014887Schin return T_BAD; 9024887Schin } 9034887Schin 9044887Schin static int 9054887Schin token(register Cenv_t* env) 9064887Schin { 9074887Schin int c; 9084887Schin int posixkludge; 9094887Schin 9104887Schin if (env->token.push) 9114887Schin return env->token.lex; 9124887Schin env->token.att = env->token.esc = 0; 9134887Schin if ((env->token.len = MBSIZE(env->cursor)) > 1) 9144887Schin return env->token.lex = C_MB; 9154887Schin env->token.lex = 0; 9164887Schin for (;;) 9174887Schin { 9184887Schin c = *env->cursor; 9194887Schin if (c == 0 || c == env->delimiter || c == env->terminator) 9204887Schin return T_END; 9214887Schin if (!(env->flags & REG_COMMENT)) 9224887Schin break; 9234887Schin if (c == '#') 9244887Schin { 9254887Schin do 9264887Schin { 9274887Schin c = *++env->cursor; 9284887Schin if (c == 0 || c == env->delimiter) 9294887Schin return T_END; 9304887Schin } while (c != '\n'); 9314887Schin } 9324887Schin else if (!isspace(c)) 9334887Schin break; 9344887Schin env->cursor++; 9354887Schin } 9364887Schin if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter) 9374887Schin { 9384887Schin if (env->parnest) 9394887Schin { 9404887Schin env->error = REG_EPAREN; 9414887Schin return T_BAD; 9424887Schin } 9434887Schin env->parno = 0; 9444887Schin env->pattern = env->cursor + 1; 9454887Schin return T_BAR; 9464887Schin } 9474887Schin if (env->flags & REG_LITERAL) 9484887Schin return c; 9494887Schin if (posixkludge = env->posixkludge) 9504887Schin { 9514887Schin env->posixkludge = 0; 9524887Schin if (c == '*') 9534887Schin return c; 9544887Schin } 9554887Schin if (c == '\\') 9564887Schin { 9574887Schin if (env->flags & REG_SHELL_ESCAPED) 9584887Schin return c; 9594887Schin if (!(c = *(env->cursor + 1)) || c == env->terminator) 9604887Schin { 9614887Schin if (env->flags & REG_LENIENT) 9624887Schin { 9634887Schin if (c) 9644887Schin { 9654887Schin env->token.esc = env->token.len; 9664887Schin env->token.len += MBSIZE(env->cursor + 1); 9674887Schin return c; 9684887Schin } 9694887Schin return '\\'; 9704887Schin } 9714887Schin env->error = REG_EESCAPE; 9724887Schin return T_BAD; 9734887Schin } 9744887Schin env->token.esc = env->token.len; 9754887Schin env->token.len += MBSIZE(env->cursor + 1); 9764887Schin if (env->delimiter && c == 'n') 9774887Schin return '\n'; 9784887Schin else if (c == env->delimiter) 9794887Schin return magic(env, c, 0); 9804887Schin else if (c == '(' && env->type == BRE) 9814887Schin env->posixkludge = 1; 9824887Schin else if (c == ')' && env->type == BRE && env->parnest <= 0) 9834887Schin { 9844887Schin env->error = REG_EPAREN; 9854887Schin return T_BAD; 9864887Schin } 9874887Schin else if (isspace(c) && (env->flags & REG_COMMENT)) 9884887Schin return c; 9894887Schin return magic(env, c, 1); 9904887Schin } 9914887Schin else if (c == '$') 9924887Schin { 9934887Schin if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n') 9944887Schin return T_DOLL; 9954887Schin } 9964887Schin else if (c == '^') 9974887Schin { 9984887Schin if (env->type == BRE && (env->cursor == env->pattern || posixkludge)) 9994887Schin { 10004887Schin env->posixkludge = 1; 10014887Schin return T_CFLX; 10024887Schin } 10034887Schin } 10044887Schin else if (c == ')') 10054887Schin { 10064887Schin if (env->type != BRE && env->parnest <= 0) 10074887Schin return c; 10084887Schin } 10094887Schin else if (c == '/' && env->explicit == env->mappedslash) 10104887Schin { 10114887Schin while (*(env->cursor + env->token.len) == c) 10124887Schin env->token.len++; 10134887Schin return T_SLASHPLUS; 10144887Schin } 10154887Schin return magic(env, c, 0); 10164887Schin } 10174887Schin 10184887Schin static Celt_t* 10194887Schin col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec) 10204887Schin { 10218462SApril.Chin@Sun.COM register char* s; 10224887Schin register unsigned char* k; 10234887Schin register unsigned char* e; 10244887Schin register int c; 10254887Schin register int cc; 10264887Schin int bt; 10274887Schin int et; 10284887Schin Ckey_t key; 10294887Schin 10304887Schin cc = 0; 10314887Schin for (;;) 10324887Schin { 10334887Schin k = key; 10344887Schin if (bw == 1) 10354887Schin { 10364887Schin c = bc; 10374887Schin if (ic) 10384887Schin { 10398462SApril.Chin@Sun.COM if (isupper(c)) 10408462SApril.Chin@Sun.COM { 10418462SApril.Chin@Sun.COM c = tolower(c); 10428462SApril.Chin@Sun.COM cc = -1; 10438462SApril.Chin@Sun.COM } 10448462SApril.Chin@Sun.COM else if (islower(c)) 10458462SApril.Chin@Sun.COM { 10468462SApril.Chin@Sun.COM c = toupper(c); 10478462SApril.Chin@Sun.COM cc = -1; 10488462SApril.Chin@Sun.COM } 10498462SApril.Chin@Sun.COM } 10508462SApril.Chin@Sun.COM *k++ = c; 10518462SApril.Chin@Sun.COM } 10528462SApril.Chin@Sun.COM else if (bw < COLL_KEY_MAX) 10538462SApril.Chin@Sun.COM { 10548462SApril.Chin@Sun.COM s = (char*)bp; 10558462SApril.Chin@Sun.COM if (ic) 10568462SApril.Chin@Sun.COM { 10578462SApril.Chin@Sun.COM c = mbchar(s); 10584887Schin if (iswupper(c)) 10594887Schin { 10604887Schin c = towlower(c); 10614887Schin cc = 1; 10624887Schin } 10634887Schin else if (iswlower(c)) 10644887Schin { 10654887Schin c = towupper(c); 10664887Schin cc = 1; 10674887Schin } 10684887Schin } 10698462SApril.Chin@Sun.COM if (cc > 0) 10704887Schin { 10718462SApril.Chin@Sun.COM cc = -1; 10728462SApril.Chin@Sun.COM k += wctomb((char*)k, c); 10734887Schin } 10748462SApril.Chin@Sun.COM else 10758462SApril.Chin@Sun.COM for (e = k + bw; k < e; *k++ = *s++); 10764887Schin } 10774887Schin *k = 0; 10784887Schin mbxfrm(ce->beg, key, COLL_KEY_MAX); 10794887Schin if (ep) 10804887Schin { 10814887Schin k = key; 10824887Schin c = mbchar(k); 10834887Schin if (iswupper(c)) 10844887Schin bt = COLL_range_uc; 10854887Schin else if (iswlower(c)) 10864887Schin bt = COLL_range_lc; 10874887Schin else 10884887Schin bt = COLL_range; 10894887Schin k = key; 10904887Schin if (ew == 1) 10914887Schin { 10924887Schin c = ec; 10934887Schin if (ic) 10944887Schin { 10958462SApril.Chin@Sun.COM if (isupper(c)) 10968462SApril.Chin@Sun.COM { 10978462SApril.Chin@Sun.COM c = tolower(c); 10988462SApril.Chin@Sun.COM cc = -1; 10998462SApril.Chin@Sun.COM } 11008462SApril.Chin@Sun.COM else if (islower(c)) 11018462SApril.Chin@Sun.COM { 11028462SApril.Chin@Sun.COM c = toupper(c); 11038462SApril.Chin@Sun.COM cc = -1; 11048462SApril.Chin@Sun.COM } 11058462SApril.Chin@Sun.COM } 11068462SApril.Chin@Sun.COM *k++ = c; 11078462SApril.Chin@Sun.COM } 11088462SApril.Chin@Sun.COM else if (ew < COLL_KEY_MAX) 11098462SApril.Chin@Sun.COM { 11108462SApril.Chin@Sun.COM s = (char*)ep; 11118462SApril.Chin@Sun.COM if (ic) 11128462SApril.Chin@Sun.COM { 11138462SApril.Chin@Sun.COM c = mbchar(s); 11144887Schin if (iswupper(c)) 11154887Schin { 11164887Schin c = towlower(c); 11174887Schin cc = 1; 11184887Schin } 11194887Schin else if (iswlower(c)) 11204887Schin { 11214887Schin c = towupper(c); 11224887Schin cc = 1; 11234887Schin } 11244887Schin } 11258462SApril.Chin@Sun.COM if (cc > 0) 11264887Schin { 11278462SApril.Chin@Sun.COM cc = -1; 11288462SApril.Chin@Sun.COM k += wctomb((char*)k, c); 11294887Schin } 11308462SApril.Chin@Sun.COM else 11318462SApril.Chin@Sun.COM for (e = k + ew; k < e; *k++ = *s++); 11324887Schin } 11334887Schin *k = 0; 11344887Schin mbxfrm(ce->end, key, COLL_KEY_MAX); 11354887Schin k = key; 11364887Schin c = mbchar(k); 11374887Schin if (iswupper(c)) 11384887Schin et = COLL_range_uc; 11394887Schin else if (iswlower(c)) 11404887Schin et = COLL_range_lc; 11414887Schin else 11424887Schin et = COLL_range; 11434887Schin ce->typ = bt == et ? bt : COLL_range; 11444887Schin } 11454887Schin else 11464887Schin ce->typ = COLL_char; 11474887Schin ce++; 11484887Schin if (!ic || !cc) 11494887Schin break; 11504887Schin ic = 0; 11514887Schin } 11524887Schin return ce; 11534887Schin } 11544887Schin 11554887Schin static Rex_t* 11564887Schin bra(Cenv_t* env) 11574887Schin { 11584887Schin Rex_t* e; 11594887Schin int c; 11604887Schin int i; 11614887Schin int w; 11624887Schin int neg; 11634887Schin int last; 11644887Schin int inrange; 11654887Schin int complicated; 11664887Schin int collate; 11674887Schin int elements; 11684887Schin unsigned char* first; 11694887Schin unsigned char* start; 11704887Schin unsigned char* begin; 11714887Schin unsigned char* s; 11724887Schin regclass_t f; 11734887Schin unsigned char buf[4 * (COLL_KEY_MAX + 1)]; 11744887Schin #if _PACKAGE_ast 11754887Schin int ic; 11764887Schin char mbc[COLL_KEY_MAX + 1]; 11774887Schin #endif 11784887Schin 11794887Schin if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 11804887Schin return 0; 11814887Schin collate = complicated = elements = 0; 11824887Schin if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!') 11834887Schin { 11844887Schin env->cursor++; 11854887Schin neg = 1; 11864887Schin } 11874887Schin else 11884887Schin neg = 0; 11898462SApril.Chin@Sun.COM first = env->cursor; 11908462SApril.Chin@Sun.COM start = first + MBSIZE(first); 11914887Schin if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter)) 11924887Schin goto error; 11934887Schin begin = env->cursor + MBSIZE(env->cursor); 11944887Schin 11954887Schin /* 11964887Schin * inrange: 0=no, 1=possibly, 2=definitely 11974887Schin */ 11984887Schin 11994887Schin inrange = 0; 12004887Schin for (;;) 12014887Schin { 12024887Schin if (!(c = *env->cursor) || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 12034887Schin goto error; 12044887Schin env->cursor += (w = MBSIZE(env->cursor)); 12054887Schin if (c == '\\') 12064887Schin { 12074887Schin if (*env->cursor) 12084887Schin { 12094887Schin if (*env->cursor == 'n') 12104887Schin { 12114887Schin env->cursor++; 12124887Schin c = '\n'; 12134887Schin } 12144887Schin else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 12154887Schin { 12164887Schin env->token.len = 1; 12174887Schin w = magic(env, *env->cursor, 2); 12184887Schin if (env->token.len > 1 || w != T_BAD) 12194887Schin { 12204887Schin if (env->token.len == 1 && (f = classfun(w))) 12214887Schin { 12224887Schin if (inrange > 1) 12234887Schin { 12244887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 12254887Schin goto erange; 12264887Schin inrange = 0; 12274887Schin } 12284887Schin env->cursor++; 12294887Schin for (c = 0; c <= UCHAR_MAX; c++) 12304887Schin if ((*f)(c)) 12314887Schin setadd(e->re.charclass, c); 12324887Schin complicated++; 12334887Schin elements++; 12344887Schin continue; 12354887Schin } 12364887Schin if (env->token.len > 1 || w >= 0 && w < T_META) 12374887Schin { 12384887Schin c = w; 12394887Schin if (c > UCHAR_MAX) 12404887Schin { 12414887Schin if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide()) 12424887Schin goto erange; 12434887Schin c = UCHAR_MAX; 12444887Schin } 12454887Schin env->cursor += env->token.len; 12464887Schin } 12474887Schin } 12484887Schin } 12494887Schin } 12504887Schin } 12514887Schin else if (c == ']') 12524887Schin { 12534887Schin if (env->cursor == begin) 12544887Schin { 12554887Schin last = c; 12564887Schin inrange = 1; 12574887Schin continue; 12584887Schin } 12594887Schin if (inrange != 0) 12604887Schin { 12614887Schin setadd(e->re.charclass, last); 12624887Schin elements++; 12634887Schin if (inrange == 2) 12644887Schin { 12654887Schin setadd(e->re.charclass, '-'); 12664887Schin elements++; 12674887Schin } 12684887Schin } 12694887Schin break; 12704887Schin } 12714887Schin else if (c == '-') 12724887Schin { 12734887Schin if (!inrange && env->cursor != begin && *env->cursor != ']') 12744887Schin { 12754887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 12764887Schin goto erange; 12774887Schin continue; 12784887Schin } 12794887Schin else if (inrange == 1) 12804887Schin { 12814887Schin inrange = 2; 12824887Schin complicated++; 12834887Schin continue; 12844887Schin } 12854887Schin } 12864887Schin else if (c == '[') 12874887Schin { 12884887Schin switch (*env->cursor) 12894887Schin { 12904887Schin case 0: 12914887Schin goto error; 12924887Schin case ':': 1293*10898Sroland.mainz@nrubsig.org if (env->regexp) 1294*10898Sroland.mainz@nrubsig.org goto normal; 12954887Schin if (inrange == 1) 12964887Schin { 12974887Schin setadd(e->re.charclass, last); 12984887Schin elements++; 12994887Schin } 13004887Schin if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 13014887Schin { 13024887Schin if (env->cursor == start && (c = *(env->cursor + 1))) 13034887Schin { 13044887Schin s = start = env->cursor + 1; 13054887Schin while (*++s && *s != ':'); 13064887Schin if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']') 13074887Schin { 13084887Schin if ((i = (s - start)) == 1) 13094887Schin { 13104887Schin switch (c) 13114887Schin { 13124887Schin case '<': 13134887Schin i = REX_WBEG; 13144887Schin break; 13154887Schin case '>': 13164887Schin i = REX_WEND; 13174887Schin break; 13184887Schin default: 13194887Schin i = 0; 13204887Schin break; 13214887Schin } 13224887Schin if (i) 13234887Schin { 13244887Schin env->cursor = s + 3; 13254887Schin drop(env->disc, e); 13264887Schin return node(env, i, 0, 0, 0); 13274887Schin } 13284887Schin } 13294887Schin } 13304887Schin } 13314887Schin env->error = REG_ECTYPE; 13324887Schin goto error; 13334887Schin } 13344887Schin for (c = 0; c <= UCHAR_MAX; c++) 13354887Schin if ((*f)(c)) 13364887Schin setadd(e->re.charclass, c); 13374887Schin inrange = 0; 13384887Schin complicated++; 13394887Schin elements++; 13404887Schin continue; 13414887Schin case '=': 1342*10898Sroland.mainz@nrubsig.org if (env->regexp) 1343*10898Sroland.mainz@nrubsig.org goto normal; 13444887Schin if (inrange == 2) 13454887Schin goto erange; 13464887Schin if (inrange == 1) 13474887Schin { 13484887Schin setadd(e->re.charclass, last); 13494887Schin elements++; 13504887Schin } 13514887Schin if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 13524887Schin goto ecollate; 13534887Schin if (c > 1) 13544887Schin collate++; 13554887Schin else 13564887Schin setadd(e->re.charclass, buf[0]); 13574887Schin c = buf[0]; 13584887Schin inrange = 0; 13594887Schin complicated++; 13604887Schin elements++; 13614887Schin continue; 13624887Schin case '.': 1363*10898Sroland.mainz@nrubsig.org if (env->regexp) 1364*10898Sroland.mainz@nrubsig.org goto normal; 13654887Schin if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 13664887Schin goto ecollate; 13674887Schin if (c > 1) 13684887Schin collate++; 13694887Schin c = buf[0]; 13704887Schin complicated++; 13714887Schin break; 13724887Schin default: 1373*10898Sroland.mainz@nrubsig.org normal: 13744887Schin if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 13754887Schin goto error; 13764887Schin break; 13774887Schin } 13784887Schin } 13794887Schin else if (w > 1) 13804887Schin complicated++; 13814887Schin if (inrange == 2) 13824887Schin { 13834887Schin if (last > c) 13844887Schin { 13854887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 13864887Schin goto erange; 13874887Schin setadd(e->re.charclass, last); 13884887Schin setadd(e->re.charclass, c); 13894887Schin } 13904887Schin else 13914887Schin for (i = last; i <= c; i++) 13924887Schin setadd(e->re.charclass, i); 13934887Schin inrange = env->type >= SRE || (env->flags & REG_LENIENT); 13944887Schin elements += 2; 13954887Schin } 13964887Schin else if (inrange == 1) 13974887Schin { 13984887Schin setadd(e->re.charclass, last); 13994887Schin elements++; 14004887Schin } 14014887Schin else 14024887Schin inrange = 1; 14034887Schin last = c; 14044887Schin } 14054887Schin #if _PACKAGE_ast 14064887Schin if (complicated && mbcoll()) 14074887Schin { 14084887Schin Dt_t* dt; 14094887Schin Cchr_t* cc; 14104887Schin Cchr_t* tc; 14114887Schin Cchr_t* xc; 14124887Schin Celt_t* ce; 14134887Schin Cchr_t key; 14144887Schin int rw; 14154887Schin int rc; 14168462SApril.Chin@Sun.COM int wc; 14174887Schin unsigned char* rp; 14184887Schin unsigned char* pp; 14198462SApril.Chin@Sun.COM char* wp; 14208462SApril.Chin@Sun.COM char cb[2][COLL_KEY_MAX+1]; 14214887Schin 14224887Schin static Dtdisc_t disc; 14234887Schin 14244887Schin static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 14254887Schin 14264887Schin if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data)) 14274887Schin { 14284887Schin disc.key = offsetof(Cchr_t, key); 14294887Schin if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree))) 14304887Schin { 14314887Schin for (i = 0; i < elementsof(primary) - 1; i++, cc++) 14324887Schin { 14334887Schin cc->nam[0] = primary[i]; 14344887Schin mbxfrm(cc->key, cc->nam, COLL_KEY_MAX); 14354887Schin dtinsert(dt, cc); 14364887Schin } 14374887Schin for (i = 0; i < elementsof(cc->key); i++) 14384887Schin cc->key[i] = ~0; 14394887Schin dtinsert(dt, cc); 14404887Schin LCINFO(AST_LC_COLLATE)->data = (void*)dt; 14414887Schin } 14424887Schin else 14434887Schin { 14444887Schin if (cc) 14454887Schin free(cc); 14464887Schin drop(env->disc, e); 14474887Schin return 0; 14484887Schin } 14494887Schin } 14504887Schin if (dt) 14514887Schin { 14524887Schin drop(env->disc, e); 14534887Schin if (ic = env->flags & REG_ICASE) 14544887Schin elements *= 2; 14558462SApril.Chin@Sun.COM if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t)))) 14564887Schin return 0; 14574887Schin ce = (Celt_t*)e->re.data; 14584887Schin e->re.collate.invert = neg; 14594887Schin e->re.collate.elements = ce; 14604887Schin env->cursor = first; 14614887Schin inrange = 0; 14624887Schin for (;;) 14634887Schin { 14644887Schin if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 14654887Schin goto error; 14664887Schin pp = env->cursor; 14674887Schin env->cursor += (w = MBSIZE(env->cursor)); 14684887Schin if (c == '\\') 14694887Schin { 14704887Schin if (*env->cursor) 14714887Schin { 14724887Schin if (*env->cursor == 'n') 14734887Schin { 14744887Schin pp = env->cursor++; 14754887Schin c = '\n'; 14764887Schin } 14774887Schin else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 14784887Schin { 14794887Schin env->token.len = 1; 14804887Schin w = magic(env, *env->cursor, 2); 14814887Schin if (env->token.len > 1 || w != T_BAD) 14824887Schin { 14834887Schin if (env->token.len == 1 && (f = classfun(w))) 14844887Schin { 14854887Schin if (inrange > 1) 14864887Schin { 14874887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 14884887Schin goto erange; 14894887Schin inrange = 0; 14904887Schin } 14914887Schin env->cursor++; 14924887Schin ce->fun = f; 14934887Schin ce->typ = COLL_call; 14944887Schin ce++; 14954887Schin continue; 14964887Schin } 14974887Schin if (env->token.len > 1 || w >= 0 && w < T_META) 14984887Schin { 14994887Schin c = w; 15004887Schin w = wctomb(mbc, c); 15014887Schin pp = (unsigned char*)mbc; 15024887Schin env->cursor += env->token.len; 15034887Schin } 15044887Schin } 15054887Schin } 15064887Schin } 15074887Schin } 15084887Schin else if (c == ']') 15094887Schin { 15104887Schin if (env->cursor == begin) 15114887Schin { 15124887Schin rp = pp; 15134887Schin rw = w; 15144887Schin inrange = 1; 15154887Schin continue; 15164887Schin } 15174887Schin if (inrange != 0) 15184887Schin { 15194887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 15204887Schin if (inrange == 2) 15214887Schin ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0); 15224887Schin } 15234887Schin break; 15244887Schin } 15254887Schin else if (c == '-') 15264887Schin { 15274887Schin if (!inrange && env->cursor != begin && *env->cursor != ']') 15284887Schin { 15294887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 15304887Schin goto erange; 15314887Schin continue; 15324887Schin } 15334887Schin else if (inrange == 1) 15344887Schin { 15354887Schin inrange = 2; 15364887Schin continue; 15374887Schin } 15384887Schin } 15394887Schin else if (c == '[') 15404887Schin { 15414887Schin switch (*env->cursor) 15424887Schin { 15434887Schin case 0: 15444887Schin goto error; 15454887Schin case ':': 1546*10898Sroland.mainz@nrubsig.org if (env->regexp) 1547*10898Sroland.mainz@nrubsig.org goto complicated_normal; 15484887Schin if (inrange == 1) 15494887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 15504887Schin if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 15514887Schin { 15524887Schin if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']') 15534887Schin { 15544887Schin switch (c) 15554887Schin { 15564887Schin case '<': 15574887Schin i = REX_WBEG; 15584887Schin break; 15594887Schin case '>': 15604887Schin i = REX_WEND; 15614887Schin break; 15624887Schin default: 15634887Schin i = 0; 15644887Schin break; 15654887Schin } 15664887Schin if (i) 15674887Schin { 15684887Schin env->cursor += 5; 15694887Schin drop(env->disc, e); 15704887Schin return node(env, i, 0, 0, 0); 15714887Schin } 15724887Schin } 15734887Schin env->error = REG_ECTYPE; 15744887Schin goto error; 15754887Schin } 15764887Schin ce->fun = f; 15774887Schin ce->typ = COLL_call; 15784887Schin ce++; 15794887Schin inrange = 0; 15804887Schin continue; 15814887Schin case '=': 1582*10898Sroland.mainz@nrubsig.org if (env->regexp) 1583*10898Sroland.mainz@nrubsig.org goto complicated_normal; 15844887Schin if (inrange == 2) 15854887Schin goto erange; 15864887Schin if (inrange == 1) 15874887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 15888462SApril.Chin@Sun.COM pp = (unsigned char*)cb[inrange]; 15894887Schin rp = env->cursor + 1; 15908462SApril.Chin@Sun.COM if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 15914887Schin goto ecollate; 15928462SApril.Chin@Sun.COM wp = (char*)pp; 15938462SApril.Chin@Sun.COM wc = mbchar(wp); 15944887Schin c = 0; 15954887Schin if (ic) 15968462SApril.Chin@Sun.COM { 15978462SApril.Chin@Sun.COM if (iswupper(wc)) 15988462SApril.Chin@Sun.COM { 15998462SApril.Chin@Sun.COM wc = towlower(wc); 16008462SApril.Chin@Sun.COM rw = wctomb((char*)pp, wc); 16018462SApril.Chin@Sun.COM c = 'u'; 16028462SApril.Chin@Sun.COM } 16038462SApril.Chin@Sun.COM else if (iswlower(wc)) 16048462SApril.Chin@Sun.COM c = 'l'; 16058462SApril.Chin@Sun.COM } 16064887Schin for (;;) 16074887Schin { 16088462SApril.Chin@Sun.COM mbxfrm(key.key, (char*)pp, COLL_KEY_MAX); 16098462SApril.Chin@Sun.COM if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key))) 16108462SApril.Chin@Sun.COM goto ecollate; 16118462SApril.Chin@Sun.COM xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc; 16128462SApril.Chin@Sun.COM if (c == 'l' || c == 'L' && !(c = 0)) 16134887Schin ce->typ = COLL_range_lc; 16148462SApril.Chin@Sun.COM else if (c == 'u' || c == 'U' && !(c = 0)) 16154887Schin ce->typ = COLL_range_uc; 16164887Schin else 16174887Schin ce->typ = COLL_range; 16184887Schin strcpy((char*)ce->beg, (char*)xc->key); 16194887Schin if (!(cc = (Cchr_t*)dtnext(dt, cc))) 16204887Schin goto ecollate; 16214887Schin if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc))) 16224887Schin cc = tc; 16234887Schin strcpy((char*)ce->end, (char*)cc->key); 16244887Schin ce->max = -1; 16254887Schin ce++; 16264887Schin if (!c) 16274887Schin break; 16288462SApril.Chin@Sun.COM if (c == 'u') 16298462SApril.Chin@Sun.COM { 16308462SApril.Chin@Sun.COM wc = towlower(wc); 16318462SApril.Chin@Sun.COM c = 'L'; 16328462SApril.Chin@Sun.COM } 16338462SApril.Chin@Sun.COM else 16348462SApril.Chin@Sun.COM { 16358462SApril.Chin@Sun.COM wc = towupper(wc); 16368462SApril.Chin@Sun.COM c = 'U'; 16378462SApril.Chin@Sun.COM } 16388462SApril.Chin@Sun.COM rw = wctomb((char*)pp, wc); 16394887Schin } 16404887Schin inrange = 0; 16418462SApril.Chin@Sun.COM c = *pp; 16424887Schin continue; 16434887Schin case '.': 1644*10898Sroland.mainz@nrubsig.org if (env->regexp) 1645*10898Sroland.mainz@nrubsig.org goto complicated_normal; 16468462SApril.Chin@Sun.COM pp = (unsigned char*)cb[inrange]; 16478462SApril.Chin@Sun.COM if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 16484887Schin goto ecollate; 16494887Schin c = buf[0]; 16504887Schin break; 16514887Schin default: 1652*10898Sroland.mainz@nrubsig.org complicated_normal: 16534887Schin if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 16544887Schin goto error; 16554887Schin break; 16564887Schin } 16574887Schin } 16584887Schin if (inrange == 2) 16594887Schin { 16604887Schin ce = col(ce, ic, rp, rw, rc, pp, w, c); 16614887Schin if (strcmp((char*)ce->beg, (char*)ce->end) > 0) 16624887Schin { 16634887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 16644887Schin goto erange; 16654887Schin (ce-1)->typ = COLL_char; 16664887Schin strcpy((char*)ce->beg, (char*)(ce-1)->end); 16674887Schin ce->typ = COLL_char; 16684887Schin ce++; 16694887Schin } 16704887Schin inrange = env->type >= SRE || (env->flags & REG_LENIENT); 16714887Schin } 16724887Schin else if (inrange == 1) 16734887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 16744887Schin else 16754887Schin inrange = 1; 16764887Schin rp = pp; 16774887Schin rw = w; 16784887Schin rc = c; 16794887Schin } 16804887Schin ce->typ = COLL_end; 16814887Schin return e; 16824887Schin } 16834887Schin } 16844887Schin #endif 16854887Schin if (collate) 16864887Schin goto ecollate; 16874887Schin if (env->flags & REG_ICASE) 16884887Schin for (i = 0; i <= UCHAR_MAX; i++) 16894887Schin if (settst(e->re.charclass, i)) 16904887Schin { 16914887Schin if (isupper(i)) 16924887Schin c = tolower(i); 16934887Schin else if (islower(i)) 16944887Schin c = toupper(i); 16954887Schin else 16964887Schin continue; 16974887Schin setadd(e->re.charclass, c); 16984887Schin } 16994887Schin if (neg) 17004887Schin { 17014887Schin for (i = 0; i < elementsof(e->re.charclass->bits); i++) 17024887Schin e->re.charclass->bits[i] ^= ~0; 17034887Schin if (env->explicit >= 0) 17044887Schin setclr(e->re.charclass, env->explicit); 17054887Schin } 17064887Schin return e; 17074887Schin ecollate: 17084887Schin env->error = REG_ECOLLATE; 17094887Schin goto error; 17104887Schin erange: 17114887Schin env->error = REG_ERANGE; 17124887Schin error: 17134887Schin drop(env->disc, e); 17144887Schin if (!env->error) 17154887Schin env->error = REG_EBRACK; 17164887Schin return 0; 17174887Schin } 17184887Schin 17194887Schin static Rex_t* 17204887Schin ccl(Cenv_t* env, int type) 17214887Schin { 17224887Schin int i; 17234887Schin Rex_t* e; 17244887Schin Celt_t* ce; 17254887Schin regclass_t f; 17264887Schin 17274887Schin if (!(f = classfun(type))) 17284887Schin { 17294887Schin env->error = REG_BADESC; 17304887Schin return 0; 17314887Schin } 17324887Schin if (!mbcoll()) 17334887Schin { 17344887Schin if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 17354887Schin return 0; 17364887Schin for (i = 0; i <= UCHAR_MAX; i++) 17374887Schin if ((*f)(i)) 17384887Schin setadd(e->re.charclass, i); 17394887Schin if (env->explicit >= 0) 17404887Schin setclr(e->re.charclass, env->explicit); 17414887Schin } 17424887Schin else 17434887Schin { 17444887Schin if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t)))) 17454887Schin return 0; 17464887Schin ce = (Celt_t*)e->re.data; 17474887Schin e->re.collate.invert = 0; 17484887Schin e->re.collate.elements = ce; 17498462SApril.Chin@Sun.COM ce->fun = f; 17504887Schin ce->typ = COLL_call; 17514887Schin ce++; 17524887Schin ce->typ = COLL_end; 17534887Schin } 17544887Schin return e; 17554887Schin } 17564887Schin 17574887Schin static Rex_t* 17584887Schin rep(Cenv_t* env, Rex_t* e, int number, int last) 17594887Schin { 17604887Schin Rex_t* f; 17614887Schin unsigned long m = 0; 17624887Schin unsigned long n = RE_DUP_INF; 17634887Schin int minimal = -1; 17644887Schin 17654887Schin if (!e) 17664887Schin return 0; 17674887Schin switch (token(env)) 17684887Schin { 17694887Schin case T_BANG: 17704887Schin eat(env); 17714887Schin if (!(f = node(env, REX_NEG, m, n, 0))) 17724887Schin { 17734887Schin drop(env->disc, e); 17744887Schin return 0; 17754887Schin } 17764887Schin f->re.group.expr.rex = e; 17774887Schin return f; 17784887Schin case T_QUES: 17794887Schin eat(env); 17804887Schin n = 1; 17814887Schin break; 17824887Schin case T_STAR: 17834887Schin eat(env); 17844887Schin break; 17854887Schin case T_PLUS: 17864887Schin eat(env); 17874887Schin m = 1; 17884887Schin break; 17894887Schin case T_LEFT: 17904887Schin eat(env); 17914887Schin m = env->token.min; 17924887Schin n = env->token.max; 17934887Schin break; 17944887Schin default: 17954887Schin return e; 17964887Schin } 17974887Schin if (env->token.att) 17984887Schin minimal = 1; 17994887Schin else if (env->type < SRE) 18004887Schin switch (token(env)) 18014887Schin { 18024887Schin case T_QUES: 18034887Schin eat(env); 18044887Schin minimal = !(env->flags & REG_MINIMAL); 18054887Schin break; 18064887Schin case T_STAR: /*AST*/ 18074887Schin eat(env); 18084887Schin minimal = !!(env->flags & REG_MINIMAL); 18094887Schin break; 18104887Schin } 18114887Schin switch (e->type) 18124887Schin { 18134887Schin case REX_DOT: 18144887Schin case REX_CLASS: 18154887Schin case REX_COLL_CLASS: 18164887Schin case REX_ONECHAR: 18174887Schin e->lo = m; 18184887Schin e->hi = n; 18194887Schin if (minimal >= 0) 18204887Schin mark(e, minimal); 18214887Schin return e; 18224887Schin #if HUH_2002_08_07 18234887Schin case REX_BEG: 18244887Schin #endif 18254887Schin case REX_BEG_STR: 18264887Schin case REX_END_STR: 18274887Schin case REX_FIN_STR: 18284887Schin case REX_WBEG: 18294887Schin case REX_WEND: 18304887Schin case REX_WORD: 18314887Schin case REX_WORD_NOT: 18324887Schin env->error = REG_BADRPT; 18334887Schin drop(env->disc, e); 18344887Schin return 0; 18354887Schin } 18364887Schin if (m == 1 && n == 1) 18374887Schin { 18384887Schin if (minimal >= 0) 18394887Schin mark(e, minimal); 18404887Schin return e; 18414887Schin } 18424887Schin if (!(f = node(env, REX_REP, m, n, 0))) 18434887Schin { 18444887Schin drop(env->disc, e); 18454887Schin return 0; 18464887Schin } 18474887Schin f->re.group.expr.rex = e; 18484887Schin f->re.group.number = number; 18494887Schin f->re.group.last = last; 18504887Schin if (minimal >= 0) 18514887Schin mark(f, minimal); 18524887Schin if (m <= n && n) 18534887Schin { 18544887Schin for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex); 18554887Schin if (e && e->type == REX_NEG) 18564887Schin f->type = REX_GROUP; 18574887Schin } 18584887Schin return f; 18594887Schin } 18604887Schin 18614887Schin static int 18624887Schin isstring(Rex_t* e) 18634887Schin { 18644887Schin switch (e->type) 18654887Schin { 18664887Schin case REX_ONECHAR: 18674887Schin return e->lo == 1 && e->hi == 1; 18684887Schin case REX_STRING: 18694887Schin return 1; 18704887Schin } 18714887Schin return 0; 18724887Schin } 18734887Schin 18744887Schin static Trie_node_t* 18754887Schin trienode(Cenv_t* env, int c) 18764887Schin { 18774887Schin Trie_node_t* t; 18784887Schin 18794887Schin if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t))) 18804887Schin { 18814887Schin memset(t, 0, sizeof(Trie_node_t)); 18824887Schin t->c = c; 18834887Schin } 18844887Schin return t; 18854887Schin } 18864887Schin 18874887Schin static int 18884887Schin insert(Cenv_t* env, Rex_t* f, Rex_t* g) 18894887Schin { 18904887Schin unsigned char* s; 18914887Schin unsigned char* e; 18924887Schin Trie_node_t* t; 18934887Schin int len; 18944887Schin unsigned char tmp[2]; 18954887Schin 18964887Schin switch (f->type) 18974887Schin { 18984887Schin case REX_ONECHAR: 18994887Schin *(s = tmp) = f->re.onechar; 19004887Schin e = s + 1; 19014887Schin break; 19024887Schin case REX_STRING: 19034887Schin s = f->re.string.base; 19044887Schin e = s + f->re.string.size; 19054887Schin break; 19064887Schin default: 19074887Schin return 1; 19084887Schin } 19094887Schin if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s))) 19104887Schin return 1; 19114887Schin for (len = 1;;) 19124887Schin { 19134887Schin if (t->c == *s) 19144887Schin { 19154887Schin if (++s >= e) 19164887Schin break; 19174887Schin if (!t->son && !(t->son = trienode(env, *s))) 19184887Schin return 1; 19194887Schin t = t->son; 19204887Schin len++; 19214887Schin } 19224887Schin else 19234887Schin { 19244887Schin if (!t->sib && !(t->sib = trienode(env, *s))) 19254887Schin return 1; 19264887Schin t = t->sib; 19274887Schin } 19284887Schin } 19294887Schin if (g->re.trie.min > len) 19304887Schin g->re.trie.min = len; 19314887Schin if (g->re.trie.max < len) 19324887Schin g->re.trie.max = len; 19334887Schin t->end = 1; 19344887Schin return 0; 19354887Schin } 19364887Schin 19374887Schin /* 19384887Schin * trie() tries to combine nontrivial e and f into a REX_TRIE 19394887Schin * unless 0 is returned, e and f are deleted as far as possible 19404887Schin * also called by regcomb 19414887Schin */ 19424887Schin 19434887Schin static Rex_t* 19444887Schin trie(Cenv_t* env, Rex_t* e, Rex_t* f) 19454887Schin { 19464887Schin Rex_t* g; 19474887Schin 19484887Schin if (e->next || f->next || !isstring(e) || e->flags != f->flags) 19494887Schin return 0; 19504887Schin if (isstring(f)) 19514887Schin { 19524887Schin if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*)))) 19534887Schin return 0; 19544887Schin g->re.trie.min = INT_MAX; 19554887Schin if (insert(env, f, g)) 19564887Schin goto nospace; 19574887Schin drop(env->disc, f); 19584887Schin } 19594887Schin else if (f->type != REX_TRIE) 19604887Schin return 0; 19614887Schin else 19624887Schin g = f; 19634887Schin if (insert(env, e, g)) 19644887Schin goto nospace; 19654887Schin drop(env->disc, e); 19664887Schin return g; 19674887Schin nospace: 19684887Schin if (g != f) 19694887Schin drop(env->disc, g); 19704887Schin return 0; 19714887Schin } 19724887Schin 19734887Schin static Rex_t* alt(Cenv_t*, int, int); 19744887Schin 19754887Schin static int 19764887Schin chr(register Cenv_t* env, int* escaped) 19774887Schin { 19784887Schin unsigned char* p; 19794887Schin int c; 19804887Schin 19814887Schin *escaped = 0; 19824887Schin if (!(c = *env->cursor)) 19834887Schin return -1; 19844887Schin env->cursor++; 19854887Schin if (c == '\\') 19864887Schin { 19874887Schin if (env->flags & REG_SHELL_ESCAPED) 19884887Schin return c; 19894887Schin if (!(c = *(env->cursor + 1)) || c == env->terminator) 19904887Schin { 19914887Schin if (env->flags & REG_LENIENT) 19924887Schin return c ? c : '\\'; 19934887Schin env->error = REG_EESCAPE; 19944887Schin return -1; 19954887Schin } 19964887Schin p = env->cursor; 19974887Schin c = chresc((char*)env->cursor - 1, (char**)&env->cursor); 19984887Schin *escaped = env->cursor - p; 19994887Schin } 20004887Schin return c; 20014887Schin } 20024887Schin 20034887Schin /* 20044887Schin * open the perly gates 20054887Schin */ 20064887Schin 20074887Schin static Rex_t* 20084887Schin grp(Cenv_t* env, int parno) 20094887Schin { 20104887Schin Rex_t* e; 20114887Schin Rex_t* f; 20124887Schin int c; 20134887Schin int i; 20144887Schin int n; 20154887Schin int x; 20164887Schin int esc; 20174887Schin int typ; 20184887Schin int beg; 20194887Schin unsigned char* p; 20204887Schin 20214887Schin beg = env->pattern == env->cursor - env->token.len; 20224887Schin if (!(c = env->token.lex) && (c = *env->cursor)) 20234887Schin env->cursor++; 20244887Schin env->token.len = 0; 20254887Schin env->parnest++; 20264887Schin typ = -1; 20274887Schin switch (c) 20284887Schin { 20294887Schin case '-': 20304887Schin case '+': 20314887Schin case 'a': 20324887Schin case 'g': 20334887Schin case 'i': 20344887Schin case 'l': 20354887Schin case 'm': 20364887Schin case 'p': 20374887Schin case 'r': 20384887Schin case 's': 20394887Schin case 'x': 20404887Schin case 'A': 20414887Schin case 'B': 20424887Schin case 'E': 20434887Schin case 'F': 20444887Schin case 'G': 20454887Schin case 'I': 20464887Schin case 'K': 20478462SApril.Chin@Sun.COM case 'L': 20488462SApril.Chin@Sun.COM case 'M': /* glob(3) */ 20498462SApril.Chin@Sun.COM case 'N': /* glob(3) */ 20508462SApril.Chin@Sun.COM case 'O': /* glob(3) */ 20518462SApril.Chin@Sun.COM case 'R': /* pcre */ 20524887Schin case 'S': 20534887Schin case 'U': /* pcre */ 20544887Schin case 'X': /* pcre */ 20554887Schin x = REX_GROUP; 20564887Schin i = 1; 20574887Schin env->token.push = 1; 20584887Schin for (;;) 20594887Schin { 20604887Schin switch (c) 20614887Schin { 20628462SApril.Chin@Sun.COM case ')': 20638462SApril.Chin@Sun.COM if (!(env->flags & REG_LITERAL)) 20648462SApril.Chin@Sun.COM { 20658462SApril.Chin@Sun.COM env->error = REG_BADRPT; 20668462SApril.Chin@Sun.COM return 0; 20678462SApril.Chin@Sun.COM } 20688462SApril.Chin@Sun.COM /*FALLTHROUGH*/ 20694887Schin case 0: 20704887Schin case T_CLOSE: 20714887Schin x = 0; 20724887Schin goto done; 20734887Schin case ':': 20744887Schin eat(env); 20754887Schin if (token(env) == T_CLOSE) 20764887Schin x = 0; 20774887Schin goto done; 20784887Schin case '-': 20794887Schin i = 0; 20804887Schin break; 20814887Schin case '+': 20824887Schin i = 1; 20834887Schin break; 20844887Schin case 'a': 20854887Schin if (i) 20864887Schin env->flags |= (REG_LEFT|REG_RIGHT); 20874887Schin else 20884887Schin env->flags &= ~(REG_LEFT|REG_RIGHT); 20894887Schin break; 20904887Schin case 'g': 20914887Schin if (i) 20924887Schin env->flags &= ~REG_MINIMAL; 20934887Schin else 20944887Schin env->flags |= REG_MINIMAL; 20954887Schin break; 20964887Schin case 'i': 20974887Schin if (i) 20984887Schin env->flags |= REG_ICASE; 20994887Schin else 21004887Schin env->flags &= ~REG_ICASE; 21014887Schin break; 21024887Schin case 'l': 21034887Schin if (i) 21044887Schin env->flags |= REG_LEFT; 21054887Schin else 21064887Schin env->flags &= ~REG_LEFT; 21074887Schin break; 21084887Schin case 'm': 21094887Schin if (i) 21104887Schin env->flags |= REG_NEWLINE; 21114887Schin else 21124887Schin env->flags &= ~REG_NEWLINE; 21134887Schin env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 21144887Schin break; 21154887Schin case 'p': 21164887Schin if (i) 21174887Schin env->flags &= ~REG_LENIENT; 21184887Schin else 21194887Schin env->flags |= REG_LENIENT; 21204887Schin break; 21214887Schin case 'r': 21224887Schin if (i) 21234887Schin env->flags |= REG_RIGHT; 21244887Schin else 21254887Schin env->flags &= ~REG_RIGHT; 21264887Schin break; 21274887Schin case 's': 21284887Schin if (i) 21294887Schin env->flags |= REG_SPAN; 21304887Schin else 21314887Schin env->flags &= ~REG_SPAN; 21324887Schin env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 21334887Schin break; 21344887Schin case 'x': 21354887Schin if (i) 21364887Schin env->flags |= REG_COMMENT; 21374887Schin else 21384887Schin env->flags &= ~REG_COMMENT; 21394887Schin break; 21404887Schin case 'A': 21414887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21424887Schin env->flags |= REG_AUGMENTED|REG_EXTENDED; 21434887Schin typ = ARE; 21444887Schin break; 21454887Schin case 'B': 21464887Schin case 'G': 21474887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21484887Schin typ = BRE; 21494887Schin break; 21504887Schin case 'E': 21514887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21524887Schin env->flags |= REG_EXTENDED; 21534887Schin typ = ERE; 21544887Schin break; 21554887Schin case 'F': 21564887Schin case 'L': 21574887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21584887Schin env->flags |= REG_LITERAL; 21594887Schin typ = ERE; 21604887Schin break; 21614887Schin case 'K': 21624887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21634887Schin env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT; 21644887Schin typ = KRE; 21654887Schin break; 21664887Schin case 'M': 21674887Schin /* used by caller to disable glob(3) GLOB_BRACE */ 21684887Schin break; 21694887Schin case 'N': 21704887Schin /* used by caller to disable glob(3) GLOB_NOCHECK */ 21714887Schin break; 21728462SApril.Chin@Sun.COM case 'O': 21734887Schin /* used by caller to disable glob(3) GLOB_STARSTAR */ 21744887Schin break; 21754887Schin case 'S': 21764887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21774887Schin env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT; 21784887Schin typ = SRE; 21794887Schin break; 21804887Schin case 'U': /* PCRE_UNGREEDY */ 21814887Schin if (i) 21824887Schin env->flags |= REG_MINIMAL; 21834887Schin else 21844887Schin env->flags &= ~REG_MINIMAL; 21854887Schin break; 21864887Schin case 'X': /* PCRE_EXTRA */ 21874887Schin break; 21884887Schin default: 21894887Schin env->error = REG_BADRPT; 21904887Schin return 0; 21914887Schin } 21924887Schin eat(env); 21934887Schin c = token(env); 21944887Schin } 21954887Schin done: 21964887Schin break; 21974887Schin case ':': 21984887Schin x = REX_GROUP; 21994887Schin break; 22004887Schin case '=': 22014887Schin x = REX_GROUP_AHEAD; 22024887Schin break; 22034887Schin case '!': 22044887Schin x = REX_GROUP_AHEAD_NOT; 22054887Schin break; 22064887Schin case '<': 22074887Schin switch (token(env)) 22084887Schin { 22094887Schin case '=': 22104887Schin x = REX_GROUP_BEHIND; 22114887Schin break; 22124887Schin case '!': 22134887Schin case T_BANG: 22144887Schin x = REX_GROUP_BEHIND_NOT; 22154887Schin break; 22164887Schin default: 22174887Schin env->error = REG_BADRPT; 22184887Schin return 0; 22194887Schin } 22204887Schin eat(env); 22214887Schin break; 22224887Schin case '>': 22234887Schin x = REX_GROUP_CUT; 22244887Schin break; 22254887Schin case '%': 22264887Schin case T_PERCENT: 22274887Schin e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short)); 22284887Schin e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor; 22294887Schin n = 1; 22304887Schin for (;;) 22314887Schin { 22324887Schin switch (i = chr(env, &esc)) 22334887Schin { 22344887Schin case -1: 22354887Schin case 0: 22364887Schin invalid: 22374887Schin env->cursor -= esc + 1; 22384887Schin env->error = REG_EPAREN; 22394887Schin return 0; 22404887Schin case 'D': 22414887Schin x = REX_NEST_delimiter; 22424887Schin /*FALLTHROUGH*/ 22434887Schin delimiter: 22444887Schin if ((i = chr(env, &esc)) < 0) 22454887Schin goto invalid; 22464887Schin if (e->re.nest.type[i] & ~x) 22474887Schin goto invalid; 22484887Schin e->re.nest.type[i] = x; 22494887Schin continue; 22504887Schin case 'E': 22514887Schin x = REX_NEST_escape; 22524887Schin goto delimiter; 22534887Schin case 'L': 22544887Schin x = REX_NEST_literal; 22554887Schin goto quote; 22564887Schin case 'O': 22574887Schin switch (i = chr(env, &esc)) 22584887Schin { 22594887Schin case 'T': 22604887Schin e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator; 22614887Schin break; 22624887Schin default: 22634887Schin goto invalid; 22644887Schin } 22654887Schin continue; 22664887Schin case 'Q': 22674887Schin x = REX_NEST_quote; 22684887Schin /*FALLTHROUGH*/ 22694887Schin quote: 22704887Schin if ((i = chr(env, &esc)) < 0) 22714887Schin goto invalid; 22724887Schin if (e->re.nest.type[i] & ~x) 22734887Schin goto invalid; 22744887Schin e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT); 22754887Schin continue; 22764887Schin case 'S': 22774887Schin x = REX_NEST_separator; 22784887Schin goto delimiter; 22794887Schin case 'T': 22804887Schin x = REX_NEST_terminator; 22814887Schin goto delimiter; 22824887Schin case '|': 22834887Schin case '&': 22844887Schin if (!esc) 22854887Schin goto invalid; 22864887Schin goto nesting; 22874887Schin case '(': 22884887Schin if (!esc) 22894887Schin n++; 22904887Schin goto nesting; 22914887Schin case ')': 22924887Schin if (!esc && !--n) 22934887Schin break; 22944887Schin goto nesting; 22954887Schin default: 22964887Schin nesting: 22974887Schin if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 22984887Schin goto invalid; 22994887Schin e->re.nest.type[i] = REX_NEST_open; 23004887Schin if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 23014887Schin goto invalid; 23024887Schin if (!esc) 23034887Schin { 23044887Schin if (x == ')' && !--n) 23054887Schin goto invalid; 23064887Schin else if (x == '(') 23074887Schin n++; 23084887Schin } 23094887Schin e->re.nest.type[x] |= REX_NEST_close; 23104887Schin e->re.nest.type[i] |= x << REX_NEST_SHIFT; 23114887Schin continue; 23124887Schin } 23134887Schin break; 23144887Schin } 23154887Schin env->parnest--; 23164887Schin if (c == T_PERCENT) 23174887Schin for (n = 0; n < 2; n++) 23184887Schin { 23194887Schin parno = ++env->parno; 23204887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0))) 23214887Schin { 23224887Schin drop(env->disc, e); 23234887Schin return 0; 23244887Schin } 23254887Schin if (parno < elementsof(env->paren)) 23264887Schin env->paren[parno] = f; 23274887Schin f->re.group.back = 0; 23284887Schin f->re.group.number = parno; 23294887Schin f->re.group.expr.rex = e; 23304887Schin e = f; 23314887Schin } 23324887Schin return e; 23334887Schin case '(': 23344887Schin c = 0; 23354887Schin if (isdigit(*env->cursor)) 23364887Schin { 23374887Schin f = 0; 23384887Schin do 23394887Schin { 23404887Schin if (c > (INT_MAX / 10)) 23414887Schin { 23424887Schin env->error = REG_BADRPT; 23434887Schin return 0; 23444887Schin } 23454887Schin c = c * 10 + (*env->cursor++ - '0'); 23464887Schin } while (isdigit(*env->cursor)); 23474887Schin if (*env->cursor++ != ')') 23484887Schin { 23494887Schin env->error = REG_BADRPT; 23504887Schin return 0; 23514887Schin } 23524887Schin if (c && env->type >= SRE) 23534887Schin c = c * 2 - 1; 23544887Schin if (!c || c > env->parno || !env->paren[c]) 23554887Schin { 23564887Schin if (!(env->flags & REG_LENIENT)) 23574887Schin { 23584887Schin env->error = REG_ESUBREG; 23594887Schin return 0; 23604887Schin } 23614887Schin if (c) 23624887Schin c = -1; 23634887Schin } 23644887Schin } 23654887Schin else 23664887Schin { 23674887Schin if (env->type < SRE && *env->cursor++ != '?') 23684887Schin { 23694887Schin env->error = REG_BADRPT; 23704887Schin return 0; 23714887Schin } 23724887Schin if (!(f = grp(env, parno + 1)) && env->error) 23734887Schin return 0; 23744887Schin } 23754887Schin if (!(e = node(env, REX_GROUP_COND, 0, 0, 0))) 23764887Schin { 23774887Schin drop(env->disc, f); 23784887Schin return 0; 23794887Schin } 23804887Schin e->re.group.size = c; 23814887Schin e->re.group.expr.binary.left = f; 23824887Schin if (!(e->re.group.expr.binary.right = alt(env, parno, 1))) 23834887Schin { 23844887Schin drop(env->disc, e); 23854887Schin return 0; 23864887Schin } 23874887Schin if (token(env) != T_CLOSE) 23884887Schin { 23894887Schin env->error = REG_EPAREN; 23904887Schin return 0; 23914887Schin } 23924887Schin eat(env); 23934887Schin env->parnest--; 23944887Schin return rep(env, e, parno, parno); 23954887Schin case '{': 23964887Schin p = env->cursor; 23974887Schin n = 1; 23984887Schin while (c = *env->cursor) 23994887Schin { 24004887Schin if (c == '\\' && *(env->cursor + 1)) 24014887Schin env->cursor++; 24024887Schin else if (c == '{') 24034887Schin n++; 24044887Schin else if (c == '}' && !--n) 24054887Schin break; 24064887Schin else if (c == env->delimiter || c == env->terminator) 24074887Schin break; 24084887Schin env->cursor++; 24094887Schin } 24104887Schin if (c != '}') 24114887Schin { 24124887Schin env->error = REG_EBRACE; 24134887Schin return 0; 24144887Schin } 24154887Schin if (*++env->cursor != ')') 24164887Schin { 24174887Schin env->error = REG_EPAREN; 24184887Schin return 0; 24194887Schin } 24204887Schin env->cursor++; 24214887Schin env->parnest--; 24224887Schin if (env->disc->re_version < REG_VERSION_EXEC) 24234887Schin { 24244887Schin env->error = REG_BADRPT; 24254887Schin return 0; 24264887Schin } 24274887Schin if (!env->disc->re_execf) 24284887Schin return 0; 24294887Schin if (!(e = node(env, REX_EXEC, 0, 0, 0))) 24304887Schin return 0; 24314887Schin e->re.exec.text = (const char*)p; 24324887Schin e->re.exec.size = env->cursor - p - 2; 24334887Schin if (!env->disc->re_compf) 24344887Schin e->re.exec.data = 0; 24354887Schin else 24364887Schin e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc); 24374887Schin return e; 24384887Schin case '0': case '1': case '2': case '3': case '4': 24394887Schin case '5': case '6': case '7': case '8': case '9': 24404887Schin c -= '0'; 24414887Schin while (isdigit(*env->cursor)) 24424887Schin { 24434887Schin if (c > (INT_MAX / 10)) 24444887Schin { 24454887Schin env->error = REG_ESUBREG; 24464887Schin return 0; 24474887Schin } 24484887Schin c = c * 10 + *env->cursor++ - '0'; 24494887Schin } 24504887Schin if (*env->cursor == ')') 24514887Schin { 24524887Schin env->cursor++; 24534887Schin env->parnest--; 24544887Schin env->token.len = 1; 24554887Schin if (c > env->parno || !env->paren[c]) 24564887Schin { 24574887Schin env->error = REG_ESUBREG; 24584887Schin return 0; 24594887Schin } 24604887Schin env->paren[c]->re.group.back = 1; 24614887Schin return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 24624887Schin } 24634887Schin /*FALLTHROUGH*/ 24644887Schin default: 24654887Schin env->error = REG_BADRPT; 24664887Schin return 0; 24674887Schin } 24684887Schin if (x && !(e = alt(env, parno, 0))) 24694887Schin return 0; 24704887Schin c = token(env); 24714887Schin env->parnest--; 24728462SApril.Chin@Sun.COM if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')')) 24734887Schin { 24744887Schin env->error = REG_EPAREN; 24754887Schin return 0; 24764887Schin } 24774887Schin eat(env); 24784887Schin if (typ >= 0) 24794887Schin { 24804887Schin if (beg) 24814887Schin env->pattern = env->cursor; 24824887Schin env->type = typ; 24834887Schin } 24844887Schin if (!x) 24854887Schin return 0; 24864887Schin if (!(f = node(env, x, 0, 0, 0))) 24874887Schin { 24884887Schin drop(env->disc, e); 24894887Schin return 0; 24904887Schin } 24914887Schin f->re.group.expr.rex = e; 24924887Schin if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT) 24934887Schin { 24944887Schin if (stats(env, e)) 24954887Schin { 24964887Schin drop(env->disc, f); 24974887Schin if (!env->error) 24984887Schin env->error = REG_ECOUNT; 24994887Schin return 0; 25004887Schin } 25014887Schin f->re.group.size = env->stats.m; 25024887Schin memset(&env->stats, 0, sizeof(env->stats)); 25034887Schin } 25044887Schin switch (x) 25054887Schin { 25064887Schin case REX_GROUP: 25074887Schin case REX_GROUP_CUT: 25084887Schin f = rep(env, f, parno, env->parno); 25094887Schin break; 25104887Schin } 25114887Schin return f; 25124887Schin } 25134887Schin 25144887Schin static Rex_t* 25154887Schin seq(Cenv_t* env) 25164887Schin { 25174887Schin Rex_t* e; 25184887Schin Rex_t* f; 25194887Schin Token_t tok; 25204887Schin int c; 25214887Schin int i; 25224887Schin int n; 25234887Schin int x; 25244887Schin int parno; 25254887Schin int type; 25264887Schin regflags_t flags; 25274887Schin unsigned char* s; 25284887Schin unsigned char* p; 25294887Schin unsigned char* t; 25304887Schin unsigned char* u; 25314887Schin unsigned char buf[256]; 25324887Schin 25334887Schin for (;;) 25344887Schin { 25354887Schin s = buf; 25364887Schin while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len]) 25374887Schin { 25384887Schin x = c; 25394887Schin p = env->cursor; 25404887Schin if (c >= 0) 25414887Schin { 25424887Schin n = 1; 25434887Schin *s++ = (env->flags & REG_ICASE) ? toupper(c) : c; 25444887Schin } 25458462SApril.Chin@Sun.COM else if (c == C_ESC || (env->flags & REG_ICASE)) 25464887Schin { 25478462SApril.Chin@Sun.COM c = (c == C_ESC) ? env->token.lex : mbchar(p); 25488462SApril.Chin@Sun.COM if (env->flags & REG_ICASE) 25498462SApril.Chin@Sun.COM c = towupper(c); 25508462SApril.Chin@Sun.COM if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX) 25518462SApril.Chin@Sun.COM break; 25528462SApril.Chin@Sun.COM if ((n = wctomb((char*)s, c)) < 0) 25538462SApril.Chin@Sun.COM *s++ = c; 25548462SApril.Chin@Sun.COM else if (n) 25558462SApril.Chin@Sun.COM s += n; 25568462SApril.Chin@Sun.COM else 25574887Schin { 25584887Schin n = 1; 25594887Schin *s++ = 0; 25604887Schin } 25614887Schin } 25624887Schin else 25634887Schin { 25644887Schin n = env->token.len - env->token.esc; 25654887Schin for (t = p, u = s + n; s < u; *s++ = *t++); 25664887Schin } 25674887Schin eat(env); 25684887Schin } 25694887Schin if (c == T_BAD) 25704887Schin return 0; 25714887Schin if (s > buf) 25724887Schin switch (c) 25734887Schin { 25744887Schin case T_STAR: 25754887Schin case T_PLUS: 25764887Schin case T_LEFT: 25774887Schin case T_QUES: 25784887Schin case T_BANG: 25794887Schin if ((s -= n) == buf) 25804887Schin e = 0; 25814887Schin else 25824887Schin { 25834887Schin i = s - buf; 25844887Schin if (!(e = node(env, REX_STRING, 0, 0, i))) 25854887Schin return 0; 25864887Schin memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i); 25874887Schin e->re.string.size = i; 25884887Schin } 25894887Schin if (x >= 0) 25904887Schin { 25914887Schin if (!(f = node(env, REX_ONECHAR, 1, 1, 0))) 25924887Schin { 25934887Schin drop(env->disc, e); 25944887Schin return 0; 25954887Schin } 25964887Schin f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x; 25974887Schin } 25984887Schin else 25994887Schin { 26008462SApril.Chin@Sun.COM if (!(f = node(env, REX_STRING, 0, 0, n))) 26014887Schin return 0; 26028462SApril.Chin@Sun.COM memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n); 26038462SApril.Chin@Sun.COM f->re.string.size = n; 26044887Schin } 26054887Schin if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env)))) 26064887Schin { 26074887Schin drop(env->disc, e); 26084887Schin return 0; 26094887Schin } 26104887Schin if (e) 26114887Schin f = cat(env, e, f); 26124887Schin return f; 26134887Schin default: 26144887Schin c = s - buf; 26154887Schin if (!(e = node(env, REX_STRING, 0, 0, c))) 26164887Schin return 0; 26174887Schin memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c); 26184887Schin e->re.string.size = c; 26194887Schin return cat(env, e, seq(env)); 26204887Schin } 26214887Schin else if (c > T_BACK) 26224887Schin { 26234887Schin eat(env); 26244887Schin c -= T_BACK; 26254887Schin if (c > env->parno || !env->paren[c]) 26264887Schin { 26274887Schin env->error = REG_ESUBREG; 26284887Schin return 0; 26294887Schin } 26304887Schin env->paren[c]->re.group.back = 1; 26314887Schin e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 26324887Schin } 26334887Schin else 26344887Schin switch (c) 26354887Schin { 26364887Schin case T_AND: 26374887Schin case T_CLOSE: 26384887Schin case T_BAR: 26394887Schin case T_END: 26404887Schin return node(env, REX_NULL, 0, 0, 0); 26414887Schin case T_DOLL: 26424887Schin eat(env); 26434887Schin e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0); 26444887Schin break; 26454887Schin case T_CFLX: 26464887Schin eat(env); 26474887Schin if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED)) 26484887Schin e = rep(env, e, 0, 0); 26494887Schin break; 26504887Schin case T_OPEN: 26514887Schin tok = env->token; 26524887Schin eat(env); 26534887Schin flags = env->flags; 26544887Schin type = env->type; 26554887Schin if (env->token.att) 26564887Schin env->flags |= REG_MINIMAL; 26574887Schin env->parnest++; 26584887Schin if (env->type == KRE) 26594887Schin ++env->parno; 26604887Schin parno = ++env->parno; 26614887Schin if (!(e = alt(env, parno + 1, 0))) 26624887Schin break; 26634887Schin if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL)) 26644887Schin { 26654887Schin drop(env->disc, e); 26664887Schin env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL; 26674887Schin return 0; 26684887Schin } 26694887Schin if (token(env) != T_CLOSE) 26704887Schin { 26714887Schin drop(env->disc, e); 26724887Schin env->error = REG_EPAREN; 26734887Schin return 0; 26744887Schin } 26754887Schin env->parnest--; 26764887Schin eat(env); 26774887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0))) 26784887Schin { 26794887Schin drop(env->disc, e); 26804887Schin return 0; 26814887Schin } 26824887Schin if (parno < elementsof(env->paren)) 26834887Schin env->paren[parno] = f; 26844887Schin f->re.group.back = 0; 26854887Schin f->re.group.number = parno; 26864887Schin f->re.group.expr.rex = e; 26874887Schin if (tok.lex) 26884887Schin { 26894887Schin tok.push = 1; 26904887Schin env->token = tok; 26914887Schin } 26924887Schin if (!(e = rep(env, f, parno, env->parno))) 26934887Schin return 0; 26944887Schin if (env->type == KRE) 26954887Schin { 26964887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0))) 26974887Schin { 26984887Schin drop(env->disc, e); 26994887Schin return 0; 27004887Schin } 27014887Schin if (--parno < elementsof(env->paren)) 27024887Schin env->paren[parno] = f; 27034887Schin f->re.group.back = 0; 27044887Schin f->re.group.number = parno; 27054887Schin f->re.group.expr.rex = e; 27064887Schin e = f; 27074887Schin } 27084887Schin env->flags = flags; 27094887Schin env->type = type; 27104887Schin break; 27114887Schin case T_GROUP: 27124887Schin p = env->cursor; 27134887Schin eat(env); 27144887Schin flags = env->flags; 27154887Schin type = env->type; 27164887Schin if (!(e = grp(env, env->parno + 1))) 27174887Schin { 27184887Schin if (env->error) 27194887Schin return 0; 27204887Schin if (env->literal == env->pattern && env->literal == p) 27214887Schin env->literal = env->cursor; 27224887Schin continue; 27234887Schin } 27244887Schin env->flags = flags; 27254887Schin env->type = type; 27264887Schin break; 27274887Schin case T_BRA: 27284887Schin eat(env); 27294887Schin if (e = bra(env)) 27304887Schin e = rep(env, e, 0, 0); 27314887Schin break; 27324887Schin case T_ALNUM: 27334887Schin case T_ALNUM_NOT: 27344887Schin case T_DIGIT: 27354887Schin case T_DIGIT_NOT: 27364887Schin case T_SPACE: 27374887Schin case T_SPACE_NOT: 27384887Schin eat(env); 27394887Schin if (e = ccl(env, c)) 27404887Schin e = rep(env, e, 0, 0); 27414887Schin break; 27424887Schin case T_LT: 27434887Schin eat(env); 27444887Schin e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0); 27454887Schin break; 27464887Schin case T_GT: 27474887Schin eat(env); 27484887Schin e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0); 27494887Schin break; 27504887Schin case T_DOT: 27514887Schin eat(env); 27524887Schin e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 27534887Schin break; 27544887Schin case T_DOTSTAR: 27554887Schin eat(env); 27564887Schin env->token.lex = T_STAR; 27574887Schin env->token.push = 1; 27584887Schin e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 27594887Schin break; 27604887Schin case T_SLASHPLUS: 27614887Schin eat(env); 27624887Schin env->token.lex = T_PLUS; 27634887Schin env->token.push = 1; 27644887Schin if (e = node(env, REX_ONECHAR, 1, 1, 0)) 27654887Schin { 27664887Schin e->re.onechar = '/'; 27674887Schin e = rep(env, e, 0, 0); 27684887Schin } 27694887Schin break; 27704887Schin case T_WORD: 27714887Schin eat(env); 27724887Schin e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0); 27734887Schin break; 27744887Schin case T_WORD_NOT: 27754887Schin eat(env); 27764887Schin e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0); 27774887Schin break; 27784887Schin case T_BEG_STR: 27794887Schin eat(env); 27804887Schin e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0); 27814887Schin break; 27824887Schin case T_END_STR: 27834887Schin eat(env); 27844887Schin e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0); 27854887Schin break; 27864887Schin case T_FIN_STR: 27874887Schin eat(env); 27884887Schin e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0); 27894887Schin break; 27904887Schin default: 27914887Schin env->error = REG_BADRPT; 27924887Schin return 0; 27934887Schin } 27944887Schin if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator) 27954887Schin e = cat(env, e, seq(env)); 27964887Schin return e; 27974887Schin } 27984887Schin } 27994887Schin 28004887Schin static Rex_t* 28014887Schin con(Cenv_t* env) 28024887Schin { 28034887Schin Rex_t* e; 28044887Schin Rex_t* f; 28054887Schin Rex_t* g; 28064887Schin 28074887Schin if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND) 28084887Schin return e; 28094887Schin eat(env); 28104887Schin if (!(f = con(env))) 28114887Schin { 28124887Schin drop(env->disc, e); 28134887Schin return 0; 28144887Schin } 28154887Schin if (!(g = node(env, REX_CONJ, 0, 0, 0))) 28164887Schin { 28174887Schin drop(env->disc, e); 28184887Schin drop(env->disc, f); 28194887Schin return 0; 28204887Schin } 28214887Schin g->re.group.expr.binary.left = e; 28224887Schin g->re.group.expr.binary.right = f; 28234887Schin return g; 28244887Schin } 28254887Schin 28264887Schin static Rex_t* 28274887Schin alt(Cenv_t* env, int number, int cond) 28284887Schin { 28294887Schin Rex_t* e; 28304887Schin Rex_t* f; 28314887Schin Rex_t* g; 28324887Schin 28334887Schin if (!(e = con(env))) 28344887Schin return 0; 28354887Schin else if (token(env) != T_BAR) 28364887Schin { 28374887Schin if (!cond) 28384887Schin return e; 28394887Schin f = 0; 28404887Schin if (e->type == REX_NULL) 28414887Schin goto bad; 28424887Schin } 28434887Schin else 28444887Schin { 28454887Schin eat(env); 28464887Schin if (!(f = alt(env, number, 0))) 28474887Schin { 28484887Schin drop(env->disc, e); 28494887Schin return 0; 28504887Schin } 28514887Schin if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL)) 28524887Schin goto bad; 28534887Schin if (!cond && (g = trie(env, e, f))) 28544887Schin return g; 28554887Schin } 28564887Schin if (!(g = node(env, REX_ALT, 0, 0, 0))) 28574887Schin { 28584887Schin env->error = REG_ESPACE; 28594887Schin goto bad; 28604887Schin } 28614887Schin g->re.group.number = number; 28624887Schin g->re.group.last = env->parno; 28634887Schin g->re.group.expr.binary.left = e; 28644887Schin g->re.group.expr.binary.right = f; 28654887Schin return g; 28664887Schin bad: 28674887Schin drop(env->disc, e); 28684887Schin drop(env->disc, f); 28694887Schin if (!env->error) 28704887Schin env->error = REG_ENULL; 28714887Schin return 0; 28724887Schin } 28734887Schin 28744887Schin /* 28754887Schin * add v to REX_BM tables 28764887Schin */ 28774887Schin 28784887Schin static void 28794887Schin bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b) 28804887Schin { 28814887Schin int c; 28824887Schin int m; 28834887Schin size_t z; 28844887Schin 28854887Schin for (m = 0; m < n; m++) 28864887Schin { 28874887Schin if (!(z = n - m - 1)) 28884887Schin z = HIT; 28894887Schin c = v[m]; 28904887Schin a->re.bm.mask[m][c] |= b; 28914887Schin if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 28924887Schin a->re.bm.skip[c] = z; 28934887Schin if (a->flags & REG_ICASE) 28944887Schin { 28954887Schin if (isupper(c)) 28964887Schin c = tolower(c); 28974887Schin else if (islower(c)) 28984887Schin c = toupper(c); 28994887Schin else 29004887Schin continue; 29014887Schin a->re.bm.mask[m][c] |= b; 29024887Schin if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 29034887Schin a->re.bm.skip[c] = z; 29044887Schin } 29054887Schin } 29064887Schin } 29074887Schin 29084887Schin /* 29094887Schin * set up BM table from trie 29104887Schin */ 29114887Schin 29124887Schin static int 29134887Schin bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b) 29144887Schin { 29154887Schin do 29164887Schin { 29174887Schin v[m] = x->c; 29184887Schin if (m >= (n - 1)) 29194887Schin { 29204887Schin bmstr(env, a, v, n, b); 29214887Schin if (!(b <<= 1)) 29224887Schin { 29234887Schin b = 1; 29244887Schin a->re.bm.complete = 0; 29254887Schin } 29264887Schin else if (x->son) 29274887Schin a->re.bm.complete = 0; 29284887Schin } 29294887Schin else if (x->son) 29304887Schin b = bmtrie(env, a, v, x->son, n, m + 1, b); 29314887Schin } while (x = x->sib); 29324887Schin return b; 29334887Schin } 29344887Schin 29354887Schin /* 29364887Schin * rewrite the expression tree for some special cases 29374887Schin * 1. it is a null expression - illegal 29384887Schin * 2. max length fixed string found -- use BM algorithm 29394887Schin * 3. it begins with an unanchored string - use KMP algorithm 29404887Schin * 0 returned on success 29414887Schin */ 29424887Schin 29434887Schin static int 29444887Schin special(Cenv_t* env, regex_t* p) 29454887Schin { 29464887Schin Rex_t* a; 29474887Schin Rex_t* e; 29484887Schin Rex_t* t; 29494887Schin Rex_t* x; 29504887Schin Rex_t* y; 29514887Schin unsigned char* s; 29524887Schin int* f; 29534887Schin int n; 29544887Schin int m; 29554887Schin int k; 29564887Schin 29574887Schin DEBUG_INIT(); 29584887Schin if (e = p->env->rex) 29594887Schin { 29604887Schin if ((x = env->stats.x) && x->re.string.size < 3) 29614887Schin x = 0; 29624887Schin if ((t = env->stats.y) && t->re.trie.min < 3) 29634887Schin t = 0; 29644887Schin if (x && t) 29654887Schin { 29664887Schin if (x->re.string.size >= t->re.trie.min) 29674887Schin t = 0; 29684887Schin else 29694887Schin x = 0; 29704887Schin } 29718462SApril.Chin@Sun.COM if (x || t) 29724887Schin { 29734887Schin Bm_mask_t** mask; 29744887Schin Bm_mask_t* h; 29754887Schin unsigned char* v; 29764887Schin size_t* q; 29774887Schin unsigned long l; 29784887Schin int i; 29794887Schin int j; 29804887Schin 29814887Schin if (x) 29824887Schin { 29834887Schin y = x; 29844887Schin n = m = x->re.string.size; 29854887Schin l = env->stats.l; 29864887Schin } 29874887Schin else 29884887Schin { 29894887Schin y = t; 29904887Schin n = t->re.trie.min; 29914887Schin m = t->re.trie.max; 29924887Schin l = env->stats.k; 29934887Schin } 29944887Schin if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t)))) 29954887Schin return 1; 29964887Schin if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t)))) 29974887Schin { 29984887Schin alloc(env->disc, q, 0); 29994887Schin return 1; 30004887Schin } 30014887Schin a->flags = y->flags; 30024887Schin a->map = y->map; 30034887Schin a->re.bm.size = n; 30044887Schin a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1; 30054887Schin a->re.bm.left = l - 1; 30064887Schin a->re.bm.right = env->stats.m - l - n; 30074887Schin a->re.bm.complete = (y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n; 30084887Schin h = (Bm_mask_t*)&a->re.bm.mask[n]; 30094887Schin a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1)); 30104887Schin a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1]; 30114887Schin for (m = 0; m <= UCHAR_MAX; m++) 30124887Schin a->re.bm.skip[m] = n; 30134887Schin a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left); 30144887Schin for (i = 1; i <= n; i++) 30154887Schin a->re.bm.fail[i] = 2 * n - i; 30164887Schin mask = a->re.bm.mask; 30174887Schin for (m = 0; m < n; m++) 30184887Schin { 30194887Schin mask[m] = h; 30204887Schin h += UCHAR_MAX + 1; 30214887Schin } 30224887Schin if (x) 30234887Schin bmstr(env, a, x->re.string.base, n, 1); 30244887Schin else 30254887Schin { 30264887Schin v = (unsigned char*)q; 30274887Schin memset(v, 0, n); 30284887Schin m = 1; 30294887Schin for (i = 0; i <= UCHAR_MAX; i++) 30304887Schin if (t->re.trie.root[i]) 30314887Schin m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m); 30324887Schin } 30334887Schin mask--; 30344887Schin memset(q, 0, n * sizeof(*q)); 30354887Schin for (k = (j = n) + 1; j > 0; j--, k--) 30364887Schin { 30374887Schin DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0)); 30384887Schin for (q[j] = k; k <= n; k = q[k]) 30394887Schin { 30404887Schin for (m = 0; m <= UCHAR_MAX; m++) 30414887Schin if (mask[k][m] == mask[j][m]) 30424887Schin { 30434887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0)); 30444887Schin goto cut; 30454887Schin } 30464887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0)); 30474887Schin if (a->re.bm.fail[k] > n - j) 30484887Schin a->re.bm.fail[k] = n - j; 30494887Schin } 30504887Schin cut: ; 30514887Schin } 30524887Schin for (i = 1; i <= n; i++) 30534887Schin if (a->re.bm.fail[i] > n + k - i) 30544887Schin { 30554887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0)); 30564887Schin a->re.bm.fail[i] = n + k - i; 30574887Schin } 30584887Schin #if _AST_REGEX_DEBUG 30594887Schin if (DEBUG_TEST(0x0020,(1),(0))) 30604887Schin { 30614887Schin sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0); 30624887Schin for (m = 0; m < n; m++) 30634887Schin for (i = 1; i <= UCHAR_MAX; i++) 30644887Schin if (a->re.bm.mask[m][i]) 30654887Schin sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]); 30664887Schin for (i = ' '; i <= UCHAR_MAX; i++) 30674887Schin if (a->re.bm.skip[i] >= HIT) 30684887Schin sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i); 30694887Schin else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n) 30704887Schin sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]); 30714887Schin for (j = 31; j >= 0; j--) 30724887Schin { 30734887Schin sfprintf(sfstderr, " "); 30744887Schin next: 30754887Schin for (m = 0; m < n; m++) 30764887Schin { 30774887Schin for (i = 0040; i < 0177; i++) 30784887Schin if (a->re.bm.mask[m][i] & (1 << j)) 30794887Schin { 30804887Schin sfprintf(sfstderr, " %c", i); 30814887Schin break; 30824887Schin } 30834887Schin if (i >= 0177) 30844887Schin { 30854887Schin if (j > 0) 30864887Schin { 30874887Schin j--; 30884887Schin goto next; 30894887Schin } 30904887Schin sfprintf(sfstderr, " ?"); 30914887Schin } 30924887Schin } 30934887Schin sfprintf(sfstderr, "\n"); 30944887Schin } 30954887Schin sfprintf(sfstderr, "FAIL: "); 30964887Schin for (m = 1; m <= n; m++) 30974887Schin sfprintf(sfstderr, "%3d", a->re.bm.fail[m]); 30984887Schin sfprintf(sfstderr, "\n"); 30994887Schin } 31004887Schin #endif 31014887Schin alloc(env->disc, q, 0); 31024887Schin a->next = e; 31034887Schin p->env->rex = a; 31044887Schin return 0; 31054887Schin } 31064887Schin switch (e->type) 31074887Schin { 31084887Schin case REX_BEG: 31094887Schin if (env->flags & REG_NEWLINE) 31104887Schin return 0; 31114887Schin break; 31124887Schin case REX_GROUP: 31134887Schin if (env->stats.b) 31144887Schin return 0; 31154887Schin e = e->re.group.expr.rex; 31164887Schin if (e->type != REX_DOT) 31174887Schin return 0; 31184887Schin /*FALLTHROUGH*/ 31194887Schin case REX_DOT: 31204887Schin if (e->lo == 0 && e->hi == RE_DUP_INF) 31214887Schin break; 31224887Schin return 0; 31234887Schin case REX_NULL: 31244887Schin if (env->flags & REG_NULL) 31254887Schin break; 31264887Schin env->error = REG_ENULL; 31274887Schin return 1; 31284887Schin case REX_STRING: 31298462SApril.Chin@Sun.COM if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map) 31304887Schin return 0; 31314887Schin s = e->re.string.base; 31324887Schin n = e->re.string.size; 31334887Schin if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1)))) 31344887Schin return 1; 31354887Schin a->flags = e->flags; 31364887Schin a->map = e->map; 31374887Schin f = a->re.string.fail; 31384887Schin memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n); 31394887Schin s = a->re.string.base; 31404887Schin a->re.string.size = n; 31414887Schin f[0] = m = -1; 31424887Schin for (k = 1; k < n; k++) 31434887Schin { 31444887Schin while (m >= 0 && s[m+1] != s[k]) 31454887Schin m = f[m]; 31464887Schin if (s[m+1] == s[k]) 31474887Schin m++; 31484887Schin f[k] = m; 31494887Schin } 31504887Schin a->next = e->next; 31514887Schin p->env->rex = a; 31524887Schin e->next = 0; 31534887Schin drop(env->disc, e); 31544887Schin break; 31554887Schin default: 31564887Schin return 0; 31574887Schin } 31584887Schin } 31594887Schin p->env->once = 1; 31604887Schin return 0; 31614887Schin } 31624887Schin 31634887Schin int 31644887Schin regcomp(regex_t* p, const char* pattern, regflags_t flags) 31654887Schin { 31664887Schin Rex_t* e; 31674887Schin Rex_t* f; 31684887Schin regdisc_t* disc; 31694887Schin unsigned char* fold; 31704887Schin int i; 31714887Schin Cenv_t env; 31724887Schin 31734887Schin if (!p) 31744887Schin return REG_BADPAT; 31754887Schin if (flags & REG_DISCIPLINE) 31764887Schin { 31774887Schin flags &= ~REG_DISCIPLINE; 31784887Schin disc = p->re_disc; 31794887Schin } 31804887Schin else 31814887Schin disc = &state.disc; 31824887Schin if (!disc->re_errorlevel) 31834887Schin disc->re_errorlevel = 2; 31844887Schin p->env = 0; 31854887Schin if (!pattern) 31864887Schin return fatal(disc, REG_BADPAT, pattern); 31874887Schin if (!state.initialized) 31884887Schin { 31894887Schin state.initialized = 1; 31904887Schin for (i = 0; i < elementsof(state.escape); i++) 31914887Schin state.magic[state.escape[i].key] = state.escape[i].val; 31924887Schin } 31934887Schin if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data)) 31944887Schin { 31954887Schin if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1))) 31964887Schin return fatal(disc, REG_ESPACE, pattern); 31974887Schin for (i = 0; i <= UCHAR_MAX; i++) 31984887Schin fold[i] = toupper(i); 31994887Schin LCINFO(AST_LC_CTYPE)->data = (void*)fold; 32004887Schin } 32014887Schin again: 32024887Schin if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t)))) 32034887Schin return fatal(disc, REG_ESPACE, pattern); 32044887Schin memset(p->env, 0, sizeof(*p->env)); 32054887Schin memset(&env, 0, sizeof(env)); 32064887Schin env.regex = p; 32074887Schin env.flags = flags; 32084887Schin env.disc = p->env->disc = disc; 32094887Schin if (env.flags & REG_AUGMENTED) 32104887Schin env.flags |= REG_EXTENDED; 32114887Schin env.mappeddot = '.'; 32124887Schin env.mappednewline = '\n'; 32134887Schin env.mappedslash = '/'; 32144887Schin if (disc->re_version >= REG_VERSION_MAP && disc->re_map) 32154887Schin { 32164887Schin env.map = disc->re_map; 32174887Schin env.MAP = p->env->fold; 32184887Schin for (i = 0; i <= UCHAR_MAX; i++) 32194887Schin { 32204887Schin env.MAP[i] = fold[env.map[i]]; 32214887Schin if (env.map[i] == '.') 32224887Schin env.mappeddot = i; 32234887Schin if (env.map[i] == '\n') 32244887Schin env.mappednewline = i; 32254887Schin if (env.map[i] == '/') 32264887Schin env.mappedslash = i; 32274887Schin } 32284887Schin } 32294887Schin else 32304887Schin env.MAP = fold; 32314887Schin env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE; 32324887Schin env.explicit = -1; 32334887Schin if (env.flags & REG_SHELL) 32344887Schin { 32354887Schin if (env.flags & REG_SHELL_PATH) 32364887Schin env.explicit = env.mappedslash; 32374887Schin env.flags |= REG_LENIENT|REG_NULL; 32384887Schin env.type = env.type == BRE ? SRE : KRE; 32394887Schin } 32404887Schin if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE) 32414887Schin env.explicit = env.mappednewline; 32424887Schin p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1; 32434887Schin env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL)); 3244*10898Sroland.mainz@nrubsig.org env.regexp = !!(env.flags & REG_REGEXP); 32454887Schin env.token.lex = 0; 32464887Schin env.token.push = 0; 32474887Schin if (env.flags & REG_DELIMITED) 32484887Schin { 32494887Schin switch (env.delimiter = *pattern++) 32504887Schin { 32514887Schin case 0: 32524887Schin case '\\': 32534887Schin case '\n': 32544887Schin case '\r': 32554887Schin env.error = REG_EDELIM; 32564887Schin goto bad; 32574887Schin } 32584887Schin env.terminator = '\n'; 32594887Schin } 32604887Schin env.literal = env.pattern = env.cursor = (unsigned char*)pattern; 32614887Schin if (!(p->env->rex = alt(&env, 1, 0))) 32624887Schin goto bad; 32634887Schin if (env.parnest) 32644887Schin { 32654887Schin env.error = REG_EPAREN; 32664887Schin goto bad; 32674887Schin } 32684887Schin p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL); 32694887Schin if (env.flags & REG_LEFT) 32704887Schin { 32714887Schin if (p->env->rex->type != REX_BEG) 32724887Schin { 32734887Schin if (p->env->rex->type == REX_ALT) 32744887Schin env.flags &= ~REG_FIRST; 32754887Schin if (!(e = node(&env, REX_BEG, 0, 0, 0))) 32764887Schin { 32774887Schin regfree(p); 32784887Schin return fatal(disc, REG_ESPACE, pattern); 32794887Schin } 32804887Schin e->next = p->env->rex; 32814887Schin p->env->rex = e; 32824887Schin p->env->once = 1; 32834887Schin } 32844887Schin p->env->stats.re_flags |= REG_LEFT; 32854887Schin } 32864887Schin for (e = p->env->rex; e->next; e = e->next); 32874887Schin p->env->done.type = REX_DONE; 32884887Schin p->env->done.flags = e->flags; 32894887Schin if (env.flags & REG_RIGHT) 32904887Schin { 32914887Schin if (e->type != REX_END) 32924887Schin { 32934887Schin if (p->env->rex->type == REX_ALT) 32944887Schin env.flags &= ~REG_FIRST; 32954887Schin if (!(f = node(&env, REX_END, 0, 0, 0))) 32964887Schin { 32974887Schin regfree(p); 32984887Schin return fatal(disc, REG_ESPACE, pattern); 32994887Schin } 33004887Schin f->flags = e->flags; 33014887Schin f->map = e->map; 33024887Schin e->next = f; 33034887Schin } 33044887Schin p->env->stats.re_flags |= REG_RIGHT; 33054887Schin } 33064887Schin if (stats(&env, p->env->rex)) 33074887Schin { 33084887Schin if (!env.error) 33094887Schin env.error = REG_ECOUNT; 33104887Schin goto bad; 33114887Schin } 33124887Schin if (env.stats.b) 33134887Schin p->env->hard = p->env->separate = 1; 33144887Schin else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c))) 33154887Schin p->env->hard = 1; 33164887Schin if (p->env->hard || env.stats.c || env.stats.i) 33174887Schin p->env->stats.re_min = p->env->stats.re_max = -1; 33184887Schin else 33194887Schin { 33204887Schin if (!(p->env->stats.re_min = env.stats.m)) 33214887Schin p->env->stats.re_min = -1; 33224887Schin if (!(p->env->stats.re_max = env.stats.n)) 33234887Schin p->env->stats.re_max = -1; 33244887Schin } 33254887Schin if (special(&env, p)) 33264887Schin goto bad; 33274887Schin serialize(&env, p->env->rex, 1); 33284887Schin p->re_nsub = env.stats.p; 33294887Schin if (env.type == KRE) 33304887Schin p->re_nsub /= 2; 33314887Schin if (env.flags & REG_DELIMITED) 33324887Schin { 33334887Schin p->re_npat = env.cursor - env.pattern + 1; 33344887Schin if (*env.cursor == env.delimiter) 33354887Schin p->re_npat++; 33364887Schin else if (env.flags & REG_MUSTDELIM) 33374887Schin { 33384887Schin env.error = REG_EDELIM; 33394887Schin goto bad; 33404887Schin } 33414887Schin else 33424887Schin env.flags &= ~REG_DELIMITED; 33434887Schin } 33444887Schin p->env->explicit = env.explicit; 33454887Schin p->env->flags = env.flags & REG_COMP; 33464887Schin p->env->min = env.stats.m; 33474887Schin p->env->nsub = env.stats.p + env.stats.u; 33484887Schin p->env->refs = 1; 33494887Schin return 0; 33504887Schin bad: 33514887Schin regfree(p); 33524887Schin if (!env.error) 33534887Schin env.error = REG_ESPACE; 33544887Schin if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL)) 33554887Schin { 33564887Schin flags |= REG_LITERAL; 33574887Schin pattern = (const char*)env.literal; 33584887Schin goto again; 33594887Schin } 33604887Schin return fatal(disc, env.error, pattern); 33614887Schin } 33624887Schin 33634887Schin /* 33644887Schin * regcomp() on sized pattern 33654887Schin * the lazy way around adding and checking env.end 33664887Schin */ 33674887Schin 33684887Schin int 33694887Schin regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags) 33704887Schin { 33714887Schin char* s; 33724887Schin int r; 33734887Schin 33744887Schin if (!(s = malloc(size + 1))) 33754887Schin return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern); 33764887Schin memcpy(s, pattern, size); 33774887Schin s[size] = 0; 33784887Schin r = regcomp(p, s, flags); 33794887Schin free(s); 33804887Schin return r; 33814887Schin } 33824887Schin 33834887Schin /* 33844887Schin * combine two compiled regular expressions if possible, 33854887Schin * replacing first with the combination and freeing second. 33864887Schin * return 0 on success. 33874887Schin * the only combinations handled are building a Trie 33884887Schin * from String|Kmp|Trie and String|Kmp 33894887Schin */ 33904887Schin 33914887Schin int 33924887Schin regcomb(regex_t* p, regex_t* q) 33934887Schin { 33944887Schin Rex_t* e = p->env->rex; 33954887Schin Rex_t* f = q->env->rex; 33964887Schin Rex_t* g; 33974887Schin Cenv_t env; 33984887Schin 33994887Schin if (!e || !f) 34004887Schin return fatal(p->env->disc, REG_BADPAT, NiL); 34014887Schin if (p->env->separate || q->env->separate) 34024887Schin return REG_ESUBREG; 34034887Schin memset(&env, 0, sizeof(env)); 34044887Schin env.disc = p->env->disc; 34054887Schin if (e->type == REX_BM) 34064887Schin { 34074887Schin p->env->rex = e->next; 34084887Schin e->next = 0; 34094887Schin drop(env.disc, e); 34104887Schin e = p->env->rex; 34114887Schin } 34124887Schin if (f->type == REX_BM) 34134887Schin { 34144887Schin q->env->rex = f->next; 34154887Schin f->next = 0; 34164887Schin drop(env.disc, f); 34174887Schin f = q->env->rex; 34184887Schin } 34194887Schin if (e->type == REX_BEG && f->type == REX_BEG) 34204887Schin { 34214887Schin p->env->flags |= REG_LEFT; 34224887Schin p->env->rex = e->next; 34234887Schin e->next = 0; 34244887Schin drop(env.disc, e); 34254887Schin e = p->env->rex; 34264887Schin q->env->rex = f->next; 34274887Schin f->next = 0; 34284887Schin drop(env.disc, f); 34294887Schin f = q->env->rex; 34304887Schin } 34314887Schin if (e->next && e->next->type == REX_END && f->next && f->next->type == REX_END) 34324887Schin { 34334887Schin p->env->flags |= REG_RIGHT; 34344887Schin drop(env.disc, e->next); 34354887Schin e->next = 0; 34364887Schin drop(env.disc, f->next); 34374887Schin f->next = 0; 34384887Schin } 34394887Schin if (!(g = trie(&env, f, e))) 34404887Schin return fatal(p->env->disc, REG_BADPAT, NiL); 34414887Schin p->env->rex = g; 34424887Schin if (!q->env->once) 34434887Schin p->env->once = 0; 34444887Schin q->env->rex = 0; 34454887Schin if (p->env->flags & REG_LEFT) 34464887Schin { 34474887Schin if (!(e = node(&env, REX_BEG, 0, 0, 0))) 34484887Schin { 34494887Schin regfree(p); 34504887Schin return fatal(p->env->disc, REG_ESPACE, NiL); 34514887Schin } 34524887Schin e->next = p->env->rex; 34534887Schin p->env->rex = e; 34544887Schin p->env->once = 1; 34554887Schin } 34564887Schin if (p->env->flags & REG_RIGHT) 34574887Schin { 34584887Schin for (f = p->env->rex; f->next; f = f->next); 34594887Schin if (f->type != REX_END) 34604887Schin { 34614887Schin if (!(e = node(&env, REX_END, 0, 0, 0))) 34624887Schin { 34634887Schin regfree(p); 34644887Schin return fatal(p->env->disc, REG_ESPACE, NiL); 34654887Schin } 34664887Schin f->next = e; 34674887Schin } 34684887Schin } 34694887Schin env.explicit = p->env->explicit; 34704887Schin env.flags = p->env->flags; 34714887Schin env.disc = p->env->disc; 34724887Schin if (stats(&env, p->env->rex)) 34734887Schin { 34744887Schin regfree(p); 34754887Schin return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL); 34764887Schin } 34774887Schin if (special(&env, p)) 34784887Schin { 34794887Schin regfree(p); 34804887Schin return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL); 34814887Schin } 34824887Schin p->env->min = g->re.trie.min; 34834887Schin return 0; 34844887Schin } 34854887Schin 34864887Schin /* 34874887Schin * copy a reference of p into q 34884887Schin * p and q may then have separate regsubcomp() instantiations 34894887Schin */ 34904887Schin 34914887Schin int 34924887Schin regdup(regex_t* p, regex_t* q) 34934887Schin { 34944887Schin if (!p || !q) 34954887Schin return REG_BADPAT; 34964887Schin *q = *p; 34974887Schin p->env->refs++; 34984887Schin q->re_sub = 0; 34994887Schin return 0; 35004887Schin } 3501