14887Schin /*********************************************************************** 24887Schin * * 34887Schin * This software is part of the ast package * 4*8462SApril.Chin@Sun.COM * Copyright (c) 1985-2008 AT&T Intellectual Property * 54887Schin * and is licensed under the * 64887Schin * Common Public License, Version 1.0 * 7*8462SApril.Chin@Sun.COM * by AT&T Intellectual Property * 84887Schin * * 94887Schin * A copy of the License is available at * 104887Schin * http://www.opensource.org/licenses/cpl1.0.txt * 114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 124887Schin * * 134887Schin * Information and Software Systems Research * 144887Schin * AT&T Research * 154887Schin * Florham Park NJ * 164887Schin * * 174887Schin * Glenn Fowler <gsf@research.att.com> * 184887Schin * David Korn <dgk@research.att.com> * 194887Schin * Phong Vo <kpv@research.att.com> * 204887Schin * * 214887Schin ***********************************************************************/ 224887Schin #pragma prototyped 234887Schin 244887Schin /* 254887Schin * posix regex compiler 264887Schin */ 274887Schin 284887Schin #include "reglib.h" 294887Schin 304887Schin #if _PACKAGE_ast 314887Schin #include "lclib.h" 324887Schin #endif 334887Schin 344887Schin #define serialize re_serialize /* hp.ia64 <unistd.h>! */ 354887Schin 364887Schin #define C_ESC (-1) 374887Schin #define C_MB (-2) 384887Schin 394887Schin #if _AST_REGEX_DEBUG 404887Schin 414887Schin #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 424887Schin #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 434887Schin #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 444887Schin 454887Schin static unsigned long debug; 464887Schin static unsigned long debug_flag; 474887Schin 484887Schin #else 494887Schin 504887Schin #define DEBUG_INIT() 514887Schin #define DEBUG_TEST(f,y,n) (n) 524887Schin #define DEBUG_CODE(f,y,n) do {n} while(0) 534887Schin 544887Schin #endif 554887Schin 564887Schin #if _PACKAGE_ast 574887Schin 584887Schin typedef struct Cchr_s 594887Schin { 604887Schin Dtlink_t lnk; 614887Schin unsigned char nam[2]; 624887Schin Ckey_t key; 634887Schin } Cchr_t; 644887Schin 654887Schin #endif 664887Schin 674887Schin #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0) 684887Schin 694887Schin /* 704887Schin * determine whether greedy matching will work, i.e. produce 714887Schin * the best match first. such expressions are "easy", and 724887Schin * need no backtracking once a complete match is found. 734887Schin * if an expression has backreferences or alts it's hard 744887Schin * else if it has only one closure it's easy 754887Schin * else if all closures are simple (i.e. one-character) it's easy 764887Schin * else it's hard. 774887Schin */ 784887Schin 794887Schin typedef struct Stats_s 804887Schin { 814887Schin unsigned long l; /* min length to left of x */ 824887Schin unsigned long k; /* min length to left of y */ 834887Schin unsigned long m; /* min length */ 844887Schin unsigned long n; /* max length */ 854887Schin unsigned short a; /* number of alternations */ 864887Schin unsigned short b; /* number of backrefs */ 874887Schin unsigned short c; /* number of closures */ 884887Schin unsigned short i; /* number of negations */ 894887Schin unsigned short p; /* number of named subexpressions */ 904887Schin unsigned short s; /* number of simple closures */ 914887Schin unsigned short t; /* number of tries */ 924887Schin unsigned short u; /* number of unnamed subexpressions */ 934887Schin Rex_t* x; /* max length REX_STRING */ 944887Schin Rex_t* y; /* max length REX_TRIE */ 954887Schin } Stats_t; 964887Schin 974887Schin typedef struct Token_s 984887Schin { 994887Schin unsigned long min; 1004887Schin unsigned long max; 1014887Schin short lex; 1024887Schin short len; 1034887Schin short esc; 1044887Schin short att; 1054887Schin short push; 1064887Schin } Token_t; 1074887Schin 1084887Schin typedef struct Cenv_s 1094887Schin { 1104887Schin int delimiter; /* pattern delimiter */ 1114887Schin int error; /* last error */ 1124887Schin int explicit; /* explicit match on this char */ 1134887Schin int mappeddot; /* inverse mapped '.' */ 1144887Schin int mappednewline; /* inverse mapped '\n' */ 1154887Schin int mappedslash; /* inverse mapped '/' */ 1164887Schin regflags_t flags; /* flags arg to regcomp */ 1174887Schin int type; /* BRE,ERE,ARE,SRE,KRE */ 1184887Schin unsigned char* cursor; /* curent point in re */ 1194887Schin unsigned char* pattern; /* the original pattern */ 1204887Schin unsigned char* literal; /* literal restart pattern */ 1214887Schin int parno; /* number of last open paren */ 1224887Schin int parnest; /* paren nest count */ 1234887Schin int posixkludge; /* to make * nonspecial */ 1244887Schin Token_t token; /* token lookahead */ 1254887Schin Stats_t stats; /* RE statistics */ 1264887Schin int terminator; /* pattern terminator */ 1274887Schin Rex_t* paren[2*(BACK_REF_MAX+2)]; 1284887Schin /* paren[i]!=0 if \i defined */ 1294887Schin regex_t* regex; /* user handle */ 1304887Schin regdisc_t* disc; /* user discipline */ 1314887Schin unsigned char* map; /* external to native ccode map */ 1324887Schin unsigned char* MAP; /* fold and/or map */ 1334887Schin } Cenv_t; 1344887Schin 1354887Schin /* 1364887Schin * allocate a new Rex_t node 1374887Schin */ 1384887Schin 1394887Schin static Rex_t* 1404887Schin node(Cenv_t* env, int type, int lo, int hi, size_t extra) 1414887Schin { 1424887Schin register Rex_t* e; 1434887Schin 1444887Schin if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra)) 1454887Schin { 1464887Schin memset(e, 0, sizeof(Rex_t) + extra); 1474887Schin e->type = type; 1484887Schin e->marked = 0; 1494887Schin e->lo = lo; 1504887Schin e->hi = hi; 1514887Schin e->flags = env->flags; 1524887Schin e->map = (e->flags & REG_ICASE) ? env->MAP : env->map; 1534887Schin e->explicit = env->explicit; 1544887Schin if (extra) 1554887Schin e->re.data = (char*)e + sizeof(Rex_t); 1564887Schin } 1574887Schin return e; 1584887Schin } 1594887Schin 1604887Schin /* 1614887Schin * free a Trie_node_t node 1624887Schin */ 1634887Schin 1644887Schin static void 1654887Schin triedrop(regdisc_t* disc, Trie_node_t* e) 1664887Schin { 1674887Schin if (e) 1684887Schin { 1694887Schin triedrop(disc, e->son); 1704887Schin triedrop(disc, e->sib); 1714887Schin alloc(disc, e, 0); 1724887Schin } 1734887Schin } 1744887Schin 1754887Schin /* 1764887Schin * free a Rex_t node 1774887Schin */ 1784887Schin 1794887Schin void 1804887Schin drop(regdisc_t* disc, Rex_t* e) 1814887Schin { 1824887Schin int i; 1834887Schin Rex_t* x; 1844887Schin 1854887Schin if (e && !(disc->re_flags & REG_NOFREE)) 1864887Schin do 1874887Schin { 1884887Schin switch (e->type) 1894887Schin { 1904887Schin case REX_ALT: 1914887Schin case REX_CONJ: 1924887Schin drop(disc, e->re.group.expr.binary.left); 1934887Schin drop(disc, e->re.group.expr.binary.right); 1944887Schin break; 1954887Schin case REX_GROUP: 1964887Schin case REX_GROUP_AHEAD: 1974887Schin case REX_GROUP_AHEAD_NOT: 1984887Schin case REX_GROUP_BEHIND: 1994887Schin case REX_GROUP_BEHIND_NOT: 2004887Schin case REX_GROUP_CUT: 2014887Schin case REX_NEG: 2024887Schin case REX_REP: 2034887Schin drop(disc, e->re.group.expr.rex); 2044887Schin break; 2054887Schin case REX_TRIE: 2064887Schin for (i = 0; i <= UCHAR_MAX; i++) 2074887Schin triedrop(disc, e->re.trie.root[i]); 2084887Schin break; 2094887Schin } 2104887Schin x = e->next; 2114887Schin alloc(disc, e, 0); 2124887Schin } while (e = x); 2134887Schin } 2144887Schin 2154887Schin /* 2164887Schin * mark e and descendants minimal 2174887Schin */ 2184887Schin 2194887Schin static void 2204887Schin mark(register Rex_t* e, int set) 2214887Schin { 2224887Schin if (e && !e->marked) 2234887Schin do 2244887Schin { 2254887Schin e->marked = 1; 2264887Schin if (set) 2274887Schin e->flags |= REG_MINIMAL; 2284887Schin else 2294887Schin e->flags &= ~REG_MINIMAL; 2304887Schin switch (e->type) 2314887Schin { 2324887Schin case REX_ALT: 2334887Schin case REX_CONJ: 2344887Schin case REX_GROUP_COND: 2354887Schin if (e->re.group.expr.binary.left) 2364887Schin mark(e->re.group.expr.binary.left, set); 2374887Schin if (e->re.group.expr.binary.right) 2384887Schin mark(e->re.group.expr.binary.right, set); 2394887Schin break; 2404887Schin case REX_GROUP: 2414887Schin case REX_GROUP_AHEAD: 2424887Schin case REX_GROUP_AHEAD_NOT: 2434887Schin case REX_GROUP_BEHIND: 2444887Schin case REX_GROUP_BEHIND_NOT: 2454887Schin case REX_GROUP_CUT: 2464887Schin case REX_NEG: 2474887Schin case REX_REP: 2484887Schin case REX_TRIE: 2494887Schin mark(e->re.group.expr.rex, set); 2504887Schin break; 2514887Schin } 2524887Schin } while (e = e->next); 2534887Schin } 2544887Schin 2554887Schin /* 2564887Schin * assign subexpression numbers by a preorder tree walk 2574887Schin */ 2584887Schin 2594887Schin static int 2604887Schin serialize(Cenv_t* env, Rex_t* e, int n) 2614887Schin { 2624887Schin do 2634887Schin { 2644887Schin e->serial = n++; 2654887Schin switch (e->type) 2664887Schin { 2674887Schin case REX_ALT: 2684887Schin case REX_GROUP_COND: 2694887Schin if (e->re.group.expr.binary.left) 2704887Schin n = serialize(env, e->re.group.expr.binary.left, n); 2714887Schin e->re.group.expr.binary.serial = n++; 2724887Schin if (e->re.group.expr.binary.right) 2734887Schin n = serialize(env, e->re.group.expr.binary.right, n); 2744887Schin break; 2754887Schin case REX_CONJ: 2764887Schin n = serialize(env, e->re.group.expr.binary.left, n); 2774887Schin n = serialize(env, e->re.group.expr.binary.right, n); 2784887Schin break; 2794887Schin case REX_GROUP: 2804887Schin case REX_GROUP_AHEAD: 2814887Schin case REX_GROUP_AHEAD_NOT: 2824887Schin case REX_GROUP_BEHIND: 2834887Schin case REX_GROUP_BEHIND_NOT: 2844887Schin case REX_GROUP_CUT: 2854887Schin case REX_NEG: 2864887Schin case REX_REP: 2874887Schin n = serialize(env, e->re.group.expr.rex, n); 2884887Schin break; 2894887Schin } 2904887Schin } while (e = e->next); 2914887Schin return n; 2924887Schin } 2934887Schin 2944887Schin /* 2954887Schin * catenate e and f into a sequence, collapsing them if possible 2964887Schin */ 2974887Schin 2984887Schin static Rex_t* 2994887Schin cat(Cenv_t* env, Rex_t* e, Rex_t* f) 3004887Schin { 3014887Schin Rex_t* g; 3024887Schin 3034887Schin if (!f) 3044887Schin { 3054887Schin drop(env->disc, e); 3064887Schin return 0; 3074887Schin } 3084887Schin if (e->type == REX_NULL) 3094887Schin { 3104887Schin drop(env->disc, e); 3114887Schin return f; 3124887Schin } 3134887Schin if (f->type == REX_NULL) 3144887Schin { 3154887Schin g = f->next; 3164887Schin f->next = 0; 3174887Schin drop(env->disc, f); 3184887Schin f = g; 3194887Schin } 3204887Schin else if (e->type == REX_DOT && f->type == REX_DOT) 3214887Schin { 3224887Schin unsigned int m = e->lo + f->lo; 3234887Schin unsigned int n = e->hi + f->hi; 3244887Schin 3254887Schin if (m <= RE_DUP_MAX) 3264887Schin { 3274887Schin if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX) 3284887Schin { 3294887Schin n = RE_DUP_INF; 3304887Schin goto combine; 3314887Schin } 3324887Schin else if (n <= RE_DUP_MAX) 3334887Schin { 3344887Schin combine: 3354887Schin e->lo = m; 3364887Schin e->hi = n; 3374887Schin g = f->next; 3384887Schin f->next = 0; 3394887Schin drop(env->disc, f); 3404887Schin f = g; 3414887Schin } 3424887Schin } 3434887Schin } 3444887Schin e->next = f; 3454887Schin return e; 3464887Schin } 3474887Schin 3484887Schin /* 3494887Schin * collect re statistics 3504887Schin */ 3514887Schin 3524887Schin static int 3534887Schin stats(register Cenv_t* env, register Rex_t* e) 3544887Schin { 3554887Schin register unsigned long n; 3564887Schin register unsigned long m; 3574887Schin unsigned long cm; 3584887Schin unsigned long nm; 3594887Schin unsigned long cn; 3604887Schin unsigned long nn; 3614887Schin unsigned long l; 3624887Schin unsigned long k; 3634887Schin unsigned long t; 3644887Schin Rex_t* q; 3654887Schin Rex_t* x; 3664887Schin Rex_t* y; 3674887Schin unsigned char c; 3684887Schin unsigned char b; 3694887Schin 3704887Schin do 3714887Schin { 3724887Schin switch (e->type) 3734887Schin { 3744887Schin case REX_ALT: 3754887Schin x = env->stats.x; 3764887Schin l = env->stats.l; 3774887Schin y = env->stats.y; 3784887Schin k = env->stats.k; 3794887Schin t = env->stats.t; 3804887Schin if (++env->stats.a <= 0) 3814887Schin return 1; 3824887Schin cm = env->stats.m; 3834887Schin env->stats.m = 0; 3844887Schin cn = env->stats.n; 3854887Schin env->stats.n = 0; 3864887Schin if (stats(env, e->re.group.expr.binary.left)) 3874887Schin return 1; 3884887Schin m = env->stats.m; 3894887Schin env->stats.m = 0; 3904887Schin n = env->stats.n; 3914887Schin env->stats.n = 0; 3924887Schin if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right)) 3934887Schin return 1; 3944887Schin if (env->stats.m > m) 3954887Schin env->stats.m = m; 3964887Schin else 3974887Schin m = env->stats.m; 3984887Schin if ((env->stats.m += cm) < m) 3994887Schin return 1; 4004887Schin if (env->stats.n < n) 4014887Schin env->stats.n = n; 4024887Schin else 4034887Schin n = env->stats.n; 4044887Schin if ((env->stats.n += cn) < n) 4054887Schin return 1; 4064887Schin env->stats.x = x; 4074887Schin env->stats.l = l; 4084887Schin env->stats.y = y; 4094887Schin env->stats.k = k; 4104887Schin env->stats.t = t; 4114887Schin break; 4124887Schin case REX_BACK: 4134887Schin if (++env->stats.b <= 0) 4144887Schin return 1; 4154887Schin break; 4164887Schin case REX_CLASS: 4174887Schin case REX_COLL_CLASS: 4184887Schin case REX_DOT: 4194887Schin case REX_ONECHAR: 4204887Schin n = env->stats.m; 4214887Schin if ((env->stats.m += e->lo) < n) 4224887Schin return 1; 4234887Schin if (e->hi != RE_DUP_INF) 4244887Schin { 4254887Schin n = env->stats.n; 4264887Schin if ((env->stats.n += e->hi) < n) 4274887Schin return 1; 4284887Schin } 4294887Schin if (e->lo != e->hi) 4304887Schin { 4314887Schin if (++env->stats.c <= 0) 4324887Schin return 1; 4334887Schin if (++env->stats.s <= 0) 4344887Schin return 1; 4354887Schin } 4364887Schin break; 4374887Schin case REX_CONJ: 4384887Schin cm = env->stats.m; 4394887Schin env->stats.m = 0; 4404887Schin cn = env->stats.n; 4414887Schin env->stats.n = 0; 4424887Schin if (stats(env, e->re.group.expr.binary.left)) 4434887Schin return 1; 4444887Schin nm = env->stats.m; 4454887Schin env->stats.m = 0; 4464887Schin nn = env->stats.n; 4474887Schin env->stats.n = 0; 4484887Schin if (stats(env, e->re.group.expr.binary.right)) 4494887Schin return 1; 4504887Schin if (env->stats.m < nm) 4514887Schin env->stats.m = nm; 4524887Schin else 4534887Schin nm = env->stats.m; 4544887Schin if ((env->stats.m += cm) < nm) 4554887Schin return 1; 4564887Schin if (env->stats.n < nn) 4574887Schin env->stats.n = nn; 4584887Schin else 4594887Schin nn = env->stats.n; 4604887Schin if ((env->stats.n += cn) < nn) 4614887Schin return 1; 4624887Schin break; 4634887Schin case REX_GROUP: 4644887Schin if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0) 4654887Schin return 1; 4664887Schin if (stats(env, e->re.group.expr.rex)) 4674887Schin return 1; 4684887Schin break; 4694887Schin case REX_GROUP_AHEAD: 4704887Schin case REX_GROUP_AHEAD_NOT: 4714887Schin case REX_GROUP_BEHIND: 4724887Schin case REX_GROUP_BEHIND_NOT: 4734887Schin m = env->stats.m; 4744887Schin n = env->stats.n; 4754887Schin x = env->stats.x; 4764887Schin y = env->stats.y; 4774887Schin if (stats(env, e->re.group.expr.rex)) 4784887Schin return 1; 4794887Schin env->stats.m = m; 4804887Schin env->stats.n = n; 4814887Schin env->stats.x = x; 4824887Schin env->stats.y = y; 4834887Schin switch (e->type) 4844887Schin { 4854887Schin case REX_GROUP_AHEAD: 4864887Schin case REX_GROUP_BEHIND: 4874887Schin if (++env->stats.u <= 0) 4884887Schin return 1; 4894887Schin break; 4904887Schin } 4914887Schin break; 4924887Schin case REX_GROUP_COND: 4934887Schin if (++env->stats.u <= 0) 4944887Schin return 1; 4954887Schin m = env->stats.m; 4964887Schin n = env->stats.n; 4974887Schin x = env->stats.x; 4984887Schin y = env->stats.y; 4994887Schin if (e->re.group.size > 0 && ++env->stats.b <= 0) 5004887Schin return 1; 5014887Schin if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left)) 5024887Schin return 1; 5034887Schin if (q = e->re.group.expr.binary.right) 5044887Schin { 5054887Schin if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left)) 5064887Schin return 1; 5074887Schin if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right)) 5084887Schin return 1; 5094887Schin } 5104887Schin env->stats.m = m; 5114887Schin env->stats.n = n; 5124887Schin env->stats.x = x; 5134887Schin env->stats.y = y; 5144887Schin break; 5154887Schin case REX_GROUP_CUT: 5164887Schin if (++env->stats.u <= 0) 5174887Schin return 1; 5184887Schin m = env->stats.m; 5194887Schin n = env->stats.n; 5204887Schin x = env->stats.x; 5214887Schin y = env->stats.y; 5224887Schin if (stats(env, e->re.group.expr.rex)) 5234887Schin return 1; 5244887Schin env->stats.m = m; 5254887Schin env->stats.n = n; 5264887Schin env->stats.x = x; 5274887Schin env->stats.y = y; 5284887Schin break; 5294887Schin case REX_NEG: 5304887Schin env->stats.i++; 5314887Schin x = env->stats.x; 5324887Schin l = env->stats.l; 5334887Schin y = env->stats.y; 5344887Schin k = env->stats.k; 5354887Schin t = env->stats.t; 5364887Schin cm = env->stats.m; 5374887Schin env->stats.m = 0; 5384887Schin if (stats(env, e->re.group.expr.rex)) 5394887Schin return 1; 5404887Schin env->stats.m = !env->stats.m; 5414887Schin if ((env->stats.m += cm) < cm) 5424887Schin return 1; 5434887Schin env->stats.x = x; 5444887Schin env->stats.l = l; 5454887Schin env->stats.y = y; 5464887Schin env->stats.k = k; 5474887Schin env->stats.t = t; 5484887Schin break; 5494887Schin case REX_REP: 5504887Schin x = env->stats.x; 5514887Schin l = env->stats.l; 5524887Schin y = env->stats.y; 5534887Schin k = env->stats.k; 5544887Schin t = env->stats.t; 5554887Schin if (++env->stats.c <= 0) 5564887Schin return 1; 5574887Schin b = env->stats.b; 5584887Schin c = env->stats.c; 5594887Schin cm = env->stats.m; 5604887Schin env->stats.m = 0; 5614887Schin if (stats(env, e->re.group.expr.rex)) 5624887Schin return 1; 5634887Schin if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0) 5644887Schin return 1; 5654887Schin if (e->lo < 1) 5664887Schin { 5674887Schin env->stats.x = x; 5684887Schin env->stats.l = l; 5694887Schin env->stats.y = y; 5704887Schin env->stats.k = k; 5714887Schin env->stats.t = t; 5724887Schin env->stats.m = cm; 5734887Schin } 5744887Schin else 5754887Schin { 5764887Schin m = env->stats.m; 5774887Schin if ((env->stats.m *= e->lo) > 0 && env->stats.m < m) 5784887Schin return 1; 5794887Schin m = env->stats.m; 5804887Schin if ((env->stats.m += cm) < m) 5814887Schin return 1; 5824887Schin if (env->stats.x != x) 5834887Schin env->stats.l = cm; 5844887Schin if (env->stats.y != y) 5854887Schin env->stats.k = cm; 5864887Schin } 5874887Schin break; 5884887Schin case REX_STRING: 589*8462SApril.Chin@Sun.COM if (!e->map) 5904887Schin { 591*8462SApril.Chin@Sun.COM cm = env->stats.m; 592*8462SApril.Chin@Sun.COM if ((env->stats.m += e->re.string.size) < cm) 593*8462SApril.Chin@Sun.COM return 1; 594*8462SApril.Chin@Sun.COM cn = env->stats.n; 595*8462SApril.Chin@Sun.COM if ((env->stats.n += e->re.string.size) < cn) 596*8462SApril.Chin@Sun.COM return 1; 597*8462SApril.Chin@Sun.COM if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size) 598*8462SApril.Chin@Sun.COM { 599*8462SApril.Chin@Sun.COM env->stats.x = e; 600*8462SApril.Chin@Sun.COM env->stats.l = cm; 601*8462SApril.Chin@Sun.COM } 6024887Schin } 6034887Schin break; 6044887Schin case REX_TRIE: 6054887Schin if (++env->stats.s <= 0) 6064887Schin return 1; 6074887Schin cm = env->stats.m; 6084887Schin if ((env->stats.m += e->re.trie.min) < cm) 6094887Schin return 1; 6104887Schin cn = env->stats.n; 6114887Schin if ((env->stats.n += e->re.trie.max) < cn) 6124887Schin return 1; 6134887Schin env->stats.t++; 6144887Schin if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min) 6154887Schin { 6164887Schin env->stats.y = e; 6174887Schin env->stats.k = cm; 6184887Schin } 6194887Schin break; 6204887Schin } 6214887Schin } while (e = e->next); 6224887Schin return 0; 6234887Schin } 6244887Schin 6254887Schin static int token(Cenv_t*); 6264887Schin 6274887Schin static int 6284887Schin magic(register Cenv_t* env, register int c, int escaped) 6294887Schin { 6304887Schin register char* sp; 6314887Schin register int n; 6324887Schin int o = c; 6334887Schin int e = env->error; 6344887Schin int l = env->token.len; 6354887Schin short* mp; 6364887Schin char* ep; 6374887Schin 6384887Schin if (mp = state.magic[c]) 6394887Schin { 6404887Schin c = mp[env->type+escaped]; 6414887Schin if (c >= T_META) 6424887Schin { 6434887Schin sp = (char*)env->cursor + env->token.len; 6444887Schin switch (c) 6454887Schin { 6464887Schin case T_LEFT: 6474887Schin n = 0; 6484887Schin ep = sp; 6494887Schin while (*sp >= '0' && *sp <= '9') 6504887Schin { 6514887Schin if (n > (INT_MAX / 10)) 6524887Schin { 6534887Schin env->error = REG_BADBR; 6544887Schin goto bad; 6554887Schin } 6564887Schin n = n * 10 + *sp++ - '0'; 6574887Schin } 6584887Schin if (sp == ep) 6594887Schin { 660*8462SApril.Chin@Sun.COM if (env->type < SRE || *sp != ',') 661*8462SApril.Chin@Sun.COM { 662*8462SApril.Chin@Sun.COM env->error = *sp ? REG_BADBR : REG_EBRACE; 663*8462SApril.Chin@Sun.COM goto bad; 664*8462SApril.Chin@Sun.COM } 6654887Schin } 6664887Schin else if (n > RE_DUP_MAX) 6674887Schin { 6684887Schin env->error = REG_BADBR; 6694887Schin goto bad; 6704887Schin } 6714887Schin env->token.min = n; 6724887Schin if (*sp == ',') 6734887Schin { 6744887Schin n = 0; 6754887Schin ep = ++sp; 6764887Schin while (*sp >= '0' && *sp <= '9') 6774887Schin { 6784887Schin if (n > (INT_MAX / 10)) 6794887Schin { 6804887Schin env->error = REG_BADBR; 6814887Schin goto bad; 6824887Schin } 6834887Schin n = n * 10 + *sp++ - '0'; 6844887Schin } 6854887Schin if (sp == ep) 6864887Schin n = RE_DUP_INF; 6874887Schin else if (n < env->token.min) 6884887Schin { 6894887Schin env->error = REG_BADBR; 6904887Schin goto bad; 6914887Schin } 6924887Schin } 6934887Schin env->token.max = n; 6944887Schin switch (*sp) 6954887Schin { 6964887Schin case 0: 6974887Schin env->error = REG_EBRACE; 6984887Schin goto bad; 6994887Schin case '\\': 7004887Schin if (!escaped) 7014887Schin { 7024887Schin env->error = REG_BADBR; 7034887Schin goto bad; 7044887Schin } 7054887Schin sp++; 7064887Schin break; 7074887Schin default: 7084887Schin if (escaped) 7094887Schin { 7104887Schin env->error = REG_BADBR; 7114887Schin goto bad; 7124887Schin } 7134887Schin break; 7144887Schin } 7154887Schin switch (*sp++) 7164887Schin { 7174887Schin case 0: 7184887Schin env->error = REG_EBRACE; 7194887Schin goto bad; 7204887Schin case '}': 7214887Schin break; 7224887Schin default: 7234887Schin env->error = REG_BADBR; 7244887Schin goto bad; 7254887Schin } 7264887Schin env->token.len = sp - (char*)env->cursor; 7274887Schin break; 7284887Schin case T_RIGHT: 7294887Schin env->error = REG_EBRACE; 7304887Schin goto bad; 7314887Schin case T_OPEN: 7324887Schin if (env->type < SRE && *sp == '?') 7334887Schin { 7344887Schin env->token.len++; 7354887Schin env->token.lex = 0; 7364887Schin goto group; 7374887Schin } 7384887Schin break; 7394887Schin case T_ESCAPE: 7404887Schin c = chresc(sp - 2, &ep); 7414887Schin if (ep < sp) 7424887Schin goto bad; 7434887Schin env->token.len += ep - sp; 7444887Schin if (c >= T_META) 7454887Schin { 7464887Schin env->token.lex = c; 7474887Schin c = C_ESC; 7484887Schin } 7494887Schin return c; 7504887Schin case T_BACK+0: 7514887Schin case T_BACK+1: 7524887Schin case T_BACK+2: 7534887Schin case T_BACK+3: 7544887Schin case T_BACK+4: 7554887Schin case T_BACK+5: 7564887Schin case T_BACK+6: 7574887Schin case T_BACK+7: 7584887Schin n = chresc(sp - 2, &ep); 7594887Schin if (ep > sp + 1) 7604887Schin { 7614887Schin env->token.len += ep - sp; 7624887Schin return n; 7634887Schin } 7644887Schin /*FALLTHROUGH*/ 7654887Schin case T_BACK+8: 7664887Schin case T_BACK+9: 7674887Schin if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT)) 7684887Schin { 7694887Schin env->error = REG_BADESC; 7704887Schin goto bad; 7714887Schin } 7724887Schin if ((env->flags & REG_MULTIREF) && isdigit(*sp)) 7734887Schin { 7744887Schin c = (c - T_BACK) * 10 + (*sp - '0'); 7754887Schin if (c > 0 && c <= env->parno && env->paren[c]) 7764887Schin c += T_BACK; 7774887Schin else 7784887Schin c = chresc(sp - 2, &ep); 7794887Schin env->token.len++; 7804887Schin } 7814887Schin if (c == T_BACK) 7824887Schin c = 0; 7834887Schin break; 7844887Schin case T_BAD: 7854887Schin if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META) 7864887Schin return c; 7874887Schin goto bad; 7884887Schin } 7894887Schin if (env->type >= SRE) 7904887Schin { 7914887Schin if (c == T_DOT) 7924887Schin c = '.'; 7934887Schin else if (c < T_OPEN) 7944887Schin { 7954887Schin if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(') 7964887Schin { 7974887Schin env->token.len++; 7984887Schin env->token.att = 1; 7994887Schin } 8004887Schin if (env->type == KRE && *(env->cursor + env->token.len) == '(') 8014887Schin { 8024887Schin env->token.len++; 8034887Schin switch (c) 8044887Schin { 8054887Schin case T_AT: 8064887Schin break; 8074887Schin case T_PERCENT: 8084887Schin env->token.lex = c; 8094887Schin goto group; 8104887Schin case T_TILDE: 8114887Schin env->token.lex = 0; 8124887Schin goto group; 8134887Schin default: 8144887Schin env->token.lex = c; 8154887Schin break; 8164887Schin } 8174887Schin c = T_OPEN; 8184887Schin } 8194887Schin else if (c == T_STAR) 8204887Schin c = T_DOTSTAR; 8214887Schin else if (c == T_QUES) 8224887Schin c = T_DOT; 8234887Schin else 8244887Schin { 8254887Schin c = o; 8264887Schin env->token.len = l; 8274887Schin } 8284887Schin } 8294887Schin else if (c > T_BACK) 8304887Schin { 8314887Schin c = (c - T_BACK) * 2 - 1; 8324887Schin c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c; 8334887Schin } 8344887Schin else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP)) 8354887Schin { 8364887Schin if (c == T_AND) 8374887Schin c = '&'; 8384887Schin else if (c == T_BAR) 8394887Schin c = '|'; 8404887Schin else if (c == T_OPEN) 8414887Schin c = '('; 8424887Schin } 8434887Schin } 8444887Schin } 8454887Schin } 8464887Schin else if (escaped == 2) 8474887Schin { 8484887Schin if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter)) 8494887Schin /*ok*/; 8504887Schin else 8514887Schin { 8524887Schin env->error = REG_BADESC; 8534887Schin goto bad; 8544887Schin } 8554887Schin } 8564887Schin else if (escaped && !(env->flags & REG_LENIENT) && c != ']') 8574887Schin { 8584887Schin env->error = REG_BADESC; 8594887Schin goto bad; 8604887Schin } 8614887Schin return c; 8624887Schin group: 8634887Schin sp = (char*)env->cursor + env->token.len; 8644887Schin switch (*sp++) 8654887Schin { 8664887Schin case ')': 8674887Schin break; 8684887Schin case '#': 8694887Schin for (;;) 8704887Schin { 8714887Schin switch (*sp++) 8724887Schin { 8734887Schin case 0: 8744887Schin env->error = REG_EPAREN; 8754887Schin return T_BAD; 8764887Schin case ')': 8774887Schin break; 8784887Schin default: 8794887Schin continue; 8804887Schin } 8814887Schin break; 8824887Schin } 8834887Schin break; 8844887Schin default: 8854887Schin return T_GROUP; 8864887Schin } 8874887Schin env->cursor = (unsigned char*)sp; 8884887Schin return token(env); 8894887Schin bad: 8904887Schin if (escaped == 2) 8914887Schin env->error = e; 8924887Schin else if (env->flags & REG_LENIENT) 8934887Schin return o; 8944887Schin else if (escaped == 1 && !env->error) 8954887Schin { 8964887Schin if (mp || o == ']') 8974887Schin return o; 8984887Schin env->error = REG_BADESC; 8994887Schin } 9004887Schin return T_BAD; 9014887Schin } 9024887Schin 9034887Schin static int 9044887Schin token(register Cenv_t* env) 9054887Schin { 9064887Schin int c; 9074887Schin int posixkludge; 9084887Schin 9094887Schin if (env->token.push) 9104887Schin return env->token.lex; 9114887Schin env->token.att = env->token.esc = 0; 9124887Schin if ((env->token.len = MBSIZE(env->cursor)) > 1) 9134887Schin return env->token.lex = C_MB; 9144887Schin env->token.lex = 0; 9154887Schin for (;;) 9164887Schin { 9174887Schin c = *env->cursor; 9184887Schin if (c == 0 || c == env->delimiter || c == env->terminator) 9194887Schin return T_END; 9204887Schin if (!(env->flags & REG_COMMENT)) 9214887Schin break; 9224887Schin if (c == '#') 9234887Schin { 9244887Schin do 9254887Schin { 9264887Schin c = *++env->cursor; 9274887Schin if (c == 0 || c == env->delimiter) 9284887Schin return T_END; 9294887Schin } while (c != '\n'); 9304887Schin } 9314887Schin else if (!isspace(c)) 9324887Schin break; 9334887Schin env->cursor++; 9344887Schin } 9354887Schin if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter) 9364887Schin { 9374887Schin if (env->parnest) 9384887Schin { 9394887Schin env->error = REG_EPAREN; 9404887Schin return T_BAD; 9414887Schin } 9424887Schin env->parno = 0; 9434887Schin env->pattern = env->cursor + 1; 9444887Schin return T_BAR; 9454887Schin } 9464887Schin if (env->flags & REG_LITERAL) 9474887Schin return c; 9484887Schin if (posixkludge = env->posixkludge) 9494887Schin { 9504887Schin env->posixkludge = 0; 9514887Schin if (c == '*') 9524887Schin return c; 9534887Schin } 9544887Schin if (c == '\\') 9554887Schin { 9564887Schin if (env->flags & REG_SHELL_ESCAPED) 9574887Schin return c; 9584887Schin if (!(c = *(env->cursor + 1)) || c == env->terminator) 9594887Schin { 9604887Schin if (env->flags & REG_LENIENT) 9614887Schin { 9624887Schin if (c) 9634887Schin { 9644887Schin env->token.esc = env->token.len; 9654887Schin env->token.len += MBSIZE(env->cursor + 1); 9664887Schin return c; 9674887Schin } 9684887Schin return '\\'; 9694887Schin } 9704887Schin env->error = REG_EESCAPE; 9714887Schin return T_BAD; 9724887Schin } 9734887Schin env->token.esc = env->token.len; 9744887Schin env->token.len += MBSIZE(env->cursor + 1); 9754887Schin if (env->delimiter && c == 'n') 9764887Schin return '\n'; 9774887Schin else if (c == env->delimiter) 9784887Schin return magic(env, c, 0); 9794887Schin else if (c == '(' && env->type == BRE) 9804887Schin env->posixkludge = 1; 9814887Schin else if (c == ')' && env->type == BRE && env->parnest <= 0) 9824887Schin { 9834887Schin env->error = REG_EPAREN; 9844887Schin return T_BAD; 9854887Schin } 9864887Schin else if (isspace(c) && (env->flags & REG_COMMENT)) 9874887Schin return c; 9884887Schin return magic(env, c, 1); 9894887Schin } 9904887Schin else if (c == '$') 9914887Schin { 9924887Schin if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n') 9934887Schin return T_DOLL; 9944887Schin } 9954887Schin else if (c == '^') 9964887Schin { 9974887Schin if (env->type == BRE && (env->cursor == env->pattern || posixkludge)) 9984887Schin { 9994887Schin env->posixkludge = 1; 10004887Schin return T_CFLX; 10014887Schin } 10024887Schin } 10034887Schin else if (c == ')') 10044887Schin { 10054887Schin if (env->type != BRE && env->parnest <= 0) 10064887Schin return c; 10074887Schin } 10084887Schin else if (c == '/' && env->explicit == env->mappedslash) 10094887Schin { 10104887Schin while (*(env->cursor + env->token.len) == c) 10114887Schin env->token.len++; 10124887Schin return T_SLASHPLUS; 10134887Schin } 10144887Schin return magic(env, c, 0); 10154887Schin } 10164887Schin 10174887Schin static Celt_t* 10184887Schin col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec) 10194887Schin { 1020*8462SApril.Chin@Sun.COM register char* s; 10214887Schin register unsigned char* k; 10224887Schin register unsigned char* e; 10234887Schin register int c; 10244887Schin register int cc; 10254887Schin int bt; 10264887Schin int et; 10274887Schin Ckey_t key; 10284887Schin 10294887Schin cc = 0; 10304887Schin for (;;) 10314887Schin { 10324887Schin k = key; 10334887Schin if (bw == 1) 10344887Schin { 10354887Schin c = bc; 10364887Schin if (ic) 10374887Schin { 1038*8462SApril.Chin@Sun.COM if (isupper(c)) 1039*8462SApril.Chin@Sun.COM { 1040*8462SApril.Chin@Sun.COM c = tolower(c); 1041*8462SApril.Chin@Sun.COM cc = -1; 1042*8462SApril.Chin@Sun.COM } 1043*8462SApril.Chin@Sun.COM else if (islower(c)) 1044*8462SApril.Chin@Sun.COM { 1045*8462SApril.Chin@Sun.COM c = toupper(c); 1046*8462SApril.Chin@Sun.COM cc = -1; 1047*8462SApril.Chin@Sun.COM } 1048*8462SApril.Chin@Sun.COM } 1049*8462SApril.Chin@Sun.COM *k++ = c; 1050*8462SApril.Chin@Sun.COM } 1051*8462SApril.Chin@Sun.COM else if (bw < COLL_KEY_MAX) 1052*8462SApril.Chin@Sun.COM { 1053*8462SApril.Chin@Sun.COM s = (char*)bp; 1054*8462SApril.Chin@Sun.COM if (ic) 1055*8462SApril.Chin@Sun.COM { 1056*8462SApril.Chin@Sun.COM c = mbchar(s); 10574887Schin if (iswupper(c)) 10584887Schin { 10594887Schin c = towlower(c); 10604887Schin cc = 1; 10614887Schin } 10624887Schin else if (iswlower(c)) 10634887Schin { 10644887Schin c = towupper(c); 10654887Schin cc = 1; 10664887Schin } 10674887Schin } 1068*8462SApril.Chin@Sun.COM if (cc > 0) 10694887Schin { 1070*8462SApril.Chin@Sun.COM cc = -1; 1071*8462SApril.Chin@Sun.COM k += wctomb((char*)k, c); 10724887Schin } 1073*8462SApril.Chin@Sun.COM else 1074*8462SApril.Chin@Sun.COM for (e = k + bw; k < e; *k++ = *s++); 10754887Schin } 10764887Schin *k = 0; 10774887Schin mbxfrm(ce->beg, key, COLL_KEY_MAX); 10784887Schin if (ep) 10794887Schin { 10804887Schin k = key; 10814887Schin c = mbchar(k); 10824887Schin if (iswupper(c)) 10834887Schin bt = COLL_range_uc; 10844887Schin else if (iswlower(c)) 10854887Schin bt = COLL_range_lc; 10864887Schin else 10874887Schin bt = COLL_range; 10884887Schin k = key; 10894887Schin if (ew == 1) 10904887Schin { 10914887Schin c = ec; 10924887Schin if (ic) 10934887Schin { 1094*8462SApril.Chin@Sun.COM if (isupper(c)) 1095*8462SApril.Chin@Sun.COM { 1096*8462SApril.Chin@Sun.COM c = tolower(c); 1097*8462SApril.Chin@Sun.COM cc = -1; 1098*8462SApril.Chin@Sun.COM } 1099*8462SApril.Chin@Sun.COM else if (islower(c)) 1100*8462SApril.Chin@Sun.COM { 1101*8462SApril.Chin@Sun.COM c = toupper(c); 1102*8462SApril.Chin@Sun.COM cc = -1; 1103*8462SApril.Chin@Sun.COM } 1104*8462SApril.Chin@Sun.COM } 1105*8462SApril.Chin@Sun.COM *k++ = c; 1106*8462SApril.Chin@Sun.COM } 1107*8462SApril.Chin@Sun.COM else if (ew < COLL_KEY_MAX) 1108*8462SApril.Chin@Sun.COM { 1109*8462SApril.Chin@Sun.COM s = (char*)ep; 1110*8462SApril.Chin@Sun.COM if (ic) 1111*8462SApril.Chin@Sun.COM { 1112*8462SApril.Chin@Sun.COM c = mbchar(s); 11134887Schin if (iswupper(c)) 11144887Schin { 11154887Schin c = towlower(c); 11164887Schin cc = 1; 11174887Schin } 11184887Schin else if (iswlower(c)) 11194887Schin { 11204887Schin c = towupper(c); 11214887Schin cc = 1; 11224887Schin } 11234887Schin } 1124*8462SApril.Chin@Sun.COM if (cc > 0) 11254887Schin { 1126*8462SApril.Chin@Sun.COM cc = -1; 1127*8462SApril.Chin@Sun.COM k += wctomb((char*)k, c); 11284887Schin } 1129*8462SApril.Chin@Sun.COM else 1130*8462SApril.Chin@Sun.COM for (e = k + ew; k < e; *k++ = *s++); 11314887Schin } 11324887Schin *k = 0; 11334887Schin mbxfrm(ce->end, key, COLL_KEY_MAX); 11344887Schin k = key; 11354887Schin c = mbchar(k); 11364887Schin if (iswupper(c)) 11374887Schin et = COLL_range_uc; 11384887Schin else if (iswlower(c)) 11394887Schin et = COLL_range_lc; 11404887Schin else 11414887Schin et = COLL_range; 11424887Schin ce->typ = bt == et ? bt : COLL_range; 11434887Schin } 11444887Schin else 11454887Schin ce->typ = COLL_char; 11464887Schin ce++; 11474887Schin if (!ic || !cc) 11484887Schin break; 11494887Schin ic = 0; 11504887Schin } 11514887Schin return ce; 11524887Schin } 11534887Schin 11544887Schin static Rex_t* 11554887Schin bra(Cenv_t* env) 11564887Schin { 11574887Schin Rex_t* e; 11584887Schin int c; 11594887Schin int i; 11604887Schin int w; 11614887Schin int neg; 11624887Schin int last; 11634887Schin int inrange; 11644887Schin int complicated; 11654887Schin int collate; 11664887Schin int elements; 11674887Schin unsigned char* first; 11684887Schin unsigned char* start; 11694887Schin unsigned char* begin; 11704887Schin unsigned char* s; 11714887Schin regclass_t f; 11724887Schin unsigned char buf[4 * (COLL_KEY_MAX + 1)]; 11734887Schin #if _PACKAGE_ast 11744887Schin int ic; 11754887Schin char mbc[COLL_KEY_MAX + 1]; 11764887Schin #endif 11774887Schin 11784887Schin if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 11794887Schin return 0; 11804887Schin collate = complicated = elements = 0; 11814887Schin if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!') 11824887Schin { 11834887Schin env->cursor++; 11844887Schin neg = 1; 11854887Schin } 11864887Schin else 11874887Schin neg = 0; 1188*8462SApril.Chin@Sun.COM first = env->cursor; 1189*8462SApril.Chin@Sun.COM start = first + MBSIZE(first); 11904887Schin if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter)) 11914887Schin goto error; 11924887Schin begin = env->cursor + MBSIZE(env->cursor); 11934887Schin 11944887Schin /* 11954887Schin * inrange: 0=no, 1=possibly, 2=definitely 11964887Schin */ 11974887Schin 11984887Schin inrange = 0; 11994887Schin for (;;) 12004887Schin { 12014887Schin if (!(c = *env->cursor) || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 12024887Schin goto error; 12034887Schin env->cursor += (w = MBSIZE(env->cursor)); 12044887Schin if (c == '\\') 12054887Schin { 12064887Schin if (*env->cursor) 12074887Schin { 12084887Schin if (*env->cursor == 'n') 12094887Schin { 12104887Schin env->cursor++; 12114887Schin c = '\n'; 12124887Schin } 12134887Schin else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 12144887Schin { 12154887Schin env->token.len = 1; 12164887Schin w = magic(env, *env->cursor, 2); 12174887Schin if (env->token.len > 1 || w != T_BAD) 12184887Schin { 12194887Schin if (env->token.len == 1 && (f = classfun(w))) 12204887Schin { 12214887Schin if (inrange > 1) 12224887Schin { 12234887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 12244887Schin goto erange; 12254887Schin inrange = 0; 12264887Schin } 12274887Schin env->cursor++; 12284887Schin for (c = 0; c <= UCHAR_MAX; c++) 12294887Schin if ((*f)(c)) 12304887Schin setadd(e->re.charclass, c); 12314887Schin complicated++; 12324887Schin elements++; 12334887Schin continue; 12344887Schin } 12354887Schin if (env->token.len > 1 || w >= 0 && w < T_META) 12364887Schin { 12374887Schin c = w; 12384887Schin if (c > UCHAR_MAX) 12394887Schin { 12404887Schin if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide()) 12414887Schin goto erange; 12424887Schin c = UCHAR_MAX; 12434887Schin } 12444887Schin env->cursor += env->token.len; 12454887Schin } 12464887Schin } 12474887Schin } 12484887Schin } 12494887Schin } 12504887Schin else if (c == ']') 12514887Schin { 12524887Schin if (env->cursor == begin) 12534887Schin { 12544887Schin last = c; 12554887Schin inrange = 1; 12564887Schin continue; 12574887Schin } 12584887Schin if (inrange != 0) 12594887Schin { 12604887Schin setadd(e->re.charclass, last); 12614887Schin elements++; 12624887Schin if (inrange == 2) 12634887Schin { 12644887Schin setadd(e->re.charclass, '-'); 12654887Schin elements++; 12664887Schin } 12674887Schin } 12684887Schin break; 12694887Schin } 12704887Schin else if (c == '-') 12714887Schin { 12724887Schin if (!inrange && env->cursor != begin && *env->cursor != ']') 12734887Schin { 12744887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 12754887Schin goto erange; 12764887Schin continue; 12774887Schin } 12784887Schin else if (inrange == 1) 12794887Schin { 12804887Schin inrange = 2; 12814887Schin complicated++; 12824887Schin continue; 12834887Schin } 12844887Schin } 12854887Schin else if (c == '[') 12864887Schin { 12874887Schin switch (*env->cursor) 12884887Schin { 12894887Schin case 0: 12904887Schin goto error; 12914887Schin case ':': 12924887Schin if (inrange == 1) 12934887Schin { 12944887Schin setadd(e->re.charclass, last); 12954887Schin elements++; 12964887Schin } 12974887Schin if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 12984887Schin { 12994887Schin if (env->cursor == start && (c = *(env->cursor + 1))) 13004887Schin { 13014887Schin s = start = env->cursor + 1; 13024887Schin while (*++s && *s != ':'); 13034887Schin if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']') 13044887Schin { 13054887Schin if ((i = (s - start)) == 1) 13064887Schin { 13074887Schin switch (c) 13084887Schin { 13094887Schin case '<': 13104887Schin i = REX_WBEG; 13114887Schin break; 13124887Schin case '>': 13134887Schin i = REX_WEND; 13144887Schin break; 13154887Schin default: 13164887Schin i = 0; 13174887Schin break; 13184887Schin } 13194887Schin if (i) 13204887Schin { 13214887Schin env->cursor = s + 3; 13224887Schin drop(env->disc, e); 13234887Schin return node(env, i, 0, 0, 0); 13244887Schin } 13254887Schin } 13264887Schin } 13274887Schin } 13284887Schin env->error = REG_ECTYPE; 13294887Schin goto error; 13304887Schin } 13314887Schin for (c = 0; c <= UCHAR_MAX; c++) 13324887Schin if ((*f)(c)) 13334887Schin setadd(e->re.charclass, c); 13344887Schin inrange = 0; 13354887Schin complicated++; 13364887Schin elements++; 13374887Schin continue; 13384887Schin case '=': 13394887Schin if (inrange == 2) 13404887Schin goto erange; 13414887Schin if (inrange == 1) 13424887Schin { 13434887Schin setadd(e->re.charclass, last); 13444887Schin elements++; 13454887Schin } 13464887Schin if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 13474887Schin goto ecollate; 13484887Schin if (c > 1) 13494887Schin collate++; 13504887Schin else 13514887Schin setadd(e->re.charclass, buf[0]); 13524887Schin c = buf[0]; 13534887Schin inrange = 0; 13544887Schin complicated++; 13554887Schin elements++; 13564887Schin continue; 13574887Schin case '.': 13584887Schin if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 13594887Schin goto ecollate; 13604887Schin if (c > 1) 13614887Schin collate++; 13624887Schin c = buf[0]; 13634887Schin complicated++; 13644887Schin break; 13654887Schin default: 13664887Schin if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 13674887Schin goto error; 13684887Schin break; 13694887Schin } 13704887Schin } 13714887Schin else if (w > 1) 13724887Schin complicated++; 13734887Schin if (inrange == 2) 13744887Schin { 13754887Schin if (last > c) 13764887Schin { 13774887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 13784887Schin goto erange; 13794887Schin setadd(e->re.charclass, last); 13804887Schin setadd(e->re.charclass, c); 13814887Schin } 13824887Schin else 13834887Schin for (i = last; i <= c; i++) 13844887Schin setadd(e->re.charclass, i); 13854887Schin inrange = env->type >= SRE || (env->flags & REG_LENIENT); 13864887Schin elements += 2; 13874887Schin } 13884887Schin else if (inrange == 1) 13894887Schin { 13904887Schin setadd(e->re.charclass, last); 13914887Schin elements++; 13924887Schin } 13934887Schin else 13944887Schin inrange = 1; 13954887Schin last = c; 13964887Schin } 13974887Schin #if _PACKAGE_ast 13984887Schin if (complicated && mbcoll()) 13994887Schin { 14004887Schin Dt_t* dt; 14014887Schin Cchr_t* cc; 14024887Schin Cchr_t* tc; 14034887Schin Cchr_t* xc; 14044887Schin Celt_t* ce; 14054887Schin Cchr_t key; 14064887Schin int rw; 14074887Schin int rc; 1408*8462SApril.Chin@Sun.COM int wc; 14094887Schin unsigned char* rp; 14104887Schin unsigned char* pp; 1411*8462SApril.Chin@Sun.COM char* wp; 1412*8462SApril.Chin@Sun.COM char cb[2][COLL_KEY_MAX+1]; 14134887Schin 14144887Schin static Dtdisc_t disc; 14154887Schin 14164887Schin static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 14174887Schin 14184887Schin if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data)) 14194887Schin { 14204887Schin disc.key = offsetof(Cchr_t, key); 14214887Schin if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree))) 14224887Schin { 14234887Schin for (i = 0; i < elementsof(primary) - 1; i++, cc++) 14244887Schin { 14254887Schin cc->nam[0] = primary[i]; 14264887Schin mbxfrm(cc->key, cc->nam, COLL_KEY_MAX); 14274887Schin dtinsert(dt, cc); 14284887Schin } 14294887Schin for (i = 0; i < elementsof(cc->key); i++) 14304887Schin cc->key[i] = ~0; 14314887Schin dtinsert(dt, cc); 14324887Schin LCINFO(AST_LC_COLLATE)->data = (void*)dt; 14334887Schin } 14344887Schin else 14354887Schin { 14364887Schin if (cc) 14374887Schin free(cc); 14384887Schin drop(env->disc, e); 14394887Schin return 0; 14404887Schin } 14414887Schin } 14424887Schin if (dt) 14434887Schin { 14444887Schin drop(env->disc, e); 14454887Schin if (ic = env->flags & REG_ICASE) 14464887Schin elements *= 2; 1447*8462SApril.Chin@Sun.COM if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t)))) 14484887Schin return 0; 14494887Schin ce = (Celt_t*)e->re.data; 14504887Schin e->re.collate.invert = neg; 14514887Schin e->re.collate.elements = ce; 14524887Schin env->cursor = first; 14534887Schin inrange = 0; 14544887Schin for (;;) 14554887Schin { 14564887Schin if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 14574887Schin goto error; 14584887Schin pp = env->cursor; 14594887Schin env->cursor += (w = MBSIZE(env->cursor)); 14604887Schin if (c == '\\') 14614887Schin { 14624887Schin if (*env->cursor) 14634887Schin { 14644887Schin if (*env->cursor == 'n') 14654887Schin { 14664887Schin pp = env->cursor++; 14674887Schin c = '\n'; 14684887Schin } 14694887Schin else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 14704887Schin { 14714887Schin env->token.len = 1; 14724887Schin w = magic(env, *env->cursor, 2); 14734887Schin if (env->token.len > 1 || w != T_BAD) 14744887Schin { 14754887Schin if (env->token.len == 1 && (f = classfun(w))) 14764887Schin { 14774887Schin if (inrange > 1) 14784887Schin { 14794887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 14804887Schin goto erange; 14814887Schin inrange = 0; 14824887Schin } 14834887Schin env->cursor++; 14844887Schin ce->fun = f; 14854887Schin ce->typ = COLL_call; 14864887Schin ce++; 14874887Schin continue; 14884887Schin } 14894887Schin if (env->token.len > 1 || w >= 0 && w < T_META) 14904887Schin { 14914887Schin c = w; 14924887Schin w = wctomb(mbc, c); 14934887Schin pp = (unsigned char*)mbc; 14944887Schin env->cursor += env->token.len; 14954887Schin } 14964887Schin } 14974887Schin } 14984887Schin } 14994887Schin } 15004887Schin else if (c == ']') 15014887Schin { 15024887Schin if (env->cursor == begin) 15034887Schin { 15044887Schin rp = pp; 15054887Schin rw = w; 15064887Schin inrange = 1; 15074887Schin continue; 15084887Schin } 15094887Schin if (inrange != 0) 15104887Schin { 15114887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 15124887Schin if (inrange == 2) 15134887Schin ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0); 15144887Schin } 15154887Schin break; 15164887Schin } 15174887Schin else if (c == '-') 15184887Schin { 15194887Schin if (!inrange && env->cursor != begin && *env->cursor != ']') 15204887Schin { 15214887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 15224887Schin goto erange; 15234887Schin continue; 15244887Schin } 15254887Schin else if (inrange == 1) 15264887Schin { 15274887Schin inrange = 2; 15284887Schin continue; 15294887Schin } 15304887Schin } 15314887Schin else if (c == '[') 15324887Schin { 15334887Schin switch (*env->cursor) 15344887Schin { 15354887Schin case 0: 15364887Schin goto error; 15374887Schin case ':': 15384887Schin if (inrange == 1) 15394887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 15404887Schin if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 15414887Schin { 15424887Schin if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']') 15434887Schin { 15444887Schin switch (c) 15454887Schin { 15464887Schin case '<': 15474887Schin i = REX_WBEG; 15484887Schin break; 15494887Schin case '>': 15504887Schin i = REX_WEND; 15514887Schin break; 15524887Schin default: 15534887Schin i = 0; 15544887Schin break; 15554887Schin } 15564887Schin if (i) 15574887Schin { 15584887Schin env->cursor += 5; 15594887Schin drop(env->disc, e); 15604887Schin return node(env, i, 0, 0, 0); 15614887Schin } 15624887Schin } 15634887Schin env->error = REG_ECTYPE; 15644887Schin goto error; 15654887Schin } 15664887Schin ce->fun = f; 15674887Schin ce->typ = COLL_call; 15684887Schin ce++; 15694887Schin inrange = 0; 15704887Schin continue; 15714887Schin case '=': 15724887Schin if (inrange == 2) 15734887Schin goto erange; 15744887Schin if (inrange == 1) 15754887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1576*8462SApril.Chin@Sun.COM pp = (unsigned char*)cb[inrange]; 15774887Schin rp = env->cursor + 1; 1578*8462SApril.Chin@Sun.COM if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 15794887Schin goto ecollate; 1580*8462SApril.Chin@Sun.COM wp = (char*)pp; 1581*8462SApril.Chin@Sun.COM wc = mbchar(wp); 15824887Schin c = 0; 15834887Schin if (ic) 1584*8462SApril.Chin@Sun.COM { 1585*8462SApril.Chin@Sun.COM if (iswupper(wc)) 1586*8462SApril.Chin@Sun.COM { 1587*8462SApril.Chin@Sun.COM wc = towlower(wc); 1588*8462SApril.Chin@Sun.COM rw = wctomb((char*)pp, wc); 1589*8462SApril.Chin@Sun.COM c = 'u'; 1590*8462SApril.Chin@Sun.COM } 1591*8462SApril.Chin@Sun.COM else if (iswlower(wc)) 1592*8462SApril.Chin@Sun.COM c = 'l'; 1593*8462SApril.Chin@Sun.COM } 15944887Schin for (;;) 15954887Schin { 1596*8462SApril.Chin@Sun.COM mbxfrm(key.key, (char*)pp, COLL_KEY_MAX); 1597*8462SApril.Chin@Sun.COM if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key))) 1598*8462SApril.Chin@Sun.COM goto ecollate; 1599*8462SApril.Chin@Sun.COM xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc; 1600*8462SApril.Chin@Sun.COM if (c == 'l' || c == 'L' && !(c = 0)) 16014887Schin ce->typ = COLL_range_lc; 1602*8462SApril.Chin@Sun.COM else if (c == 'u' || c == 'U' && !(c = 0)) 16034887Schin ce->typ = COLL_range_uc; 16044887Schin else 16054887Schin ce->typ = COLL_range; 16064887Schin strcpy((char*)ce->beg, (char*)xc->key); 16074887Schin if (!(cc = (Cchr_t*)dtnext(dt, cc))) 16084887Schin goto ecollate; 16094887Schin if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc))) 16104887Schin cc = tc; 16114887Schin strcpy((char*)ce->end, (char*)cc->key); 16124887Schin ce->max = -1; 16134887Schin ce++; 16144887Schin if (!c) 16154887Schin break; 1616*8462SApril.Chin@Sun.COM if (c == 'u') 1617*8462SApril.Chin@Sun.COM { 1618*8462SApril.Chin@Sun.COM wc = towlower(wc); 1619*8462SApril.Chin@Sun.COM c = 'L'; 1620*8462SApril.Chin@Sun.COM } 1621*8462SApril.Chin@Sun.COM else 1622*8462SApril.Chin@Sun.COM { 1623*8462SApril.Chin@Sun.COM wc = towupper(wc); 1624*8462SApril.Chin@Sun.COM c = 'U'; 1625*8462SApril.Chin@Sun.COM } 1626*8462SApril.Chin@Sun.COM rw = wctomb((char*)pp, wc); 16274887Schin } 16284887Schin inrange = 0; 1629*8462SApril.Chin@Sun.COM c = *pp; 16304887Schin continue; 16314887Schin case '.': 1632*8462SApril.Chin@Sun.COM pp = (unsigned char*)cb[inrange]; 1633*8462SApril.Chin@Sun.COM if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 16344887Schin goto ecollate; 16354887Schin c = buf[0]; 16364887Schin break; 16374887Schin default: 16384887Schin if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 16394887Schin goto error; 16404887Schin break; 16414887Schin } 16424887Schin } 16434887Schin if (inrange == 2) 16444887Schin { 16454887Schin ce = col(ce, ic, rp, rw, rc, pp, w, c); 16464887Schin if (strcmp((char*)ce->beg, (char*)ce->end) > 0) 16474887Schin { 16484887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 16494887Schin goto erange; 16504887Schin (ce-1)->typ = COLL_char; 16514887Schin strcpy((char*)ce->beg, (char*)(ce-1)->end); 16524887Schin ce->typ = COLL_char; 16534887Schin ce++; 16544887Schin } 16554887Schin inrange = env->type >= SRE || (env->flags & REG_LENIENT); 16564887Schin } 16574887Schin else if (inrange == 1) 16584887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 16594887Schin else 16604887Schin inrange = 1; 16614887Schin rp = pp; 16624887Schin rw = w; 16634887Schin rc = c; 16644887Schin } 16654887Schin ce->typ = COLL_end; 16664887Schin return e; 16674887Schin } 16684887Schin } 16694887Schin #endif 16704887Schin if (collate) 16714887Schin goto ecollate; 16724887Schin if (env->flags & REG_ICASE) 16734887Schin for (i = 0; i <= UCHAR_MAX; i++) 16744887Schin if (settst(e->re.charclass, i)) 16754887Schin { 16764887Schin if (isupper(i)) 16774887Schin c = tolower(i); 16784887Schin else if (islower(i)) 16794887Schin c = toupper(i); 16804887Schin else 16814887Schin continue; 16824887Schin setadd(e->re.charclass, c); 16834887Schin } 16844887Schin if (neg) 16854887Schin { 16864887Schin for (i = 0; i < elementsof(e->re.charclass->bits); i++) 16874887Schin e->re.charclass->bits[i] ^= ~0; 16884887Schin if (env->explicit >= 0) 16894887Schin setclr(e->re.charclass, env->explicit); 16904887Schin } 16914887Schin return e; 16924887Schin ecollate: 16934887Schin env->error = REG_ECOLLATE; 16944887Schin goto error; 16954887Schin erange: 16964887Schin env->error = REG_ERANGE; 16974887Schin error: 16984887Schin drop(env->disc, e); 16994887Schin if (!env->error) 17004887Schin env->error = REG_EBRACK; 17014887Schin return 0; 17024887Schin } 17034887Schin 17044887Schin static Rex_t* 17054887Schin ccl(Cenv_t* env, int type) 17064887Schin { 17074887Schin int i; 17084887Schin Rex_t* e; 17094887Schin Celt_t* ce; 17104887Schin regclass_t f; 17114887Schin 17124887Schin if (!(f = classfun(type))) 17134887Schin { 17144887Schin env->error = REG_BADESC; 17154887Schin return 0; 17164887Schin } 17174887Schin if (!mbcoll()) 17184887Schin { 17194887Schin if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 17204887Schin return 0; 17214887Schin for (i = 0; i <= UCHAR_MAX; i++) 17224887Schin if ((*f)(i)) 17234887Schin setadd(e->re.charclass, i); 17244887Schin if (env->explicit >= 0) 17254887Schin setclr(e->re.charclass, env->explicit); 17264887Schin } 17274887Schin else 17284887Schin { 17294887Schin if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t)))) 17304887Schin return 0; 17314887Schin ce = (Celt_t*)e->re.data; 17324887Schin e->re.collate.invert = 0; 17334887Schin e->re.collate.elements = ce; 1734*8462SApril.Chin@Sun.COM ce->fun = f; 17354887Schin ce->typ = COLL_call; 17364887Schin ce++; 17374887Schin ce->typ = COLL_end; 17384887Schin } 17394887Schin return e; 17404887Schin } 17414887Schin 17424887Schin static Rex_t* 17434887Schin rep(Cenv_t* env, Rex_t* e, int number, int last) 17444887Schin { 17454887Schin Rex_t* f; 17464887Schin unsigned long m = 0; 17474887Schin unsigned long n = RE_DUP_INF; 17484887Schin int minimal = -1; 17494887Schin 17504887Schin if (!e) 17514887Schin return 0; 17524887Schin switch (token(env)) 17534887Schin { 17544887Schin case T_BANG: 17554887Schin eat(env); 17564887Schin if (!(f = node(env, REX_NEG, m, n, 0))) 17574887Schin { 17584887Schin drop(env->disc, e); 17594887Schin return 0; 17604887Schin } 17614887Schin f->re.group.expr.rex = e; 17624887Schin return f; 17634887Schin case T_QUES: 17644887Schin eat(env); 17654887Schin n = 1; 17664887Schin break; 17674887Schin case T_STAR: 17684887Schin eat(env); 17694887Schin break; 17704887Schin case T_PLUS: 17714887Schin eat(env); 17724887Schin m = 1; 17734887Schin break; 17744887Schin case T_LEFT: 17754887Schin eat(env); 17764887Schin m = env->token.min; 17774887Schin n = env->token.max; 17784887Schin break; 17794887Schin default: 17804887Schin return e; 17814887Schin } 17824887Schin if (env->token.att) 17834887Schin minimal = 1; 17844887Schin else if (env->type < SRE) 17854887Schin switch (token(env)) 17864887Schin { 17874887Schin case T_QUES: 17884887Schin eat(env); 17894887Schin minimal = !(env->flags & REG_MINIMAL); 17904887Schin break; 17914887Schin case T_STAR: /*AST*/ 17924887Schin eat(env); 17934887Schin minimal = !!(env->flags & REG_MINIMAL); 17944887Schin break; 17954887Schin } 17964887Schin switch (e->type) 17974887Schin { 17984887Schin case REX_DOT: 17994887Schin case REX_CLASS: 18004887Schin case REX_COLL_CLASS: 18014887Schin case REX_ONECHAR: 18024887Schin e->lo = m; 18034887Schin e->hi = n; 18044887Schin if (minimal >= 0) 18054887Schin mark(e, minimal); 18064887Schin return e; 18074887Schin #if HUH_2002_08_07 18084887Schin case REX_BEG: 18094887Schin #endif 18104887Schin case REX_BEG_STR: 18114887Schin case REX_END_STR: 18124887Schin case REX_FIN_STR: 18134887Schin case REX_WBEG: 18144887Schin case REX_WEND: 18154887Schin case REX_WORD: 18164887Schin case REX_WORD_NOT: 18174887Schin env->error = REG_BADRPT; 18184887Schin drop(env->disc, e); 18194887Schin return 0; 18204887Schin } 18214887Schin if (m == 1 && n == 1) 18224887Schin { 18234887Schin if (minimal >= 0) 18244887Schin mark(e, minimal); 18254887Schin return e; 18264887Schin } 18274887Schin if (!(f = node(env, REX_REP, m, n, 0))) 18284887Schin { 18294887Schin drop(env->disc, e); 18304887Schin return 0; 18314887Schin } 18324887Schin f->re.group.expr.rex = e; 18334887Schin f->re.group.number = number; 18344887Schin f->re.group.last = last; 18354887Schin if (minimal >= 0) 18364887Schin mark(f, minimal); 18374887Schin if (m <= n && n) 18384887Schin { 18394887Schin for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex); 18404887Schin if (e && e->type == REX_NEG) 18414887Schin f->type = REX_GROUP; 18424887Schin } 18434887Schin return f; 18444887Schin } 18454887Schin 18464887Schin static int 18474887Schin isstring(Rex_t* e) 18484887Schin { 18494887Schin switch (e->type) 18504887Schin { 18514887Schin case REX_ONECHAR: 18524887Schin return e->lo == 1 && e->hi == 1; 18534887Schin case REX_STRING: 18544887Schin return 1; 18554887Schin } 18564887Schin return 0; 18574887Schin } 18584887Schin 18594887Schin static Trie_node_t* 18604887Schin trienode(Cenv_t* env, int c) 18614887Schin { 18624887Schin Trie_node_t* t; 18634887Schin 18644887Schin if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t))) 18654887Schin { 18664887Schin memset(t, 0, sizeof(Trie_node_t)); 18674887Schin t->c = c; 18684887Schin } 18694887Schin return t; 18704887Schin } 18714887Schin 18724887Schin static int 18734887Schin insert(Cenv_t* env, Rex_t* f, Rex_t* g) 18744887Schin { 18754887Schin unsigned char* s; 18764887Schin unsigned char* e; 18774887Schin Trie_node_t* t; 18784887Schin int len; 18794887Schin unsigned char tmp[2]; 18804887Schin 18814887Schin switch (f->type) 18824887Schin { 18834887Schin case REX_ONECHAR: 18844887Schin *(s = tmp) = f->re.onechar; 18854887Schin e = s + 1; 18864887Schin break; 18874887Schin case REX_STRING: 18884887Schin s = f->re.string.base; 18894887Schin e = s + f->re.string.size; 18904887Schin break; 18914887Schin default: 18924887Schin return 1; 18934887Schin } 18944887Schin if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s))) 18954887Schin return 1; 18964887Schin for (len = 1;;) 18974887Schin { 18984887Schin if (t->c == *s) 18994887Schin { 19004887Schin if (++s >= e) 19014887Schin break; 19024887Schin if (!t->son && !(t->son = trienode(env, *s))) 19034887Schin return 1; 19044887Schin t = t->son; 19054887Schin len++; 19064887Schin } 19074887Schin else 19084887Schin { 19094887Schin if (!t->sib && !(t->sib = trienode(env, *s))) 19104887Schin return 1; 19114887Schin t = t->sib; 19124887Schin } 19134887Schin } 19144887Schin if (g->re.trie.min > len) 19154887Schin g->re.trie.min = len; 19164887Schin if (g->re.trie.max < len) 19174887Schin g->re.trie.max = len; 19184887Schin t->end = 1; 19194887Schin return 0; 19204887Schin } 19214887Schin 19224887Schin /* 19234887Schin * trie() tries to combine nontrivial e and f into a REX_TRIE 19244887Schin * unless 0 is returned, e and f are deleted as far as possible 19254887Schin * also called by regcomb 19264887Schin */ 19274887Schin 19284887Schin static Rex_t* 19294887Schin trie(Cenv_t* env, Rex_t* e, Rex_t* f) 19304887Schin { 19314887Schin Rex_t* g; 19324887Schin 19334887Schin if (e->next || f->next || !isstring(e) || e->flags != f->flags) 19344887Schin return 0; 19354887Schin if (isstring(f)) 19364887Schin { 19374887Schin if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*)))) 19384887Schin return 0; 19394887Schin g->re.trie.min = INT_MAX; 19404887Schin if (insert(env, f, g)) 19414887Schin goto nospace; 19424887Schin drop(env->disc, f); 19434887Schin } 19444887Schin else if (f->type != REX_TRIE) 19454887Schin return 0; 19464887Schin else 19474887Schin g = f; 19484887Schin if (insert(env, e, g)) 19494887Schin goto nospace; 19504887Schin drop(env->disc, e); 19514887Schin return g; 19524887Schin nospace: 19534887Schin if (g != f) 19544887Schin drop(env->disc, g); 19554887Schin return 0; 19564887Schin } 19574887Schin 19584887Schin static Rex_t* alt(Cenv_t*, int, int); 19594887Schin 19604887Schin static int 19614887Schin chr(register Cenv_t* env, int* escaped) 19624887Schin { 19634887Schin unsigned char* p; 19644887Schin int c; 19654887Schin 19664887Schin *escaped = 0; 19674887Schin if (!(c = *env->cursor)) 19684887Schin return -1; 19694887Schin env->cursor++; 19704887Schin if (c == '\\') 19714887Schin { 19724887Schin if (env->flags & REG_SHELL_ESCAPED) 19734887Schin return c; 19744887Schin if (!(c = *(env->cursor + 1)) || c == env->terminator) 19754887Schin { 19764887Schin if (env->flags & REG_LENIENT) 19774887Schin return c ? c : '\\'; 19784887Schin env->error = REG_EESCAPE; 19794887Schin return -1; 19804887Schin } 19814887Schin p = env->cursor; 19824887Schin c = chresc((char*)env->cursor - 1, (char**)&env->cursor); 19834887Schin *escaped = env->cursor - p; 19844887Schin } 19854887Schin return c; 19864887Schin } 19874887Schin 19884887Schin /* 19894887Schin * open the perly gates 19904887Schin */ 19914887Schin 19924887Schin static Rex_t* 19934887Schin grp(Cenv_t* env, int parno) 19944887Schin { 19954887Schin Rex_t* e; 19964887Schin Rex_t* f; 19974887Schin int c; 19984887Schin int i; 19994887Schin int n; 20004887Schin int x; 20014887Schin int esc; 20024887Schin int typ; 20034887Schin int beg; 20044887Schin unsigned char* p; 20054887Schin 20064887Schin beg = env->pattern == env->cursor - env->token.len; 20074887Schin if (!(c = env->token.lex) && (c = *env->cursor)) 20084887Schin env->cursor++; 20094887Schin env->token.len = 0; 20104887Schin env->parnest++; 20114887Schin typ = -1; 20124887Schin switch (c) 20134887Schin { 20144887Schin case '-': 20154887Schin case '+': 20164887Schin case 'a': 20174887Schin case 'g': 20184887Schin case 'i': 20194887Schin case 'l': 20204887Schin case 'm': 20214887Schin case 'p': 20224887Schin case 'r': 20234887Schin case 's': 20244887Schin case 'x': 20254887Schin case 'A': 20264887Schin case 'B': 20274887Schin case 'E': 20284887Schin case 'F': 20294887Schin case 'G': 20304887Schin case 'I': 20314887Schin case 'K': 2032*8462SApril.Chin@Sun.COM case 'L': 2033*8462SApril.Chin@Sun.COM case 'M': /* glob(3) */ 2034*8462SApril.Chin@Sun.COM case 'N': /* glob(3) */ 2035*8462SApril.Chin@Sun.COM case 'O': /* glob(3) */ 2036*8462SApril.Chin@Sun.COM case 'R': /* pcre */ 20374887Schin case 'S': 20384887Schin case 'U': /* pcre */ 20394887Schin case 'X': /* pcre */ 20404887Schin x = REX_GROUP; 20414887Schin i = 1; 20424887Schin env->token.push = 1; 20434887Schin for (;;) 20444887Schin { 20454887Schin switch (c) 20464887Schin { 2047*8462SApril.Chin@Sun.COM case ')': 2048*8462SApril.Chin@Sun.COM if (!(env->flags & REG_LITERAL)) 2049*8462SApril.Chin@Sun.COM { 2050*8462SApril.Chin@Sun.COM env->error = REG_BADRPT; 2051*8462SApril.Chin@Sun.COM return 0; 2052*8462SApril.Chin@Sun.COM } 2053*8462SApril.Chin@Sun.COM /*FALLTHROUGH*/ 20544887Schin case 0: 20554887Schin case T_CLOSE: 20564887Schin x = 0; 20574887Schin goto done; 20584887Schin case ':': 20594887Schin eat(env); 20604887Schin if (token(env) == T_CLOSE) 20614887Schin x = 0; 20624887Schin goto done; 20634887Schin case '-': 20644887Schin i = 0; 20654887Schin break; 20664887Schin case '+': 20674887Schin i = 1; 20684887Schin break; 20694887Schin case 'a': 20704887Schin if (i) 20714887Schin env->flags |= (REG_LEFT|REG_RIGHT); 20724887Schin else 20734887Schin env->flags &= ~(REG_LEFT|REG_RIGHT); 20744887Schin break; 20754887Schin case 'g': 20764887Schin if (i) 20774887Schin env->flags &= ~REG_MINIMAL; 20784887Schin else 20794887Schin env->flags |= REG_MINIMAL; 20804887Schin break; 20814887Schin case 'i': 20824887Schin if (i) 20834887Schin env->flags |= REG_ICASE; 20844887Schin else 20854887Schin env->flags &= ~REG_ICASE; 20864887Schin break; 20874887Schin case 'l': 20884887Schin if (i) 20894887Schin env->flags |= REG_LEFT; 20904887Schin else 20914887Schin env->flags &= ~REG_LEFT; 20924887Schin break; 20934887Schin case 'm': 20944887Schin if (i) 20954887Schin env->flags |= REG_NEWLINE; 20964887Schin else 20974887Schin env->flags &= ~REG_NEWLINE; 20984887Schin env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 20994887Schin break; 21004887Schin case 'p': 21014887Schin if (i) 21024887Schin env->flags &= ~REG_LENIENT; 21034887Schin else 21044887Schin env->flags |= REG_LENIENT; 21054887Schin break; 21064887Schin case 'r': 21074887Schin if (i) 21084887Schin env->flags |= REG_RIGHT; 21094887Schin else 21104887Schin env->flags &= ~REG_RIGHT; 21114887Schin break; 21124887Schin case 's': 21134887Schin if (i) 21144887Schin env->flags |= REG_SPAN; 21154887Schin else 21164887Schin env->flags &= ~REG_SPAN; 21174887Schin env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 21184887Schin break; 21194887Schin case 'x': 21204887Schin if (i) 21214887Schin env->flags |= REG_COMMENT; 21224887Schin else 21234887Schin env->flags &= ~REG_COMMENT; 21244887Schin break; 21254887Schin case 'A': 21264887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21274887Schin env->flags |= REG_AUGMENTED|REG_EXTENDED; 21284887Schin typ = ARE; 21294887Schin break; 21304887Schin case 'B': 21314887Schin case 'G': 21324887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21334887Schin typ = BRE; 21344887Schin break; 21354887Schin case 'E': 21364887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21374887Schin env->flags |= REG_EXTENDED; 21384887Schin typ = ERE; 21394887Schin break; 21404887Schin case 'F': 21414887Schin case 'L': 21424887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21434887Schin env->flags |= REG_LITERAL; 21444887Schin typ = ERE; 21454887Schin break; 21464887Schin case 'K': 21474887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21484887Schin env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT; 21494887Schin typ = KRE; 21504887Schin break; 21514887Schin case 'M': 21524887Schin /* used by caller to disable glob(3) GLOB_BRACE */ 21534887Schin break; 21544887Schin case 'N': 21554887Schin /* used by caller to disable glob(3) GLOB_NOCHECK */ 21564887Schin break; 2157*8462SApril.Chin@Sun.COM case 'O': 21584887Schin /* used by caller to disable glob(3) GLOB_STARSTAR */ 21594887Schin break; 21604887Schin case 'S': 21614887Schin env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 21624887Schin env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT; 21634887Schin typ = SRE; 21644887Schin break; 21654887Schin case 'U': /* PCRE_UNGREEDY */ 21664887Schin if (i) 21674887Schin env->flags |= REG_MINIMAL; 21684887Schin else 21694887Schin env->flags &= ~REG_MINIMAL; 21704887Schin break; 21714887Schin case 'X': /* PCRE_EXTRA */ 21724887Schin break; 21734887Schin default: 21744887Schin env->error = REG_BADRPT; 21754887Schin return 0; 21764887Schin } 21774887Schin eat(env); 21784887Schin c = token(env); 21794887Schin } 21804887Schin done: 21814887Schin break; 21824887Schin case ':': 21834887Schin x = REX_GROUP; 21844887Schin break; 21854887Schin case '=': 21864887Schin x = REX_GROUP_AHEAD; 21874887Schin break; 21884887Schin case '!': 21894887Schin x = REX_GROUP_AHEAD_NOT; 21904887Schin break; 21914887Schin case '<': 21924887Schin switch (token(env)) 21934887Schin { 21944887Schin case '=': 21954887Schin x = REX_GROUP_BEHIND; 21964887Schin break; 21974887Schin case '!': 21984887Schin case T_BANG: 21994887Schin x = REX_GROUP_BEHIND_NOT; 22004887Schin break; 22014887Schin default: 22024887Schin env->error = REG_BADRPT; 22034887Schin return 0; 22044887Schin } 22054887Schin eat(env); 22064887Schin break; 22074887Schin case '>': 22084887Schin x = REX_GROUP_CUT; 22094887Schin break; 22104887Schin case '%': 22114887Schin case T_PERCENT: 22124887Schin e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short)); 22134887Schin e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor; 22144887Schin n = 1; 22154887Schin for (;;) 22164887Schin { 22174887Schin switch (i = chr(env, &esc)) 22184887Schin { 22194887Schin case -1: 22204887Schin case 0: 22214887Schin invalid: 22224887Schin env->cursor -= esc + 1; 22234887Schin env->error = REG_EPAREN; 22244887Schin return 0; 22254887Schin case 'D': 22264887Schin x = REX_NEST_delimiter; 22274887Schin /*FALLTHROUGH*/ 22284887Schin delimiter: 22294887Schin if ((i = chr(env, &esc)) < 0) 22304887Schin goto invalid; 22314887Schin if (e->re.nest.type[i] & ~x) 22324887Schin goto invalid; 22334887Schin e->re.nest.type[i] = x; 22344887Schin continue; 22354887Schin case 'E': 22364887Schin x = REX_NEST_escape; 22374887Schin goto delimiter; 22384887Schin case 'L': 22394887Schin x = REX_NEST_literal; 22404887Schin goto quote; 22414887Schin case 'O': 22424887Schin switch (i = chr(env, &esc)) 22434887Schin { 22444887Schin case 'T': 22454887Schin e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator; 22464887Schin break; 22474887Schin default: 22484887Schin goto invalid; 22494887Schin } 22504887Schin continue; 22514887Schin case 'Q': 22524887Schin x = REX_NEST_quote; 22534887Schin /*FALLTHROUGH*/ 22544887Schin quote: 22554887Schin if ((i = chr(env, &esc)) < 0) 22564887Schin goto invalid; 22574887Schin if (e->re.nest.type[i] & ~x) 22584887Schin goto invalid; 22594887Schin e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT); 22604887Schin continue; 22614887Schin case 'S': 22624887Schin x = REX_NEST_separator; 22634887Schin goto delimiter; 22644887Schin case 'T': 22654887Schin x = REX_NEST_terminator; 22664887Schin goto delimiter; 22674887Schin case '|': 22684887Schin case '&': 22694887Schin if (!esc) 22704887Schin goto invalid; 22714887Schin goto nesting; 22724887Schin case '(': 22734887Schin if (!esc) 22744887Schin n++; 22754887Schin goto nesting; 22764887Schin case ')': 22774887Schin if (!esc && !--n) 22784887Schin break; 22794887Schin goto nesting; 22804887Schin default: 22814887Schin nesting: 22824887Schin if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 22834887Schin goto invalid; 22844887Schin e->re.nest.type[i] = REX_NEST_open; 22854887Schin if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 22864887Schin goto invalid; 22874887Schin if (!esc) 22884887Schin { 22894887Schin if (x == ')' && !--n) 22904887Schin goto invalid; 22914887Schin else if (x == '(') 22924887Schin n++; 22934887Schin } 22944887Schin e->re.nest.type[x] |= REX_NEST_close; 22954887Schin e->re.nest.type[i] |= x << REX_NEST_SHIFT; 22964887Schin continue; 22974887Schin } 22984887Schin break; 22994887Schin } 23004887Schin env->parnest--; 23014887Schin if (c == T_PERCENT) 23024887Schin for (n = 0; n < 2; n++) 23034887Schin { 23044887Schin parno = ++env->parno; 23054887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0))) 23064887Schin { 23074887Schin drop(env->disc, e); 23084887Schin return 0; 23094887Schin } 23104887Schin if (parno < elementsof(env->paren)) 23114887Schin env->paren[parno] = f; 23124887Schin f->re.group.back = 0; 23134887Schin f->re.group.number = parno; 23144887Schin f->re.group.expr.rex = e; 23154887Schin e = f; 23164887Schin } 23174887Schin return e; 23184887Schin case '(': 23194887Schin c = 0; 23204887Schin if (isdigit(*env->cursor)) 23214887Schin { 23224887Schin f = 0; 23234887Schin do 23244887Schin { 23254887Schin if (c > (INT_MAX / 10)) 23264887Schin { 23274887Schin env->error = REG_BADRPT; 23284887Schin return 0; 23294887Schin } 23304887Schin c = c * 10 + (*env->cursor++ - '0'); 23314887Schin } while (isdigit(*env->cursor)); 23324887Schin if (*env->cursor++ != ')') 23334887Schin { 23344887Schin env->error = REG_BADRPT; 23354887Schin return 0; 23364887Schin } 23374887Schin if (c && env->type >= SRE) 23384887Schin c = c * 2 - 1; 23394887Schin if (!c || c > env->parno || !env->paren[c]) 23404887Schin { 23414887Schin if (!(env->flags & REG_LENIENT)) 23424887Schin { 23434887Schin env->error = REG_ESUBREG; 23444887Schin return 0; 23454887Schin } 23464887Schin if (c) 23474887Schin c = -1; 23484887Schin } 23494887Schin } 23504887Schin else 23514887Schin { 23524887Schin if (env->type < SRE && *env->cursor++ != '?') 23534887Schin { 23544887Schin env->error = REG_BADRPT; 23554887Schin return 0; 23564887Schin } 23574887Schin if (!(f = grp(env, parno + 1)) && env->error) 23584887Schin return 0; 23594887Schin } 23604887Schin if (!(e = node(env, REX_GROUP_COND, 0, 0, 0))) 23614887Schin { 23624887Schin drop(env->disc, f); 23634887Schin return 0; 23644887Schin } 23654887Schin e->re.group.size = c; 23664887Schin e->re.group.expr.binary.left = f; 23674887Schin if (!(e->re.group.expr.binary.right = alt(env, parno, 1))) 23684887Schin { 23694887Schin drop(env->disc, e); 23704887Schin return 0; 23714887Schin } 23724887Schin if (token(env) != T_CLOSE) 23734887Schin { 23744887Schin env->error = REG_EPAREN; 23754887Schin return 0; 23764887Schin } 23774887Schin eat(env); 23784887Schin env->parnest--; 23794887Schin return rep(env, e, parno, parno); 23804887Schin case '{': 23814887Schin p = env->cursor; 23824887Schin n = 1; 23834887Schin while (c = *env->cursor) 23844887Schin { 23854887Schin if (c == '\\' && *(env->cursor + 1)) 23864887Schin env->cursor++; 23874887Schin else if (c == '{') 23884887Schin n++; 23894887Schin else if (c == '}' && !--n) 23904887Schin break; 23914887Schin else if (c == env->delimiter || c == env->terminator) 23924887Schin break; 23934887Schin env->cursor++; 23944887Schin } 23954887Schin if (c != '}') 23964887Schin { 23974887Schin env->error = REG_EBRACE; 23984887Schin return 0; 23994887Schin } 24004887Schin if (*++env->cursor != ')') 24014887Schin { 24024887Schin env->error = REG_EPAREN; 24034887Schin return 0; 24044887Schin } 24054887Schin env->cursor++; 24064887Schin env->parnest--; 24074887Schin if (env->disc->re_version < REG_VERSION_EXEC) 24084887Schin { 24094887Schin env->error = REG_BADRPT; 24104887Schin return 0; 24114887Schin } 24124887Schin if (!env->disc->re_execf) 24134887Schin return 0; 24144887Schin if (!(e = node(env, REX_EXEC, 0, 0, 0))) 24154887Schin return 0; 24164887Schin e->re.exec.text = (const char*)p; 24174887Schin e->re.exec.size = env->cursor - p - 2; 24184887Schin if (!env->disc->re_compf) 24194887Schin e->re.exec.data = 0; 24204887Schin else 24214887Schin e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc); 24224887Schin return e; 24234887Schin case '0': case '1': case '2': case '3': case '4': 24244887Schin case '5': case '6': case '7': case '8': case '9': 24254887Schin c -= '0'; 24264887Schin while (isdigit(*env->cursor)) 24274887Schin { 24284887Schin if (c > (INT_MAX / 10)) 24294887Schin { 24304887Schin env->error = REG_ESUBREG; 24314887Schin return 0; 24324887Schin } 24334887Schin c = c * 10 + *env->cursor++ - '0'; 24344887Schin } 24354887Schin if (*env->cursor == ')') 24364887Schin { 24374887Schin env->cursor++; 24384887Schin env->parnest--; 24394887Schin env->token.len = 1; 24404887Schin if (c > env->parno || !env->paren[c]) 24414887Schin { 24424887Schin env->error = REG_ESUBREG; 24434887Schin return 0; 24444887Schin } 24454887Schin env->paren[c]->re.group.back = 1; 24464887Schin return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 24474887Schin } 24484887Schin /*FALLTHROUGH*/ 24494887Schin default: 24504887Schin env->error = REG_BADRPT; 24514887Schin return 0; 24524887Schin } 24534887Schin if (x && !(e = alt(env, parno, 0))) 24544887Schin return 0; 24554887Schin c = token(env); 24564887Schin env->parnest--; 2457*8462SApril.Chin@Sun.COM if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')')) 24584887Schin { 24594887Schin env->error = REG_EPAREN; 24604887Schin return 0; 24614887Schin } 24624887Schin eat(env); 24634887Schin if (typ >= 0) 24644887Schin { 24654887Schin if (beg) 24664887Schin env->pattern = env->cursor; 24674887Schin env->type = typ; 24684887Schin } 24694887Schin if (!x) 24704887Schin return 0; 24714887Schin if (!(f = node(env, x, 0, 0, 0))) 24724887Schin { 24734887Schin drop(env->disc, e); 24744887Schin return 0; 24754887Schin } 24764887Schin f->re.group.expr.rex = e; 24774887Schin if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT) 24784887Schin { 24794887Schin if (stats(env, e)) 24804887Schin { 24814887Schin drop(env->disc, f); 24824887Schin if (!env->error) 24834887Schin env->error = REG_ECOUNT; 24844887Schin return 0; 24854887Schin } 24864887Schin f->re.group.size = env->stats.m; 24874887Schin memset(&env->stats, 0, sizeof(env->stats)); 24884887Schin } 24894887Schin switch (x) 24904887Schin { 24914887Schin case REX_GROUP: 24924887Schin case REX_GROUP_CUT: 24934887Schin f = rep(env, f, parno, env->parno); 24944887Schin break; 24954887Schin } 24964887Schin return f; 24974887Schin } 24984887Schin 24994887Schin static Rex_t* 25004887Schin seq(Cenv_t* env) 25014887Schin { 25024887Schin Rex_t* e; 25034887Schin Rex_t* f; 25044887Schin Token_t tok; 25054887Schin int c; 25064887Schin int i; 25074887Schin int n; 25084887Schin int x; 25094887Schin int parno; 25104887Schin int type; 25114887Schin regflags_t flags; 25124887Schin unsigned char* s; 25134887Schin unsigned char* p; 25144887Schin unsigned char* t; 25154887Schin unsigned char* u; 25164887Schin unsigned char buf[256]; 25174887Schin 25184887Schin for (;;) 25194887Schin { 25204887Schin s = buf; 25214887Schin while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len]) 25224887Schin { 25234887Schin x = c; 25244887Schin p = env->cursor; 25254887Schin if (c >= 0) 25264887Schin { 25274887Schin n = 1; 25284887Schin *s++ = (env->flags & REG_ICASE) ? toupper(c) : c; 25294887Schin } 2530*8462SApril.Chin@Sun.COM else if (c == C_ESC || (env->flags & REG_ICASE)) 25314887Schin { 2532*8462SApril.Chin@Sun.COM c = (c == C_ESC) ? env->token.lex : mbchar(p); 2533*8462SApril.Chin@Sun.COM if (env->flags & REG_ICASE) 2534*8462SApril.Chin@Sun.COM c = towupper(c); 2535*8462SApril.Chin@Sun.COM if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX) 2536*8462SApril.Chin@Sun.COM break; 2537*8462SApril.Chin@Sun.COM if ((n = wctomb((char*)s, c)) < 0) 2538*8462SApril.Chin@Sun.COM *s++ = c; 2539*8462SApril.Chin@Sun.COM else if (n) 2540*8462SApril.Chin@Sun.COM s += n; 2541*8462SApril.Chin@Sun.COM else 25424887Schin { 25434887Schin n = 1; 25444887Schin *s++ = 0; 25454887Schin } 25464887Schin } 25474887Schin else 25484887Schin { 25494887Schin n = env->token.len - env->token.esc; 25504887Schin for (t = p, u = s + n; s < u; *s++ = *t++); 25514887Schin } 25524887Schin eat(env); 25534887Schin } 25544887Schin if (c == T_BAD) 25554887Schin return 0; 25564887Schin if (s > buf) 25574887Schin switch (c) 25584887Schin { 25594887Schin case T_STAR: 25604887Schin case T_PLUS: 25614887Schin case T_LEFT: 25624887Schin case T_QUES: 25634887Schin case T_BANG: 25644887Schin if ((s -= n) == buf) 25654887Schin e = 0; 25664887Schin else 25674887Schin { 25684887Schin i = s - buf; 25694887Schin if (!(e = node(env, REX_STRING, 0, 0, i))) 25704887Schin return 0; 25714887Schin memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i); 25724887Schin e->re.string.size = i; 25734887Schin } 25744887Schin if (x >= 0) 25754887Schin { 25764887Schin if (!(f = node(env, REX_ONECHAR, 1, 1, 0))) 25774887Schin { 25784887Schin drop(env->disc, e); 25794887Schin return 0; 25804887Schin } 25814887Schin f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x; 25824887Schin } 25834887Schin else 25844887Schin { 2585*8462SApril.Chin@Sun.COM if (!(f = node(env, REX_STRING, 0, 0, n))) 25864887Schin return 0; 2587*8462SApril.Chin@Sun.COM memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n); 2588*8462SApril.Chin@Sun.COM f->re.string.size = n; 25894887Schin } 25904887Schin if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env)))) 25914887Schin { 25924887Schin drop(env->disc, e); 25934887Schin return 0; 25944887Schin } 25954887Schin if (e) 25964887Schin f = cat(env, e, f); 25974887Schin return f; 25984887Schin default: 25994887Schin c = s - buf; 26004887Schin if (!(e = node(env, REX_STRING, 0, 0, c))) 26014887Schin return 0; 26024887Schin memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c); 26034887Schin e->re.string.size = c; 26044887Schin return cat(env, e, seq(env)); 26054887Schin } 26064887Schin else if (c > T_BACK) 26074887Schin { 26084887Schin eat(env); 26094887Schin c -= T_BACK; 26104887Schin if (c > env->parno || !env->paren[c]) 26114887Schin { 26124887Schin env->error = REG_ESUBREG; 26134887Schin return 0; 26144887Schin } 26154887Schin env->paren[c]->re.group.back = 1; 26164887Schin e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 26174887Schin } 26184887Schin else 26194887Schin switch (c) 26204887Schin { 26214887Schin case T_AND: 26224887Schin case T_CLOSE: 26234887Schin case T_BAR: 26244887Schin case T_END: 26254887Schin return node(env, REX_NULL, 0, 0, 0); 26264887Schin case T_DOLL: 26274887Schin eat(env); 26284887Schin e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0); 26294887Schin break; 26304887Schin case T_CFLX: 26314887Schin eat(env); 26324887Schin if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED)) 26334887Schin e = rep(env, e, 0, 0); 26344887Schin break; 26354887Schin case T_OPEN: 26364887Schin tok = env->token; 26374887Schin eat(env); 26384887Schin flags = env->flags; 26394887Schin type = env->type; 26404887Schin if (env->token.att) 26414887Schin env->flags |= REG_MINIMAL; 26424887Schin env->parnest++; 26434887Schin if (env->type == KRE) 26444887Schin ++env->parno; 26454887Schin parno = ++env->parno; 26464887Schin if (!(e = alt(env, parno + 1, 0))) 26474887Schin break; 26484887Schin if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL)) 26494887Schin { 26504887Schin drop(env->disc, e); 26514887Schin env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL; 26524887Schin return 0; 26534887Schin } 26544887Schin if (token(env) != T_CLOSE) 26554887Schin { 26564887Schin drop(env->disc, e); 26574887Schin env->error = REG_EPAREN; 26584887Schin return 0; 26594887Schin } 26604887Schin env->parnest--; 26614887Schin eat(env); 26624887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0))) 26634887Schin { 26644887Schin drop(env->disc, e); 26654887Schin return 0; 26664887Schin } 26674887Schin if (parno < elementsof(env->paren)) 26684887Schin env->paren[parno] = f; 26694887Schin f->re.group.back = 0; 26704887Schin f->re.group.number = parno; 26714887Schin f->re.group.expr.rex = e; 26724887Schin if (tok.lex) 26734887Schin { 26744887Schin tok.push = 1; 26754887Schin env->token = tok; 26764887Schin } 26774887Schin if (!(e = rep(env, f, parno, env->parno))) 26784887Schin return 0; 26794887Schin if (env->type == KRE) 26804887Schin { 26814887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0))) 26824887Schin { 26834887Schin drop(env->disc, e); 26844887Schin return 0; 26854887Schin } 26864887Schin if (--parno < elementsof(env->paren)) 26874887Schin env->paren[parno] = f; 26884887Schin f->re.group.back = 0; 26894887Schin f->re.group.number = parno; 26904887Schin f->re.group.expr.rex = e; 26914887Schin e = f; 26924887Schin } 26934887Schin env->flags = flags; 26944887Schin env->type = type; 26954887Schin break; 26964887Schin case T_GROUP: 26974887Schin p = env->cursor; 26984887Schin eat(env); 26994887Schin flags = env->flags; 27004887Schin type = env->type; 27014887Schin if (!(e = grp(env, env->parno + 1))) 27024887Schin { 27034887Schin if (env->error) 27044887Schin return 0; 27054887Schin if (env->literal == env->pattern && env->literal == p) 27064887Schin env->literal = env->cursor; 27074887Schin continue; 27084887Schin } 27094887Schin env->flags = flags; 27104887Schin env->type = type; 27114887Schin break; 27124887Schin case T_BRA: 27134887Schin eat(env); 27144887Schin if (e = bra(env)) 27154887Schin e = rep(env, e, 0, 0); 27164887Schin break; 27174887Schin case T_ALNUM: 27184887Schin case T_ALNUM_NOT: 27194887Schin case T_DIGIT: 27204887Schin case T_DIGIT_NOT: 27214887Schin case T_SPACE: 27224887Schin case T_SPACE_NOT: 27234887Schin eat(env); 27244887Schin if (e = ccl(env, c)) 27254887Schin e = rep(env, e, 0, 0); 27264887Schin break; 27274887Schin case T_LT: 27284887Schin eat(env); 27294887Schin e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0); 27304887Schin break; 27314887Schin case T_GT: 27324887Schin eat(env); 27334887Schin e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0); 27344887Schin break; 27354887Schin case T_DOT: 27364887Schin eat(env); 27374887Schin e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 27384887Schin break; 27394887Schin case T_DOTSTAR: 27404887Schin eat(env); 27414887Schin env->token.lex = T_STAR; 27424887Schin env->token.push = 1; 27434887Schin e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 27444887Schin break; 27454887Schin case T_SLASHPLUS: 27464887Schin eat(env); 27474887Schin env->token.lex = T_PLUS; 27484887Schin env->token.push = 1; 27494887Schin if (e = node(env, REX_ONECHAR, 1, 1, 0)) 27504887Schin { 27514887Schin e->re.onechar = '/'; 27524887Schin e = rep(env, e, 0, 0); 27534887Schin } 27544887Schin break; 27554887Schin case T_WORD: 27564887Schin eat(env); 27574887Schin e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0); 27584887Schin break; 27594887Schin case T_WORD_NOT: 27604887Schin eat(env); 27614887Schin e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0); 27624887Schin break; 27634887Schin case T_BEG_STR: 27644887Schin eat(env); 27654887Schin e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0); 27664887Schin break; 27674887Schin case T_END_STR: 27684887Schin eat(env); 27694887Schin e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0); 27704887Schin break; 27714887Schin case T_FIN_STR: 27724887Schin eat(env); 27734887Schin e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0); 27744887Schin break; 27754887Schin default: 27764887Schin env->error = REG_BADRPT; 27774887Schin return 0; 27784887Schin } 27794887Schin if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator) 27804887Schin e = cat(env, e, seq(env)); 27814887Schin return e; 27824887Schin } 27834887Schin } 27844887Schin 27854887Schin static Rex_t* 27864887Schin con(Cenv_t* env) 27874887Schin { 27884887Schin Rex_t* e; 27894887Schin Rex_t* f; 27904887Schin Rex_t* g; 27914887Schin 27924887Schin if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND) 27934887Schin return e; 27944887Schin eat(env); 27954887Schin if (!(f = con(env))) 27964887Schin { 27974887Schin drop(env->disc, e); 27984887Schin return 0; 27994887Schin } 28004887Schin if (!(g = node(env, REX_CONJ, 0, 0, 0))) 28014887Schin { 28024887Schin drop(env->disc, e); 28034887Schin drop(env->disc, f); 28044887Schin return 0; 28054887Schin } 28064887Schin g->re.group.expr.binary.left = e; 28074887Schin g->re.group.expr.binary.right = f; 28084887Schin return g; 28094887Schin } 28104887Schin 28114887Schin static Rex_t* 28124887Schin alt(Cenv_t* env, int number, int cond) 28134887Schin { 28144887Schin Rex_t* e; 28154887Schin Rex_t* f; 28164887Schin Rex_t* g; 28174887Schin 28184887Schin if (!(e = con(env))) 28194887Schin return 0; 28204887Schin else if (token(env) != T_BAR) 28214887Schin { 28224887Schin if (!cond) 28234887Schin return e; 28244887Schin f = 0; 28254887Schin if (e->type == REX_NULL) 28264887Schin goto bad; 28274887Schin } 28284887Schin else 28294887Schin { 28304887Schin eat(env); 28314887Schin if (!(f = alt(env, number, 0))) 28324887Schin { 28334887Schin drop(env->disc, e); 28344887Schin return 0; 28354887Schin } 28364887Schin if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL)) 28374887Schin goto bad; 28384887Schin if (!cond && (g = trie(env, e, f))) 28394887Schin return g; 28404887Schin } 28414887Schin if (!(g = node(env, REX_ALT, 0, 0, 0))) 28424887Schin { 28434887Schin env->error = REG_ESPACE; 28444887Schin goto bad; 28454887Schin } 28464887Schin g->re.group.number = number; 28474887Schin g->re.group.last = env->parno; 28484887Schin g->re.group.expr.binary.left = e; 28494887Schin g->re.group.expr.binary.right = f; 28504887Schin return g; 28514887Schin bad: 28524887Schin drop(env->disc, e); 28534887Schin drop(env->disc, f); 28544887Schin if (!env->error) 28554887Schin env->error = REG_ENULL; 28564887Schin return 0; 28574887Schin } 28584887Schin 28594887Schin /* 28604887Schin * add v to REX_BM tables 28614887Schin */ 28624887Schin 28634887Schin static void 28644887Schin bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b) 28654887Schin { 28664887Schin int c; 28674887Schin int m; 28684887Schin size_t z; 28694887Schin 28704887Schin for (m = 0; m < n; m++) 28714887Schin { 28724887Schin if (!(z = n - m - 1)) 28734887Schin z = HIT; 28744887Schin c = v[m]; 28754887Schin a->re.bm.mask[m][c] |= b; 28764887Schin if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 28774887Schin a->re.bm.skip[c] = z; 28784887Schin if (a->flags & REG_ICASE) 28794887Schin { 28804887Schin if (isupper(c)) 28814887Schin c = tolower(c); 28824887Schin else if (islower(c)) 28834887Schin c = toupper(c); 28844887Schin else 28854887Schin continue; 28864887Schin a->re.bm.mask[m][c] |= b; 28874887Schin if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 28884887Schin a->re.bm.skip[c] = z; 28894887Schin } 28904887Schin } 28914887Schin } 28924887Schin 28934887Schin /* 28944887Schin * set up BM table from trie 28954887Schin */ 28964887Schin 28974887Schin static int 28984887Schin bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b) 28994887Schin { 29004887Schin do 29014887Schin { 29024887Schin v[m] = x->c; 29034887Schin if (m >= (n - 1)) 29044887Schin { 29054887Schin bmstr(env, a, v, n, b); 29064887Schin if (!(b <<= 1)) 29074887Schin { 29084887Schin b = 1; 29094887Schin a->re.bm.complete = 0; 29104887Schin } 29114887Schin else if (x->son) 29124887Schin a->re.bm.complete = 0; 29134887Schin } 29144887Schin else if (x->son) 29154887Schin b = bmtrie(env, a, v, x->son, n, m + 1, b); 29164887Schin } while (x = x->sib); 29174887Schin return b; 29184887Schin } 29194887Schin 29204887Schin /* 29214887Schin * rewrite the expression tree for some special cases 29224887Schin * 1. it is a null expression - illegal 29234887Schin * 2. max length fixed string found -- use BM algorithm 29244887Schin * 3. it begins with an unanchored string - use KMP algorithm 29254887Schin * 0 returned on success 29264887Schin */ 29274887Schin 29284887Schin static int 29294887Schin special(Cenv_t* env, regex_t* p) 29304887Schin { 29314887Schin Rex_t* a; 29324887Schin Rex_t* e; 29334887Schin Rex_t* t; 29344887Schin Rex_t* x; 29354887Schin Rex_t* y; 29364887Schin unsigned char* s; 29374887Schin int* f; 29384887Schin int n; 29394887Schin int m; 29404887Schin int k; 29414887Schin 29424887Schin DEBUG_INIT(); 29434887Schin if (e = p->env->rex) 29444887Schin { 29454887Schin if ((x = env->stats.x) && x->re.string.size < 3) 29464887Schin x = 0; 29474887Schin if ((t = env->stats.y) && t->re.trie.min < 3) 29484887Schin t = 0; 29494887Schin if (x && t) 29504887Schin { 29514887Schin if (x->re.string.size >= t->re.trie.min) 29524887Schin t = 0; 29534887Schin else 29544887Schin x = 0; 29554887Schin } 2956*8462SApril.Chin@Sun.COM if (x || t) 29574887Schin { 29584887Schin Bm_mask_t** mask; 29594887Schin Bm_mask_t* h; 29604887Schin unsigned char* v; 29614887Schin size_t* q; 29624887Schin unsigned long l; 29634887Schin int i; 29644887Schin int j; 29654887Schin 29664887Schin if (x) 29674887Schin { 29684887Schin y = x; 29694887Schin n = m = x->re.string.size; 29704887Schin l = env->stats.l; 29714887Schin } 29724887Schin else 29734887Schin { 29744887Schin y = t; 29754887Schin n = t->re.trie.min; 29764887Schin m = t->re.trie.max; 29774887Schin l = env->stats.k; 29784887Schin } 29794887Schin if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t)))) 29804887Schin return 1; 29814887Schin if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t)))) 29824887Schin { 29834887Schin alloc(env->disc, q, 0); 29844887Schin return 1; 29854887Schin } 29864887Schin a->flags = y->flags; 29874887Schin a->map = y->map; 29884887Schin a->re.bm.size = n; 29894887Schin a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1; 29904887Schin a->re.bm.left = l - 1; 29914887Schin a->re.bm.right = env->stats.m - l - n; 29924887Schin a->re.bm.complete = (y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n; 29934887Schin h = (Bm_mask_t*)&a->re.bm.mask[n]; 29944887Schin a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1)); 29954887Schin a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1]; 29964887Schin for (m = 0; m <= UCHAR_MAX; m++) 29974887Schin a->re.bm.skip[m] = n; 29984887Schin a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left); 29994887Schin for (i = 1; i <= n; i++) 30004887Schin a->re.bm.fail[i] = 2 * n - i; 30014887Schin mask = a->re.bm.mask; 30024887Schin for (m = 0; m < n; m++) 30034887Schin { 30044887Schin mask[m] = h; 30054887Schin h += UCHAR_MAX + 1; 30064887Schin } 30074887Schin if (x) 30084887Schin bmstr(env, a, x->re.string.base, n, 1); 30094887Schin else 30104887Schin { 30114887Schin v = (unsigned char*)q; 30124887Schin memset(v, 0, n); 30134887Schin m = 1; 30144887Schin for (i = 0; i <= UCHAR_MAX; i++) 30154887Schin if (t->re.trie.root[i]) 30164887Schin m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m); 30174887Schin } 30184887Schin mask--; 30194887Schin memset(q, 0, n * sizeof(*q)); 30204887Schin for (k = (j = n) + 1; j > 0; j--, k--) 30214887Schin { 30224887Schin DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0)); 30234887Schin for (q[j] = k; k <= n; k = q[k]) 30244887Schin { 30254887Schin for (m = 0; m <= UCHAR_MAX; m++) 30264887Schin if (mask[k][m] == mask[j][m]) 30274887Schin { 30284887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0)); 30294887Schin goto cut; 30304887Schin } 30314887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0)); 30324887Schin if (a->re.bm.fail[k] > n - j) 30334887Schin a->re.bm.fail[k] = n - j; 30344887Schin } 30354887Schin cut: ; 30364887Schin } 30374887Schin for (i = 1; i <= n; i++) 30384887Schin if (a->re.bm.fail[i] > n + k - i) 30394887Schin { 30404887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0)); 30414887Schin a->re.bm.fail[i] = n + k - i; 30424887Schin } 30434887Schin #if _AST_REGEX_DEBUG 30444887Schin if (DEBUG_TEST(0x0020,(1),(0))) 30454887Schin { 30464887Schin sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0); 30474887Schin for (m = 0; m < n; m++) 30484887Schin for (i = 1; i <= UCHAR_MAX; i++) 30494887Schin if (a->re.bm.mask[m][i]) 30504887Schin sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]); 30514887Schin for (i = ' '; i <= UCHAR_MAX; i++) 30524887Schin if (a->re.bm.skip[i] >= HIT) 30534887Schin sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i); 30544887Schin else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n) 30554887Schin sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]); 30564887Schin for (j = 31; j >= 0; j--) 30574887Schin { 30584887Schin sfprintf(sfstderr, " "); 30594887Schin next: 30604887Schin for (m = 0; m < n; m++) 30614887Schin { 30624887Schin for (i = 0040; i < 0177; i++) 30634887Schin if (a->re.bm.mask[m][i] & (1 << j)) 30644887Schin { 30654887Schin sfprintf(sfstderr, " %c", i); 30664887Schin break; 30674887Schin } 30684887Schin if (i >= 0177) 30694887Schin { 30704887Schin if (j > 0) 30714887Schin { 30724887Schin j--; 30734887Schin goto next; 30744887Schin } 30754887Schin sfprintf(sfstderr, " ?"); 30764887Schin } 30774887Schin } 30784887Schin sfprintf(sfstderr, "\n"); 30794887Schin } 30804887Schin sfprintf(sfstderr, "FAIL: "); 30814887Schin for (m = 1; m <= n; m++) 30824887Schin sfprintf(sfstderr, "%3d", a->re.bm.fail[m]); 30834887Schin sfprintf(sfstderr, "\n"); 30844887Schin } 30854887Schin #endif 30864887Schin alloc(env->disc, q, 0); 30874887Schin a->next = e; 30884887Schin p->env->rex = a; 30894887Schin return 0; 30904887Schin } 30914887Schin switch (e->type) 30924887Schin { 30934887Schin case REX_BEG: 30944887Schin if (env->flags & REG_NEWLINE) 30954887Schin return 0; 30964887Schin break; 30974887Schin case REX_GROUP: 30984887Schin if (env->stats.b) 30994887Schin return 0; 31004887Schin e = e->re.group.expr.rex; 31014887Schin if (e->type != REX_DOT) 31024887Schin return 0; 31034887Schin /*FALLTHROUGH*/ 31044887Schin case REX_DOT: 31054887Schin if (e->lo == 0 && e->hi == RE_DUP_INF) 31064887Schin break; 31074887Schin return 0; 31084887Schin case REX_NULL: 31094887Schin if (env->flags & REG_NULL) 31104887Schin break; 31114887Schin env->error = REG_ENULL; 31124887Schin return 1; 31134887Schin case REX_STRING: 3114*8462SApril.Chin@Sun.COM if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map) 31154887Schin return 0; 31164887Schin s = e->re.string.base; 31174887Schin n = e->re.string.size; 31184887Schin if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1)))) 31194887Schin return 1; 31204887Schin a->flags = e->flags; 31214887Schin a->map = e->map; 31224887Schin f = a->re.string.fail; 31234887Schin memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n); 31244887Schin s = a->re.string.base; 31254887Schin a->re.string.size = n; 31264887Schin f[0] = m = -1; 31274887Schin for (k = 1; k < n; k++) 31284887Schin { 31294887Schin while (m >= 0 && s[m+1] != s[k]) 31304887Schin m = f[m]; 31314887Schin if (s[m+1] == s[k]) 31324887Schin m++; 31334887Schin f[k] = m; 31344887Schin } 31354887Schin a->next = e->next; 31364887Schin p->env->rex = a; 31374887Schin e->next = 0; 31384887Schin drop(env->disc, e); 31394887Schin break; 31404887Schin default: 31414887Schin return 0; 31424887Schin } 31434887Schin } 31444887Schin p->env->once = 1; 31454887Schin return 0; 31464887Schin } 31474887Schin 31484887Schin int 31494887Schin regcomp(regex_t* p, const char* pattern, regflags_t flags) 31504887Schin { 31514887Schin Rex_t* e; 31524887Schin Rex_t* f; 31534887Schin regdisc_t* disc; 31544887Schin unsigned char* fold; 31554887Schin int i; 31564887Schin Cenv_t env; 31574887Schin 31584887Schin if (!p) 31594887Schin return REG_BADPAT; 31604887Schin if (flags & REG_DISCIPLINE) 31614887Schin { 31624887Schin flags &= ~REG_DISCIPLINE; 31634887Schin disc = p->re_disc; 31644887Schin } 31654887Schin else 31664887Schin disc = &state.disc; 31674887Schin if (!disc->re_errorlevel) 31684887Schin disc->re_errorlevel = 2; 31694887Schin p->env = 0; 31704887Schin if (!pattern) 31714887Schin return fatal(disc, REG_BADPAT, pattern); 31724887Schin if (!state.initialized) 31734887Schin { 31744887Schin state.initialized = 1; 31754887Schin for (i = 0; i < elementsof(state.escape); i++) 31764887Schin state.magic[state.escape[i].key] = state.escape[i].val; 31774887Schin } 31784887Schin if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data)) 31794887Schin { 31804887Schin if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1))) 31814887Schin return fatal(disc, REG_ESPACE, pattern); 31824887Schin for (i = 0; i <= UCHAR_MAX; i++) 31834887Schin fold[i] = toupper(i); 31844887Schin LCINFO(AST_LC_CTYPE)->data = (void*)fold; 31854887Schin } 31864887Schin again: 31874887Schin if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t)))) 31884887Schin return fatal(disc, REG_ESPACE, pattern); 31894887Schin memset(p->env, 0, sizeof(*p->env)); 31904887Schin memset(&env, 0, sizeof(env)); 31914887Schin env.regex = p; 31924887Schin env.flags = flags; 31934887Schin env.disc = p->env->disc = disc; 31944887Schin if (env.flags & REG_AUGMENTED) 31954887Schin env.flags |= REG_EXTENDED; 31964887Schin env.mappeddot = '.'; 31974887Schin env.mappednewline = '\n'; 31984887Schin env.mappedslash = '/'; 31994887Schin if (disc->re_version >= REG_VERSION_MAP && disc->re_map) 32004887Schin { 32014887Schin env.map = disc->re_map; 32024887Schin env.MAP = p->env->fold; 32034887Schin for (i = 0; i <= UCHAR_MAX; i++) 32044887Schin { 32054887Schin env.MAP[i] = fold[env.map[i]]; 32064887Schin if (env.map[i] == '.') 32074887Schin env.mappeddot = i; 32084887Schin if (env.map[i] == '\n') 32094887Schin env.mappednewline = i; 32104887Schin if (env.map[i] == '/') 32114887Schin env.mappedslash = i; 32124887Schin } 32134887Schin } 32144887Schin else 32154887Schin env.MAP = fold; 32164887Schin env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE; 32174887Schin env.explicit = -1; 32184887Schin if (env.flags & REG_SHELL) 32194887Schin { 32204887Schin if (env.flags & REG_SHELL_PATH) 32214887Schin env.explicit = env.mappedslash; 32224887Schin env.flags |= REG_LENIENT|REG_NULL; 32234887Schin env.type = env.type == BRE ? SRE : KRE; 32244887Schin } 32254887Schin if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE) 32264887Schin env.explicit = env.mappednewline; 32274887Schin p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1; 32284887Schin env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL)); 32294887Schin env.token.lex = 0; 32304887Schin env.token.push = 0; 32314887Schin if (env.flags & REG_DELIMITED) 32324887Schin { 32334887Schin switch (env.delimiter = *pattern++) 32344887Schin { 32354887Schin case 0: 32364887Schin case '\\': 32374887Schin case '\n': 32384887Schin case '\r': 32394887Schin env.error = REG_EDELIM; 32404887Schin goto bad; 32414887Schin } 32424887Schin env.terminator = '\n'; 32434887Schin } 32444887Schin env.literal = env.pattern = env.cursor = (unsigned char*)pattern; 32454887Schin if (!(p->env->rex = alt(&env, 1, 0))) 32464887Schin goto bad; 32474887Schin if (env.parnest) 32484887Schin { 32494887Schin env.error = REG_EPAREN; 32504887Schin goto bad; 32514887Schin } 32524887Schin p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL); 32534887Schin if (env.flags & REG_LEFT) 32544887Schin { 32554887Schin if (p->env->rex->type != REX_BEG) 32564887Schin { 32574887Schin if (p->env->rex->type == REX_ALT) 32584887Schin env.flags &= ~REG_FIRST; 32594887Schin if (!(e = node(&env, REX_BEG, 0, 0, 0))) 32604887Schin { 32614887Schin regfree(p); 32624887Schin return fatal(disc, REG_ESPACE, pattern); 32634887Schin } 32644887Schin e->next = p->env->rex; 32654887Schin p->env->rex = e; 32664887Schin p->env->once = 1; 32674887Schin } 32684887Schin p->env->stats.re_flags |= REG_LEFT; 32694887Schin } 32704887Schin for (e = p->env->rex; e->next; e = e->next); 32714887Schin p->env->done.type = REX_DONE; 32724887Schin p->env->done.flags = e->flags; 32734887Schin if (env.flags & REG_RIGHT) 32744887Schin { 32754887Schin if (e->type != REX_END) 32764887Schin { 32774887Schin if (p->env->rex->type == REX_ALT) 32784887Schin env.flags &= ~REG_FIRST; 32794887Schin if (!(f = node(&env, REX_END, 0, 0, 0))) 32804887Schin { 32814887Schin regfree(p); 32824887Schin return fatal(disc, REG_ESPACE, pattern); 32834887Schin } 32844887Schin f->flags = e->flags; 32854887Schin f->map = e->map; 32864887Schin e->next = f; 32874887Schin } 32884887Schin p->env->stats.re_flags |= REG_RIGHT; 32894887Schin } 32904887Schin if (stats(&env, p->env->rex)) 32914887Schin { 32924887Schin if (!env.error) 32934887Schin env.error = REG_ECOUNT; 32944887Schin goto bad; 32954887Schin } 32964887Schin if (env.stats.b) 32974887Schin p->env->hard = p->env->separate = 1; 32984887Schin else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c))) 32994887Schin p->env->hard = 1; 33004887Schin if (p->env->hard || env.stats.c || env.stats.i) 33014887Schin p->env->stats.re_min = p->env->stats.re_max = -1; 33024887Schin else 33034887Schin { 33044887Schin if (!(p->env->stats.re_min = env.stats.m)) 33054887Schin p->env->stats.re_min = -1; 33064887Schin if (!(p->env->stats.re_max = env.stats.n)) 33074887Schin p->env->stats.re_max = -1; 33084887Schin } 33094887Schin if (special(&env, p)) 33104887Schin goto bad; 33114887Schin serialize(&env, p->env->rex, 1); 33124887Schin p->re_nsub = env.stats.p; 33134887Schin if (env.type == KRE) 33144887Schin p->re_nsub /= 2; 33154887Schin if (env.flags & REG_DELIMITED) 33164887Schin { 33174887Schin p->re_npat = env.cursor - env.pattern + 1; 33184887Schin if (*env.cursor == env.delimiter) 33194887Schin p->re_npat++; 33204887Schin else if (env.flags & REG_MUSTDELIM) 33214887Schin { 33224887Schin env.error = REG_EDELIM; 33234887Schin goto bad; 33244887Schin } 33254887Schin else 33264887Schin env.flags &= ~REG_DELIMITED; 33274887Schin } 33284887Schin p->env->explicit = env.explicit; 33294887Schin p->env->flags = env.flags & REG_COMP; 33304887Schin p->env->min = env.stats.m; 33314887Schin p->env->nsub = env.stats.p + env.stats.u; 33324887Schin p->env->refs = 1; 33334887Schin return 0; 33344887Schin bad: 33354887Schin regfree(p); 33364887Schin if (!env.error) 33374887Schin env.error = REG_ESPACE; 33384887Schin if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL)) 33394887Schin { 33404887Schin flags |= REG_LITERAL; 33414887Schin pattern = (const char*)env.literal; 33424887Schin goto again; 33434887Schin } 33444887Schin return fatal(disc, env.error, pattern); 33454887Schin } 33464887Schin 33474887Schin /* 33484887Schin * regcomp() on sized pattern 33494887Schin * the lazy way around adding and checking env.end 33504887Schin */ 33514887Schin 33524887Schin int 33534887Schin regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags) 33544887Schin { 33554887Schin char* s; 33564887Schin int r; 33574887Schin 33584887Schin if (!(s = malloc(size + 1))) 33594887Schin return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern); 33604887Schin memcpy(s, pattern, size); 33614887Schin s[size] = 0; 33624887Schin r = regcomp(p, s, flags); 33634887Schin free(s); 33644887Schin return r; 33654887Schin } 33664887Schin 33674887Schin /* 33684887Schin * combine two compiled regular expressions if possible, 33694887Schin * replacing first with the combination and freeing second. 33704887Schin * return 0 on success. 33714887Schin * the only combinations handled are building a Trie 33724887Schin * from String|Kmp|Trie and String|Kmp 33734887Schin */ 33744887Schin 33754887Schin int 33764887Schin regcomb(regex_t* p, regex_t* q) 33774887Schin { 33784887Schin Rex_t* e = p->env->rex; 33794887Schin Rex_t* f = q->env->rex; 33804887Schin Rex_t* g; 33814887Schin Cenv_t env; 33824887Schin 33834887Schin if (!e || !f) 33844887Schin return fatal(p->env->disc, REG_BADPAT, NiL); 33854887Schin if (p->env->separate || q->env->separate) 33864887Schin return REG_ESUBREG; 33874887Schin memset(&env, 0, sizeof(env)); 33884887Schin env.disc = p->env->disc; 33894887Schin if (e->type == REX_BM) 33904887Schin { 33914887Schin p->env->rex = e->next; 33924887Schin e->next = 0; 33934887Schin drop(env.disc, e); 33944887Schin e = p->env->rex; 33954887Schin } 33964887Schin if (f->type == REX_BM) 33974887Schin { 33984887Schin q->env->rex = f->next; 33994887Schin f->next = 0; 34004887Schin drop(env.disc, f); 34014887Schin f = q->env->rex; 34024887Schin } 34034887Schin if (e->type == REX_BEG && f->type == REX_BEG) 34044887Schin { 34054887Schin p->env->flags |= REG_LEFT; 34064887Schin p->env->rex = e->next; 34074887Schin e->next = 0; 34084887Schin drop(env.disc, e); 34094887Schin e = p->env->rex; 34104887Schin q->env->rex = f->next; 34114887Schin f->next = 0; 34124887Schin drop(env.disc, f); 34134887Schin f = q->env->rex; 34144887Schin } 34154887Schin if (e->next && e->next->type == REX_END && f->next && f->next->type == REX_END) 34164887Schin { 34174887Schin p->env->flags |= REG_RIGHT; 34184887Schin drop(env.disc, e->next); 34194887Schin e->next = 0; 34204887Schin drop(env.disc, f->next); 34214887Schin f->next = 0; 34224887Schin } 34234887Schin if (!(g = trie(&env, f, e))) 34244887Schin return fatal(p->env->disc, REG_BADPAT, NiL); 34254887Schin p->env->rex = g; 34264887Schin if (!q->env->once) 34274887Schin p->env->once = 0; 34284887Schin q->env->rex = 0; 34294887Schin if (p->env->flags & REG_LEFT) 34304887Schin { 34314887Schin if (!(e = node(&env, REX_BEG, 0, 0, 0))) 34324887Schin { 34334887Schin regfree(p); 34344887Schin return fatal(p->env->disc, REG_ESPACE, NiL); 34354887Schin } 34364887Schin e->next = p->env->rex; 34374887Schin p->env->rex = e; 34384887Schin p->env->once = 1; 34394887Schin } 34404887Schin if (p->env->flags & REG_RIGHT) 34414887Schin { 34424887Schin for (f = p->env->rex; f->next; f = f->next); 34434887Schin if (f->type != REX_END) 34444887Schin { 34454887Schin if (!(e = node(&env, REX_END, 0, 0, 0))) 34464887Schin { 34474887Schin regfree(p); 34484887Schin return fatal(p->env->disc, REG_ESPACE, NiL); 34494887Schin } 34504887Schin f->next = e; 34514887Schin } 34524887Schin } 34534887Schin env.explicit = p->env->explicit; 34544887Schin env.flags = p->env->flags; 34554887Schin env.disc = p->env->disc; 34564887Schin if (stats(&env, p->env->rex)) 34574887Schin { 34584887Schin regfree(p); 34594887Schin return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL); 34604887Schin } 34614887Schin if (special(&env, p)) 34624887Schin { 34634887Schin regfree(p); 34644887Schin return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL); 34654887Schin } 34664887Schin p->env->min = g->re.trie.min; 34674887Schin return 0; 34684887Schin } 34694887Schin 34704887Schin /* 34714887Schin * copy a reference of p into q 34724887Schin * p and q may then have separate regsubcomp() instantiations 34734887Schin */ 34744887Schin 34754887Schin int 34764887Schin regdup(regex_t* p, regex_t* q) 34774887Schin { 34784887Schin if (!p || !q) 34794887Schin return REG_BADPAT; 34804887Schin *q = *p; 34814887Schin p->env->refs++; 34824887Schin q->re_sub = 0; 34834887Schin return 0; 34844887Schin } 3485