14887Schin /*********************************************************************** 24887Schin * * 34887Schin * This software is part of the ast package * 4*12068SRoger.Faulkner@Oracle.COM * Copyright (c) 1985-2010 AT&T Intellectual Property * 54887Schin * and is licensed under the * 64887Schin * Common Public License, Version 1.0 * 78462SApril.Chin@Sun.COM * by AT&T Intellectual Property * 84887Schin * * 94887Schin * A copy of the License is available at * 104887Schin * http://www.opensource.org/licenses/cpl1.0.txt * 114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 124887Schin * * 134887Schin * Information and Software Systems Research * 144887Schin * AT&T Research * 154887Schin * Florham Park NJ * 164887Schin * * 174887Schin * Glenn Fowler <gsf@research.att.com> * 184887Schin * David Korn <dgk@research.att.com> * 194887Schin * Phong Vo <kpv@research.att.com> * 204887Schin * * 214887Schin ***********************************************************************/ 224887Schin #pragma prototyped 234887Schin 244887Schin /* 254887Schin * posix regex compiler 264887Schin */ 274887Schin 284887Schin #include "reglib.h" 294887Schin 304887Schin #if _PACKAGE_ast 314887Schin #include "lclib.h" 324887Schin #endif 334887Schin 344887Schin #define serialize re_serialize /* hp.ia64 <unistd.h>! */ 354887Schin 364887Schin #define C_ESC (-1) 374887Schin #define C_MB (-2) 384887Schin 394887Schin #if _AST_REGEX_DEBUG 404887Schin 414887Schin #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 424887Schin #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 434887Schin #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 444887Schin 454887Schin static unsigned long debug; 464887Schin static unsigned long debug_flag; 474887Schin 484887Schin #else 494887Schin 504887Schin #define DEBUG_INIT() 514887Schin #define DEBUG_TEST(f,y,n) (n) 524887Schin #define DEBUG_CODE(f,y,n) do {n} while(0) 534887Schin 544887Schin #endif 554887Schin 564887Schin #if _PACKAGE_ast 574887Schin 584887Schin typedef struct Cchr_s 594887Schin { 604887Schin Dtlink_t lnk; 614887Schin unsigned char nam[2]; 624887Schin Ckey_t key; 634887Schin } Cchr_t; 644887Schin 654887Schin #endif 664887Schin 674887Schin #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0) 684887Schin 694887Schin /* 704887Schin * determine whether greedy matching will work, i.e. produce 714887Schin * the best match first. such expressions are "easy", and 724887Schin * need no backtracking once a complete match is found. 734887Schin * if an expression has backreferences or alts it's hard 744887Schin * else if it has only one closure it's easy 754887Schin * else if all closures are simple (i.e. one-character) it's easy 764887Schin * else it's hard. 774887Schin */ 784887Schin 794887Schin typedef struct Stats_s 804887Schin { 814887Schin unsigned long l; /* min length to left of x */ 824887Schin unsigned long k; /* min length to left of y */ 834887Schin unsigned long m; /* min length */ 844887Schin unsigned long n; /* max length */ 854887Schin unsigned short a; /* number of alternations */ 864887Schin unsigned short b; /* number of backrefs */ 874887Schin unsigned short c; /* number of closures */ 88*12068SRoger.Faulkner@Oracle.COM unsigned short e; /* $ */ 894887Schin unsigned short i; /* number of negations */ 904887Schin unsigned short p; /* number of named subexpressions */ 914887Schin unsigned short s; /* number of simple closures */ 924887Schin unsigned short t; /* number of tries */ 934887Schin unsigned short u; /* number of unnamed subexpressions */ 944887Schin Rex_t* x; /* max length REX_STRING */ 954887Schin Rex_t* y; /* max length REX_TRIE */ 964887Schin } Stats_t; 974887Schin 984887Schin typedef struct Token_s 994887Schin { 1004887Schin unsigned long min; 1014887Schin unsigned long max; 1024887Schin short lex; 1034887Schin short len; 1044887Schin short esc; 1054887Schin short att; 1064887Schin short push; 1074887Schin } Token_t; 1084887Schin 1094887Schin typedef struct Cenv_s 1104887Schin { 1114887Schin int delimiter; /* pattern delimiter */ 1124887Schin int error; /* last error */ 1134887Schin int explicit; /* explicit match on this char */ 1144887Schin int mappeddot; /* inverse mapped '.' */ 1154887Schin int mappednewline; /* inverse mapped '\n' */ 1164887Schin int mappedslash; /* inverse mapped '/' */ 1174887Schin regflags_t flags; /* flags arg to regcomp */ 1184887Schin int type; /* BRE,ERE,ARE,SRE,KRE */ 1194887Schin unsigned char* cursor; /* curent point in re */ 1204887Schin unsigned char* pattern; /* the original pattern */ 1214887Schin unsigned char* literal; /* literal restart pattern */ 1224887Schin int parno; /* number of last open paren */ 1234887Schin int parnest; /* paren nest count */ 1244887Schin int posixkludge; /* to make * nonspecial */ 12510898Sroland.mainz@nrubsig.org int regexp; /* <regexp.h> compatibility */ 1264887Schin Token_t token; /* token lookahead */ 1274887Schin Stats_t stats; /* RE statistics */ 1284887Schin int terminator; /* pattern terminator */ 1294887Schin Rex_t* paren[2*(BACK_REF_MAX+2)]; 1304887Schin /* paren[i]!=0 if \i defined */ 1314887Schin regex_t* regex; /* user handle */ 1324887Schin regdisc_t* disc; /* user discipline */ 1334887Schin unsigned char* map; /* external to native ccode map */ 1344887Schin unsigned char* MAP; /* fold and/or map */ 1354887Schin } Cenv_t; 1364887Schin 1374887Schin /* 1384887Schin * allocate a new Rex_t node 1394887Schin */ 1404887Schin 1414887Schin static Rex_t* 1424887Schin node(Cenv_t* env, int type, int lo, int hi, size_t extra) 1434887Schin { 1444887Schin register Rex_t* e; 1454887Schin 1464887Schin if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra)) 1474887Schin { 1484887Schin memset(e, 0, sizeof(Rex_t) + extra); 1494887Schin e->type = type; 1504887Schin e->marked = 0; 1514887Schin e->lo = lo; 1524887Schin e->hi = hi; 1534887Schin e->flags = env->flags; 1544887Schin e->map = (e->flags & REG_ICASE) ? env->MAP : env->map; 1554887Schin e->explicit = env->explicit; 1564887Schin if (extra) 1574887Schin e->re.data = (char*)e + sizeof(Rex_t); 1584887Schin } 1594887Schin return e; 1604887Schin } 1614887Schin 1624887Schin /* 1634887Schin * free a Trie_node_t node 1644887Schin */ 1654887Schin 1664887Schin static void 1674887Schin triedrop(regdisc_t* disc, Trie_node_t* e) 1684887Schin { 1694887Schin if (e) 1704887Schin { 1714887Schin triedrop(disc, e->son); 1724887Schin triedrop(disc, e->sib); 1734887Schin alloc(disc, e, 0); 1744887Schin } 1754887Schin } 1764887Schin 1774887Schin /* 1784887Schin * free a Rex_t node 1794887Schin */ 1804887Schin 1814887Schin void 1824887Schin drop(regdisc_t* disc, Rex_t* e) 1834887Schin { 1844887Schin int i; 1854887Schin Rex_t* x; 1864887Schin 1874887Schin if (e && !(disc->re_flags & REG_NOFREE)) 1884887Schin do 1894887Schin { 1904887Schin switch (e->type) 1914887Schin { 1924887Schin case REX_ALT: 1934887Schin case REX_CONJ: 1944887Schin drop(disc, e->re.group.expr.binary.left); 1954887Schin drop(disc, e->re.group.expr.binary.right); 1964887Schin break; 1974887Schin case REX_GROUP: 1984887Schin case REX_GROUP_AHEAD: 1994887Schin case REX_GROUP_AHEAD_NOT: 2004887Schin case REX_GROUP_BEHIND: 2014887Schin case REX_GROUP_BEHIND_NOT: 2024887Schin case REX_GROUP_CUT: 2034887Schin case REX_NEG: 2044887Schin case REX_REP: 2054887Schin drop(disc, e->re.group.expr.rex); 2064887Schin break; 2074887Schin case REX_TRIE: 2084887Schin for (i = 0; i <= UCHAR_MAX; i++) 2094887Schin triedrop(disc, e->re.trie.root[i]); 2104887Schin break; 2114887Schin } 2124887Schin x = e->next; 2134887Schin alloc(disc, e, 0); 2144887Schin } while (e = x); 2154887Schin } 2164887Schin 2174887Schin /* 2184887Schin * mark e and descendants minimal 2194887Schin */ 2204887Schin 2214887Schin static void 2224887Schin mark(register Rex_t* e, int set) 2234887Schin { 2244887Schin if (e && !e->marked) 2254887Schin do 2264887Schin { 2274887Schin e->marked = 1; 2284887Schin if (set) 2294887Schin e->flags |= REG_MINIMAL; 2304887Schin else 2314887Schin e->flags &= ~REG_MINIMAL; 2324887Schin switch (e->type) 2334887Schin { 2344887Schin case REX_ALT: 2354887Schin case REX_CONJ: 2364887Schin case REX_GROUP_COND: 2374887Schin if (e->re.group.expr.binary.left) 2384887Schin mark(e->re.group.expr.binary.left, set); 2394887Schin if (e->re.group.expr.binary.right) 2404887Schin mark(e->re.group.expr.binary.right, set); 2414887Schin break; 2424887Schin case REX_GROUP: 2434887Schin case REX_GROUP_AHEAD: 2444887Schin case REX_GROUP_AHEAD_NOT: 2454887Schin case REX_GROUP_BEHIND: 2464887Schin case REX_GROUP_BEHIND_NOT: 2474887Schin case REX_GROUP_CUT: 2484887Schin case REX_NEG: 2494887Schin case REX_REP: 2504887Schin case REX_TRIE: 2514887Schin mark(e->re.group.expr.rex, set); 2524887Schin break; 2534887Schin } 2544887Schin } while (e = e->next); 2554887Schin } 2564887Schin 2574887Schin /* 2584887Schin * assign subexpression numbers by a preorder tree walk 2594887Schin */ 2604887Schin 2614887Schin static int 2624887Schin serialize(Cenv_t* env, Rex_t* e, int n) 2634887Schin { 2644887Schin do 2654887Schin { 2664887Schin e->serial = n++; 2674887Schin switch (e->type) 2684887Schin { 2694887Schin case REX_ALT: 2704887Schin case REX_GROUP_COND: 2714887Schin if (e->re.group.expr.binary.left) 2724887Schin n = serialize(env, e->re.group.expr.binary.left, n); 2734887Schin e->re.group.expr.binary.serial = n++; 2744887Schin if (e->re.group.expr.binary.right) 2754887Schin n = serialize(env, e->re.group.expr.binary.right, n); 2764887Schin break; 2774887Schin case REX_CONJ: 2784887Schin n = serialize(env, e->re.group.expr.binary.left, n); 2794887Schin n = serialize(env, e->re.group.expr.binary.right, n); 2804887Schin break; 2814887Schin case REX_GROUP: 2824887Schin case REX_GROUP_AHEAD: 2834887Schin case REX_GROUP_AHEAD_NOT: 2844887Schin case REX_GROUP_BEHIND: 2854887Schin case REX_GROUP_BEHIND_NOT: 2864887Schin case REX_GROUP_CUT: 2874887Schin case REX_NEG: 2884887Schin case REX_REP: 2894887Schin n = serialize(env, e->re.group.expr.rex, n); 2904887Schin break; 2914887Schin } 2924887Schin } while (e = e->next); 2934887Schin return n; 2944887Schin } 2954887Schin 2964887Schin /* 2974887Schin * catenate e and f into a sequence, collapsing them if possible 2984887Schin */ 2994887Schin 3004887Schin static Rex_t* 3014887Schin cat(Cenv_t* env, Rex_t* e, Rex_t* f) 3024887Schin { 3034887Schin Rex_t* g; 3044887Schin 3054887Schin if (!f) 3064887Schin { 3074887Schin drop(env->disc, e); 3084887Schin return 0; 3094887Schin } 3104887Schin if (e->type == REX_NULL) 3114887Schin { 3124887Schin drop(env->disc, e); 3134887Schin return f; 3144887Schin } 3154887Schin if (f->type == REX_NULL) 3164887Schin { 3174887Schin g = f->next; 3184887Schin f->next = 0; 3194887Schin drop(env->disc, f); 3204887Schin f = g; 3214887Schin } 3224887Schin else if (e->type == REX_DOT && f->type == REX_DOT) 3234887Schin { 3244887Schin unsigned int m = e->lo + f->lo; 3254887Schin unsigned int n = e->hi + f->hi; 3264887Schin 3274887Schin if (m <= RE_DUP_MAX) 3284887Schin { 3294887Schin if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX) 3304887Schin { 3314887Schin n = RE_DUP_INF; 3324887Schin goto combine; 3334887Schin } 3344887Schin else if (n <= RE_DUP_MAX) 3354887Schin { 3364887Schin combine: 3374887Schin e->lo = m; 3384887Schin e->hi = n; 3394887Schin g = f->next; 3404887Schin f->next = 0; 3414887Schin drop(env->disc, f); 3424887Schin f = g; 3434887Schin } 3444887Schin } 3454887Schin } 3464887Schin e->next = f; 3474887Schin return e; 3484887Schin } 3494887Schin 3504887Schin /* 3514887Schin * collect re statistics 3524887Schin */ 3534887Schin 3544887Schin static int 3554887Schin stats(register Cenv_t* env, register Rex_t* e) 3564887Schin { 3574887Schin register unsigned long n; 3584887Schin register unsigned long m; 3594887Schin unsigned long cm; 3604887Schin unsigned long nm; 3614887Schin unsigned long cn; 3624887Schin unsigned long nn; 3634887Schin unsigned long l; 3644887Schin unsigned long k; 3654887Schin unsigned long t; 3664887Schin Rex_t* q; 3674887Schin Rex_t* x; 3684887Schin Rex_t* y; 3694887Schin unsigned char c; 3704887Schin unsigned char b; 3714887Schin 3724887Schin do 3734887Schin { 3744887Schin switch (e->type) 3754887Schin { 3764887Schin case REX_ALT: 3774887Schin x = env->stats.x; 3784887Schin l = env->stats.l; 3794887Schin y = env->stats.y; 3804887Schin k = env->stats.k; 3814887Schin t = env->stats.t; 3824887Schin if (++env->stats.a <= 0) 3834887Schin return 1; 3844887Schin cm = env->stats.m; 3854887Schin env->stats.m = 0; 3864887Schin cn = env->stats.n; 3874887Schin env->stats.n = 0; 3884887Schin if (stats(env, e->re.group.expr.binary.left)) 3894887Schin return 1; 3904887Schin m = env->stats.m; 3914887Schin env->stats.m = 0; 3924887Schin n = env->stats.n; 3934887Schin env->stats.n = 0; 3944887Schin if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right)) 3954887Schin return 1; 3964887Schin if (env->stats.m > m) 3974887Schin env->stats.m = m; 3984887Schin else 3994887Schin m = env->stats.m; 4004887Schin if ((env->stats.m += cm) < m) 4014887Schin return 1; 4024887Schin if (env->stats.n < n) 4034887Schin env->stats.n = n; 4044887Schin else 4054887Schin n = env->stats.n; 4064887Schin if ((env->stats.n += cn) < n) 4074887Schin return 1; 4084887Schin env->stats.x = x; 4094887Schin env->stats.l = l; 4104887Schin env->stats.y = y; 4114887Schin env->stats.k = k; 4124887Schin env->stats.t = t; 4134887Schin break; 4144887Schin case REX_BACK: 4154887Schin if (++env->stats.b <= 0) 4164887Schin return 1; 4174887Schin break; 4184887Schin case REX_CLASS: 4194887Schin case REX_COLL_CLASS: 4204887Schin case REX_DOT: 4214887Schin case REX_ONECHAR: 4224887Schin n = env->stats.m; 4234887Schin if ((env->stats.m += e->lo) < n) 4244887Schin return 1; 4254887Schin if (e->hi != RE_DUP_INF) 4264887Schin { 4274887Schin n = env->stats.n; 4284887Schin if ((env->stats.n += e->hi) < n) 4294887Schin return 1; 4304887Schin } 4314887Schin if (e->lo != e->hi) 4324887Schin { 4334887Schin if (++env->stats.c <= 0) 4344887Schin return 1; 4354887Schin if (++env->stats.s <= 0) 4364887Schin return 1; 4374887Schin } 4384887Schin break; 4394887Schin case REX_CONJ: 4404887Schin cm = env->stats.m; 4414887Schin env->stats.m = 0; 4424887Schin cn = env->stats.n; 4434887Schin env->stats.n = 0; 4444887Schin if (stats(env, e->re.group.expr.binary.left)) 4454887Schin return 1; 4464887Schin nm = env->stats.m; 4474887Schin env->stats.m = 0; 4484887Schin nn = env->stats.n; 4494887Schin env->stats.n = 0; 4504887Schin if (stats(env, e->re.group.expr.binary.right)) 4514887Schin return 1; 4524887Schin if (env->stats.m < nm) 4534887Schin env->stats.m = nm; 4544887Schin else 4554887Schin nm = env->stats.m; 4564887Schin if ((env->stats.m += cm) < nm) 4574887Schin return 1; 4584887Schin if (env->stats.n < nn) 4594887Schin env->stats.n = nn; 4604887Schin else 4614887Schin nn = env->stats.n; 4624887Schin if ((env->stats.n += cn) < nn) 4634887Schin return 1; 4644887Schin break; 465*12068SRoger.Faulkner@Oracle.COM case REX_END: 466*12068SRoger.Faulkner@Oracle.COM env->stats.e = 1; 467*12068SRoger.Faulkner@Oracle.COM break; 4684887Schin case REX_GROUP: 4694887Schin if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0) 4704887Schin return 1; 4714887Schin if (stats(env, e->re.group.expr.rex)) 4724887Schin return 1; 4734887Schin break; 4744887Schin case REX_GROUP_AHEAD: 4754887Schin case REX_GROUP_AHEAD_NOT: 4764887Schin case REX_GROUP_BEHIND: 4774887Schin case REX_GROUP_BEHIND_NOT: 4784887Schin m = env->stats.m; 4794887Schin n = env->stats.n; 4804887Schin x = env->stats.x; 4814887Schin y = env->stats.y; 4824887Schin if (stats(env, e->re.group.expr.rex)) 4834887Schin return 1; 4844887Schin env->stats.m = m; 4854887Schin env->stats.n = n; 4864887Schin env->stats.x = x; 4874887Schin env->stats.y = y; 4884887Schin switch (e->type) 4894887Schin { 4904887Schin case REX_GROUP_AHEAD: 4914887Schin case REX_GROUP_BEHIND: 4924887Schin if (++env->stats.u <= 0) 4934887Schin return 1; 4944887Schin break; 4954887Schin } 4964887Schin break; 4974887Schin case REX_GROUP_COND: 4984887Schin if (++env->stats.u <= 0) 4994887Schin return 1; 5004887Schin m = env->stats.m; 5014887Schin n = env->stats.n; 5024887Schin x = env->stats.x; 5034887Schin y = env->stats.y; 5044887Schin if (e->re.group.size > 0 && ++env->stats.b <= 0) 5054887Schin return 1; 5064887Schin if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left)) 5074887Schin return 1; 5084887Schin if (q = e->re.group.expr.binary.right) 5094887Schin { 5104887Schin if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left)) 5114887Schin return 1; 5124887Schin if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right)) 5134887Schin return 1; 5144887Schin } 5154887Schin env->stats.m = m; 5164887Schin env->stats.n = n; 5174887Schin env->stats.x = x; 5184887Schin env->stats.y = y; 5194887Schin break; 5204887Schin case REX_GROUP_CUT: 5214887Schin if (++env->stats.u <= 0) 5224887Schin return 1; 5234887Schin m = env->stats.m; 5244887Schin n = env->stats.n; 5254887Schin x = env->stats.x; 5264887Schin y = env->stats.y; 5274887Schin if (stats(env, e->re.group.expr.rex)) 5284887Schin return 1; 5294887Schin env->stats.m = m; 5304887Schin env->stats.n = n; 5314887Schin env->stats.x = x; 5324887Schin env->stats.y = y; 5334887Schin break; 5344887Schin case REX_NEG: 5354887Schin env->stats.i++; 5364887Schin x = env->stats.x; 5374887Schin l = env->stats.l; 5384887Schin y = env->stats.y; 5394887Schin k = env->stats.k; 5404887Schin t = env->stats.t; 5414887Schin cm = env->stats.m; 5424887Schin env->stats.m = 0; 5434887Schin if (stats(env, e->re.group.expr.rex)) 5444887Schin return 1; 5454887Schin env->stats.m = !env->stats.m; 5464887Schin if ((env->stats.m += cm) < cm) 5474887Schin return 1; 5484887Schin env->stats.x = x; 5494887Schin env->stats.l = l; 5504887Schin env->stats.y = y; 5514887Schin env->stats.k = k; 5524887Schin env->stats.t = t; 5534887Schin break; 5544887Schin case REX_REP: 5554887Schin x = env->stats.x; 5564887Schin l = env->stats.l; 5574887Schin y = env->stats.y; 5584887Schin k = env->stats.k; 5594887Schin t = env->stats.t; 5604887Schin if (++env->stats.c <= 0) 5614887Schin return 1; 5624887Schin b = env->stats.b; 5634887Schin c = env->stats.c; 5644887Schin cm = env->stats.m; 5654887Schin env->stats.m = 0; 5664887Schin if (stats(env, e->re.group.expr.rex)) 5674887Schin return 1; 5684887Schin if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0) 5694887Schin return 1; 5704887Schin if (e->lo < 1) 5714887Schin { 5724887Schin env->stats.x = x; 5734887Schin env->stats.l = l; 5744887Schin env->stats.y = y; 5754887Schin env->stats.k = k; 5764887Schin env->stats.t = t; 5774887Schin env->stats.m = cm; 5784887Schin } 5794887Schin else 5804887Schin { 5814887Schin m = env->stats.m; 5824887Schin if ((env->stats.m *= e->lo) > 0 && env->stats.m < m) 5834887Schin return 1; 5844887Schin m = env->stats.m; 5854887Schin if ((env->stats.m += cm) < m) 5864887Schin return 1; 5874887Schin if (env->stats.x != x) 5884887Schin env->stats.l = cm; 5894887Schin if (env->stats.y != y) 5904887Schin env->stats.k = cm; 5914887Schin } 5924887Schin break; 5934887Schin case REX_STRING: 5948462SApril.Chin@Sun.COM if (!e->map) 5954887Schin { 5968462SApril.Chin@Sun.COM cm = env->stats.m; 5978462SApril.Chin@Sun.COM if ((env->stats.m += e->re.string.size) < cm) 5988462SApril.Chin@Sun.COM return 1; 5998462SApril.Chin@Sun.COM cn = env->stats.n; 6008462SApril.Chin@Sun.COM if ((env->stats.n += e->re.string.size) < cn) 6018462SApril.Chin@Sun.COM return 1; 6028462SApril.Chin@Sun.COM if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size) 6038462SApril.Chin@Sun.COM { 6048462SApril.Chin@Sun.COM env->stats.x = e; 6058462SApril.Chin@Sun.COM env->stats.l = cm; 6068462SApril.Chin@Sun.COM } 6074887Schin } 6084887Schin break; 6094887Schin case REX_TRIE: 6104887Schin if (++env->stats.s <= 0) 6114887Schin return 1; 6124887Schin cm = env->stats.m; 6134887Schin if ((env->stats.m += e->re.trie.min) < cm) 6144887Schin return 1; 6154887Schin cn = env->stats.n; 6164887Schin if ((env->stats.n += e->re.trie.max) < cn) 6174887Schin return 1; 6184887Schin env->stats.t++; 6194887Schin if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min) 6204887Schin { 6214887Schin env->stats.y = e; 6224887Schin env->stats.k = cm; 6234887Schin } 6244887Schin break; 6254887Schin } 6264887Schin } while (e = e->next); 6274887Schin return 0; 6284887Schin } 6294887Schin 6304887Schin static int token(Cenv_t*); 6314887Schin 6324887Schin static int 6334887Schin magic(register Cenv_t* env, register int c, int escaped) 6344887Schin { 6354887Schin register char* sp; 6364887Schin register int n; 6374887Schin int o = c; 6384887Schin int e = env->error; 6394887Schin int l = env->token.len; 6404887Schin short* mp; 6414887Schin char* ep; 6424887Schin 6434887Schin if (mp = state.magic[c]) 6444887Schin { 6454887Schin c = mp[env->type+escaped]; 6464887Schin if (c >= T_META) 6474887Schin { 6484887Schin sp = (char*)env->cursor + env->token.len; 6494887Schin switch (c) 6504887Schin { 6514887Schin case T_LEFT: 6524887Schin n = 0; 6534887Schin ep = sp; 6544887Schin while (*sp >= '0' && *sp <= '9') 6554887Schin { 6564887Schin if (n > (INT_MAX / 10)) 6574887Schin { 6584887Schin env->error = REG_BADBR; 6594887Schin goto bad; 6604887Schin } 6614887Schin n = n * 10 + *sp++ - '0'; 6624887Schin } 6634887Schin if (sp == ep) 6644887Schin { 6658462SApril.Chin@Sun.COM if (env->type < SRE || *sp != ',') 6668462SApril.Chin@Sun.COM { 6678462SApril.Chin@Sun.COM env->error = *sp ? REG_BADBR : REG_EBRACE; 6688462SApril.Chin@Sun.COM goto bad; 6698462SApril.Chin@Sun.COM } 6704887Schin } 6714887Schin else if (n > RE_DUP_MAX) 6724887Schin { 6734887Schin env->error = REG_BADBR; 6744887Schin goto bad; 6754887Schin } 6764887Schin env->token.min = n; 6774887Schin if (*sp == ',') 6784887Schin { 6794887Schin n = 0; 6804887Schin ep = ++sp; 6814887Schin while (*sp >= '0' && *sp <= '9') 6824887Schin { 6834887Schin if (n > (INT_MAX / 10)) 6844887Schin { 6854887Schin env->error = REG_BADBR; 6864887Schin goto bad; 6874887Schin } 6884887Schin n = n * 10 + *sp++ - '0'; 6894887Schin } 6904887Schin if (sp == ep) 6914887Schin n = RE_DUP_INF; 6924887Schin else if (n < env->token.min) 6934887Schin { 6944887Schin env->error = REG_BADBR; 6954887Schin goto bad; 6964887Schin } 6974887Schin } 6984887Schin env->token.max = n; 6994887Schin switch (*sp) 7004887Schin { 7014887Schin case 0: 7024887Schin env->error = REG_EBRACE; 7034887Schin goto bad; 7044887Schin case '\\': 7054887Schin if (!escaped) 7064887Schin { 7074887Schin env->error = REG_BADBR; 7084887Schin goto bad; 7094887Schin } 7104887Schin sp++; 7114887Schin break; 7124887Schin default: 7134887Schin if (escaped) 7144887Schin { 7154887Schin env->error = REG_BADBR; 7164887Schin goto bad; 7174887Schin } 7184887Schin break; 7194887Schin } 7204887Schin switch (*sp++) 7214887Schin { 7224887Schin case 0: 7234887Schin env->error = REG_EBRACE; 7244887Schin goto bad; 7254887Schin case '}': 7264887Schin break; 7274887Schin default: 7284887Schin env->error = REG_BADBR; 7294887Schin goto bad; 7304887Schin } 7314887Schin env->token.len = sp - (char*)env->cursor; 7324887Schin break; 7334887Schin case T_RIGHT: 7344887Schin env->error = REG_EBRACE; 7354887Schin goto bad; 7364887Schin case T_OPEN: 7374887Schin if (env->type < SRE && *sp == '?') 7384887Schin { 7394887Schin env->token.len++; 7404887Schin env->token.lex = 0; 7414887Schin goto group; 7424887Schin } 7434887Schin break; 7444887Schin case T_ESCAPE: 7454887Schin c = chresc(sp - 2, &ep); 7464887Schin if (ep < sp) 7474887Schin goto bad; 7484887Schin env->token.len += ep - sp; 7494887Schin if (c >= T_META) 7504887Schin { 7514887Schin env->token.lex = c; 7524887Schin c = C_ESC; 7534887Schin } 7544887Schin return c; 7554887Schin case T_BACK+0: 7564887Schin case T_BACK+1: 7574887Schin case T_BACK+2: 7584887Schin case T_BACK+3: 7594887Schin case T_BACK+4: 7604887Schin case T_BACK+5: 7614887Schin case T_BACK+6: 7624887Schin case T_BACK+7: 7634887Schin n = chresc(sp - 2, &ep); 7644887Schin if (ep > sp + 1) 7654887Schin { 7664887Schin env->token.len += ep - sp; 7674887Schin return n; 7684887Schin } 7694887Schin /*FALLTHROUGH*/ 7704887Schin case T_BACK+8: 7714887Schin case T_BACK+9: 7724887Schin if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT)) 7734887Schin { 7744887Schin env->error = REG_BADESC; 7754887Schin goto bad; 7764887Schin } 7774887Schin if ((env->flags & REG_MULTIREF) && isdigit(*sp)) 7784887Schin { 7794887Schin c = (c - T_BACK) * 10 + (*sp - '0'); 7804887Schin if (c > 0 && c <= env->parno && env->paren[c]) 7814887Schin c += T_BACK; 7824887Schin else 7834887Schin c = chresc(sp - 2, &ep); 7844887Schin env->token.len++; 7854887Schin } 7864887Schin if (c == T_BACK) 7874887Schin c = 0; 7884887Schin break; 7894887Schin case T_BAD: 7904887Schin if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META) 7914887Schin return c; 7924887Schin goto bad; 7934887Schin } 7944887Schin if (env->type >= SRE) 7954887Schin { 7964887Schin if (c == T_DOT) 7974887Schin c = '.'; 7984887Schin else if (c < T_OPEN) 7994887Schin { 8004887Schin if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(') 8014887Schin { 8024887Schin env->token.len++; 8034887Schin env->token.att = 1; 8044887Schin } 8054887Schin if (env->type == KRE && *(env->cursor + env->token.len) == '(') 8064887Schin { 8074887Schin env->token.len++; 8084887Schin switch (c) 8094887Schin { 8104887Schin case T_AT: 8114887Schin break; 8124887Schin case T_PERCENT: 8134887Schin env->token.lex = c; 8144887Schin goto group; 8154887Schin case T_TILDE: 8164887Schin env->token.lex = 0; 8174887Schin goto group; 8184887Schin default: 8194887Schin env->token.lex = c; 8204887Schin break; 8214887Schin } 8224887Schin c = T_OPEN; 8234887Schin } 8244887Schin else if (c == T_STAR) 8254887Schin c = T_DOTSTAR; 8264887Schin else if (c == T_QUES) 8274887Schin c = T_DOT; 8284887Schin else 8294887Schin { 8304887Schin c = o; 8314887Schin env->token.len = l; 8324887Schin } 8334887Schin } 8344887Schin else if (c > T_BACK) 8354887Schin { 8364887Schin c = (c - T_BACK) * 2 - 1; 8374887Schin c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c; 8384887Schin } 8394887Schin else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP)) 8404887Schin { 8414887Schin if (c == T_AND) 8424887Schin c = '&'; 8434887Schin else if (c == T_BAR) 8444887Schin c = '|'; 8454887Schin else if (c == T_OPEN) 8464887Schin c = '('; 8474887Schin } 8484887Schin } 8494887Schin } 8504887Schin } 8514887Schin else if (escaped == 2) 8524887Schin { 8534887Schin if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter)) 8544887Schin /*ok*/; 8554887Schin else 8564887Schin { 8574887Schin env->error = REG_BADESC; 8584887Schin goto bad; 8594887Schin } 8604887Schin } 8614887Schin else if (escaped && !(env->flags & REG_LENIENT) && c != ']') 8624887Schin { 8634887Schin env->error = REG_BADESC; 8644887Schin goto bad; 8654887Schin } 8664887Schin return c; 8674887Schin group: 8684887Schin sp = (char*)env->cursor + env->token.len; 8694887Schin switch (*sp++) 8704887Schin { 8714887Schin case ')': 8724887Schin break; 8734887Schin case '#': 8744887Schin for (;;) 8754887Schin { 8764887Schin switch (*sp++) 8774887Schin { 8784887Schin case 0: 8794887Schin env->error = REG_EPAREN; 8804887Schin return T_BAD; 8814887Schin case ')': 8824887Schin break; 8834887Schin default: 8844887Schin continue; 8854887Schin } 8864887Schin break; 8874887Schin } 8884887Schin break; 8894887Schin default: 8904887Schin return T_GROUP; 8914887Schin } 8924887Schin env->cursor = (unsigned char*)sp; 8934887Schin return token(env); 8944887Schin bad: 8954887Schin if (escaped == 2) 8964887Schin env->error = e; 8974887Schin else if (env->flags & REG_LENIENT) 8984887Schin return o; 8994887Schin else if (escaped == 1 && !env->error) 9004887Schin { 9014887Schin if (mp || o == ']') 9024887Schin return o; 9034887Schin env->error = REG_BADESC; 9044887Schin } 9054887Schin return T_BAD; 9064887Schin } 9074887Schin 9084887Schin static int 9094887Schin token(register Cenv_t* env) 9104887Schin { 9114887Schin int c; 9124887Schin int posixkludge; 9134887Schin 9144887Schin if (env->token.push) 9154887Schin return env->token.lex; 9164887Schin env->token.att = env->token.esc = 0; 9174887Schin if ((env->token.len = MBSIZE(env->cursor)) > 1) 9184887Schin return env->token.lex = C_MB; 9194887Schin env->token.lex = 0; 9204887Schin for (;;) 9214887Schin { 9224887Schin c = *env->cursor; 9234887Schin if (c == 0 || c == env->delimiter || c == env->terminator) 9244887Schin return T_END; 9254887Schin if (!(env->flags & REG_COMMENT)) 9264887Schin break; 9274887Schin if (c == '#') 9284887Schin { 9294887Schin do 9304887Schin { 9314887Schin c = *++env->cursor; 9324887Schin if (c == 0 || c == env->delimiter) 9334887Schin return T_END; 9344887Schin } while (c != '\n'); 9354887Schin } 9364887Schin else if (!isspace(c)) 9374887Schin break; 9384887Schin env->cursor++; 9394887Schin } 9404887Schin if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter) 9414887Schin { 9424887Schin if (env->parnest) 9434887Schin { 9444887Schin env->error = REG_EPAREN; 9454887Schin return T_BAD; 9464887Schin } 9474887Schin env->parno = 0; 9484887Schin env->pattern = env->cursor + 1; 9494887Schin return T_BAR; 9504887Schin } 9514887Schin if (env->flags & REG_LITERAL) 9524887Schin return c; 9534887Schin if (posixkludge = env->posixkludge) 9544887Schin { 9554887Schin env->posixkludge = 0; 9564887Schin if (c == '*') 9574887Schin return c; 9584887Schin } 9594887Schin if (c == '\\') 9604887Schin { 9614887Schin if (env->flags & REG_SHELL_ESCAPED) 9624887Schin return c; 9634887Schin if (!(c = *(env->cursor + 1)) || c == env->terminator) 9644887Schin { 9654887Schin if (env->flags & REG_LENIENT) 9664887Schin { 9674887Schin if (c) 9684887Schin { 9694887Schin env->token.esc = env->token.len; 9704887Schin env->token.len += MBSIZE(env->cursor + 1); 9714887Schin return c; 9724887Schin } 9734887Schin return '\\'; 9744887Schin } 9754887Schin env->error = REG_EESCAPE; 9764887Schin return T_BAD; 9774887Schin } 9784887Schin env->token.esc = env->token.len; 9794887Schin env->token.len += MBSIZE(env->cursor + 1); 9804887Schin if (env->delimiter && c == 'n') 9814887Schin return '\n'; 9824887Schin else if (c == env->delimiter) 9834887Schin return magic(env, c, 0); 9844887Schin else if (c == '(' && env->type == BRE) 9854887Schin env->posixkludge = 1; 9864887Schin else if (c == ')' && env->type == BRE && env->parnest <= 0) 9874887Schin { 9884887Schin env->error = REG_EPAREN; 9894887Schin return T_BAD; 9904887Schin } 9914887Schin else if (isspace(c) && (env->flags & REG_COMMENT)) 9924887Schin return c; 9934887Schin return magic(env, c, 1); 9944887Schin } 9954887Schin else if (c == '$') 9964887Schin { 9974887Schin if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n') 9984887Schin return T_DOLL; 9994887Schin } 10004887Schin else if (c == '^') 10014887Schin { 1002*12068SRoger.Faulkner@Oracle.COM if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1)) 10034887Schin { 1004*12068SRoger.Faulkner@Oracle.COM env->posixkludge = 2; 10054887Schin return T_CFLX; 10064887Schin } 10074887Schin } 10084887Schin else if (c == ')') 10094887Schin { 10104887Schin if (env->type != BRE && env->parnest <= 0) 10114887Schin return c; 10124887Schin } 10134887Schin else if (c == '/' && env->explicit == env->mappedslash) 10144887Schin { 10154887Schin while (*(env->cursor + env->token.len) == c) 10164887Schin env->token.len++; 10174887Schin return T_SLASHPLUS; 10184887Schin } 10194887Schin return magic(env, c, 0); 10204887Schin } 10214887Schin 10224887Schin static Celt_t* 10234887Schin col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec) 10244887Schin { 10258462SApril.Chin@Sun.COM register char* s; 10264887Schin register unsigned char* k; 10274887Schin register unsigned char* e; 10284887Schin register int c; 10294887Schin register int cc; 10304887Schin int bt; 10314887Schin int et; 10324887Schin Ckey_t key; 10334887Schin 10344887Schin cc = 0; 10354887Schin for (;;) 10364887Schin { 10374887Schin k = key; 10384887Schin if (bw == 1) 10394887Schin { 10404887Schin c = bc; 10414887Schin if (ic) 10424887Schin { 10438462SApril.Chin@Sun.COM if (isupper(c)) 10448462SApril.Chin@Sun.COM { 10458462SApril.Chin@Sun.COM c = tolower(c); 10468462SApril.Chin@Sun.COM cc = -1; 10478462SApril.Chin@Sun.COM } 10488462SApril.Chin@Sun.COM else if (islower(c)) 10498462SApril.Chin@Sun.COM { 10508462SApril.Chin@Sun.COM c = toupper(c); 10518462SApril.Chin@Sun.COM cc = -1; 10528462SApril.Chin@Sun.COM } 10538462SApril.Chin@Sun.COM } 10548462SApril.Chin@Sun.COM *k++ = c; 10558462SApril.Chin@Sun.COM } 10568462SApril.Chin@Sun.COM else if (bw < COLL_KEY_MAX) 10578462SApril.Chin@Sun.COM { 10588462SApril.Chin@Sun.COM s = (char*)bp; 10598462SApril.Chin@Sun.COM if (ic) 10608462SApril.Chin@Sun.COM { 10618462SApril.Chin@Sun.COM c = mbchar(s); 10624887Schin if (iswupper(c)) 10634887Schin { 10644887Schin c = towlower(c); 10654887Schin cc = 1; 10664887Schin } 10674887Schin else if (iswlower(c)) 10684887Schin { 10694887Schin c = towupper(c); 10704887Schin cc = 1; 10714887Schin } 10724887Schin } 10738462SApril.Chin@Sun.COM if (cc > 0) 10744887Schin { 10758462SApril.Chin@Sun.COM cc = -1; 1076*12068SRoger.Faulkner@Oracle.COM k += mbconv((char*)k, c); 10774887Schin } 10788462SApril.Chin@Sun.COM else 10798462SApril.Chin@Sun.COM for (e = k + bw; k < e; *k++ = *s++); 10804887Schin } 10814887Schin *k = 0; 10824887Schin mbxfrm(ce->beg, key, COLL_KEY_MAX); 10834887Schin if (ep) 10844887Schin { 10854887Schin k = key; 10864887Schin c = mbchar(k); 10874887Schin if (iswupper(c)) 10884887Schin bt = COLL_range_uc; 10894887Schin else if (iswlower(c)) 10904887Schin bt = COLL_range_lc; 10914887Schin else 10924887Schin bt = COLL_range; 10934887Schin k = key; 10944887Schin if (ew == 1) 10954887Schin { 10964887Schin c = ec; 10974887Schin if (ic) 10984887Schin { 10998462SApril.Chin@Sun.COM if (isupper(c)) 11008462SApril.Chin@Sun.COM { 11018462SApril.Chin@Sun.COM c = tolower(c); 11028462SApril.Chin@Sun.COM cc = -1; 11038462SApril.Chin@Sun.COM } 11048462SApril.Chin@Sun.COM else if (islower(c)) 11058462SApril.Chin@Sun.COM { 11068462SApril.Chin@Sun.COM c = toupper(c); 11078462SApril.Chin@Sun.COM cc = -1; 11088462SApril.Chin@Sun.COM } 11098462SApril.Chin@Sun.COM } 11108462SApril.Chin@Sun.COM *k++ = c; 11118462SApril.Chin@Sun.COM } 11128462SApril.Chin@Sun.COM else if (ew < COLL_KEY_MAX) 11138462SApril.Chin@Sun.COM { 11148462SApril.Chin@Sun.COM s = (char*)ep; 11158462SApril.Chin@Sun.COM if (ic) 11168462SApril.Chin@Sun.COM { 11178462SApril.Chin@Sun.COM c = mbchar(s); 11184887Schin if (iswupper(c)) 11194887Schin { 11204887Schin c = towlower(c); 11214887Schin cc = 1; 11224887Schin } 11234887Schin else if (iswlower(c)) 11244887Schin { 11254887Schin c = towupper(c); 11264887Schin cc = 1; 11274887Schin } 11284887Schin } 11298462SApril.Chin@Sun.COM if (cc > 0) 11304887Schin { 11318462SApril.Chin@Sun.COM cc = -1; 1132*12068SRoger.Faulkner@Oracle.COM k += mbconv((char*)k, c); 11334887Schin } 11348462SApril.Chin@Sun.COM else 11358462SApril.Chin@Sun.COM for (e = k + ew; k < e; *k++ = *s++); 11364887Schin } 11374887Schin *k = 0; 11384887Schin mbxfrm(ce->end, key, COLL_KEY_MAX); 11394887Schin k = key; 11404887Schin c = mbchar(k); 11414887Schin if (iswupper(c)) 11424887Schin et = COLL_range_uc; 11434887Schin else if (iswlower(c)) 11444887Schin et = COLL_range_lc; 11454887Schin else 11464887Schin et = COLL_range; 11474887Schin ce->typ = bt == et ? bt : COLL_range; 11484887Schin } 11494887Schin else 11504887Schin ce->typ = COLL_char; 11514887Schin ce++; 11524887Schin if (!ic || !cc) 11534887Schin break; 11544887Schin ic = 0; 11554887Schin } 11564887Schin return ce; 11574887Schin } 11584887Schin 11594887Schin static Rex_t* 11604887Schin bra(Cenv_t* env) 11614887Schin { 11624887Schin Rex_t* e; 11634887Schin int c; 11644887Schin int i; 11654887Schin int w; 11664887Schin int neg; 11674887Schin int last; 11684887Schin int inrange; 11694887Schin int complicated; 11704887Schin int collate; 11714887Schin int elements; 11724887Schin unsigned char* first; 11734887Schin unsigned char* start; 11744887Schin unsigned char* begin; 11754887Schin unsigned char* s; 11764887Schin regclass_t f; 11774887Schin unsigned char buf[4 * (COLL_KEY_MAX + 1)]; 11784887Schin #if _PACKAGE_ast 11794887Schin int ic; 11804887Schin char mbc[COLL_KEY_MAX + 1]; 11814887Schin #endif 11824887Schin 11834887Schin if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 11844887Schin return 0; 11854887Schin collate = complicated = elements = 0; 11864887Schin if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!') 11874887Schin { 11884887Schin env->cursor++; 11894887Schin neg = 1; 11904887Schin } 11914887Schin else 11924887Schin neg = 0; 11938462SApril.Chin@Sun.COM first = env->cursor; 11948462SApril.Chin@Sun.COM start = first + MBSIZE(first); 11954887Schin if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter)) 11964887Schin goto error; 11974887Schin begin = env->cursor + MBSIZE(env->cursor); 11984887Schin 11994887Schin /* 12004887Schin * inrange: 0=no, 1=possibly, 2=definitely 12014887Schin */ 12024887Schin 12034887Schin inrange = 0; 12044887Schin for (;;) 12054887Schin { 1206*12068SRoger.Faulkner@Oracle.COM if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE)) 12074887Schin goto error; 12084887Schin env->cursor += (w = MBSIZE(env->cursor)); 1209*12068SRoger.Faulkner@Oracle.COM if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))) 12104887Schin { 12114887Schin if (*env->cursor) 12124887Schin { 12134887Schin if (*env->cursor == 'n') 12144887Schin { 12154887Schin env->cursor++; 12164887Schin c = '\n'; 12174887Schin } 12184887Schin else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 12194887Schin { 12204887Schin env->token.len = 1; 12214887Schin w = magic(env, *env->cursor, 2); 12224887Schin if (env->token.len > 1 || w != T_BAD) 12234887Schin { 12244887Schin if (env->token.len == 1 && (f = classfun(w))) 12254887Schin { 12264887Schin if (inrange > 1) 12274887Schin { 12284887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 12294887Schin goto erange; 12304887Schin inrange = 0; 12314887Schin } 12324887Schin env->cursor++; 12334887Schin for (c = 0; c <= UCHAR_MAX; c++) 12344887Schin if ((*f)(c)) 12354887Schin setadd(e->re.charclass, c); 12364887Schin complicated++; 12374887Schin elements++; 12384887Schin continue; 12394887Schin } 12404887Schin if (env->token.len > 1 || w >= 0 && w < T_META) 12414887Schin { 12424887Schin c = w; 12434887Schin if (c > UCHAR_MAX) 12444887Schin { 12454887Schin if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide()) 12464887Schin goto erange; 12474887Schin c = UCHAR_MAX; 12484887Schin } 12494887Schin env->cursor += env->token.len; 12504887Schin } 12514887Schin } 12524887Schin } 12534887Schin } 12544887Schin } 12554887Schin else if (c == ']') 12564887Schin { 12574887Schin if (env->cursor == begin) 12584887Schin { 12594887Schin last = c; 12604887Schin inrange = 1; 12614887Schin continue; 12624887Schin } 12634887Schin if (inrange != 0) 12644887Schin { 12654887Schin setadd(e->re.charclass, last); 12664887Schin elements++; 12674887Schin if (inrange == 2) 12684887Schin { 12694887Schin setadd(e->re.charclass, '-'); 12704887Schin elements++; 12714887Schin } 12724887Schin } 12734887Schin break; 12744887Schin } 12754887Schin else if (c == '-') 12764887Schin { 12774887Schin if (!inrange && env->cursor != begin && *env->cursor != ']') 12784887Schin { 12794887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 12804887Schin goto erange; 12814887Schin continue; 12824887Schin } 12834887Schin else if (inrange == 1) 12844887Schin { 12854887Schin inrange = 2; 12864887Schin complicated++; 12874887Schin continue; 12884887Schin } 12894887Schin } 12904887Schin else if (c == '[') 12914887Schin { 12924887Schin switch (*env->cursor) 12934887Schin { 12944887Schin case 0: 12954887Schin goto error; 12964887Schin case ':': 129710898Sroland.mainz@nrubsig.org if (env->regexp) 129810898Sroland.mainz@nrubsig.org goto normal; 12994887Schin if (inrange == 1) 13004887Schin { 13014887Schin setadd(e->re.charclass, last); 13024887Schin elements++; 13034887Schin } 13044887Schin if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 13054887Schin { 13064887Schin if (env->cursor == start && (c = *(env->cursor + 1))) 13074887Schin { 13084887Schin s = start = env->cursor + 1; 13094887Schin while (*++s && *s != ':'); 13104887Schin if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']') 13114887Schin { 13124887Schin if ((i = (s - start)) == 1) 13134887Schin { 13144887Schin switch (c) 13154887Schin { 13164887Schin case '<': 13174887Schin i = REX_WBEG; 13184887Schin break; 13194887Schin case '>': 13204887Schin i = REX_WEND; 13214887Schin break; 13224887Schin default: 13234887Schin i = 0; 13244887Schin break; 13254887Schin } 13264887Schin if (i) 13274887Schin { 13284887Schin env->cursor = s + 3; 13294887Schin drop(env->disc, e); 13304887Schin return node(env, i, 0, 0, 0); 13314887Schin } 13324887Schin } 13334887Schin } 13344887Schin } 13354887Schin env->error = REG_ECTYPE; 13364887Schin goto error; 13374887Schin } 13384887Schin for (c = 0; c <= UCHAR_MAX; c++) 13394887Schin if ((*f)(c)) 13404887Schin setadd(e->re.charclass, c); 13414887Schin inrange = 0; 13424887Schin complicated++; 13434887Schin elements++; 13444887Schin continue; 13454887Schin case '=': 134610898Sroland.mainz@nrubsig.org if (env->regexp) 134710898Sroland.mainz@nrubsig.org goto normal; 13484887Schin if (inrange == 2) 13494887Schin goto erange; 13504887Schin if (inrange == 1) 13514887Schin { 13524887Schin setadd(e->re.charclass, last); 13534887Schin elements++; 13544887Schin } 13554887Schin if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 13564887Schin goto ecollate; 13574887Schin if (c > 1) 13584887Schin collate++; 13594887Schin else 13604887Schin setadd(e->re.charclass, buf[0]); 13614887Schin c = buf[0]; 13624887Schin inrange = 0; 13634887Schin complicated++; 13644887Schin elements++; 13654887Schin continue; 13664887Schin case '.': 136710898Sroland.mainz@nrubsig.org if (env->regexp) 136810898Sroland.mainz@nrubsig.org goto normal; 13694887Schin if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 13704887Schin goto ecollate; 13714887Schin if (c > 1) 13724887Schin collate++; 13734887Schin c = buf[0]; 13744887Schin complicated++; 13754887Schin break; 13764887Schin default: 137710898Sroland.mainz@nrubsig.org normal: 13784887Schin if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 13794887Schin goto error; 13804887Schin break; 13814887Schin } 13824887Schin } 13834887Schin else if (w > 1) 13844887Schin complicated++; 13854887Schin if (inrange == 2) 13864887Schin { 1387*12068SRoger.Faulkner@Oracle.COM if (last <= c) 13884887Schin { 1389*12068SRoger.Faulkner@Oracle.COM for (i = last; i <= c; i++) 1390*12068SRoger.Faulkner@Oracle.COM setadd(e->re.charclass, i); 1391*12068SRoger.Faulkner@Oracle.COM inrange = env->type >= SRE || (env->flags & REG_LENIENT); 1392*12068SRoger.Faulkner@Oracle.COM elements += 2; 1393*12068SRoger.Faulkner@Oracle.COM } 1394*12068SRoger.Faulkner@Oracle.COM else if (env->type >= SRE) 1395*12068SRoger.Faulkner@Oracle.COM { 13964887Schin setadd(e->re.charclass, last); 13974887Schin setadd(e->re.charclass, c); 1398*12068SRoger.Faulkner@Oracle.COM elements += 2; 1399*12068SRoger.Faulkner@Oracle.COM inrange = 1; 14004887Schin } 1401*12068SRoger.Faulkner@Oracle.COM else if (!(env->flags & REG_LENIENT)) 1402*12068SRoger.Faulkner@Oracle.COM goto erange; 14034887Schin else 1404*12068SRoger.Faulkner@Oracle.COM inrange = 0; 14054887Schin } 14064887Schin else if (inrange == 1) 14074887Schin { 14084887Schin setadd(e->re.charclass, last); 14094887Schin elements++; 14104887Schin } 14114887Schin else 14124887Schin inrange = 1; 14134887Schin last = c; 14144887Schin } 14154887Schin #if _PACKAGE_ast 14164887Schin if (complicated && mbcoll()) 14174887Schin { 14184887Schin Dt_t* dt; 14194887Schin Cchr_t* cc; 14204887Schin Cchr_t* tc; 14214887Schin Cchr_t* xc; 14224887Schin Celt_t* ce; 14234887Schin Cchr_t key; 14244887Schin int rw; 14254887Schin int rc; 14268462SApril.Chin@Sun.COM int wc; 14274887Schin unsigned char* rp; 14284887Schin unsigned char* pp; 14298462SApril.Chin@Sun.COM char* wp; 14308462SApril.Chin@Sun.COM char cb[2][COLL_KEY_MAX+1]; 14314887Schin 14324887Schin static Dtdisc_t disc; 14334887Schin 14344887Schin static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 14354887Schin 14364887Schin if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data)) 14374887Schin { 14384887Schin disc.key = offsetof(Cchr_t, key); 14394887Schin if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree))) 14404887Schin { 14414887Schin for (i = 0; i < elementsof(primary) - 1; i++, cc++) 14424887Schin { 14434887Schin cc->nam[0] = primary[i]; 14444887Schin mbxfrm(cc->key, cc->nam, COLL_KEY_MAX); 14454887Schin dtinsert(dt, cc); 14464887Schin } 14474887Schin for (i = 0; i < elementsof(cc->key); i++) 14484887Schin cc->key[i] = ~0; 14494887Schin dtinsert(dt, cc); 14504887Schin LCINFO(AST_LC_COLLATE)->data = (void*)dt; 14514887Schin } 14524887Schin else 14534887Schin { 14544887Schin if (cc) 14554887Schin free(cc); 14564887Schin drop(env->disc, e); 14574887Schin return 0; 14584887Schin } 14594887Schin } 14604887Schin if (dt) 14614887Schin { 14624887Schin drop(env->disc, e); 14634887Schin if (ic = env->flags & REG_ICASE) 14644887Schin elements *= 2; 14658462SApril.Chin@Sun.COM if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t)))) 14664887Schin return 0; 14674887Schin ce = (Celt_t*)e->re.data; 14684887Schin e->re.collate.invert = neg; 14694887Schin e->re.collate.elements = ce; 14704887Schin env->cursor = first; 14714887Schin inrange = 0; 14724887Schin for (;;) 14734887Schin { 14744887Schin if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 14754887Schin goto error; 14764887Schin pp = env->cursor; 14774887Schin env->cursor += (w = MBSIZE(env->cursor)); 1478*12068SRoger.Faulkner@Oracle.COM if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))) 14794887Schin { 14804887Schin if (*env->cursor) 14814887Schin { 14824887Schin if (*env->cursor == 'n') 14834887Schin { 14844887Schin pp = env->cursor++; 14854887Schin c = '\n'; 14864887Schin } 14874887Schin else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 14884887Schin { 14894887Schin env->token.len = 1; 14904887Schin w = magic(env, *env->cursor, 2); 14914887Schin if (env->token.len > 1 || w != T_BAD) 14924887Schin { 14934887Schin if (env->token.len == 1 && (f = classfun(w))) 14944887Schin { 14954887Schin if (inrange > 1) 14964887Schin { 14974887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 14984887Schin goto erange; 14994887Schin inrange = 0; 15004887Schin } 15014887Schin env->cursor++; 15024887Schin ce->fun = f; 15034887Schin ce->typ = COLL_call; 15044887Schin ce++; 15054887Schin continue; 15064887Schin } 15074887Schin if (env->token.len > 1 || w >= 0 && w < T_META) 15084887Schin { 15094887Schin c = w; 1510*12068SRoger.Faulkner@Oracle.COM w = mbconv(mbc, c); 15114887Schin pp = (unsigned char*)mbc; 15124887Schin env->cursor += env->token.len; 15134887Schin } 15144887Schin } 15154887Schin } 15164887Schin } 15174887Schin } 15184887Schin else if (c == ']') 15194887Schin { 15204887Schin if (env->cursor == begin) 15214887Schin { 15224887Schin rp = pp; 15234887Schin rw = w; 15244887Schin inrange = 1; 15254887Schin continue; 15264887Schin } 15274887Schin if (inrange != 0) 15284887Schin { 15294887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 15304887Schin if (inrange == 2) 15314887Schin ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0); 15324887Schin } 15334887Schin break; 15344887Schin } 15354887Schin else if (c == '-') 15364887Schin { 15374887Schin if (!inrange && env->cursor != begin && *env->cursor != ']') 15384887Schin { 15394887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 15404887Schin goto erange; 15414887Schin continue; 15424887Schin } 15434887Schin else if (inrange == 1) 15444887Schin { 15454887Schin inrange = 2; 15464887Schin continue; 15474887Schin } 15484887Schin } 15494887Schin else if (c == '[') 15504887Schin { 15514887Schin switch (*env->cursor) 15524887Schin { 15534887Schin case 0: 15544887Schin goto error; 15554887Schin case ':': 155610898Sroland.mainz@nrubsig.org if (env->regexp) 155710898Sroland.mainz@nrubsig.org goto complicated_normal; 15584887Schin if (inrange == 1) 15594887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 15604887Schin if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 15614887Schin { 15624887Schin if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']') 15634887Schin { 15644887Schin switch (c) 15654887Schin { 15664887Schin case '<': 15674887Schin i = REX_WBEG; 15684887Schin break; 15694887Schin case '>': 15704887Schin i = REX_WEND; 15714887Schin break; 15724887Schin default: 15734887Schin i = 0; 15744887Schin break; 15754887Schin } 15764887Schin if (i) 15774887Schin { 15784887Schin env->cursor += 5; 15794887Schin drop(env->disc, e); 15804887Schin return node(env, i, 0, 0, 0); 15814887Schin } 15824887Schin } 15834887Schin env->error = REG_ECTYPE; 15844887Schin goto error; 15854887Schin } 15864887Schin ce->fun = f; 15874887Schin ce->typ = COLL_call; 15884887Schin ce++; 15894887Schin inrange = 0; 15904887Schin continue; 15914887Schin case '=': 159210898Sroland.mainz@nrubsig.org if (env->regexp) 159310898Sroland.mainz@nrubsig.org goto complicated_normal; 15944887Schin if (inrange == 2) 15954887Schin goto erange; 15964887Schin if (inrange == 1) 15974887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 15988462SApril.Chin@Sun.COM pp = (unsigned char*)cb[inrange]; 15994887Schin rp = env->cursor + 1; 16008462SApril.Chin@Sun.COM if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 16014887Schin goto ecollate; 16028462SApril.Chin@Sun.COM wp = (char*)pp; 16038462SApril.Chin@Sun.COM wc = mbchar(wp); 16044887Schin c = 0; 16054887Schin if (ic) 16068462SApril.Chin@Sun.COM { 16078462SApril.Chin@Sun.COM if (iswupper(wc)) 16088462SApril.Chin@Sun.COM { 16098462SApril.Chin@Sun.COM wc = towlower(wc); 1610*12068SRoger.Faulkner@Oracle.COM rw = mbconv((char*)pp, wc); 16118462SApril.Chin@Sun.COM c = 'u'; 16128462SApril.Chin@Sun.COM } 16138462SApril.Chin@Sun.COM else if (iswlower(wc)) 16148462SApril.Chin@Sun.COM c = 'l'; 16158462SApril.Chin@Sun.COM } 16164887Schin for (;;) 16174887Schin { 16188462SApril.Chin@Sun.COM mbxfrm(key.key, (char*)pp, COLL_KEY_MAX); 16198462SApril.Chin@Sun.COM if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key))) 16208462SApril.Chin@Sun.COM goto ecollate; 16218462SApril.Chin@Sun.COM xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc; 16228462SApril.Chin@Sun.COM if (c == 'l' || c == 'L' && !(c = 0)) 16234887Schin ce->typ = COLL_range_lc; 16248462SApril.Chin@Sun.COM else if (c == 'u' || c == 'U' && !(c = 0)) 16254887Schin ce->typ = COLL_range_uc; 16264887Schin else 16274887Schin ce->typ = COLL_range; 16284887Schin strcpy((char*)ce->beg, (char*)xc->key); 16294887Schin if (!(cc = (Cchr_t*)dtnext(dt, cc))) 16304887Schin goto ecollate; 16314887Schin if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc))) 16324887Schin cc = tc; 16334887Schin strcpy((char*)ce->end, (char*)cc->key); 16344887Schin ce->max = -1; 16354887Schin ce++; 16364887Schin if (!c) 16374887Schin break; 16388462SApril.Chin@Sun.COM if (c == 'u') 16398462SApril.Chin@Sun.COM { 16408462SApril.Chin@Sun.COM wc = towlower(wc); 16418462SApril.Chin@Sun.COM c = 'L'; 16428462SApril.Chin@Sun.COM } 16438462SApril.Chin@Sun.COM else 16448462SApril.Chin@Sun.COM { 16458462SApril.Chin@Sun.COM wc = towupper(wc); 16468462SApril.Chin@Sun.COM c = 'U'; 16478462SApril.Chin@Sun.COM } 1648*12068SRoger.Faulkner@Oracle.COM rw = mbconv((char*)pp, wc); 16494887Schin } 16504887Schin inrange = 0; 16518462SApril.Chin@Sun.COM c = *pp; 16524887Schin continue; 16534887Schin case '.': 165410898Sroland.mainz@nrubsig.org if (env->regexp) 165510898Sroland.mainz@nrubsig.org goto complicated_normal; 16568462SApril.Chin@Sun.COM pp = (unsigned char*)cb[inrange]; 16578462SApril.Chin@Sun.COM if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 16584887Schin goto ecollate; 16594887Schin c = buf[0]; 16604887Schin break; 16614887Schin default: 166210898Sroland.mainz@nrubsig.org complicated_normal: 16634887Schin if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 16644887Schin goto error; 16654887Schin break; 16664887Schin } 16674887Schin } 16684887Schin if (inrange == 2) 16694887Schin { 16704887Schin ce = col(ce, ic, rp, rw, rc, pp, w, c); 16714887Schin if (strcmp((char*)ce->beg, (char*)ce->end) > 0) 16724887Schin { 16734887Schin if (env->type < SRE && !(env->flags & REG_LENIENT)) 16744887Schin goto erange; 16754887Schin (ce-1)->typ = COLL_char; 16764887Schin strcpy((char*)ce->beg, (char*)(ce-1)->end); 16774887Schin ce->typ = COLL_char; 16784887Schin ce++; 16794887Schin } 16804887Schin inrange = env->type >= SRE || (env->flags & REG_LENIENT); 16814887Schin } 16824887Schin else if (inrange == 1) 16834887Schin ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 16844887Schin else 16854887Schin inrange = 1; 16864887Schin rp = pp; 16874887Schin rw = w; 16884887Schin rc = c; 16894887Schin } 16904887Schin ce->typ = COLL_end; 16914887Schin return e; 16924887Schin } 16934887Schin } 16944887Schin #endif 16954887Schin if (collate) 16964887Schin goto ecollate; 16974887Schin if (env->flags & REG_ICASE) 16984887Schin for (i = 0; i <= UCHAR_MAX; i++) 16994887Schin if (settst(e->re.charclass, i)) 17004887Schin { 17014887Schin if (isupper(i)) 17024887Schin c = tolower(i); 17034887Schin else if (islower(i)) 17044887Schin c = toupper(i); 17054887Schin else 17064887Schin continue; 17074887Schin setadd(e->re.charclass, c); 17084887Schin } 17094887Schin if (neg) 17104887Schin { 17114887Schin for (i = 0; i < elementsof(e->re.charclass->bits); i++) 17124887Schin e->re.charclass->bits[i] ^= ~0; 17134887Schin if (env->explicit >= 0) 17144887Schin setclr(e->re.charclass, env->explicit); 17154887Schin } 17164887Schin return e; 17174887Schin ecollate: 17184887Schin env->error = REG_ECOLLATE; 17194887Schin goto error; 17204887Schin erange: 17214887Schin env->error = REG_ERANGE; 17224887Schin error: 17234887Schin drop(env->disc, e); 17244887Schin if (!env->error) 17254887Schin env->error = REG_EBRACK; 17264887Schin return 0; 17274887Schin } 17284887Schin 17294887Schin static Rex_t* 17304887Schin ccl(Cenv_t* env, int type) 17314887Schin { 17324887Schin int i; 17334887Schin Rex_t* e; 17344887Schin Celt_t* ce; 17354887Schin regclass_t f; 17364887Schin 17374887Schin if (!(f = classfun(type))) 17384887Schin { 17394887Schin env->error = REG_BADESC; 17404887Schin return 0; 17414887Schin } 17424887Schin if (!mbcoll()) 17434887Schin { 17444887Schin if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 17454887Schin return 0; 17464887Schin for (i = 0; i <= UCHAR_MAX; i++) 17474887Schin if ((*f)(i)) 17484887Schin setadd(e->re.charclass, i); 17494887Schin if (env->explicit >= 0) 17504887Schin setclr(e->re.charclass, env->explicit); 17514887Schin } 17524887Schin else 17534887Schin { 17544887Schin if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t)))) 17554887Schin return 0; 17564887Schin ce = (Celt_t*)e->re.data; 17574887Schin e->re.collate.invert = 0; 17584887Schin e->re.collate.elements = ce; 17598462SApril.Chin@Sun.COM ce->fun = f; 17604887Schin ce->typ = COLL_call; 17614887Schin ce++; 17624887Schin ce->typ = COLL_end; 17634887Schin } 17644887Schin return e; 17654887Schin } 17664887Schin 17674887Schin static Rex_t* 17684887Schin rep(Cenv_t* env, Rex_t* e, int number, int last) 17694887Schin { 17704887Schin Rex_t* f; 17714887Schin unsigned long m = 0; 17724887Schin unsigned long n = RE_DUP_INF; 17734887Schin int minimal = -1; 17744887Schin 17754887Schin if (!e) 17764887Schin return 0; 17774887Schin switch (token(env)) 17784887Schin { 17794887Schin case T_BANG: 17804887Schin eat(env); 17814887Schin if (!(f = node(env, REX_NEG, m, n, 0))) 17824887Schin { 17834887Schin drop(env->disc, e); 17844887Schin return 0; 17854887Schin } 17864887Schin f->re.group.expr.rex = e; 17874887Schin return f; 17884887Schin case T_QUES: 17894887Schin eat(env); 17904887Schin n = 1; 17914887Schin break; 17924887Schin case T_STAR: 17934887Schin eat(env); 17944887Schin break; 17954887Schin case T_PLUS: 17964887Schin eat(env); 17974887Schin m = 1; 17984887Schin break; 17994887Schin case T_LEFT: 18004887Schin eat(env); 18014887Schin m = env->token.min; 18024887Schin n = env->token.max; 18034887Schin break; 18044887Schin default: 18054887Schin return e; 18064887Schin } 18074887Schin if (env->token.att) 18084887Schin minimal = 1; 18094887Schin else if (env->type < SRE) 18104887Schin switch (token(env)) 18114887Schin { 18124887Schin case T_QUES: 18134887Schin eat(env); 18144887Schin minimal = !(env->flags & REG_MINIMAL); 18154887Schin break; 18164887Schin case T_STAR: /*AST*/ 18174887Schin eat(env); 18184887Schin minimal = !!(env->flags & REG_MINIMAL); 18194887Schin break; 18204887Schin } 18214887Schin switch (e->type) 18224887Schin { 18234887Schin case REX_DOT: 18244887Schin case REX_CLASS: 18254887Schin case REX_COLL_CLASS: 18264887Schin case REX_ONECHAR: 18274887Schin e->lo = m; 18284887Schin e->hi = n; 18294887Schin if (minimal >= 0) 18304887Schin mark(e, minimal); 18314887Schin return e; 18324887Schin #if HUH_2002_08_07 18334887Schin case REX_BEG: 18344887Schin #endif 18354887Schin case REX_BEG_STR: 18364887Schin case REX_END_STR: 18374887Schin case REX_FIN_STR: 18384887Schin case REX_WBEG: 18394887Schin case REX_WEND: 18404887Schin case REX_WORD: 18414887Schin case REX_WORD_NOT: 18424887Schin env->error = REG_BADRPT; 18434887Schin drop(env->disc, e); 18444887Schin return 0; 18454887Schin } 18464887Schin if (m == 1 && n == 1) 18474887Schin { 18484887Schin if (minimal >= 0) 18494887Schin mark(e, minimal); 18504887Schin return e; 18514887Schin } 18524887Schin if (!(f = node(env, REX_REP, m, n, 0))) 18534887Schin { 18544887Schin drop(env->disc, e); 18554887Schin return 0; 18564887Schin } 18574887Schin f->re.group.expr.rex = e; 18584887Schin f->re.group.number = number; 18594887Schin f->re.group.last = last; 18604887Schin if (minimal >= 0) 18614887Schin mark(f, minimal); 18624887Schin if (m <= n && n) 18634887Schin { 18644887Schin for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex); 18654887Schin if (e && e->type == REX_NEG) 18664887Schin f->type = REX_GROUP; 18674887Schin } 18684887Schin return f; 18694887Schin } 18704887Schin 18714887Schin static int 18724887Schin isstring(Rex_t* e) 18734887Schin { 18744887Schin switch (e->type) 18754887Schin { 18764887Schin case REX_ONECHAR: 18774887Schin return e->lo == 1 && e->hi == 1; 18784887Schin case REX_STRING: 18794887Schin return 1; 18804887Schin } 18814887Schin return 0; 18824887Schin } 18834887Schin 18844887Schin static Trie_node_t* 18854887Schin trienode(Cenv_t* env, int c) 18864887Schin { 18874887Schin Trie_node_t* t; 18884887Schin 18894887Schin if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t))) 18904887Schin { 18914887Schin memset(t, 0, sizeof(Trie_node_t)); 18924887Schin t->c = c; 18934887Schin } 18944887Schin return t; 18954887Schin } 18964887Schin 18974887Schin static int 18984887Schin insert(Cenv_t* env, Rex_t* f, Rex_t* g) 18994887Schin { 19004887Schin unsigned char* s; 19014887Schin unsigned char* e; 19024887Schin Trie_node_t* t; 19034887Schin int len; 19044887Schin unsigned char tmp[2]; 19054887Schin 19064887Schin switch (f->type) 19074887Schin { 19084887Schin case REX_ONECHAR: 19094887Schin *(s = tmp) = f->re.onechar; 19104887Schin e = s + 1; 19114887Schin break; 19124887Schin case REX_STRING: 19134887Schin s = f->re.string.base; 19144887Schin e = s + f->re.string.size; 19154887Schin break; 19164887Schin default: 19174887Schin return 1; 19184887Schin } 19194887Schin if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s))) 19204887Schin return 1; 19214887Schin for (len = 1;;) 19224887Schin { 19234887Schin if (t->c == *s) 19244887Schin { 19254887Schin if (++s >= e) 19264887Schin break; 19274887Schin if (!t->son && !(t->son = trienode(env, *s))) 19284887Schin return 1; 19294887Schin t = t->son; 19304887Schin len++; 19314887Schin } 19324887Schin else 19334887Schin { 19344887Schin if (!t->sib && !(t->sib = trienode(env, *s))) 19354887Schin return 1; 19364887Schin t = t->sib; 19374887Schin } 19384887Schin } 19394887Schin if (g->re.trie.min > len) 19404887Schin g->re.trie.min = len; 19414887Schin if (g->re.trie.max < len) 19424887Schin g->re.trie.max = len; 19434887Schin t->end = 1; 19444887Schin return 0; 19454887Schin } 19464887Schin 19474887Schin /* 19484887Schin * trie() tries to combine nontrivial e and f into a REX_TRIE 19494887Schin * unless 0 is returned, e and f are deleted as far as possible 19504887Schin * also called by regcomb 19514887Schin */ 19524887Schin 19534887Schin static Rex_t* 19544887Schin trie(Cenv_t* env, Rex_t* e, Rex_t* f) 19554887Schin { 19564887Schin Rex_t* g; 19574887Schin 19584887Schin if (e->next || f->next || !isstring(e) || e->flags != f->flags) 19594887Schin return 0; 19604887Schin if (isstring(f)) 19614887Schin { 19624887Schin if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*)))) 19634887Schin return 0; 19644887Schin g->re.trie.min = INT_MAX; 19654887Schin if (insert(env, f, g)) 19664887Schin goto nospace; 19674887Schin drop(env->disc, f); 19684887Schin } 19694887Schin else if (f->type != REX_TRIE) 19704887Schin return 0; 19714887Schin else 19724887Schin g = f; 19734887Schin if (insert(env, e, g)) 19744887Schin goto nospace; 19754887Schin drop(env->disc, e); 19764887Schin return g; 19774887Schin nospace: 19784887Schin if (g != f) 19794887Schin drop(env->disc, g); 19804887Schin return 0; 19814887Schin } 19824887Schin 19834887Schin static Rex_t* alt(Cenv_t*, int, int); 19844887Schin 19854887Schin static int 19864887Schin chr(register Cenv_t* env, int* escaped) 19874887Schin { 19884887Schin unsigned char* p; 19894887Schin int c; 19904887Schin 19914887Schin *escaped = 0; 19924887Schin if (!(c = *env->cursor)) 19934887Schin return -1; 19944887Schin env->cursor++; 19954887Schin if (c == '\\') 19964887Schin { 19974887Schin if (env->flags & REG_SHELL_ESCAPED) 19984887Schin return c; 19994887Schin if (!(c = *(env->cursor + 1)) || c == env->terminator) 20004887Schin { 20014887Schin if (env->flags & REG_LENIENT) 20024887Schin return c ? c : '\\'; 20034887Schin env->error = REG_EESCAPE; 20044887Schin return -1; 20054887Schin } 20064887Schin p = env->cursor; 20074887Schin c = chresc((char*)env->cursor - 1, (char**)&env->cursor); 20084887Schin *escaped = env->cursor - p; 20094887Schin } 20104887Schin return c; 20114887Schin } 20124887Schin 20134887Schin /* 20144887Schin * open the perly gates 20154887Schin */ 20164887Schin 20174887Schin static Rex_t* 20184887Schin grp(Cenv_t* env, int parno) 20194887Schin { 20204887Schin Rex_t* e; 20214887Schin Rex_t* f; 20224887Schin int c; 20234887Schin int i; 20244887Schin int n; 20254887Schin int x; 20264887Schin int esc; 20274887Schin int typ; 20284887Schin int beg; 20294887Schin unsigned char* p; 20304887Schin 20314887Schin beg = env->pattern == env->cursor - env->token.len; 20324887Schin if (!(c = env->token.lex) && (c = *env->cursor)) 20334887Schin env->cursor++; 20344887Schin env->token.len = 0; 20354887Schin env->parnest++; 20364887Schin typ = -1; 20374887Schin switch (c) 20384887Schin { 20394887Schin case '-': 20404887Schin case '+': 20414887Schin case 'a': 20424887Schin case 'g': 20434887Schin case 'i': 20444887Schin case 'l': 20454887Schin case 'm': 20464887Schin case 'p': 20474887Schin case 'r': 20484887Schin case 's': 20494887Schin case 'x': 20504887Schin case 'A': 20514887Schin case 'B': 20524887Schin case 'E': 20534887Schin case 'F': 20544887Schin case 'G': 20554887Schin case 'I': 20564887Schin case 'K': 20578462SApril.Chin@Sun.COM case 'L': 20588462SApril.Chin@Sun.COM case 'M': /* glob(3) */ 20598462SApril.Chin@Sun.COM case 'N': /* glob(3) */ 20608462SApril.Chin@Sun.COM case 'O': /* glob(3) */ 2061*12068SRoger.Faulkner@Oracle.COM case 'P': 20628462SApril.Chin@Sun.COM case 'R': /* pcre */ 20634887Schin case 'S': 20644887Schin case 'U': /* pcre */ 20654887Schin case 'X': /* pcre */ 20664887Schin x = REX_GROUP; 20674887Schin i = 1; 20684887Schin env->token.push = 1; 20694887Schin for (;;) 20704887Schin { 20714887Schin switch (c) 20724887Schin { 20738462SApril.Chin@Sun.COM case ')': 20748462SApril.Chin@Sun.COM if (!(env->flags & REG_LITERAL)) 20758462SApril.Chin@Sun.COM { 20768462SApril.Chin@Sun.COM env->error = REG_BADRPT; 20778462SApril.Chin@Sun.COM return 0; 20788462SApril.Chin@Sun.COM } 20798462SApril.Chin@Sun.COM /*FALLTHROUGH*/ 20804887Schin case 0: 20814887Schin case T_CLOSE: 20824887Schin x = 0; 20834887Schin goto done; 20844887Schin case ':': 20854887Schin eat(env); 20864887Schin if (token(env) == T_CLOSE) 20874887Schin x = 0; 20884887Schin goto done; 20894887Schin case '-': 20904887Schin i = 0; 20914887Schin break; 20924887Schin case '+': 20934887Schin i = 1; 20944887Schin break; 20954887Schin case 'a': 20964887Schin if (i) 20974887Schin env->flags |= (REG_LEFT|REG_RIGHT); 20984887Schin else 20994887Schin env->flags &= ~(REG_LEFT|REG_RIGHT); 21004887Schin break; 21014887Schin case 'g': 21024887Schin if (i) 21034887Schin env->flags &= ~REG_MINIMAL; 21044887Schin else 21054887Schin env->flags |= REG_MINIMAL; 21064887Schin break; 21074887Schin case 'i': 21084887Schin if (i) 21094887Schin env->flags |= REG_ICASE; 21104887Schin else 21114887Schin env->flags &= ~REG_ICASE; 21124887Schin break; 21134887Schin case 'l': 21144887Schin if (i) 21154887Schin env->flags |= REG_LEFT; 21164887Schin else 21174887Schin env->flags &= ~REG_LEFT; 21184887Schin break; 21194887Schin case 'm': 21204887Schin if (i) 21214887Schin env->flags |= REG_NEWLINE; 21224887Schin else 21234887Schin env->flags &= ~REG_NEWLINE; 21244887Schin env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 21254887Schin break; 21264887Schin case 'p': 21274887Schin if (i) 21284887Schin env->flags &= ~REG_LENIENT; 21294887Schin else 21304887Schin env->flags |= REG_LENIENT; 21314887Schin break; 21324887Schin case 'r': 21334887Schin if (i) 21344887Schin env->flags |= REG_RIGHT; 21354887Schin else 21364887Schin env->flags &= ~REG_RIGHT; 21374887Schin break; 21384887Schin case 's': 21394887Schin if (i) 21404887Schin env->flags |= REG_SPAN; 21414887Schin else 21424887Schin env->flags &= ~REG_SPAN; 21434887Schin env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 21444887Schin break; 21454887Schin case 'x': 21464887Schin if (i) 21474887Schin env->flags |= REG_COMMENT; 21484887Schin else 21494887Schin env->flags &= ~REG_COMMENT; 21504887Schin break; 21514887Schin case 'A': 2152*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 21534887Schin env->flags |= REG_AUGMENTED|REG_EXTENDED; 21544887Schin typ = ARE; 21554887Schin break; 21564887Schin case 'B': 21574887Schin case 'G': 2158*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 21594887Schin typ = BRE; 21604887Schin break; 21614887Schin case 'E': 2162*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 21634887Schin env->flags |= REG_EXTENDED; 21644887Schin typ = ERE; 21654887Schin break; 21664887Schin case 'F': 21674887Schin case 'L': 2168*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 21694887Schin env->flags |= REG_LITERAL; 21704887Schin typ = ERE; 21714887Schin break; 21724887Schin case 'K': 2173*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 21744887Schin env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT; 21754887Schin typ = KRE; 21764887Schin break; 21774887Schin case 'M': 21784887Schin /* used by caller to disable glob(3) GLOB_BRACE */ 21794887Schin break; 21804887Schin case 'N': 21814887Schin /* used by caller to disable glob(3) GLOB_NOCHECK */ 21824887Schin break; 21838462SApril.Chin@Sun.COM case 'O': 21844887Schin /* used by caller to disable glob(3) GLOB_STARSTAR */ 21854887Schin break; 2186*12068SRoger.Faulkner@Oracle.COM case 'P': 2187*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 2188*12068SRoger.Faulkner@Oracle.COM env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE; 2189*12068SRoger.Faulkner@Oracle.COM typ = ERE; 2190*12068SRoger.Faulkner@Oracle.COM break; 21914887Schin case 'S': 2192*12068SRoger.Faulkner@Oracle.COM env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 21934887Schin env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT; 21944887Schin typ = SRE; 21954887Schin break; 21964887Schin case 'U': /* PCRE_UNGREEDY */ 21974887Schin if (i) 21984887Schin env->flags |= REG_MINIMAL; 21994887Schin else 22004887Schin env->flags &= ~REG_MINIMAL; 22014887Schin break; 22024887Schin case 'X': /* PCRE_EXTRA */ 22034887Schin break; 22044887Schin default: 22054887Schin env->error = REG_BADRPT; 22064887Schin return 0; 22074887Schin } 22084887Schin eat(env); 22094887Schin c = token(env); 22104887Schin } 22114887Schin done: 22124887Schin break; 22134887Schin case ':': 22144887Schin x = REX_GROUP; 22154887Schin break; 22164887Schin case '=': 22174887Schin x = REX_GROUP_AHEAD; 22184887Schin break; 22194887Schin case '!': 22204887Schin x = REX_GROUP_AHEAD_NOT; 22214887Schin break; 22224887Schin case '<': 22234887Schin switch (token(env)) 22244887Schin { 22254887Schin case '=': 22264887Schin x = REX_GROUP_BEHIND; 22274887Schin break; 22284887Schin case '!': 22294887Schin case T_BANG: 22304887Schin x = REX_GROUP_BEHIND_NOT; 22314887Schin break; 22324887Schin default: 22334887Schin env->error = REG_BADRPT; 22344887Schin return 0; 22354887Schin } 22364887Schin eat(env); 22374887Schin break; 22384887Schin case '>': 22394887Schin x = REX_GROUP_CUT; 22404887Schin break; 22414887Schin case '%': 22424887Schin case T_PERCENT: 22434887Schin e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short)); 22444887Schin e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor; 22454887Schin n = 1; 22464887Schin for (;;) 22474887Schin { 22484887Schin switch (i = chr(env, &esc)) 22494887Schin { 22504887Schin case -1: 22514887Schin case 0: 22524887Schin invalid: 22534887Schin env->cursor -= esc + 1; 22544887Schin env->error = REG_EPAREN; 22554887Schin return 0; 22564887Schin case 'D': 22574887Schin x = REX_NEST_delimiter; 22584887Schin /*FALLTHROUGH*/ 22594887Schin delimiter: 22604887Schin if ((i = chr(env, &esc)) < 0) 22614887Schin goto invalid; 22624887Schin if (e->re.nest.type[i] & ~x) 22634887Schin goto invalid; 22644887Schin e->re.nest.type[i] = x; 22654887Schin continue; 22664887Schin case 'E': 22674887Schin x = REX_NEST_escape; 22684887Schin goto delimiter; 22694887Schin case 'L': 22704887Schin x = REX_NEST_literal; 22714887Schin goto quote; 22724887Schin case 'O': 22734887Schin switch (i = chr(env, &esc)) 22744887Schin { 22754887Schin case 'T': 22764887Schin e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator; 22774887Schin break; 22784887Schin default: 22794887Schin goto invalid; 22804887Schin } 22814887Schin continue; 22824887Schin case 'Q': 22834887Schin x = REX_NEST_quote; 22844887Schin /*FALLTHROUGH*/ 22854887Schin quote: 22864887Schin if ((i = chr(env, &esc)) < 0) 22874887Schin goto invalid; 22884887Schin if (e->re.nest.type[i] & ~x) 22894887Schin goto invalid; 22904887Schin e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT); 22914887Schin continue; 22924887Schin case 'S': 22934887Schin x = REX_NEST_separator; 22944887Schin goto delimiter; 22954887Schin case 'T': 22964887Schin x = REX_NEST_terminator; 22974887Schin goto delimiter; 22984887Schin case '|': 22994887Schin case '&': 23004887Schin if (!esc) 23014887Schin goto invalid; 23024887Schin goto nesting; 23034887Schin case '(': 23044887Schin if (!esc) 23054887Schin n++; 23064887Schin goto nesting; 23074887Schin case ')': 23084887Schin if (!esc && !--n) 23094887Schin break; 23104887Schin goto nesting; 23114887Schin default: 23124887Schin nesting: 23134887Schin if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 23144887Schin goto invalid; 23154887Schin e->re.nest.type[i] = REX_NEST_open; 23164887Schin if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 23174887Schin goto invalid; 23184887Schin if (!esc) 23194887Schin { 23204887Schin if (x == ')' && !--n) 23214887Schin goto invalid; 23224887Schin else if (x == '(') 23234887Schin n++; 23244887Schin } 23254887Schin e->re.nest.type[x] |= REX_NEST_close; 23264887Schin e->re.nest.type[i] |= x << REX_NEST_SHIFT; 23274887Schin continue; 23284887Schin } 23294887Schin break; 23304887Schin } 23314887Schin env->parnest--; 23324887Schin if (c == T_PERCENT) 23334887Schin for (n = 0; n < 2; n++) 23344887Schin { 23354887Schin parno = ++env->parno; 23364887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0))) 23374887Schin { 23384887Schin drop(env->disc, e); 23394887Schin return 0; 23404887Schin } 23414887Schin if (parno < elementsof(env->paren)) 23424887Schin env->paren[parno] = f; 23434887Schin f->re.group.back = 0; 23444887Schin f->re.group.number = parno; 23454887Schin f->re.group.expr.rex = e; 23464887Schin e = f; 23474887Schin } 23484887Schin return e; 23494887Schin case '(': 23504887Schin c = 0; 23514887Schin if (isdigit(*env->cursor)) 23524887Schin { 23534887Schin f = 0; 23544887Schin do 23554887Schin { 23564887Schin if (c > (INT_MAX / 10)) 23574887Schin { 23584887Schin env->error = REG_BADRPT; 23594887Schin return 0; 23604887Schin } 23614887Schin c = c * 10 + (*env->cursor++ - '0'); 23624887Schin } while (isdigit(*env->cursor)); 23634887Schin if (*env->cursor++ != ')') 23644887Schin { 23654887Schin env->error = REG_BADRPT; 23664887Schin return 0; 23674887Schin } 23684887Schin if (c && env->type >= SRE) 23694887Schin c = c * 2 - 1; 23704887Schin if (!c || c > env->parno || !env->paren[c]) 23714887Schin { 23724887Schin if (!(env->flags & REG_LENIENT)) 23734887Schin { 23744887Schin env->error = REG_ESUBREG; 23754887Schin return 0; 23764887Schin } 23774887Schin if (c) 23784887Schin c = -1; 23794887Schin } 23804887Schin } 23814887Schin else 23824887Schin { 23834887Schin if (env->type < SRE && *env->cursor++ != '?') 23844887Schin { 23854887Schin env->error = REG_BADRPT; 23864887Schin return 0; 23874887Schin } 23884887Schin if (!(f = grp(env, parno + 1)) && env->error) 23894887Schin return 0; 23904887Schin } 23914887Schin if (!(e = node(env, REX_GROUP_COND, 0, 0, 0))) 23924887Schin { 23934887Schin drop(env->disc, f); 23944887Schin return 0; 23954887Schin } 23964887Schin e->re.group.size = c; 23974887Schin e->re.group.expr.binary.left = f; 23984887Schin if (!(e->re.group.expr.binary.right = alt(env, parno, 1))) 23994887Schin { 24004887Schin drop(env->disc, e); 24014887Schin return 0; 24024887Schin } 24034887Schin if (token(env) != T_CLOSE) 24044887Schin { 24054887Schin env->error = REG_EPAREN; 24064887Schin return 0; 24074887Schin } 24084887Schin eat(env); 24094887Schin env->parnest--; 24104887Schin return rep(env, e, parno, parno); 24114887Schin case '{': 24124887Schin p = env->cursor; 24134887Schin n = 1; 24144887Schin while (c = *env->cursor) 24154887Schin { 24164887Schin if (c == '\\' && *(env->cursor + 1)) 24174887Schin env->cursor++; 24184887Schin else if (c == '{') 24194887Schin n++; 24204887Schin else if (c == '}' && !--n) 24214887Schin break; 24224887Schin else if (c == env->delimiter || c == env->terminator) 24234887Schin break; 24244887Schin env->cursor++; 24254887Schin } 24264887Schin if (c != '}') 24274887Schin { 24284887Schin env->error = REG_EBRACE; 24294887Schin return 0; 24304887Schin } 24314887Schin if (*++env->cursor != ')') 24324887Schin { 24334887Schin env->error = REG_EPAREN; 24344887Schin return 0; 24354887Schin } 24364887Schin env->cursor++; 24374887Schin env->parnest--; 24384887Schin if (env->disc->re_version < REG_VERSION_EXEC) 24394887Schin { 24404887Schin env->error = REG_BADRPT; 24414887Schin return 0; 24424887Schin } 24434887Schin if (!env->disc->re_execf) 24444887Schin return 0; 24454887Schin if (!(e = node(env, REX_EXEC, 0, 0, 0))) 24464887Schin return 0; 24474887Schin e->re.exec.text = (const char*)p; 24484887Schin e->re.exec.size = env->cursor - p - 2; 24494887Schin if (!env->disc->re_compf) 24504887Schin e->re.exec.data = 0; 24514887Schin else 24524887Schin e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc); 24534887Schin return e; 24544887Schin case '0': case '1': case '2': case '3': case '4': 24554887Schin case '5': case '6': case '7': case '8': case '9': 24564887Schin c -= '0'; 24574887Schin while (isdigit(*env->cursor)) 24584887Schin { 24594887Schin if (c > (INT_MAX / 10)) 24604887Schin { 24614887Schin env->error = REG_ESUBREG; 24624887Schin return 0; 24634887Schin } 24644887Schin c = c * 10 + *env->cursor++ - '0'; 24654887Schin } 24664887Schin if (*env->cursor == ')') 24674887Schin { 24684887Schin env->cursor++; 24694887Schin env->parnest--; 24704887Schin env->token.len = 1; 24714887Schin if (c > env->parno || !env->paren[c]) 24724887Schin { 24734887Schin env->error = REG_ESUBREG; 24744887Schin return 0; 24754887Schin } 24764887Schin env->paren[c]->re.group.back = 1; 24774887Schin return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 24784887Schin } 24794887Schin /*FALLTHROUGH*/ 24804887Schin default: 24814887Schin env->error = REG_BADRPT; 24824887Schin return 0; 24834887Schin } 24844887Schin if (x && !(e = alt(env, parno, 0))) 24854887Schin return 0; 24864887Schin c = token(env); 24874887Schin env->parnest--; 24888462SApril.Chin@Sun.COM if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')')) 24894887Schin { 24904887Schin env->error = REG_EPAREN; 24914887Schin return 0; 24924887Schin } 24934887Schin eat(env); 24944887Schin if (typ >= 0) 24954887Schin { 24964887Schin if (beg) 24974887Schin env->pattern = env->cursor; 24984887Schin env->type = typ; 24994887Schin } 25004887Schin if (!x) 25014887Schin return 0; 25024887Schin if (!(f = node(env, x, 0, 0, 0))) 25034887Schin { 25044887Schin drop(env->disc, e); 25054887Schin return 0; 25064887Schin } 25074887Schin f->re.group.expr.rex = e; 25084887Schin if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT) 25094887Schin { 25104887Schin if (stats(env, e)) 25114887Schin { 25124887Schin drop(env->disc, f); 25134887Schin if (!env->error) 25144887Schin env->error = REG_ECOUNT; 25154887Schin return 0; 25164887Schin } 25174887Schin f->re.group.size = env->stats.m; 25184887Schin memset(&env->stats, 0, sizeof(env->stats)); 25194887Schin } 25204887Schin switch (x) 25214887Schin { 25224887Schin case REX_GROUP: 25234887Schin case REX_GROUP_CUT: 25244887Schin f = rep(env, f, parno, env->parno); 25254887Schin break; 25264887Schin } 25274887Schin return f; 25284887Schin } 25294887Schin 25304887Schin static Rex_t* 25314887Schin seq(Cenv_t* env) 25324887Schin { 25334887Schin Rex_t* e; 25344887Schin Rex_t* f; 25354887Schin Token_t tok; 25364887Schin int c; 25374887Schin int i; 25384887Schin int n; 25394887Schin int x; 25404887Schin int parno; 25414887Schin int type; 25424887Schin regflags_t flags; 25434887Schin unsigned char* s; 25444887Schin unsigned char* p; 25454887Schin unsigned char* t; 25464887Schin unsigned char* u; 25474887Schin unsigned char buf[256]; 25484887Schin 25494887Schin for (;;) 25504887Schin { 25514887Schin s = buf; 25524887Schin while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len]) 25534887Schin { 25544887Schin x = c; 25554887Schin p = env->cursor; 25564887Schin if (c >= 0) 25574887Schin { 25584887Schin n = 1; 25594887Schin *s++ = (env->flags & REG_ICASE) ? toupper(c) : c; 25604887Schin } 25618462SApril.Chin@Sun.COM else if (c == C_ESC || (env->flags & REG_ICASE)) 25624887Schin { 25638462SApril.Chin@Sun.COM c = (c == C_ESC) ? env->token.lex : mbchar(p); 25648462SApril.Chin@Sun.COM if (env->flags & REG_ICASE) 25658462SApril.Chin@Sun.COM c = towupper(c); 25668462SApril.Chin@Sun.COM if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX) 25678462SApril.Chin@Sun.COM break; 2568*12068SRoger.Faulkner@Oracle.COM if ((n = mbconv((char*)s, c)) < 0) 25698462SApril.Chin@Sun.COM *s++ = c; 25708462SApril.Chin@Sun.COM else if (n) 25718462SApril.Chin@Sun.COM s += n; 25728462SApril.Chin@Sun.COM else 25734887Schin { 25744887Schin n = 1; 25754887Schin *s++ = 0; 25764887Schin } 25774887Schin } 25784887Schin else 25794887Schin { 25804887Schin n = env->token.len - env->token.esc; 25814887Schin for (t = p, u = s + n; s < u; *s++ = *t++); 25824887Schin } 25834887Schin eat(env); 25844887Schin } 25854887Schin if (c == T_BAD) 25864887Schin return 0; 25874887Schin if (s > buf) 25884887Schin switch (c) 25894887Schin { 25904887Schin case T_STAR: 25914887Schin case T_PLUS: 25924887Schin case T_LEFT: 25934887Schin case T_QUES: 25944887Schin case T_BANG: 25954887Schin if ((s -= n) == buf) 25964887Schin e = 0; 25974887Schin else 25984887Schin { 25994887Schin i = s - buf; 26004887Schin if (!(e = node(env, REX_STRING, 0, 0, i))) 26014887Schin return 0; 26024887Schin memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i); 26034887Schin e->re.string.size = i; 26044887Schin } 26054887Schin if (x >= 0) 26064887Schin { 26074887Schin if (!(f = node(env, REX_ONECHAR, 1, 1, 0))) 26084887Schin { 26094887Schin drop(env->disc, e); 26104887Schin return 0; 26114887Schin } 26124887Schin f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x; 26134887Schin } 26144887Schin else 26154887Schin { 26168462SApril.Chin@Sun.COM if (!(f = node(env, REX_STRING, 0, 0, n))) 26174887Schin return 0; 26188462SApril.Chin@Sun.COM memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n); 26198462SApril.Chin@Sun.COM f->re.string.size = n; 26204887Schin } 26214887Schin if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env)))) 26224887Schin { 26234887Schin drop(env->disc, e); 26244887Schin return 0; 26254887Schin } 26264887Schin if (e) 26274887Schin f = cat(env, e, f); 26284887Schin return f; 26294887Schin default: 26304887Schin c = s - buf; 26314887Schin if (!(e = node(env, REX_STRING, 0, 0, c))) 26324887Schin return 0; 26334887Schin memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c); 26344887Schin e->re.string.size = c; 26354887Schin return cat(env, e, seq(env)); 26364887Schin } 26374887Schin else if (c > T_BACK) 26384887Schin { 26394887Schin eat(env); 26404887Schin c -= T_BACK; 26414887Schin if (c > env->parno || !env->paren[c]) 26424887Schin { 26434887Schin env->error = REG_ESUBREG; 26444887Schin return 0; 26454887Schin } 26464887Schin env->paren[c]->re.group.back = 1; 26474887Schin e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 26484887Schin } 26494887Schin else 26504887Schin switch (c) 26514887Schin { 26524887Schin case T_AND: 26534887Schin case T_CLOSE: 26544887Schin case T_BAR: 26554887Schin case T_END: 26564887Schin return node(env, REX_NULL, 0, 0, 0); 26574887Schin case T_DOLL: 26584887Schin eat(env); 26594887Schin e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0); 26604887Schin break; 26614887Schin case T_CFLX: 26624887Schin eat(env); 26634887Schin if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED)) 26644887Schin e = rep(env, e, 0, 0); 26654887Schin break; 26664887Schin case T_OPEN: 26674887Schin tok = env->token; 26684887Schin eat(env); 26694887Schin flags = env->flags; 26704887Schin type = env->type; 26714887Schin if (env->token.att) 26724887Schin env->flags |= REG_MINIMAL; 26734887Schin env->parnest++; 26744887Schin if (env->type == KRE) 26754887Schin ++env->parno; 26764887Schin parno = ++env->parno; 26774887Schin if (!(e = alt(env, parno + 1, 0))) 26784887Schin break; 26794887Schin if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL)) 26804887Schin { 26814887Schin drop(env->disc, e); 26824887Schin env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL; 26834887Schin return 0; 26844887Schin } 26854887Schin if (token(env) != T_CLOSE) 26864887Schin { 26874887Schin drop(env->disc, e); 26884887Schin env->error = REG_EPAREN; 26894887Schin return 0; 26904887Schin } 26914887Schin env->parnest--; 26924887Schin eat(env); 26934887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0))) 26944887Schin { 26954887Schin drop(env->disc, e); 26964887Schin return 0; 26974887Schin } 26984887Schin if (parno < elementsof(env->paren)) 26994887Schin env->paren[parno] = f; 27004887Schin f->re.group.back = 0; 27014887Schin f->re.group.number = parno; 27024887Schin f->re.group.expr.rex = e; 27034887Schin if (tok.lex) 27044887Schin { 27054887Schin tok.push = 1; 27064887Schin env->token = tok; 27074887Schin } 27084887Schin if (!(e = rep(env, f, parno, env->parno))) 27094887Schin return 0; 27104887Schin if (env->type == KRE) 27114887Schin { 27124887Schin if (!(f = node(env, REX_GROUP, 0, 0, 0))) 27134887Schin { 27144887Schin drop(env->disc, e); 27154887Schin return 0; 27164887Schin } 27174887Schin if (--parno < elementsof(env->paren)) 27184887Schin env->paren[parno] = f; 27194887Schin f->re.group.back = 0; 27204887Schin f->re.group.number = parno; 27214887Schin f->re.group.expr.rex = e; 27224887Schin e = f; 27234887Schin } 27244887Schin env->flags = flags; 27254887Schin env->type = type; 27264887Schin break; 27274887Schin case T_GROUP: 27284887Schin p = env->cursor; 27294887Schin eat(env); 27304887Schin flags = env->flags; 27314887Schin type = env->type; 27324887Schin if (!(e = grp(env, env->parno + 1))) 27334887Schin { 27344887Schin if (env->error) 27354887Schin return 0; 27364887Schin if (env->literal == env->pattern && env->literal == p) 27374887Schin env->literal = env->cursor; 27384887Schin continue; 27394887Schin } 27404887Schin env->flags = flags; 27414887Schin env->type = type; 27424887Schin break; 27434887Schin case T_BRA: 27444887Schin eat(env); 27454887Schin if (e = bra(env)) 27464887Schin e = rep(env, e, 0, 0); 27474887Schin break; 27484887Schin case T_ALNUM: 27494887Schin case T_ALNUM_NOT: 27504887Schin case T_DIGIT: 27514887Schin case T_DIGIT_NOT: 27524887Schin case T_SPACE: 27534887Schin case T_SPACE_NOT: 27544887Schin eat(env); 27554887Schin if (e = ccl(env, c)) 27564887Schin e = rep(env, e, 0, 0); 27574887Schin break; 27584887Schin case T_LT: 27594887Schin eat(env); 27604887Schin e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0); 27614887Schin break; 27624887Schin case T_GT: 27634887Schin eat(env); 27644887Schin e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0); 27654887Schin break; 27664887Schin case T_DOT: 27674887Schin eat(env); 27684887Schin e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 27694887Schin break; 27704887Schin case T_DOTSTAR: 27714887Schin eat(env); 27724887Schin env->token.lex = T_STAR; 27734887Schin env->token.push = 1; 27744887Schin e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 27754887Schin break; 27764887Schin case T_SLASHPLUS: 27774887Schin eat(env); 27784887Schin env->token.lex = T_PLUS; 27794887Schin env->token.push = 1; 27804887Schin if (e = node(env, REX_ONECHAR, 1, 1, 0)) 27814887Schin { 27824887Schin e->re.onechar = '/'; 27834887Schin e = rep(env, e, 0, 0); 27844887Schin } 27854887Schin break; 27864887Schin case T_WORD: 27874887Schin eat(env); 27884887Schin e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0); 27894887Schin break; 27904887Schin case T_WORD_NOT: 27914887Schin eat(env); 27924887Schin e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0); 27934887Schin break; 27944887Schin case T_BEG_STR: 27954887Schin eat(env); 27964887Schin e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0); 27974887Schin break; 27984887Schin case T_END_STR: 27994887Schin eat(env); 28004887Schin e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0); 28014887Schin break; 28024887Schin case T_FIN_STR: 28034887Schin eat(env); 28044887Schin e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0); 28054887Schin break; 28064887Schin default: 28074887Schin env->error = REG_BADRPT; 28084887Schin return 0; 28094887Schin } 28104887Schin if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator) 28114887Schin e = cat(env, e, seq(env)); 28124887Schin return e; 28134887Schin } 28144887Schin } 28154887Schin 28164887Schin static Rex_t* 28174887Schin con(Cenv_t* env) 28184887Schin { 28194887Schin Rex_t* e; 28204887Schin Rex_t* f; 28214887Schin Rex_t* g; 28224887Schin 28234887Schin if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND) 28244887Schin return e; 28254887Schin eat(env); 28264887Schin if (!(f = con(env))) 28274887Schin { 28284887Schin drop(env->disc, e); 28294887Schin return 0; 28304887Schin } 28314887Schin if (!(g = node(env, REX_CONJ, 0, 0, 0))) 28324887Schin { 28334887Schin drop(env->disc, e); 28344887Schin drop(env->disc, f); 28354887Schin return 0; 28364887Schin } 28374887Schin g->re.group.expr.binary.left = e; 28384887Schin g->re.group.expr.binary.right = f; 28394887Schin return g; 28404887Schin } 28414887Schin 28424887Schin static Rex_t* 28434887Schin alt(Cenv_t* env, int number, int cond) 28444887Schin { 28454887Schin Rex_t* e; 28464887Schin Rex_t* f; 28474887Schin Rex_t* g; 28484887Schin 28494887Schin if (!(e = con(env))) 28504887Schin return 0; 28514887Schin else if (token(env) != T_BAR) 28524887Schin { 28534887Schin if (!cond) 28544887Schin return e; 28554887Schin f = 0; 28564887Schin if (e->type == REX_NULL) 28574887Schin goto bad; 28584887Schin } 28594887Schin else 28604887Schin { 28614887Schin eat(env); 28624887Schin if (!(f = alt(env, number, 0))) 28634887Schin { 28644887Schin drop(env->disc, e); 28654887Schin return 0; 28664887Schin } 28674887Schin if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL)) 28684887Schin goto bad; 28694887Schin if (!cond && (g = trie(env, e, f))) 28704887Schin return g; 28714887Schin } 28724887Schin if (!(g = node(env, REX_ALT, 0, 0, 0))) 28734887Schin { 28744887Schin env->error = REG_ESPACE; 28754887Schin goto bad; 28764887Schin } 28774887Schin g->re.group.number = number; 28784887Schin g->re.group.last = env->parno; 28794887Schin g->re.group.expr.binary.left = e; 28804887Schin g->re.group.expr.binary.right = f; 28814887Schin return g; 28824887Schin bad: 28834887Schin drop(env->disc, e); 28844887Schin drop(env->disc, f); 28854887Schin if (!env->error) 28864887Schin env->error = REG_ENULL; 28874887Schin return 0; 28884887Schin } 28894887Schin 28904887Schin /* 28914887Schin * add v to REX_BM tables 28924887Schin */ 28934887Schin 28944887Schin static void 28954887Schin bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b) 28964887Schin { 28974887Schin int c; 28984887Schin int m; 28994887Schin size_t z; 29004887Schin 29014887Schin for (m = 0; m < n; m++) 29024887Schin { 29034887Schin if (!(z = n - m - 1)) 29044887Schin z = HIT; 29054887Schin c = v[m]; 29064887Schin a->re.bm.mask[m][c] |= b; 29074887Schin if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 29084887Schin a->re.bm.skip[c] = z; 29094887Schin if (a->flags & REG_ICASE) 29104887Schin { 29114887Schin if (isupper(c)) 29124887Schin c = tolower(c); 29134887Schin else if (islower(c)) 29144887Schin c = toupper(c); 29154887Schin else 29164887Schin continue; 29174887Schin a->re.bm.mask[m][c] |= b; 29184887Schin if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 29194887Schin a->re.bm.skip[c] = z; 29204887Schin } 29214887Schin } 29224887Schin } 29234887Schin 29244887Schin /* 29254887Schin * set up BM table from trie 29264887Schin */ 29274887Schin 29284887Schin static int 29294887Schin bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b) 29304887Schin { 29314887Schin do 29324887Schin { 29334887Schin v[m] = x->c; 29344887Schin if (m >= (n - 1)) 29354887Schin { 29364887Schin bmstr(env, a, v, n, b); 29374887Schin if (!(b <<= 1)) 29384887Schin { 29394887Schin b = 1; 29404887Schin a->re.bm.complete = 0; 29414887Schin } 29424887Schin else if (x->son) 29434887Schin a->re.bm.complete = 0; 29444887Schin } 29454887Schin else if (x->son) 29464887Schin b = bmtrie(env, a, v, x->son, n, m + 1, b); 29474887Schin } while (x = x->sib); 29484887Schin return b; 29494887Schin } 29504887Schin 29514887Schin /* 29524887Schin * rewrite the expression tree for some special cases 29534887Schin * 1. it is a null expression - illegal 29544887Schin * 2. max length fixed string found -- use BM algorithm 29554887Schin * 3. it begins with an unanchored string - use KMP algorithm 29564887Schin * 0 returned on success 29574887Schin */ 29584887Schin 29594887Schin static int 29604887Schin special(Cenv_t* env, regex_t* p) 29614887Schin { 29624887Schin Rex_t* a; 29634887Schin Rex_t* e; 29644887Schin Rex_t* t; 29654887Schin Rex_t* x; 29664887Schin Rex_t* y; 29674887Schin unsigned char* s; 29684887Schin int* f; 29694887Schin int n; 29704887Schin int m; 29714887Schin int k; 29724887Schin 29734887Schin DEBUG_INIT(); 29744887Schin if (e = p->env->rex) 29754887Schin { 29764887Schin if ((x = env->stats.x) && x->re.string.size < 3) 29774887Schin x = 0; 29784887Schin if ((t = env->stats.y) && t->re.trie.min < 3) 29794887Schin t = 0; 29804887Schin if (x && t) 29814887Schin { 29824887Schin if (x->re.string.size >= t->re.trie.min) 29834887Schin t = 0; 29844887Schin else 29854887Schin x = 0; 29864887Schin } 29878462SApril.Chin@Sun.COM if (x || t) 29884887Schin { 29894887Schin Bm_mask_t** mask; 29904887Schin Bm_mask_t* h; 29914887Schin unsigned char* v; 29924887Schin size_t* q; 29934887Schin unsigned long l; 29944887Schin int i; 29954887Schin int j; 29964887Schin 29974887Schin if (x) 29984887Schin { 29994887Schin y = x; 30004887Schin n = m = x->re.string.size; 30014887Schin l = env->stats.l; 30024887Schin } 30034887Schin else 30044887Schin { 30054887Schin y = t; 30064887Schin n = t->re.trie.min; 30074887Schin m = t->re.trie.max; 30084887Schin l = env->stats.k; 30094887Schin } 30104887Schin if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t)))) 30114887Schin return 1; 30124887Schin if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t)))) 30134887Schin { 30144887Schin alloc(env->disc, q, 0); 30154887Schin return 1; 30164887Schin } 30174887Schin a->flags = y->flags; 30184887Schin a->map = y->map; 30194887Schin a->re.bm.size = n; 30204887Schin a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1; 30214887Schin a->re.bm.left = l - 1; 30224887Schin a->re.bm.right = env->stats.m - l - n; 3023*12068SRoger.Faulkner@Oracle.COM a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n; 30244887Schin h = (Bm_mask_t*)&a->re.bm.mask[n]; 30254887Schin a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1)); 30264887Schin a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1]; 30274887Schin for (m = 0; m <= UCHAR_MAX; m++) 30284887Schin a->re.bm.skip[m] = n; 30294887Schin a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left); 30304887Schin for (i = 1; i <= n; i++) 30314887Schin a->re.bm.fail[i] = 2 * n - i; 30324887Schin mask = a->re.bm.mask; 30334887Schin for (m = 0; m < n; m++) 30344887Schin { 30354887Schin mask[m] = h; 30364887Schin h += UCHAR_MAX + 1; 30374887Schin } 30384887Schin if (x) 30394887Schin bmstr(env, a, x->re.string.base, n, 1); 30404887Schin else 30414887Schin { 30424887Schin v = (unsigned char*)q; 30434887Schin memset(v, 0, n); 30444887Schin m = 1; 30454887Schin for (i = 0; i <= UCHAR_MAX; i++) 30464887Schin if (t->re.trie.root[i]) 30474887Schin m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m); 30484887Schin } 30494887Schin mask--; 30504887Schin memset(q, 0, n * sizeof(*q)); 30514887Schin for (k = (j = n) + 1; j > 0; j--, k--) 30524887Schin { 30534887Schin DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0)); 30544887Schin for (q[j] = k; k <= n; k = q[k]) 30554887Schin { 30564887Schin for (m = 0; m <= UCHAR_MAX; m++) 30574887Schin if (mask[k][m] == mask[j][m]) 30584887Schin { 30594887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0)); 30604887Schin goto cut; 30614887Schin } 30624887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0)); 30634887Schin if (a->re.bm.fail[k] > n - j) 30644887Schin a->re.bm.fail[k] = n - j; 30654887Schin } 30664887Schin cut: ; 30674887Schin } 30684887Schin for (i = 1; i <= n; i++) 30694887Schin if (a->re.bm.fail[i] > n + k - i) 30704887Schin { 30714887Schin DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0)); 30724887Schin a->re.bm.fail[i] = n + k - i; 30734887Schin } 30744887Schin #if _AST_REGEX_DEBUG 30754887Schin if (DEBUG_TEST(0x0020,(1),(0))) 30764887Schin { 30774887Schin sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0); 30784887Schin for (m = 0; m < n; m++) 30794887Schin for (i = 1; i <= UCHAR_MAX; i++) 30804887Schin if (a->re.bm.mask[m][i]) 30814887Schin sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]); 30824887Schin for (i = ' '; i <= UCHAR_MAX; i++) 30834887Schin if (a->re.bm.skip[i] >= HIT) 30844887Schin sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i); 30854887Schin else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n) 30864887Schin sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]); 30874887Schin for (j = 31; j >= 0; j--) 30884887Schin { 30894887Schin sfprintf(sfstderr, " "); 30904887Schin next: 30914887Schin for (m = 0; m < n; m++) 30924887Schin { 30934887Schin for (i = 0040; i < 0177; i++) 30944887Schin if (a->re.bm.mask[m][i] & (1 << j)) 30954887Schin { 30964887Schin sfprintf(sfstderr, " %c", i); 30974887Schin break; 30984887Schin } 30994887Schin if (i >= 0177) 31004887Schin { 31014887Schin if (j > 0) 31024887Schin { 31034887Schin j--; 31044887Schin goto next; 31054887Schin } 31064887Schin sfprintf(sfstderr, " ?"); 31074887Schin } 31084887Schin } 31094887Schin sfprintf(sfstderr, "\n"); 31104887Schin } 31114887Schin sfprintf(sfstderr, "FAIL: "); 31124887Schin for (m = 1; m <= n; m++) 31134887Schin sfprintf(sfstderr, "%3d", a->re.bm.fail[m]); 31144887Schin sfprintf(sfstderr, "\n"); 31154887Schin } 31164887Schin #endif 31174887Schin alloc(env->disc, q, 0); 31184887Schin a->next = e; 31194887Schin p->env->rex = a; 31204887Schin return 0; 31214887Schin } 31224887Schin switch (e->type) 31234887Schin { 31244887Schin case REX_BEG: 31254887Schin if (env->flags & REG_NEWLINE) 31264887Schin return 0; 31274887Schin break; 31284887Schin case REX_GROUP: 31294887Schin if (env->stats.b) 31304887Schin return 0; 31314887Schin e = e->re.group.expr.rex; 31324887Schin if (e->type != REX_DOT) 31334887Schin return 0; 31344887Schin /*FALLTHROUGH*/ 31354887Schin case REX_DOT: 31364887Schin if (e->lo == 0 && e->hi == RE_DUP_INF) 31374887Schin break; 31384887Schin return 0; 31394887Schin case REX_NULL: 31404887Schin if (env->flags & REG_NULL) 31414887Schin break; 31424887Schin env->error = REG_ENULL; 31434887Schin return 1; 31444887Schin case REX_STRING: 31458462SApril.Chin@Sun.COM if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map) 31464887Schin return 0; 31474887Schin s = e->re.string.base; 31484887Schin n = e->re.string.size; 31494887Schin if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1)))) 31504887Schin return 1; 31514887Schin a->flags = e->flags; 31524887Schin a->map = e->map; 31534887Schin f = a->re.string.fail; 31544887Schin memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n); 31554887Schin s = a->re.string.base; 31564887Schin a->re.string.size = n; 31574887Schin f[0] = m = -1; 31584887Schin for (k = 1; k < n; k++) 31594887Schin { 31604887Schin while (m >= 0 && s[m+1] != s[k]) 31614887Schin m = f[m]; 31624887Schin if (s[m+1] == s[k]) 31634887Schin m++; 31644887Schin f[k] = m; 31654887Schin } 31664887Schin a->next = e->next; 31674887Schin p->env->rex = a; 31684887Schin e->next = 0; 31694887Schin drop(env->disc, e); 31704887Schin break; 31714887Schin default: 31724887Schin return 0; 31734887Schin } 31744887Schin } 31754887Schin p->env->once = 1; 31764887Schin return 0; 31774887Schin } 31784887Schin 31794887Schin int 31804887Schin regcomp(regex_t* p, const char* pattern, regflags_t flags) 31814887Schin { 31824887Schin Rex_t* e; 31834887Schin Rex_t* f; 31844887Schin regdisc_t* disc; 31854887Schin unsigned char* fold; 31864887Schin int i; 31874887Schin Cenv_t env; 31884887Schin 31894887Schin if (!p) 31904887Schin return REG_BADPAT; 31914887Schin if (flags & REG_DISCIPLINE) 31924887Schin { 31934887Schin flags &= ~REG_DISCIPLINE; 31944887Schin disc = p->re_disc; 31954887Schin } 31964887Schin else 31974887Schin disc = &state.disc; 31984887Schin if (!disc->re_errorlevel) 31994887Schin disc->re_errorlevel = 2; 32004887Schin p->env = 0; 32014887Schin if (!pattern) 32024887Schin return fatal(disc, REG_BADPAT, pattern); 32034887Schin if (!state.initialized) 32044887Schin { 32054887Schin state.initialized = 1; 32064887Schin for (i = 0; i < elementsof(state.escape); i++) 32074887Schin state.magic[state.escape[i].key] = state.escape[i].val; 32084887Schin } 32094887Schin if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data)) 32104887Schin { 32114887Schin if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1))) 32124887Schin return fatal(disc, REG_ESPACE, pattern); 32134887Schin for (i = 0; i <= UCHAR_MAX; i++) 32144887Schin fold[i] = toupper(i); 32154887Schin LCINFO(AST_LC_CTYPE)->data = (void*)fold; 32164887Schin } 32174887Schin again: 32184887Schin if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t)))) 32194887Schin return fatal(disc, REG_ESPACE, pattern); 32204887Schin memset(p->env, 0, sizeof(*p->env)); 32214887Schin memset(&env, 0, sizeof(env)); 32224887Schin env.regex = p; 32234887Schin env.flags = flags; 32244887Schin env.disc = p->env->disc = disc; 32254887Schin if (env.flags & REG_AUGMENTED) 32264887Schin env.flags |= REG_EXTENDED; 32274887Schin env.mappeddot = '.'; 32284887Schin env.mappednewline = '\n'; 32294887Schin env.mappedslash = '/'; 32304887Schin if (disc->re_version >= REG_VERSION_MAP && disc->re_map) 32314887Schin { 32324887Schin env.map = disc->re_map; 32334887Schin env.MAP = p->env->fold; 32344887Schin for (i = 0; i <= UCHAR_MAX; i++) 32354887Schin { 32364887Schin env.MAP[i] = fold[env.map[i]]; 32374887Schin if (env.map[i] == '.') 32384887Schin env.mappeddot = i; 32394887Schin if (env.map[i] == '\n') 32404887Schin env.mappednewline = i; 32414887Schin if (env.map[i] == '/') 32424887Schin env.mappedslash = i; 32434887Schin } 32444887Schin } 32454887Schin else 32464887Schin env.MAP = fold; 32474887Schin env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE; 32484887Schin env.explicit = -1; 32494887Schin if (env.flags & REG_SHELL) 32504887Schin { 32514887Schin if (env.flags & REG_SHELL_PATH) 32524887Schin env.explicit = env.mappedslash; 32534887Schin env.flags |= REG_LENIENT|REG_NULL; 32544887Schin env.type = env.type == BRE ? SRE : KRE; 32554887Schin } 32564887Schin if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE) 32574887Schin env.explicit = env.mappednewline; 32584887Schin p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1; 32594887Schin env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL)); 326010898Sroland.mainz@nrubsig.org env.regexp = !!(env.flags & REG_REGEXP); 32614887Schin env.token.lex = 0; 32624887Schin env.token.push = 0; 32634887Schin if (env.flags & REG_DELIMITED) 32644887Schin { 32654887Schin switch (env.delimiter = *pattern++) 32664887Schin { 32674887Schin case 0: 32684887Schin case '\\': 32694887Schin case '\n': 32704887Schin case '\r': 32714887Schin env.error = REG_EDELIM; 32724887Schin goto bad; 32734887Schin } 32744887Schin env.terminator = '\n'; 32754887Schin } 32764887Schin env.literal = env.pattern = env.cursor = (unsigned char*)pattern; 32774887Schin if (!(p->env->rex = alt(&env, 1, 0))) 32784887Schin goto bad; 32794887Schin if (env.parnest) 32804887Schin { 32814887Schin env.error = REG_EPAREN; 32824887Schin goto bad; 32834887Schin } 32844887Schin p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL); 32854887Schin if (env.flags & REG_LEFT) 32864887Schin { 32874887Schin if (p->env->rex->type != REX_BEG) 32884887Schin { 32894887Schin if (p->env->rex->type == REX_ALT) 32904887Schin env.flags &= ~REG_FIRST; 32914887Schin if (!(e = node(&env, REX_BEG, 0, 0, 0))) 32924887Schin { 32934887Schin regfree(p); 32944887Schin return fatal(disc, REG_ESPACE, pattern); 32954887Schin } 32964887Schin e->next = p->env->rex; 32974887Schin p->env->rex = e; 32984887Schin p->env->once = 1; 32994887Schin } 33004887Schin p->env->stats.re_flags |= REG_LEFT; 33014887Schin } 33024887Schin for (e = p->env->rex; e->next; e = e->next); 33034887Schin p->env->done.type = REX_DONE; 33044887Schin p->env->done.flags = e->flags; 33054887Schin if (env.flags & REG_RIGHT) 33064887Schin { 33074887Schin if (e->type != REX_END) 33084887Schin { 33094887Schin if (p->env->rex->type == REX_ALT) 33104887Schin env.flags &= ~REG_FIRST; 33114887Schin if (!(f = node(&env, REX_END, 0, 0, 0))) 33124887Schin { 33134887Schin regfree(p); 33144887Schin return fatal(disc, REG_ESPACE, pattern); 33154887Schin } 33164887Schin f->flags = e->flags; 33174887Schin f->map = e->map; 33184887Schin e->next = f; 33194887Schin } 33204887Schin p->env->stats.re_flags |= REG_RIGHT; 33214887Schin } 33224887Schin if (stats(&env, p->env->rex)) 33234887Schin { 33244887Schin if (!env.error) 33254887Schin env.error = REG_ECOUNT; 33264887Schin goto bad; 33274887Schin } 33284887Schin if (env.stats.b) 33294887Schin p->env->hard = p->env->separate = 1; 33304887Schin else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c))) 33314887Schin p->env->hard = 1; 33324887Schin if (p->env->hard || env.stats.c || env.stats.i) 33334887Schin p->env->stats.re_min = p->env->stats.re_max = -1; 33344887Schin else 33354887Schin { 33364887Schin if (!(p->env->stats.re_min = env.stats.m)) 33374887Schin p->env->stats.re_min = -1; 33384887Schin if (!(p->env->stats.re_max = env.stats.n)) 33394887Schin p->env->stats.re_max = -1; 33404887Schin } 33414887Schin if (special(&env, p)) 33424887Schin goto bad; 33434887Schin serialize(&env, p->env->rex, 1); 33444887Schin p->re_nsub = env.stats.p; 33454887Schin if (env.type == KRE) 33464887Schin p->re_nsub /= 2; 33474887Schin if (env.flags & REG_DELIMITED) 33484887Schin { 33494887Schin p->re_npat = env.cursor - env.pattern + 1; 33504887Schin if (*env.cursor == env.delimiter) 33514887Schin p->re_npat++; 33524887Schin else if (env.flags & REG_MUSTDELIM) 33534887Schin { 33544887Schin env.error = REG_EDELIM; 33554887Schin goto bad; 33564887Schin } 33574887Schin else 33584887Schin env.flags &= ~REG_DELIMITED; 33594887Schin } 33604887Schin p->env->explicit = env.explicit; 33614887Schin p->env->flags = env.flags & REG_COMP; 33624887Schin p->env->min = env.stats.m; 33634887Schin p->env->nsub = env.stats.p + env.stats.u; 33644887Schin p->env->refs = 1; 33654887Schin return 0; 33664887Schin bad: 33674887Schin regfree(p); 33684887Schin if (!env.error) 33694887Schin env.error = REG_ESPACE; 33704887Schin if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL)) 33714887Schin { 33724887Schin flags |= REG_LITERAL; 33734887Schin pattern = (const char*)env.literal; 33744887Schin goto again; 33754887Schin } 33764887Schin return fatal(disc, env.error, pattern); 33774887Schin } 33784887Schin 33794887Schin /* 33804887Schin * regcomp() on sized pattern 33814887Schin * the lazy way around adding and checking env.end 33824887Schin */ 33834887Schin 33844887Schin int 33854887Schin regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags) 33864887Schin { 33874887Schin char* s; 33884887Schin int r; 33894887Schin 33904887Schin if (!(s = malloc(size + 1))) 33914887Schin return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern); 33924887Schin memcpy(s, pattern, size); 33934887Schin s[size] = 0; 33944887Schin r = regcomp(p, s, flags); 33954887Schin free(s); 33964887Schin return r; 33974887Schin } 33984887Schin 33994887Schin /* 34004887Schin * combine two compiled regular expressions if possible, 34014887Schin * replacing first with the combination and freeing second. 34024887Schin * return 0 on success. 34034887Schin * the only combinations handled are building a Trie 34044887Schin * from String|Kmp|Trie and String|Kmp 34054887Schin */ 34064887Schin 34074887Schin int 34084887Schin regcomb(regex_t* p, regex_t* q) 34094887Schin { 34104887Schin Rex_t* e = p->env->rex; 34114887Schin Rex_t* f = q->env->rex; 34124887Schin Rex_t* g; 3413*12068SRoger.Faulkner@Oracle.COM Rex_t* h; 34144887Schin Cenv_t env; 34154887Schin 34164887Schin if (!e || !f) 34174887Schin return fatal(p->env->disc, REG_BADPAT, NiL); 34184887Schin if (p->env->separate || q->env->separate) 34194887Schin return REG_ESUBREG; 34204887Schin memset(&env, 0, sizeof(env)); 34214887Schin env.disc = p->env->disc; 34224887Schin if (e->type == REX_BM) 34234887Schin { 34244887Schin p->env->rex = e->next; 34254887Schin e->next = 0; 34264887Schin drop(env.disc, e); 34274887Schin e = p->env->rex; 34284887Schin } 34294887Schin if (f->type == REX_BM) 34304887Schin { 34314887Schin q->env->rex = f->next; 34324887Schin f->next = 0; 34334887Schin drop(env.disc, f); 34344887Schin f = q->env->rex; 34354887Schin } 34364887Schin if (e->type == REX_BEG && f->type == REX_BEG) 34374887Schin { 34384887Schin p->env->flags |= REG_LEFT; 34394887Schin p->env->rex = e->next; 34404887Schin e->next = 0; 34414887Schin drop(env.disc, e); 34424887Schin e = p->env->rex; 34434887Schin q->env->rex = f->next; 34444887Schin f->next = 0; 34454887Schin drop(env.disc, f); 34464887Schin f = q->env->rex; 34474887Schin } 3448*12068SRoger.Faulkner@Oracle.COM for (g = e; g->next; g = g->next); 3449*12068SRoger.Faulkner@Oracle.COM for (h = f; h->next; h = h->next); 3450*12068SRoger.Faulkner@Oracle.COM if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END) 34514887Schin { 34524887Schin p->env->flags |= REG_RIGHT; 3453*12068SRoger.Faulkner@Oracle.COM drop(env.disc, g->next); 3454*12068SRoger.Faulkner@Oracle.COM g->next = 0; 3455*12068SRoger.Faulkner@Oracle.COM drop(env.disc, h->next); 3456*12068SRoger.Faulkner@Oracle.COM h->next = 0; 34574887Schin } 34584887Schin if (!(g = trie(&env, f, e))) 34594887Schin return fatal(p->env->disc, REG_BADPAT, NiL); 34604887Schin p->env->rex = g; 34614887Schin if (!q->env->once) 34624887Schin p->env->once = 0; 34634887Schin q->env->rex = 0; 34644887Schin if (p->env->flags & REG_LEFT) 34654887Schin { 34664887Schin if (!(e = node(&env, REX_BEG, 0, 0, 0))) 34674887Schin { 34684887Schin regfree(p); 34694887Schin return fatal(p->env->disc, REG_ESPACE, NiL); 34704887Schin } 34714887Schin e->next = p->env->rex; 34724887Schin p->env->rex = e; 34734887Schin p->env->once = 1; 34744887Schin } 34754887Schin if (p->env->flags & REG_RIGHT) 34764887Schin { 34774887Schin for (f = p->env->rex; f->next; f = f->next); 34784887Schin if (f->type != REX_END) 34794887Schin { 34804887Schin if (!(e = node(&env, REX_END, 0, 0, 0))) 34814887Schin { 34824887Schin regfree(p); 34834887Schin return fatal(p->env->disc, REG_ESPACE, NiL); 34844887Schin } 34854887Schin f->next = e; 34864887Schin } 34874887Schin } 34884887Schin env.explicit = p->env->explicit; 34894887Schin env.flags = p->env->flags; 34904887Schin env.disc = p->env->disc; 34914887Schin if (stats(&env, p->env->rex)) 34924887Schin { 34934887Schin regfree(p); 34944887Schin return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL); 34954887Schin } 34964887Schin if (special(&env, p)) 34974887Schin { 34984887Schin regfree(p); 34994887Schin return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL); 35004887Schin } 35014887Schin p->env->min = g->re.trie.min; 35024887Schin return 0; 35034887Schin } 35044887Schin 35054887Schin /* 35064887Schin * copy a reference of p into q 35074887Schin * p and q may then have separate regsubcomp() instantiations 35084887Schin */ 35094887Schin 35104887Schin int 35114887Schin regdup(regex_t* p, regex_t* q) 35124887Schin { 35134887Schin if (!p || !q) 35144887Schin return REG_BADPAT; 35154887Schin *q = *p; 35164887Schin p->env->refs++; 35174887Schin q->re_sub = 0; 35184887Schin return 0; 35194887Schin } 3520