1*4887Schin /*********************************************************************** 2*4887Schin * * 3*4887Schin * This software is part of the ast package * 4*4887Schin * Copyright (c) 1985-2007 AT&T Knowledge Ventures * 5*4887Schin * and is licensed under the * 6*4887Schin * Common Public License, Version 1.0 * 7*4887Schin * by AT&T Knowledge Ventures * 8*4887Schin * * 9*4887Schin * A copy of the License is available at * 10*4887Schin * http://www.opensource.org/licenses/cpl1.0.txt * 11*4887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12*4887Schin * * 13*4887Schin * Information and Software Systems Research * 14*4887Schin * AT&T Research * 15*4887Schin * Florham Park NJ * 16*4887Schin * * 17*4887Schin * Glenn Fowler <gsf@research.att.com> * 18*4887Schin * David Korn <dgk@research.att.com> * 19*4887Schin * Phong Vo <kpv@research.att.com> * 20*4887Schin * * 21*4887Schin ***********************************************************************/ 22*4887Schin #pragma prototyped 23*4887Schin 24*4887Schin /* 25*4887Schin * posix regex decompiler 26*4887Schin */ 27*4887Schin 28*4887Schin #include "reglib.h" 29*4887Schin 30*4887Schin #undef ismeta 31*4887Schin #define ismeta(c,t,e,d) (state.magic[c] && state.magic[c][(t)+(e)] >= T_META || (c) == (d)) 32*4887Schin #define meta(f,c,t,e,d) do { if (ismeta(c,t,e,d)) sfputc(f, '\\'); sfputc(f, c); } while (0) 33*4887Schin 34*4887Schin static void 35*4887Schin detrie(Trie_node_t* x, Sfio_t* sp, char* b, char* p, char* e, int delimiter) 36*4887Schin { 37*4887Schin register Trie_node_t* y; 38*4887Schin char* o; 39*4887Schin int k; 40*4887Schin 41*4887Schin o = p; 42*4887Schin k = 1; 43*4887Schin do 44*4887Schin { 45*4887Schin if (k) 46*4887Schin { 47*4887Schin o = p; 48*4887Schin if (p < e) 49*4887Schin *p++ = x->c; 50*4887Schin } 51*4887Schin sfputc(sp, x->c); 52*4887Schin for (y = x->sib; y; y = y->sib) 53*4887Schin { 54*4887Schin sfputc(sp, '|'); 55*4887Schin sfputc(sp, '<'); 56*4887Schin sfwrite(sp, b, p - b); 57*4887Schin sfputc(sp, '>'); 58*4887Schin detrie(y, sp, b, p, e, delimiter); 59*4887Schin } 60*4887Schin if (x->end && x->son) 61*4887Schin { 62*4887Schin sfputc(sp, '|'); 63*4887Schin sfputc(sp, '{'); 64*4887Schin sfwrite(sp, b, p - b); 65*4887Schin sfputc(sp, '}'); 66*4887Schin p = o; 67*4887Schin } 68*4887Schin } while (x = x->son); 69*4887Schin } 70*4887Schin 71*4887Schin static int 72*4887Schin decomp(register Rex_t* e, Sfio_t* sp, int type, int delimiter, regflags_t flags) 73*4887Schin { 74*4887Schin Rex_t* q; 75*4887Schin unsigned char* s; 76*4887Schin unsigned char* t; 77*4887Schin int c; 78*4887Schin int d; 79*4887Schin int cb; 80*4887Schin int cd; 81*4887Schin int cr; 82*4887Schin int ib; 83*4887Schin int ie; 84*4887Schin int nb; 85*4887Schin int ne; 86*4887Schin unsigned char ic[2*UCHAR_MAX]; 87*4887Schin unsigned char nc[2*UCHAR_MAX]; 88*4887Schin 89*4887Schin do 90*4887Schin { 91*4887Schin switch (e->type) 92*4887Schin { 93*4887Schin case REX_ALT: 94*4887Schin if (decomp(e->re.group.expr.binary.left, sp, type, delimiter, flags)) 95*4887Schin return 1; 96*4887Schin sfputc(sp, '|'); 97*4887Schin if (e->re.group.expr.binary.right && decomp(e->re.group.expr.binary.right, sp, type, delimiter, flags)) 98*4887Schin return 1; 99*4887Schin break; 100*4887Schin case REX_BACK: 101*4887Schin sfprintf(sp, "\\%d", e->lo); 102*4887Schin break; 103*4887Schin case REX_BEG: 104*4887Schin if (type < SRE) 105*4887Schin sfputc(sp, '^'); 106*4887Schin break; 107*4887Schin case REX_END: 108*4887Schin if (type < SRE) 109*4887Schin sfputc(sp, '$'); 110*4887Schin break; 111*4887Schin case REX_WBEG: 112*4887Schin meta(sp, '<', type, 1, delimiter); 113*4887Schin break; 114*4887Schin case REX_WEND: 115*4887Schin meta(sp, '<', type, 1, delimiter); 116*4887Schin break; 117*4887Schin case REX_WORD: 118*4887Schin sfprintf(sp, "\\w"); 119*4887Schin break; 120*4887Schin case REX_CLASS: 121*4887Schin case REX_COLL_CLASS: 122*4887Schin case REX_ONECHAR: 123*4887Schin case REX_DOT: 124*4887Schin case REX_REP: 125*4887Schin if (type >= SRE) 126*4887Schin { 127*4887Schin c = ')'; 128*4887Schin if (e->hi == RE_DUP_INF) 129*4887Schin { 130*4887Schin if (!e->lo) 131*4887Schin sfputc(sp, '*'); 132*4887Schin else if (e->lo == 1) 133*4887Schin sfputc(sp, '+'); 134*4887Schin else 135*4887Schin sfprintf(sp, "{%d,}", e->lo); 136*4887Schin } 137*4887Schin else if (e->hi != 1) 138*4887Schin sfprintf(sp, "{%d,%d}", e->lo, e->hi); 139*4887Schin else if (e->lo == 0) 140*4887Schin sfputc(sp, '?'); 141*4887Schin else 142*4887Schin c = 0; 143*4887Schin } 144*4887Schin switch (e->type) 145*4887Schin { 146*4887Schin case REX_REP: 147*4887Schin if (decomp(e->re.group.expr.rex, sp, type, delimiter, flags)) 148*4887Schin return 1; 149*4887Schin break; 150*4887Schin case REX_CLASS: 151*4887Schin sfputc(sp, '['); 152*4887Schin nb = ne = ib = ie = -2; 153*4887Schin cb = cd = cr = 0; 154*4887Schin s = nc; 155*4887Schin t = ic; 156*4887Schin for (c = 0; c <= UCHAR_MAX; c++) 157*4887Schin if (settst(e->re.charclass, c)) 158*4887Schin { 159*4887Schin if (c == ']') 160*4887Schin cb = 1; 161*4887Schin else if (c == '-') 162*4887Schin cr = 1; 163*4887Schin else if (c == delimiter) 164*4887Schin cd = 1; 165*4887Schin else if (nb < 0) 166*4887Schin ne = nb = c; 167*4887Schin else if (ne == (c - 1)) 168*4887Schin ne = c; 169*4887Schin else 170*4887Schin { 171*4887Schin if (ne == nb) 172*4887Schin *s++ = ne; 173*4887Schin else 174*4887Schin { 175*4887Schin *s++ = nb; 176*4887Schin *s++ = '-'; 177*4887Schin *s++ = ne; 178*4887Schin } 179*4887Schin ne = nb = c; 180*4887Schin } 181*4887Schin } 182*4887Schin else 183*4887Schin { 184*4887Schin if (c == ']') 185*4887Schin cb = -1; 186*4887Schin else if (c == '-') 187*4887Schin cr = -1; 188*4887Schin else if (c == delimiter) 189*4887Schin cd = -1; 190*4887Schin else if (ib < 0) 191*4887Schin ie = ib = c; 192*4887Schin else if (ie == (c - 1)) 193*4887Schin ie = c; 194*4887Schin else 195*4887Schin { 196*4887Schin if (ie == ib) 197*4887Schin *t++ = ie; 198*4887Schin else 199*4887Schin { 200*4887Schin *t++ = ib; 201*4887Schin *t++ = '-'; 202*4887Schin *t++ = ie; 203*4887Schin } 204*4887Schin ie = ib = c; 205*4887Schin } 206*4887Schin } 207*4887Schin if (nb >= 0) 208*4887Schin { 209*4887Schin *s++ = nb; 210*4887Schin if (ne != nb) 211*4887Schin { 212*4887Schin *s++ = '-'; 213*4887Schin *s++ = ne; 214*4887Schin } 215*4887Schin } 216*4887Schin if (ib >= 0) 217*4887Schin { 218*4887Schin *t++ = ib; 219*4887Schin if (ie != ib) 220*4887Schin { 221*4887Schin *t++ = '-'; 222*4887Schin *t++ = ie; 223*4887Schin } 224*4887Schin } 225*4887Schin if ((t - ic + 1) < (s - nc + (nc[0] == '^'))) 226*4887Schin { 227*4887Schin sfputc(sp, '^'); 228*4887Schin if (cb < 0) 229*4887Schin sfputc(sp, ']'); 230*4887Schin if (cr < 0) 231*4887Schin sfputc(sp, '-'); 232*4887Schin if (cd < 0) 233*4887Schin { 234*4887Schin if (flags & REG_ESCAPE) 235*4887Schin sfputc(sp, '\\'); 236*4887Schin sfputc(sp, delimiter); 237*4887Schin } 238*4887Schin sfwrite(sp, ic, t - ic); 239*4887Schin } 240*4887Schin else 241*4887Schin { 242*4887Schin if (cb > 0) 243*4887Schin sfputc(sp, ']'); 244*4887Schin if (cr > 0) 245*4887Schin sfputc(sp, '-'); 246*4887Schin if (cd > 0) 247*4887Schin { 248*4887Schin if (flags & REG_ESCAPE) 249*4887Schin sfputc(sp, '\\'); 250*4887Schin sfputc(sp, delimiter); 251*4887Schin } 252*4887Schin if (nc[0] == '^') 253*4887Schin { 254*4887Schin sfwrite(sp, nc + 1, s - nc - 1); 255*4887Schin sfputc(sp, '^'); 256*4887Schin } 257*4887Schin else 258*4887Schin sfwrite(sp, nc, s - nc); 259*4887Schin } 260*4887Schin sfputc(sp, ']'); 261*4887Schin break; 262*4887Schin case REX_COLL_CLASS: 263*4887Schin break; 264*4887Schin case REX_ONECHAR: 265*4887Schin meta(sp, e->re.onechar, type, 0, delimiter); 266*4887Schin break; 267*4887Schin case REX_DOT: 268*4887Schin sfputc(sp, '.'); 269*4887Schin break; 270*4887Schin } 271*4887Schin if (type < SRE) 272*4887Schin { 273*4887Schin if (e->hi == RE_DUP_INF) 274*4887Schin { 275*4887Schin if (!e->lo) 276*4887Schin sfputc(sp, '*'); 277*4887Schin else if (e->lo == 1 && ismeta('+', type, 0, delimiter)) 278*4887Schin meta(sp, '+', type, 1, delimiter); 279*4887Schin else 280*4887Schin { 281*4887Schin meta(sp, '{', type, 1, delimiter); 282*4887Schin sfprintf(sp, "%d,", e->lo); 283*4887Schin meta(sp, '}', type, 1, delimiter); 284*4887Schin } 285*4887Schin } 286*4887Schin else if (e->hi != 1 || e->lo == 0 && !ismeta('?', type, 0, delimiter)) 287*4887Schin { 288*4887Schin meta(sp, '{', type, 1, delimiter); 289*4887Schin sfprintf(sp, "%d,%d", e->lo, e->hi); 290*4887Schin meta(sp, '}', type, 1, delimiter); 291*4887Schin } 292*4887Schin else if (e->lo == 0) 293*4887Schin meta(sp, '?', type, 1, delimiter); 294*4887Schin } 295*4887Schin else if (c) 296*4887Schin sfputc(sp, c); 297*4887Schin break; 298*4887Schin case REX_STRING: 299*4887Schin case REX_KMP: 300*4887Schin t = (s = e->re.string.base) + e->re.string.size; 301*4887Schin while (s < t) 302*4887Schin { 303*4887Schin c = *s++; 304*4887Schin meta(sp, c, type, 0, delimiter); 305*4887Schin } 306*4887Schin break; 307*4887Schin case REX_TRIE: 308*4887Schin ib = 0; 309*4887Schin for (c = 0; c <= UCHAR_MAX; c++) 310*4887Schin if (e->re.trie.root[c]) 311*4887Schin { 312*4887Schin char pfx[1024]; 313*4887Schin 314*4887Schin if (ib) 315*4887Schin sfputc(sp, '|'); 316*4887Schin else 317*4887Schin ib = 1; 318*4887Schin detrie(e->re.trie.root[c], sp, pfx, pfx, &pfx[sizeof(pfx)], delimiter); 319*4887Schin } 320*4887Schin break; 321*4887Schin case REX_NEG: 322*4887Schin if (type >= SRE) 323*4887Schin sfprintf(sp, "!("); 324*4887Schin if (decomp(e->re.group.expr.rex, sp, type, delimiter, flags)) 325*4887Schin return 1; 326*4887Schin if (type >= SRE) 327*4887Schin sfputc(sp, ')'); 328*4887Schin else 329*4887Schin sfputc(sp, '!'); 330*4887Schin break; 331*4887Schin case REX_CONJ: 332*4887Schin if (decomp(e->re.group.expr.binary.left, sp, type, delimiter, flags)) 333*4887Schin return 1; 334*4887Schin sfputc(sp, '&'); 335*4887Schin if (decomp(e->re.group.expr.binary.right, sp, type, delimiter, flags)) 336*4887Schin return 1; 337*4887Schin break; 338*4887Schin case REX_GROUP: 339*4887Schin if (type >= SRE) 340*4887Schin sfputc(sp, '@'); 341*4887Schin meta(sp, '(', type, 1, delimiter); 342*4887Schin if (decomp(e->re.group.expr.rex, sp, type, delimiter, flags)) 343*4887Schin return 1; 344*4887Schin meta(sp, ')', type, 1, delimiter); 345*4887Schin break; 346*4887Schin case REX_GROUP_AHEAD: 347*4887Schin case REX_GROUP_AHEAD_NOT: 348*4887Schin case REX_GROUP_BEHIND: 349*4887Schin case REX_GROUP_BEHIND_NOT: 350*4887Schin meta(sp, '(', type, 1, delimiter); 351*4887Schin sfputc(sp, '?'); 352*4887Schin if (decomp(e->re.group.expr.rex, sp, type, delimiter, flags)) 353*4887Schin return 1; 354*4887Schin meta(sp, ')', type, 1, delimiter); 355*4887Schin break; 356*4887Schin case REX_GROUP_COND: 357*4887Schin meta(sp, '(', type, 1, delimiter); 358*4887Schin sfputc(sp, '?'); 359*4887Schin if (e->re.group.expr.binary.left && decomp(e->re.group.expr.binary.left, sp, type, delimiter, flags)) 360*4887Schin return 1; 361*4887Schin if (q = e->re.group.expr.binary.right) 362*4887Schin { 363*4887Schin sfputc(sp, ':'); 364*4887Schin if (q->re.group.expr.binary.left && decomp(q->re.group.expr.binary.left, sp, type, delimiter, flags)) 365*4887Schin return 1; 366*4887Schin sfputc(sp, ':'); 367*4887Schin if (q->re.group.expr.binary.right && decomp(q->re.group.expr.binary.right, sp, type, delimiter, flags)) 368*4887Schin return 1; 369*4887Schin } 370*4887Schin meta(sp, ')', type, 1, delimiter); 371*4887Schin break; 372*4887Schin case REX_GROUP_CUT: 373*4887Schin meta(sp, '(', type, 1, delimiter); 374*4887Schin sfputc(sp, '?'); 375*4887Schin if (decomp(e->re.group.expr.rex, sp, type, delimiter, flags)) 376*4887Schin return 1; 377*4887Schin meta(sp, ')', type, 1, delimiter); 378*4887Schin break; 379*4887Schin case REX_BM: 380*4887Schin break; 381*4887Schin default: 382*4887Schin sfprintf(sp, "<ERROR:REX_%d>", e->type); 383*4887Schin break; 384*4887Schin } 385*4887Schin } while (e = e->next); 386*4887Schin return 0; 387*4887Schin } 388*4887Schin 389*4887Schin /* 390*4887Schin * reconstruct pattern from compiled re p into sp 391*4887Schin */ 392*4887Schin 393*4887Schin size_t 394*4887Schin regdecomp(regex_t* p, regflags_t flags, char* buf, size_t n) 395*4887Schin { 396*4887Schin Sfio_t* sp; 397*4887Schin char* s; 398*4887Schin int type; 399*4887Schin int delimiter; 400*4887Schin size_t r; 401*4887Schin 402*4887Schin if (!(sp = sfstropen())) 403*4887Schin return 0; 404*4887Schin if (flags < 0) 405*4887Schin flags = p->env->flags; 406*4887Schin switch (flags & (REG_AUGMENTED|REG_EXTENDED|REG_SHELL)) 407*4887Schin { 408*4887Schin case 0: 409*4887Schin type = BRE; 410*4887Schin break; 411*4887Schin case REG_AUGMENTED: 412*4887Schin case REG_AUGMENTED|REG_EXTENDED: 413*4887Schin type = ARE; 414*4887Schin break; 415*4887Schin case REG_EXTENDED: 416*4887Schin type = ERE; 417*4887Schin break; 418*4887Schin case REG_SHELL: 419*4887Schin type = SRE; 420*4887Schin break; 421*4887Schin default: 422*4887Schin type = KRE; 423*4887Schin break; 424*4887Schin } 425*4887Schin if (flags & REG_DELIMITED) 426*4887Schin { 427*4887Schin delimiter = '/'; 428*4887Schin sfputc(sp, delimiter); 429*4887Schin } 430*4887Schin else 431*4887Schin delimiter = 0; 432*4887Schin if (decomp(p->env->rex, sp, type, delimiter, flags)) 433*4887Schin r = 0; 434*4887Schin else 435*4887Schin { 436*4887Schin if (delimiter) 437*4887Schin sfputc(sp, delimiter); 438*4887Schin if ((r = sfstrtell(sp) + 1) <= n) 439*4887Schin { 440*4887Schin if (!(s = sfstruse(sp))) 441*4887Schin r = 0; 442*4887Schin else 443*4887Schin memcpy(buf, s, r); 444*4887Schin } 445*4887Schin } 446*4887Schin sfstrclose(sp); 447*4887Schin return r; 448*4887Schin } 449