1*8659Smckusick static char sccsid[] = "@(#)regexp.c 4.1 (Berkeley) 10/19/82"; 2*8659Smckusick 3*8659Smckusick #include <ctype.h> 4*8659Smckusick 5*8659Smckusick typedef int boolean; 6*8659Smckusick #define TRUE 1 7*8659Smckusick #define FALSE 0 8*8659Smckusick #define NIL 0 9*8659Smckusick 10*8659Smckusick boolean l_onecase; /* true if upper and lower equivalent */ 11*8659Smckusick 12*8659Smckusick #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 13*8659Smckusick 14*8659Smckusick /* STRNCMP - like strncmp except that we convert the 15*8659Smckusick * first string to lower case before comparing 16*8659Smckusick * if l_onecase is set. 17*8659Smckusick */ 18*8659Smckusick 19*8659Smckusick STRNCMP(s1, s2, len) 20*8659Smckusick register char *s1,*s2; 21*8659Smckusick register int len; 22*8659Smckusick { 23*8659Smckusick if (l_onecase) { 24*8659Smckusick do 25*8659Smckusick if (*s2 - makelower(*s1)) 26*8659Smckusick return (*s2 - makelower(*s1)); 27*8659Smckusick else { 28*8659Smckusick s2++; 29*8659Smckusick s1++; 30*8659Smckusick } 31*8659Smckusick while (--len); 32*8659Smckusick } else { 33*8659Smckusick do 34*8659Smckusick if (*s2 - *s1) 35*8659Smckusick return (*s2 - *s1); 36*8659Smckusick else { 37*8659Smckusick s2++; 38*8659Smckusick s1++; 39*8659Smckusick } 40*8659Smckusick while (--len); 41*8659Smckusick } 42*8659Smckusick return(0); 43*8659Smckusick } 44*8659Smckusick 45*8659Smckusick /* The following routine converts an irregular expression to 46*8659Smckusick * internal format. 47*8659Smckusick * 48*8659Smckusick * Either meta symbols (\a \d or \p) or character strings or 49*8659Smckusick * operations ( alternation or perenthesizing ) can be 50*8659Smckusick * specified. Each starts with a descriptor byte. The descriptor 51*8659Smckusick * byte has STR set for strings, META set for meta symbols 52*8659Smckusick * and OPER set for operations. 53*8659Smckusick * The descriptor byte can also have the OPT bit set if the object 54*8659Smckusick * defined is optional. Also ALT can be set to indicate an alternation. 55*8659Smckusick * 56*8659Smckusick * For metasymbols the byte following the descriptor byte identities 57*8659Smckusick * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 58*8659Smckusick * strings the byte after the descriptor is a character count for 59*8659Smckusick * the string: 60*8659Smckusick * 61*8659Smckusick * meta symbols := descriptor 62*8659Smckusick * symbol 63*8659Smckusick * 64*8659Smckusick * strings := descriptor 65*8659Smckusick * character count 66*8659Smckusick * the string 67*8659Smckusick * 68*8659Smckusick * operatins := descriptor 69*8659Smckusick * symbol 70*8659Smckusick * character count 71*8659Smckusick */ 72*8659Smckusick 73*8659Smckusick /* 74*8659Smckusick * handy macros for accessing parts of match blocks 75*8659Smckusick */ 76*8659Smckusick #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 77*8659Smckusick #define MNEXT(A) (A+2) /* character following a metasymbol block */ 78*8659Smckusick 79*8659Smckusick #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 80*8659Smckusick #define OCNT(A) (*(A+2)) /* character count */ 81*8659Smckusick #define ONEXT(A) (A+3) /* next character after the operation */ 82*8659Smckusick #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 83*8659Smckusick 84*8659Smckusick #define SCNT(A) (*(A+1)) /* byte count of a string */ 85*8659Smckusick #define SSTR(A) (A+2) /* address of the string */ 86*8659Smckusick #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 87*8659Smckusick 88*8659Smckusick /* 89*8659Smckusick * bit flags in the descriptor 90*8659Smckusick */ 91*8659Smckusick #define OPT 1 92*8659Smckusick #define STR 2 93*8659Smckusick #define META 4 94*8659Smckusick #define ALT 8 95*8659Smckusick #define OPER 16 96*8659Smckusick 97*8659Smckusick char *ure; /* pointer current position in unconverted exp */ 98*8659Smckusick char *ccre; /* pointer to current position in converted exp*/ 99*8659Smckusick char *malloc(); 100*8659Smckusick 101*8659Smckusick char * 102*8659Smckusick convexp(re) 103*8659Smckusick char *re; /* unconverted irregular expression */ 104*8659Smckusick { 105*8659Smckusick register char *cre; /* pointer to converted regular expression */ 106*8659Smckusick 107*8659Smckusick /* allocate room for the converted expression */ 108*8659Smckusick if (re == NIL) 109*8659Smckusick return (NIL); 110*8659Smckusick if (*re == '\0') 111*8659Smckusick return (NIL); 112*8659Smckusick cre = malloc (4 * strlen(re) + 3); 113*8659Smckusick ccre = cre; 114*8659Smckusick ure = re; 115*8659Smckusick 116*8659Smckusick /* start the conversion with a \a */ 117*8659Smckusick *cre = META | OPT; 118*8659Smckusick MSYM(cre) = 'a'; 119*8659Smckusick ccre = MNEXT(cre); 120*8659Smckusick 121*8659Smckusick /* start the conversion (its recursive) */ 122*8659Smckusick expconv (); 123*8659Smckusick *ccre = 0; 124*8659Smckusick return (cre); 125*8659Smckusick } 126*8659Smckusick 127*8659Smckusick expconv() 128*8659Smckusick { 129*8659Smckusick register char *cs; /* pointer to current symbol in converted exp */ 130*8659Smckusick register char c; /* character being processed */ 131*8659Smckusick register char *acs; /* pinter to last alternate */ 132*8659Smckusick register int temp; 133*8659Smckusick 134*8659Smckusick /* let the conversion begin */ 135*8659Smckusick acs = NIL; 136*8659Smckusick while (*ure != NIL) { 137*8659Smckusick switch (c = *ure++) { 138*8659Smckusick 139*8659Smckusick case '\\': 140*8659Smckusick switch (c = *ure++) { 141*8659Smckusick 142*8659Smckusick /* escaped characters are just characters */ 143*8659Smckusick default: 144*8659Smckusick if ((*cs & STR) == 0) { 145*8659Smckusick cs = ccre; 146*8659Smckusick *cs = STR; 147*8659Smckusick SCNT(cs) = 1; 148*8659Smckusick ccre += 2; 149*8659Smckusick } else 150*8659Smckusick SCNT(cs)++; 151*8659Smckusick *ccre++ = c; 152*8659Smckusick break; 153*8659Smckusick 154*8659Smckusick /* normal(?) metacharacters */ 155*8659Smckusick case 'a': 156*8659Smckusick case 'd': 157*8659Smckusick case 'e': 158*8659Smckusick case 'p': 159*8659Smckusick if (acs != NIL && acs != cs) { 160*8659Smckusick do { 161*8659Smckusick temp = OCNT(acs); 162*8659Smckusick OCNT(acs) = ccre - acs; 163*8659Smckusick acs -= temp; 164*8659Smckusick } while (temp != 0); 165*8659Smckusick acs = NIL; 166*8659Smckusick } 167*8659Smckusick cs = ccre; 168*8659Smckusick *cs = META; 169*8659Smckusick MSYM(cs) = c; 170*8659Smckusick ccre = MNEXT(cs); 171*8659Smckusick break; 172*8659Smckusick } 173*8659Smckusick break; 174*8659Smckusick 175*8659Smckusick /* just put the symbol in */ 176*8659Smckusick case '^': 177*8659Smckusick case '$': 178*8659Smckusick if (acs != NIL && acs != cs) { 179*8659Smckusick do { 180*8659Smckusick temp = OCNT(acs); 181*8659Smckusick OCNT(acs) = ccre - acs; 182*8659Smckusick acs -= temp; 183*8659Smckusick } while (temp != 0); 184*8659Smckusick acs = NIL; 185*8659Smckusick } 186*8659Smckusick cs = ccre; 187*8659Smckusick *cs = META; 188*8659Smckusick MSYM(cs) = c; 189*8659Smckusick ccre = MNEXT(cs); 190*8659Smckusick break; 191*8659Smckusick 192*8659Smckusick /* mark the last match sequence as optional */ 193*8659Smckusick case '?': 194*8659Smckusick *cs = *cs | OPT; 195*8659Smckusick break; 196*8659Smckusick 197*8659Smckusick /* recurse and define a subexpression */ 198*8659Smckusick case '(': 199*8659Smckusick if (acs != NIL && acs != cs) { 200*8659Smckusick do { 201*8659Smckusick temp = OCNT(acs); 202*8659Smckusick OCNT(acs) = ccre - acs; 203*8659Smckusick acs -= temp; 204*8659Smckusick } while (temp != 0); 205*8659Smckusick acs = NIL; 206*8659Smckusick } 207*8659Smckusick cs = ccre; 208*8659Smckusick *cs = OPER; 209*8659Smckusick OSYM(cs) = '('; 210*8659Smckusick ccre = ONEXT(cs); 211*8659Smckusick expconv (); 212*8659Smckusick OCNT(cs) = ccre - cs; /* offset to next symbol */ 213*8659Smckusick break; 214*8659Smckusick 215*8659Smckusick /* return from a recursion */ 216*8659Smckusick case ')': 217*8659Smckusick if (acs != NIL) { 218*8659Smckusick do { 219*8659Smckusick temp = OCNT(acs); 220*8659Smckusick OCNT(acs) = ccre - acs; 221*8659Smckusick acs -= temp; 222*8659Smckusick } while (temp != 0); 223*8659Smckusick acs = NIL; 224*8659Smckusick } 225*8659Smckusick cs = ccre; 226*8659Smckusick *cs = META; 227*8659Smckusick MSYM(cs) = c; 228*8659Smckusick ccre = MNEXT(cs); 229*8659Smckusick return; 230*8659Smckusick 231*8659Smckusick /* mark the last match sequence as having an alternate */ 232*8659Smckusick /* the third byte will contain an offset to jump over the */ 233*8659Smckusick /* alternate match in case the first did not fail */ 234*8659Smckusick case '|': 235*8659Smckusick if (acs != NIL && acs != cs) 236*8659Smckusick OCNT(ccre) = ccre - acs; /* make a back pointer */ 237*8659Smckusick else 238*8659Smckusick OCNT(ccre) = 0; 239*8659Smckusick *cs |= ALT; 240*8659Smckusick cs = ccre; 241*8659Smckusick *cs = OPER; 242*8659Smckusick OSYM(cs) = '|'; 243*8659Smckusick ccre = ONEXT(cs); 244*8659Smckusick acs = cs; /* remember that the pointer is to be filles */ 245*8659Smckusick break; 246*8659Smckusick 247*8659Smckusick /* if its not a metasymbol just build a scharacter string */ 248*8659Smckusick default: 249*8659Smckusick if ((*cs & STR) == 0) { 250*8659Smckusick cs = ccre; 251*8659Smckusick *cs = STR; 252*8659Smckusick SCNT(cs) = 1; 253*8659Smckusick ccre = SSTR(cs); 254*8659Smckusick } else 255*8659Smckusick SCNT(cs)++; 256*8659Smckusick *ccre++ = c; 257*8659Smckusick break; 258*8659Smckusick } 259*8659Smckusick } 260*8659Smckusick if (acs != NIL) { 261*8659Smckusick do { 262*8659Smckusick temp = OCNT(acs); 263*8659Smckusick OCNT(acs) = ccre - acs; 264*8659Smckusick acs -= temp; 265*8659Smckusick } while (temp != 0); 266*8659Smckusick acs = NIL; 267*8659Smckusick } 268*8659Smckusick return; 269*8659Smckusick } 270*8659Smckusick /* end of convertre */ 271*8659Smckusick 272*8659Smckusick 273*8659Smckusick /* 274*8659Smckusick * The following routine recognises an irregular expresion 275*8659Smckusick * with the following special characters: 276*8659Smckusick * 277*8659Smckusick * \? - means last match was optional 278*8659Smckusick * \a - matches any number of characters 279*8659Smckusick * \d - matches any number of spaces and tabs 280*8659Smckusick * \p - matches any number of alphanumeric 281*8659Smckusick * characters. The 282*8659Smckusick * characters matched will be copied into 283*8659Smckusick * the area pointed to by 'name'. 284*8659Smckusick * \| - alternation 285*8659Smckusick * \( \) - grouping used mostly for alternation and 286*8659Smckusick * optionality 287*8659Smckusick * 288*8659Smckusick * The irregular expression must be translated to internal form 289*8659Smckusick * prior to calling this routine 290*8659Smckusick * 291*8659Smckusick * The value returned is the pointer to the first non \a 292*8659Smckusick * character matched. 293*8659Smckusick */ 294*8659Smckusick 295*8659Smckusick boolean _escaped; /* true if we are currently _escaped */ 296*8659Smckusick char *_start; /* start of string */ 297*8659Smckusick 298*8659Smckusick char * 299*8659Smckusick expmatch (s, re, mstring) 300*8659Smckusick register char *s; /* string to check for a match in */ 301*8659Smckusick register char *re; /* a converted irregular expression */ 302*8659Smckusick register char *mstring; /* where to put whatever matches a \p */ 303*8659Smckusick { 304*8659Smckusick register char *cs; /* the current symbol */ 305*8659Smckusick register char *ptr,*s1; /* temporary pointer */ 306*8659Smckusick boolean matched; /* a temporary boolean */ 307*8659Smckusick 308*8659Smckusick /* initial conditions */ 309*8659Smckusick if (re == NIL) 310*8659Smckusick return (NIL); 311*8659Smckusick cs = re; 312*8659Smckusick matched = FALSE; 313*8659Smckusick 314*8659Smckusick /* loop till expression string is exhausted (or at least pretty tired) */ 315*8659Smckusick while (*cs) { 316*8659Smckusick switch (*cs & (OPER | STR | META)) { 317*8659Smckusick 318*8659Smckusick /* try to match a string */ 319*8659Smckusick case STR: 320*8659Smckusick matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 321*8659Smckusick if (matched) { 322*8659Smckusick 323*8659Smckusick /* hoorah it matches */ 324*8659Smckusick s += SCNT(cs); 325*8659Smckusick cs = SNEXT(cs); 326*8659Smckusick } else if (*cs & ALT) { 327*8659Smckusick 328*8659Smckusick /* alternation, skip to next expression */ 329*8659Smckusick cs = SNEXT(cs); 330*8659Smckusick } else if (*cs & OPT) { 331*8659Smckusick 332*8659Smckusick /* the match is optional */ 333*8659Smckusick cs = SNEXT(cs); 334*8659Smckusick matched = 1; /* indicate a successful match */ 335*8659Smckusick } else { 336*8659Smckusick 337*8659Smckusick /* no match, error return */ 338*8659Smckusick return (NIL); 339*8659Smckusick } 340*8659Smckusick break; 341*8659Smckusick 342*8659Smckusick /* an operator, do something fancy */ 343*8659Smckusick case OPER: 344*8659Smckusick switch (OSYM(cs)) { 345*8659Smckusick 346*8659Smckusick /* this is an alternation */ 347*8659Smckusick case '|': 348*8659Smckusick if (matched) 349*8659Smckusick 350*8659Smckusick /* last thing in the alternation was a match, skip ahead */ 351*8659Smckusick cs = OPTR(cs); 352*8659Smckusick else 353*8659Smckusick 354*8659Smckusick /* no match, keep trying */ 355*8659Smckusick cs = ONEXT(cs); 356*8659Smckusick break; 357*8659Smckusick 358*8659Smckusick /* this is a grouping, recurse */ 359*8659Smckusick case '(': 360*8659Smckusick ptr = expmatch (s, ONEXT(cs), mstring); 361*8659Smckusick if (ptr != NIL) { 362*8659Smckusick 363*8659Smckusick /* the subexpression matched */ 364*8659Smckusick matched = 1; 365*8659Smckusick s = ptr; 366*8659Smckusick } else if (*cs & ALT) { 367*8659Smckusick 368*8659Smckusick /* alternation, skip to next expression */ 369*8659Smckusick matched = 0; 370*8659Smckusick } else if (*cs & OPT) { 371*8659Smckusick 372*8659Smckusick /* the match is optional */ 373*8659Smckusick matched = 1; /* indicate a successful match */ 374*8659Smckusick } else { 375*8659Smckusick 376*8659Smckusick /* no match, error return */ 377*8659Smckusick return (NIL); 378*8659Smckusick } 379*8659Smckusick cs = OPTR(cs); 380*8659Smckusick break; 381*8659Smckusick } 382*8659Smckusick break; 383*8659Smckusick 384*8659Smckusick /* try to match a metasymbol */ 385*8659Smckusick case META: 386*8659Smckusick switch (MSYM(cs)) { 387*8659Smckusick 388*8659Smckusick /* try to match anything and remember what was matched */ 389*8659Smckusick case 'p': 390*8659Smckusick /* 391*8659Smckusick * This is really the same as trying the match the 392*8659Smckusick * remaining parts of the expression to any subset 393*8659Smckusick * of the string. 394*8659Smckusick */ 395*8659Smckusick s1 = s; 396*8659Smckusick do { 397*8659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring); 398*8659Smckusick if (ptr != NIL && s1 != s) { 399*8659Smckusick 400*8659Smckusick /* we have a match, remember the match */ 401*8659Smckusick strncpy (mstring, s, s1 - s); 402*8659Smckusick mstring[s1 - s] = '\0'; 403*8659Smckusick return (ptr); 404*8659Smckusick } else if (ptr != NIL && (*cs & OPT)) { 405*8659Smckusick 406*8659Smckusick /* it was aoptional so no match is ok */ 407*8659Smckusick return (ptr); 408*8659Smckusick } else if (ptr != NIL) { 409*8659Smckusick 410*8659Smckusick /* not optional and we still matched */ 411*8659Smckusick return (NIL); 412*8659Smckusick } 413*8659Smckusick if (!isalnum(*s1) && *s1 != '_') 414*8659Smckusick return (NIL); 415*8659Smckusick if (*s1 == '\\') 416*8659Smckusick _escaped = _escaped ? FALSE : TRUE; 417*8659Smckusick else 418*8659Smckusick _escaped = FALSE; 419*8659Smckusick } while (*s1++); 420*8659Smckusick return (NIL); 421*8659Smckusick 422*8659Smckusick /* try to match anything */ 423*8659Smckusick case 'a': 424*8659Smckusick /* 425*8659Smckusick * This is really the same as trying the match the 426*8659Smckusick * remaining parts of the expression to any subset 427*8659Smckusick * of the string. 428*8659Smckusick */ 429*8659Smckusick s1 = s; 430*8659Smckusick do { 431*8659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring); 432*8659Smckusick if (ptr != NIL && s1 != s) { 433*8659Smckusick 434*8659Smckusick /* we have a match */ 435*8659Smckusick return (ptr); 436*8659Smckusick } else if (ptr != NIL && (*cs & OPT)) { 437*8659Smckusick 438*8659Smckusick /* it was aoptional so no match is ok */ 439*8659Smckusick return (ptr); 440*8659Smckusick } else if (ptr != NIL) { 441*8659Smckusick 442*8659Smckusick /* not optional and we still matched */ 443*8659Smckusick return (NIL); 444*8659Smckusick } 445*8659Smckusick if (*s1 == '\\') 446*8659Smckusick _escaped = _escaped ? FALSE : TRUE; 447*8659Smckusick else 448*8659Smckusick _escaped = FALSE; 449*8659Smckusick } while (*s1++); 450*8659Smckusick return (NIL); 451*8659Smckusick 452*8659Smckusick /* fail if we are currently _escaped */ 453*8659Smckusick case 'e': 454*8659Smckusick if (_escaped) 455*8659Smckusick return(NIL); 456*8659Smckusick cs = MNEXT(cs); 457*8659Smckusick break; 458*8659Smckusick 459*8659Smckusick /* match any number of tabs and spaces */ 460*8659Smckusick case 'd': 461*8659Smckusick ptr = s; 462*8659Smckusick while (*s == ' ' || *s == '\t') 463*8659Smckusick s++; 464*8659Smckusick if (s != ptr || s == _start) { 465*8659Smckusick 466*8659Smckusick /* match, be happy */ 467*8659Smckusick matched = 1; 468*8659Smckusick cs = MNEXT(cs); 469*8659Smckusick } else if (*s == '\n' || *s == '\0') { 470*8659Smckusick 471*8659Smckusick /* match, be happy */ 472*8659Smckusick matched = 1; 473*8659Smckusick cs = MNEXT(cs); 474*8659Smckusick } else if (*cs & ALT) { 475*8659Smckusick 476*8659Smckusick /* try the next part */ 477*8659Smckusick matched = 0; 478*8659Smckusick cs = MNEXT(cs); 479*8659Smckusick } else if (*cs & OPT) { 480*8659Smckusick 481*8659Smckusick /* doesn't matter */ 482*8659Smckusick matched = 1; 483*8659Smckusick cs = MNEXT(cs); 484*8659Smckusick } else 485*8659Smckusick 486*8659Smckusick /* no match, error return */ 487*8659Smckusick return (NIL); 488*8659Smckusick break; 489*8659Smckusick 490*8659Smckusick /* check for end of line */ 491*8659Smckusick case '$': 492*8659Smckusick if (*s == '\0' || *s == '\n') { 493*8659Smckusick 494*8659Smckusick /* match, be happy */ 495*8659Smckusick s++; 496*8659Smckusick matched = 1; 497*8659Smckusick cs = MNEXT(cs); 498*8659Smckusick } else if (*cs & ALT) { 499*8659Smckusick 500*8659Smckusick /* try the next part */ 501*8659Smckusick matched = 0; 502*8659Smckusick cs = MNEXT(cs); 503*8659Smckusick } else if (*cs & OPT) { 504*8659Smckusick 505*8659Smckusick /* doesn't matter */ 506*8659Smckusick matched = 1; 507*8659Smckusick cs = MNEXT(cs); 508*8659Smckusick } else 509*8659Smckusick 510*8659Smckusick /* no match, error return */ 511*8659Smckusick return (NIL); 512*8659Smckusick break; 513*8659Smckusick 514*8659Smckusick /* check for start of line */ 515*8659Smckusick case '^': 516*8659Smckusick if (s == _start) { 517*8659Smckusick 518*8659Smckusick /* match, be happy */ 519*8659Smckusick matched = 1; 520*8659Smckusick cs = MNEXT(cs); 521*8659Smckusick } else if (*cs & ALT) { 522*8659Smckusick 523*8659Smckusick /* try the next part */ 524*8659Smckusick matched = 0; 525*8659Smckusick cs = MNEXT(cs); 526*8659Smckusick } else if (*cs & OPT) { 527*8659Smckusick 528*8659Smckusick /* doesn't matter */ 529*8659Smckusick matched = 1; 530*8659Smckusick cs = MNEXT(cs); 531*8659Smckusick } else 532*8659Smckusick 533*8659Smckusick /* no match, error return */ 534*8659Smckusick return (NIL); 535*8659Smckusick break; 536*8659Smckusick 537*8659Smckusick /* end of a subexpression, return success */ 538*8659Smckusick case ')': 539*8659Smckusick return (s); 540*8659Smckusick } 541*8659Smckusick break; 542*8659Smckusick } 543*8659Smckusick } 544*8659Smckusick return (s); 545*8659Smckusick } 546