1*1977Swnj /* @(#)regex.c 4.1 (Berkeley) 12/21/80 */ 2*1977Swnj # 3*1977Swnj 4*1977Swnj /* 5*1977Swnj * routines to do regular expression matching 6*1977Swnj * 7*1977Swnj * Entry points: 8*1977Swnj * 9*1977Swnj * re_comp(s) 10*1977Swnj * char *s; 11*1977Swnj * ... returns 0 if the string s was compiled successfully, 12*1977Swnj * a pointer to an error message otherwise. 13*1977Swnj * If passed 0 or a null string returns without changing 14*1977Swnj * the currently compiled re (see note 11 below). 15*1977Swnj * 16*1977Swnj * re_exec(s) 17*1977Swnj * char *s; 18*1977Swnj * ... returns 1 if the string s matches the last compiled regular 19*1977Swnj * expression, 20*1977Swnj * 0 if the string s failed to match the last compiled 21*1977Swnj * regular expression, and 22*1977Swnj * -1 if the compiled regular expression was invalid 23*1977Swnj * (indicating an internal error). 24*1977Swnj * 25*1977Swnj * The strings passed to both re_comp and re_exec may have trailing or 26*1977Swnj * embedded newline characters; they are terminated by nulls. 27*1977Swnj * 28*1977Swnj * The identity of the author of these routines is lost in antiquity; 29*1977Swnj * this is essentially the same as the re code in the original V6 ed. 30*1977Swnj * 31*1977Swnj * The regular expressions recognized are described below. This description 32*1977Swnj * is essentially the same as that for ed. 33*1977Swnj * 34*1977Swnj * A regular expression specifies a set of strings of characters. 35*1977Swnj * A member of this set of strings is said to be matched by 36*1977Swnj * the regular expression. In the following specification for 37*1977Swnj * regular expressions the word `character' means any character but NUL. 38*1977Swnj * 39*1977Swnj * 1. Any character except a special character matches itself. 40*1977Swnj * Special characters are the regular expression delimiter plus 41*1977Swnj * \ [ . and sometimes ^ * $. 42*1977Swnj * 2. A . matches any character. 43*1977Swnj * 3. A \ followed by any character except a digit or ( ) 44*1977Swnj * matches that character. 45*1977Swnj * 4. A nonempty string s bracketed [s] (or [^s]) matches any 46*1977Swnj * character in (or not in) s. In s, \ has no special meaning, 47*1977Swnj * and ] may only appear as the first letter. A substring 48*1977Swnj * a-b, with a and b in ascending ASCII order, stands for 49*1977Swnj * the inclusive range of ASCII characters. 50*1977Swnj * 5. A regular expression of form 1-4 followed by * matches a 51*1977Swnj * sequence of 0 or more matches of the regular expression. 52*1977Swnj * 6. A regular expression, x, of form 1-8, bracketed \(x\) 53*1977Swnj * matches what x matches. 54*1977Swnj * 7. A \ followed by a digit n matches a copy of the string that the 55*1977Swnj * bracketed regular expression beginning with the nth \( matched. 56*1977Swnj * 8. A regular expression of form 1-8, x, followed by a regular 57*1977Swnj * expression of form 1-7, y matches a match for x followed by 58*1977Swnj * a match for y, with the x match being as long as possible 59*1977Swnj * while still permitting a y match. 60*1977Swnj * 9. A regular expression of form 1-8 preceded by ^ (or followed 61*1977Swnj * by $), is constrained to matches that begin at the left 62*1977Swnj * (or end at the right) end of a line. 63*1977Swnj * 10. A regular expression of form 1-9 picks out the longest among 64*1977Swnj * the leftmost matches in a line. 65*1977Swnj * 11. An empty regular expression stands for a copy of the last 66*1977Swnj * regular expression encountered. 67*1977Swnj */ 68*1977Swnj 69*1977Swnj /* 70*1977Swnj * constants for re's 71*1977Swnj */ 72*1977Swnj #define CBRA 1 73*1977Swnj #define CCHR 2 74*1977Swnj #define CDOT 4 75*1977Swnj #define CCL 6 76*1977Swnj #define NCCL 8 77*1977Swnj #define CDOL 10 78*1977Swnj #define CEOF 11 79*1977Swnj #define CKET 12 80*1977Swnj #define CBACK 18 81*1977Swnj 82*1977Swnj #define CSTAR 01 83*1977Swnj 84*1977Swnj #define ESIZE 512 85*1977Swnj #define NBRA 9 86*1977Swnj 87*1977Swnj static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; 88*1977Swnj static char circf; 89*1977Swnj 90*1977Swnj /* 91*1977Swnj * compile the regular expression argument into a dfa 92*1977Swnj */ 93*1977Swnj char * 94*1977Swnj re_comp(sp) 95*1977Swnj register char *sp; 96*1977Swnj { 97*1977Swnj register int c; 98*1977Swnj register char *ep = expbuf; 99*1977Swnj int cclcnt, numbra = 0; 100*1977Swnj char *lastep = 0; 101*1977Swnj char bracket[NBRA]; 102*1977Swnj char *bracketp = &bracket[0]; 103*1977Swnj static char *retoolong = "Regular expression too long"; 104*1977Swnj 105*1977Swnj #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } 106*1977Swnj 107*1977Swnj if (sp == 0 || *sp == '\0') { 108*1977Swnj if (*ep == 0) 109*1977Swnj return("No previous regular expression"); 110*1977Swnj return(0); 111*1977Swnj } 112*1977Swnj if (*sp == '^') { 113*1977Swnj circf = 1; 114*1977Swnj sp++; 115*1977Swnj } 116*1977Swnj else 117*1977Swnj circf = 0; 118*1977Swnj for (;;) { 119*1977Swnj if (ep >= &expbuf[ESIZE]) 120*1977Swnj comerr(retoolong); 121*1977Swnj if ((c = *sp++) == '\0') { 122*1977Swnj if (bracketp != bracket) 123*1977Swnj comerr("unmatched \\("); 124*1977Swnj *ep++ = CEOF; 125*1977Swnj *ep++ = 0; 126*1977Swnj return(0); 127*1977Swnj } 128*1977Swnj if (c != '*') 129*1977Swnj lastep = ep; 130*1977Swnj switch (c) { 131*1977Swnj 132*1977Swnj case '.': 133*1977Swnj *ep++ = CDOT; 134*1977Swnj continue; 135*1977Swnj 136*1977Swnj case '*': 137*1977Swnj if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 138*1977Swnj goto defchar; 139*1977Swnj *lastep |= CSTAR; 140*1977Swnj continue; 141*1977Swnj 142*1977Swnj case '$': 143*1977Swnj if (*sp != '\0') 144*1977Swnj goto defchar; 145*1977Swnj *ep++ = CDOL; 146*1977Swnj continue; 147*1977Swnj 148*1977Swnj case '[': 149*1977Swnj *ep++ = CCL; 150*1977Swnj *ep++ = 0; 151*1977Swnj cclcnt = 1; 152*1977Swnj if ((c = *sp++) == '^') { 153*1977Swnj c = *sp++; 154*1977Swnj ep[-2] = NCCL; 155*1977Swnj } 156*1977Swnj do { 157*1977Swnj if (c == '\0') 158*1977Swnj comerr("missing ]"); 159*1977Swnj if (c == '-' && ep [-1] != 0) { 160*1977Swnj if ((c = *sp++) == ']') { 161*1977Swnj *ep++ = '-'; 162*1977Swnj cclcnt++; 163*1977Swnj break; 164*1977Swnj } 165*1977Swnj while (ep[-1] < c) { 166*1977Swnj *ep = ep[-1] + 1; 167*1977Swnj ep++; 168*1977Swnj cclcnt++; 169*1977Swnj if (ep >= &expbuf[ESIZE]) 170*1977Swnj comerr(retoolong); 171*1977Swnj } 172*1977Swnj } 173*1977Swnj *ep++ = c; 174*1977Swnj cclcnt++; 175*1977Swnj if (ep >= &expbuf[ESIZE]) 176*1977Swnj comerr(retoolong); 177*1977Swnj } while ((c = *sp++) != ']'); 178*1977Swnj lastep[1] = cclcnt; 179*1977Swnj continue; 180*1977Swnj 181*1977Swnj case '\\': 182*1977Swnj if ((c = *sp++) == '(') { 183*1977Swnj if (numbra >= NBRA) 184*1977Swnj comerr("too many \\(\\) pairs"); 185*1977Swnj *bracketp++ = numbra; 186*1977Swnj *ep++ = CBRA; 187*1977Swnj *ep++ = numbra++; 188*1977Swnj continue; 189*1977Swnj } 190*1977Swnj if (c == ')') { 191*1977Swnj if (bracketp <= bracket) 192*1977Swnj comerr("unmatched \\)"); 193*1977Swnj *ep++ = CKET; 194*1977Swnj *ep++ = *--bracketp; 195*1977Swnj continue; 196*1977Swnj } 197*1977Swnj if (c >= '1' && c < ('1' + NBRA)) { 198*1977Swnj *ep++ = CBACK; 199*1977Swnj *ep++ = c - '1'; 200*1977Swnj continue; 201*1977Swnj } 202*1977Swnj *ep++ = CCHR; 203*1977Swnj *ep++ = c; 204*1977Swnj continue; 205*1977Swnj 206*1977Swnj defchar: 207*1977Swnj default: 208*1977Swnj *ep++ = CCHR; 209*1977Swnj *ep++ = c; 210*1977Swnj } 211*1977Swnj } 212*1977Swnj } 213*1977Swnj 214*1977Swnj /* 215*1977Swnj * match the argument string against the compiled re 216*1977Swnj */ 217*1977Swnj int 218*1977Swnj re_exec(p1) 219*1977Swnj register char *p1; 220*1977Swnj { 221*1977Swnj register char *p2 = expbuf; 222*1977Swnj register int c; 223*1977Swnj int rv; 224*1977Swnj 225*1977Swnj for (c = 0; c < NBRA; c++) { 226*1977Swnj braslist[c] = 0; 227*1977Swnj braelist[c] = 0; 228*1977Swnj } 229*1977Swnj if (circf) 230*1977Swnj return((advance(p1, p2))); 231*1977Swnj /* 232*1977Swnj * fast check for first character 233*1977Swnj */ 234*1977Swnj if (*p2 == CCHR) { 235*1977Swnj c = p2[1]; 236*1977Swnj do { 237*1977Swnj if (*p1 != c) 238*1977Swnj continue; 239*1977Swnj if (rv = advance(p1, p2)) 240*1977Swnj return(rv); 241*1977Swnj } while (*p1++); 242*1977Swnj return(0); 243*1977Swnj } 244*1977Swnj /* 245*1977Swnj * regular algorithm 246*1977Swnj */ 247*1977Swnj do 248*1977Swnj if (rv = advance(p1, p2)) 249*1977Swnj return(rv); 250*1977Swnj while (*p1++); 251*1977Swnj return(0); 252*1977Swnj } 253*1977Swnj 254*1977Swnj /* 255*1977Swnj * try to match the next thing in the dfa 256*1977Swnj */ 257*1977Swnj static int 258*1977Swnj advance(lp, ep) 259*1977Swnj register char *lp, *ep; 260*1977Swnj { 261*1977Swnj register char *curlp; 262*1977Swnj int ct, i; 263*1977Swnj int rv; 264*1977Swnj 265*1977Swnj for (;;) 266*1977Swnj switch (*ep++) { 267*1977Swnj 268*1977Swnj case CCHR: 269*1977Swnj if (*ep++ == *lp++) 270*1977Swnj continue; 271*1977Swnj return(0); 272*1977Swnj 273*1977Swnj case CDOT: 274*1977Swnj if (*lp++) 275*1977Swnj continue; 276*1977Swnj return(0); 277*1977Swnj 278*1977Swnj case CDOL: 279*1977Swnj if (*lp == '\0') 280*1977Swnj continue; 281*1977Swnj return(0); 282*1977Swnj 283*1977Swnj case CEOF: 284*1977Swnj return(1); 285*1977Swnj 286*1977Swnj case CCL: 287*1977Swnj if (cclass(ep, *lp++, 1)) { 288*1977Swnj ep += *ep; 289*1977Swnj continue; 290*1977Swnj } 291*1977Swnj return(0); 292*1977Swnj 293*1977Swnj case NCCL: 294*1977Swnj if (cclass(ep, *lp++, 0)) { 295*1977Swnj ep += *ep; 296*1977Swnj continue; 297*1977Swnj } 298*1977Swnj return(0); 299*1977Swnj 300*1977Swnj case CBRA: 301*1977Swnj braslist[*ep++] = lp; 302*1977Swnj continue; 303*1977Swnj 304*1977Swnj case CKET: 305*1977Swnj braelist[*ep++] = lp; 306*1977Swnj continue; 307*1977Swnj 308*1977Swnj case CBACK: 309*1977Swnj if (braelist[i = *ep++] == 0) 310*1977Swnj return(-1); 311*1977Swnj if (backref(i, lp)) { 312*1977Swnj lp += braelist[i] - braslist[i]; 313*1977Swnj continue; 314*1977Swnj } 315*1977Swnj return(0); 316*1977Swnj 317*1977Swnj case CBACK|CSTAR: 318*1977Swnj if (braelist[i = *ep++] == 0) 319*1977Swnj return(-1); 320*1977Swnj curlp = lp; 321*1977Swnj ct = braelist[i] - braslist[i]; 322*1977Swnj while (backref(i, lp)) 323*1977Swnj lp += ct; 324*1977Swnj while (lp >= curlp) { 325*1977Swnj if (rv = advance(lp, ep)) 326*1977Swnj return(rv); 327*1977Swnj lp -= ct; 328*1977Swnj } 329*1977Swnj continue; 330*1977Swnj 331*1977Swnj case CDOT|CSTAR: 332*1977Swnj curlp = lp; 333*1977Swnj while (*lp++) 334*1977Swnj ; 335*1977Swnj goto star; 336*1977Swnj 337*1977Swnj case CCHR|CSTAR: 338*1977Swnj curlp = lp; 339*1977Swnj while (*lp++ == *ep) 340*1977Swnj ; 341*1977Swnj ep++; 342*1977Swnj goto star; 343*1977Swnj 344*1977Swnj case CCL|CSTAR: 345*1977Swnj case NCCL|CSTAR: 346*1977Swnj curlp = lp; 347*1977Swnj while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 348*1977Swnj ; 349*1977Swnj ep += *ep; 350*1977Swnj goto star; 351*1977Swnj 352*1977Swnj star: 353*1977Swnj do { 354*1977Swnj lp--; 355*1977Swnj if (rv = advance(lp, ep)) 356*1977Swnj return(rv); 357*1977Swnj } while (lp > curlp); 358*1977Swnj return(0); 359*1977Swnj 360*1977Swnj default: 361*1977Swnj return(-1); 362*1977Swnj } 363*1977Swnj } 364*1977Swnj 365*1977Swnj backref(i, lp) 366*1977Swnj register int i; 367*1977Swnj register char *lp; 368*1977Swnj { 369*1977Swnj register char *bp; 370*1977Swnj 371*1977Swnj bp = braslist[i]; 372*1977Swnj while (*bp++ == *lp++) 373*1977Swnj if (bp >= braelist[i]) 374*1977Swnj return(1); 375*1977Swnj return(0); 376*1977Swnj } 377*1977Swnj 378*1977Swnj int 379*1977Swnj cclass(set, c, af) 380*1977Swnj register char *set, c; 381*1977Swnj int af; 382*1977Swnj { 383*1977Swnj register int n; 384*1977Swnj 385*1977Swnj if (c == 0) 386*1977Swnj return(0); 387*1977Swnj n = *set++; 388*1977Swnj while (--n) 389*1977Swnj if (*set++ == c) 390*1977Swnj return(af); 391*1977Swnj return(! af); 392*1977Swnj } 393