1 static char sccsid[] = "@(#)regexp.c 4.2 (Berkeley) 12/11/84"; 2 3 #include <ctype.h> 4 5 typedef int boolean; 6 #define TRUE 1 7 #define FALSE 0 8 #define NIL 0 9 10 boolean l_onecase; /* true if upper and lower equivalent */ 11 12 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 13 14 /* STRNCMP - like strncmp except that we convert the 15 * first string to lower case before comparing 16 * if l_onecase is set. 17 */ 18 19 STRNCMP(s1, s2, len) 20 register char *s1,*s2; 21 register int len; 22 { 23 if (l_onecase) { 24 do 25 if (*s2 - makelower(*s1)) 26 return (*s2 - makelower(*s1)); 27 else { 28 s2++; 29 s1++; 30 } 31 while (--len); 32 } else { 33 do 34 if (*s2 - *s1) 35 return (*s2 - *s1); 36 else { 37 s2++; 38 s1++; 39 } 40 while (--len); 41 } 42 return(0); 43 } 44 45 /* The following routine converts an irregular expression to 46 * internal format. 47 * 48 * Either meta symbols (\a \d or \p) or character strings or 49 * operations ( alternation or perenthesizing ) can be 50 * specified. Each starts with a descriptor byte. The descriptor 51 * byte has STR set for strings, META set for meta symbols 52 * and OPER set for operations. 53 * The descriptor byte can also have the OPT bit set if the object 54 * defined is optional. Also ALT can be set to indicate an alternation. 55 * 56 * For metasymbols the byte following the descriptor byte identities 57 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 58 * strings the byte after the descriptor is a character count for 59 * the string: 60 * 61 * meta symbols := descriptor 62 * symbol 63 * 64 * strings := descriptor 65 * character count 66 * the string 67 * 68 * operatins := descriptor 69 * symbol 70 * character count 71 */ 72 73 /* 74 * handy macros for accessing parts of match blocks 75 */ 76 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 77 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 78 79 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 80 #define OCNT(A) (*(A+2)) /* character count */ 81 #define ONEXT(A) (A+3) /* next character after the operation */ 82 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 83 84 #define SCNT(A) (*(A+1)) /* byte count of a string */ 85 #define SSTR(A) (A+2) /* address of the string */ 86 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 87 88 /* 89 * bit flags in the descriptor 90 */ 91 #define OPT 1 92 #define STR 2 93 #define META 4 94 #define ALT 8 95 #define OPER 16 96 97 char *ure; /* pointer current position in unconverted exp */ 98 char *ccre; /* pointer to current position in converted exp*/ 99 char *malloc(); 100 101 char * 102 convexp(re) 103 char *re; /* unconverted irregular expression */ 104 { 105 register char *cre; /* pointer to converted regular expression */ 106 107 /* allocate room for the converted expression */ 108 if (re == NIL) 109 return (NIL); 110 if (*re == '\0') 111 return (NIL); 112 cre = malloc (4 * strlen(re) + 3); 113 ccre = cre; 114 ure = re; 115 116 /* start the conversion with a \a */ 117 *cre = META | OPT; 118 MSYM(cre) = 'a'; 119 ccre = MNEXT(cre); 120 121 /* start the conversion (its recursive) */ 122 expconv (); 123 *ccre = 0; 124 return (cre); 125 } 126 127 expconv() 128 { 129 register char *cs; /* pointer to current symbol in converted exp */ 130 register char c; /* character being processed */ 131 register char *acs; /* pinter to last alternate */ 132 register int temp; 133 134 /* let the conversion begin */ 135 acs = NIL; 136 cs = NIL; 137 while (*ure != NIL) { 138 switch (c = *ure++) { 139 140 case '\\': 141 switch (c = *ure++) { 142 143 /* escaped characters are just characters */ 144 default: 145 if (cs == NIL || (*cs & STR) == 0) { 146 cs = ccre; 147 *cs = STR; 148 SCNT(cs) = 1; 149 ccre += 2; 150 } else 151 SCNT(cs)++; 152 *ccre++ = c; 153 break; 154 155 /* normal(?) metacharacters */ 156 case 'a': 157 case 'd': 158 case 'e': 159 case 'p': 160 if (acs != NIL && acs != cs) { 161 do { 162 temp = OCNT(acs); 163 OCNT(acs) = ccre - acs; 164 acs -= temp; 165 } while (temp != 0); 166 acs = NIL; 167 } 168 cs = ccre; 169 *cs = META; 170 MSYM(cs) = c; 171 ccre = MNEXT(cs); 172 break; 173 } 174 break; 175 176 /* just put the symbol in */ 177 case '^': 178 case '$': 179 if (acs != NIL && acs != cs) { 180 do { 181 temp = OCNT(acs); 182 OCNT(acs) = ccre - acs; 183 acs -= temp; 184 } while (temp != 0); 185 acs = NIL; 186 } 187 cs = ccre; 188 *cs = META; 189 MSYM(cs) = c; 190 ccre = MNEXT(cs); 191 break; 192 193 /* mark the last match sequence as optional */ 194 case '?': 195 if (cs) 196 *cs = *cs | OPT; 197 break; 198 199 /* recurse and define a subexpression */ 200 case '(': 201 if (acs != NIL && acs != cs) { 202 do { 203 temp = OCNT(acs); 204 OCNT(acs) = ccre - acs; 205 acs -= temp; 206 } while (temp != 0); 207 acs = NIL; 208 } 209 cs = ccre; 210 *cs = OPER; 211 OSYM(cs) = '('; 212 ccre = ONEXT(cs); 213 expconv (); 214 OCNT(cs) = ccre - cs; /* offset to next symbol */ 215 break; 216 217 /* return from a recursion */ 218 case ')': 219 if (acs != NIL) { 220 do { 221 temp = OCNT(acs); 222 OCNT(acs) = ccre - acs; 223 acs -= temp; 224 } while (temp != 0); 225 acs = NIL; 226 } 227 cs = ccre; 228 *cs = META; 229 MSYM(cs) = c; 230 ccre = MNEXT(cs); 231 return; 232 233 /* mark the last match sequence as having an alternate */ 234 /* the third byte will contain an offset to jump over the */ 235 /* alternate match in case the first did not fail */ 236 case '|': 237 if (acs != NIL && acs != cs) 238 OCNT(ccre) = ccre - acs; /* make a back pointer */ 239 else 240 OCNT(ccre) = 0; 241 *cs |= ALT; 242 cs = ccre; 243 *cs = OPER; 244 OSYM(cs) = '|'; 245 ccre = ONEXT(cs); 246 acs = cs; /* remember that the pointer is to be filles */ 247 break; 248 249 /* if its not a metasymbol just build a scharacter string */ 250 default: 251 if (cs == NIL || (*cs & STR) == 0) { 252 cs = ccre; 253 *cs = STR; 254 SCNT(cs) = 1; 255 ccre = SSTR(cs); 256 } else 257 SCNT(cs)++; 258 *ccre++ = c; 259 break; 260 } 261 } 262 if (acs != NIL) { 263 do { 264 temp = OCNT(acs); 265 OCNT(acs) = ccre - acs; 266 acs -= temp; 267 } while (temp != 0); 268 acs = NIL; 269 } 270 return; 271 } 272 /* end of convertre */ 273 274 275 /* 276 * The following routine recognises an irregular expresion 277 * with the following special characters: 278 * 279 * \? - means last match was optional 280 * \a - matches any number of characters 281 * \d - matches any number of spaces and tabs 282 * \p - matches any number of alphanumeric 283 * characters. The 284 * characters matched will be copied into 285 * the area pointed to by 'name'. 286 * \| - alternation 287 * \( \) - grouping used mostly for alternation and 288 * optionality 289 * 290 * The irregular expression must be translated to internal form 291 * prior to calling this routine 292 * 293 * The value returned is the pointer to the first non \a 294 * character matched. 295 */ 296 297 boolean _escaped; /* true if we are currently _escaped */ 298 char *_start; /* start of string */ 299 300 char * 301 expmatch (s, re, mstring) 302 register char *s; /* string to check for a match in */ 303 register char *re; /* a converted irregular expression */ 304 register char *mstring; /* where to put whatever matches a \p */ 305 { 306 register char *cs; /* the current symbol */ 307 register char *ptr,*s1; /* temporary pointer */ 308 boolean matched; /* a temporary boolean */ 309 310 /* initial conditions */ 311 if (re == NIL) 312 return (NIL); 313 cs = re; 314 matched = FALSE; 315 316 /* loop till expression string is exhausted (or at least pretty tired) */ 317 while (*cs) { 318 switch (*cs & (OPER | STR | META)) { 319 320 /* try to match a string */ 321 case STR: 322 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 323 if (matched) { 324 325 /* hoorah it matches */ 326 s += SCNT(cs); 327 cs = SNEXT(cs); 328 } else if (*cs & ALT) { 329 330 /* alternation, skip to next expression */ 331 cs = SNEXT(cs); 332 } else if (*cs & OPT) { 333 334 /* the match is optional */ 335 cs = SNEXT(cs); 336 matched = 1; /* indicate a successful match */ 337 } else { 338 339 /* no match, error return */ 340 return (NIL); 341 } 342 break; 343 344 /* an operator, do something fancy */ 345 case OPER: 346 switch (OSYM(cs)) { 347 348 /* this is an alternation */ 349 case '|': 350 if (matched) 351 352 /* last thing in the alternation was a match, skip ahead */ 353 cs = OPTR(cs); 354 else 355 356 /* no match, keep trying */ 357 cs = ONEXT(cs); 358 break; 359 360 /* this is a grouping, recurse */ 361 case '(': 362 ptr = expmatch (s, ONEXT(cs), mstring); 363 if (ptr != NIL) { 364 365 /* the subexpression matched */ 366 matched = 1; 367 s = ptr; 368 } else if (*cs & ALT) { 369 370 /* alternation, skip to next expression */ 371 matched = 0; 372 } else if (*cs & OPT) { 373 374 /* the match is optional */ 375 matched = 1; /* indicate a successful match */ 376 } else { 377 378 /* no match, error return */ 379 return (NIL); 380 } 381 cs = OPTR(cs); 382 break; 383 } 384 break; 385 386 /* try to match a metasymbol */ 387 case META: 388 switch (MSYM(cs)) { 389 390 /* try to match anything and remember what was matched */ 391 case 'p': 392 /* 393 * This is really the same as trying the match the 394 * remaining parts of the expression to any subset 395 * of the string. 396 */ 397 s1 = s; 398 do { 399 ptr = expmatch (s1, MNEXT(cs), mstring); 400 if (ptr != NIL && s1 != s) { 401 402 /* we have a match, remember the match */ 403 strncpy (mstring, s, s1 - s); 404 mstring[s1 - s] = '\0'; 405 return (ptr); 406 } else if (ptr != NIL && (*cs & OPT)) { 407 408 /* it was aoptional so no match is ok */ 409 return (ptr); 410 } else if (ptr != NIL) { 411 412 /* not optional and we still matched */ 413 return (NIL); 414 } 415 if (!isalnum(*s1) && *s1 != '_') 416 return (NIL); 417 if (*s1 == '\\') 418 _escaped = _escaped ? FALSE : TRUE; 419 else 420 _escaped = FALSE; 421 } while (*s1++); 422 return (NIL); 423 424 /* try to match anything */ 425 case 'a': 426 /* 427 * This is really the same as trying the match the 428 * remaining parts of the expression to any subset 429 * of the string. 430 */ 431 s1 = s; 432 do { 433 ptr = expmatch (s1, MNEXT(cs), mstring); 434 if (ptr != NIL && s1 != s) { 435 436 /* we have a match */ 437 return (ptr); 438 } else if (ptr != NIL && (*cs & OPT)) { 439 440 /* it was aoptional so no match is ok */ 441 return (ptr); 442 } else if (ptr != NIL) { 443 444 /* not optional and we still matched */ 445 return (NIL); 446 } 447 if (*s1 == '\\') 448 _escaped = _escaped ? FALSE : TRUE; 449 else 450 _escaped = FALSE; 451 } while (*s1++); 452 return (NIL); 453 454 /* fail if we are currently _escaped */ 455 case 'e': 456 if (_escaped) 457 return(NIL); 458 cs = MNEXT(cs); 459 break; 460 461 /* match any number of tabs and spaces */ 462 case 'd': 463 ptr = s; 464 while (*s == ' ' || *s == '\t') 465 s++; 466 if (s != ptr || s == _start) { 467 468 /* match, be happy */ 469 matched = 1; 470 cs = MNEXT(cs); 471 } else if (*s == '\n' || *s == '\0') { 472 473 /* match, be happy */ 474 matched = 1; 475 cs = MNEXT(cs); 476 } else if (*cs & ALT) { 477 478 /* try the next part */ 479 matched = 0; 480 cs = MNEXT(cs); 481 } else if (*cs & OPT) { 482 483 /* doesn't matter */ 484 matched = 1; 485 cs = MNEXT(cs); 486 } else 487 488 /* no match, error return */ 489 return (NIL); 490 break; 491 492 /* check for end of line */ 493 case '$': 494 if (*s == '\0' || *s == '\n') { 495 496 /* match, be happy */ 497 s++; 498 matched = 1; 499 cs = MNEXT(cs); 500 } else if (*cs & ALT) { 501 502 /* try the next part */ 503 matched = 0; 504 cs = MNEXT(cs); 505 } else if (*cs & OPT) { 506 507 /* doesn't matter */ 508 matched = 1; 509 cs = MNEXT(cs); 510 } else 511 512 /* no match, error return */ 513 return (NIL); 514 break; 515 516 /* check for start of line */ 517 case '^': 518 if (s == _start) { 519 520 /* match, be happy */ 521 matched = 1; 522 cs = MNEXT(cs); 523 } else if (*cs & ALT) { 524 525 /* try the next part */ 526 matched = 0; 527 cs = MNEXT(cs); 528 } else if (*cs & OPT) { 529 530 /* doesn't matter */ 531 matched = 1; 532 cs = MNEXT(cs); 533 } else 534 535 /* no match, error return */ 536 return (NIL); 537 break; 538 539 /* end of a subexpression, return success */ 540 case ')': 541 return (s); 542 } 543 break; 544 } 545 } 546 return (s); 547 } 548