1 /* 2 * Copyright (c) 1980 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 #ifndef lint 35 static char sccsid[] = "@(#)regexp.c 5.3 (Berkeley) 6/1/90"; 36 #endif /* not lint */ 37 38 #include <ctype.h> 39 40 typedef int boolean; 41 #define TRUE 1 42 #define FALSE 0 43 #define NIL 0 44 45 boolean l_onecase; /* true if upper and lower equivalent */ 46 47 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 48 49 /* STRNCMP - like strncmp except that we convert the 50 * first string to lower case before comparing 51 * if l_onecase is set. 52 */ 53 54 STRNCMP(s1, s2, len) 55 register char *s1,*s2; 56 register int len; 57 { 58 if (l_onecase) { 59 do 60 if (*s2 - makelower(*s1)) 61 return (*s2 - makelower(*s1)); 62 else { 63 s2++; 64 s1++; 65 } 66 while (--len); 67 } else { 68 do 69 if (*s2 - *s1) 70 return (*s2 - *s1); 71 else { 72 s2++; 73 s1++; 74 } 75 while (--len); 76 } 77 return(0); 78 } 79 80 /* The following routine converts an irregular expression to 81 * internal format. 82 * 83 * Either meta symbols (\a \d or \p) or character strings or 84 * operations ( alternation or perenthesizing ) can be 85 * specified. Each starts with a descriptor byte. The descriptor 86 * byte has STR set for strings, META set for meta symbols 87 * and OPER set for operations. 88 * The descriptor byte can also have the OPT bit set if the object 89 * defined is optional. Also ALT can be set to indicate an alternation. 90 * 91 * For metasymbols the byte following the descriptor byte identities 92 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 93 * strings the byte after the descriptor is a character count for 94 * the string: 95 * 96 * meta symbols := descriptor 97 * symbol 98 * 99 * strings := descriptor 100 * character count 101 * the string 102 * 103 * operatins := descriptor 104 * symbol 105 * character count 106 */ 107 108 /* 109 * handy macros for accessing parts of match blocks 110 */ 111 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 112 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 113 114 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 115 #define OCNT(A) (*(A+2)) /* character count */ 116 #define ONEXT(A) (A+3) /* next character after the operation */ 117 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 118 119 #define SCNT(A) (*(A+1)) /* byte count of a string */ 120 #define SSTR(A) (A+2) /* address of the string */ 121 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 122 123 /* 124 * bit flags in the descriptor 125 */ 126 #define OPT 1 127 #define STR 2 128 #define META 4 129 #define ALT 8 130 #define OPER 16 131 132 char *ure; /* pointer current position in unconverted exp */ 133 char *ccre; /* pointer to current position in converted exp*/ 134 char *malloc(); 135 136 char * 137 convexp(re) 138 char *re; /* unconverted irregular expression */ 139 { 140 register char *cre; /* pointer to converted regular expression */ 141 142 /* allocate room for the converted expression */ 143 if (re == NIL) 144 return (NIL); 145 if (*re == '\0') 146 return (NIL); 147 cre = malloc (4 * strlen(re) + 3); 148 ccre = cre; 149 ure = re; 150 151 /* start the conversion with a \a */ 152 *cre = META | OPT; 153 MSYM(cre) = 'a'; 154 ccre = MNEXT(cre); 155 156 /* start the conversion (its recursive) */ 157 expconv (); 158 *ccre = 0; 159 return (cre); 160 } 161 162 expconv() 163 { 164 register char *cs; /* pointer to current symbol in converted exp */ 165 register char c; /* character being processed */ 166 register char *acs; /* pinter to last alternate */ 167 register int temp; 168 169 /* let the conversion begin */ 170 acs = NIL; 171 cs = NIL; 172 while (*ure != NIL) { 173 switch (c = *ure++) { 174 175 case '\\': 176 switch (c = *ure++) { 177 178 /* escaped characters are just characters */ 179 default: 180 if (cs == NIL || (*cs & STR) == 0) { 181 cs = ccre; 182 *cs = STR; 183 SCNT(cs) = 1; 184 ccre += 2; 185 } else 186 SCNT(cs)++; 187 *ccre++ = c; 188 break; 189 190 /* normal(?) metacharacters */ 191 case 'a': 192 case 'd': 193 case 'e': 194 case 'p': 195 if (acs != NIL && acs != cs) { 196 do { 197 temp = OCNT(acs); 198 OCNT(acs) = ccre - acs; 199 acs -= temp; 200 } while (temp != 0); 201 acs = NIL; 202 } 203 cs = ccre; 204 *cs = META; 205 MSYM(cs) = c; 206 ccre = MNEXT(cs); 207 break; 208 } 209 break; 210 211 /* just put the symbol in */ 212 case '^': 213 case '$': 214 if (acs != NIL && acs != cs) { 215 do { 216 temp = OCNT(acs); 217 OCNT(acs) = ccre - acs; 218 acs -= temp; 219 } while (temp != 0); 220 acs = NIL; 221 } 222 cs = ccre; 223 *cs = META; 224 MSYM(cs) = c; 225 ccre = MNEXT(cs); 226 break; 227 228 /* mark the last match sequence as optional */ 229 case '?': 230 if (cs) 231 *cs = *cs | OPT; 232 break; 233 234 /* recurse and define a subexpression */ 235 case '(': 236 if (acs != NIL && acs != cs) { 237 do { 238 temp = OCNT(acs); 239 OCNT(acs) = ccre - acs; 240 acs -= temp; 241 } while (temp != 0); 242 acs = NIL; 243 } 244 cs = ccre; 245 *cs = OPER; 246 OSYM(cs) = '('; 247 ccre = ONEXT(cs); 248 expconv (); 249 OCNT(cs) = ccre - cs; /* offset to next symbol */ 250 break; 251 252 /* return from a recursion */ 253 case ')': 254 if (acs != NIL) { 255 do { 256 temp = OCNT(acs); 257 OCNT(acs) = ccre - acs; 258 acs -= temp; 259 } while (temp != 0); 260 acs = NIL; 261 } 262 cs = ccre; 263 *cs = META; 264 MSYM(cs) = c; 265 ccre = MNEXT(cs); 266 return; 267 268 /* mark the last match sequence as having an alternate */ 269 /* the third byte will contain an offset to jump over the */ 270 /* alternate match in case the first did not fail */ 271 case '|': 272 if (acs != NIL && acs != cs) 273 OCNT(ccre) = ccre - acs; /* make a back pointer */ 274 else 275 OCNT(ccre) = 0; 276 *cs |= ALT; 277 cs = ccre; 278 *cs = OPER; 279 OSYM(cs) = '|'; 280 ccre = ONEXT(cs); 281 acs = cs; /* remember that the pointer is to be filles */ 282 break; 283 284 /* if its not a metasymbol just build a scharacter string */ 285 default: 286 if (cs == NIL || (*cs & STR) == 0) { 287 cs = ccre; 288 *cs = STR; 289 SCNT(cs) = 1; 290 ccre = SSTR(cs); 291 } else 292 SCNT(cs)++; 293 *ccre++ = c; 294 break; 295 } 296 } 297 if (acs != NIL) { 298 do { 299 temp = OCNT(acs); 300 OCNT(acs) = ccre - acs; 301 acs -= temp; 302 } while (temp != 0); 303 acs = NIL; 304 } 305 return; 306 } 307 /* end of convertre */ 308 309 310 /* 311 * The following routine recognises an irregular expresion 312 * with the following special characters: 313 * 314 * \? - means last match was optional 315 * \a - matches any number of characters 316 * \d - matches any number of spaces and tabs 317 * \p - matches any number of alphanumeric 318 * characters. The 319 * characters matched will be copied into 320 * the area pointed to by 'name'. 321 * \| - alternation 322 * \( \) - grouping used mostly for alternation and 323 * optionality 324 * 325 * The irregular expression must be translated to internal form 326 * prior to calling this routine 327 * 328 * The value returned is the pointer to the first non \a 329 * character matched. 330 */ 331 332 boolean _escaped; /* true if we are currently _escaped */ 333 char *_start; /* start of string */ 334 335 char * 336 expmatch (s, re, mstring) 337 register char *s; /* string to check for a match in */ 338 register char *re; /* a converted irregular expression */ 339 register char *mstring; /* where to put whatever matches a \p */ 340 { 341 register char *cs; /* the current symbol */ 342 register char *ptr,*s1; /* temporary pointer */ 343 boolean matched; /* a temporary boolean */ 344 345 /* initial conditions */ 346 if (re == NIL) 347 return (NIL); 348 cs = re; 349 matched = FALSE; 350 351 /* loop till expression string is exhausted (or at least pretty tired) */ 352 while (*cs) { 353 switch (*cs & (OPER | STR | META)) { 354 355 /* try to match a string */ 356 case STR: 357 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 358 if (matched) { 359 360 /* hoorah it matches */ 361 s += SCNT(cs); 362 cs = SNEXT(cs); 363 } else if (*cs & ALT) { 364 365 /* alternation, skip to next expression */ 366 cs = SNEXT(cs); 367 } else if (*cs & OPT) { 368 369 /* the match is optional */ 370 cs = SNEXT(cs); 371 matched = 1; /* indicate a successful match */ 372 } else { 373 374 /* no match, error return */ 375 return (NIL); 376 } 377 break; 378 379 /* an operator, do something fancy */ 380 case OPER: 381 switch (OSYM(cs)) { 382 383 /* this is an alternation */ 384 case '|': 385 if (matched) 386 387 /* last thing in the alternation was a match, skip ahead */ 388 cs = OPTR(cs); 389 else 390 391 /* no match, keep trying */ 392 cs = ONEXT(cs); 393 break; 394 395 /* this is a grouping, recurse */ 396 case '(': 397 ptr = expmatch (s, ONEXT(cs), mstring); 398 if (ptr != NIL) { 399 400 /* the subexpression matched */ 401 matched = 1; 402 s = ptr; 403 } else if (*cs & ALT) { 404 405 /* alternation, skip to next expression */ 406 matched = 0; 407 } else if (*cs & OPT) { 408 409 /* the match is optional */ 410 matched = 1; /* indicate a successful match */ 411 } else { 412 413 /* no match, error return */ 414 return (NIL); 415 } 416 cs = OPTR(cs); 417 break; 418 } 419 break; 420 421 /* try to match a metasymbol */ 422 case META: 423 switch (MSYM(cs)) { 424 425 /* try to match anything and remember what was matched */ 426 case 'p': 427 /* 428 * This is really the same as trying the match the 429 * remaining parts of the expression to any subset 430 * of the string. 431 */ 432 s1 = s; 433 do { 434 ptr = expmatch (s1, MNEXT(cs), mstring); 435 if (ptr != NIL && s1 != s) { 436 437 /* we have a match, remember the match */ 438 strncpy (mstring, s, s1 - s); 439 mstring[s1 - s] = '\0'; 440 return (ptr); 441 } else if (ptr != NIL && (*cs & OPT)) { 442 443 /* it was aoptional so no match is ok */ 444 return (ptr); 445 } else if (ptr != NIL) { 446 447 /* not optional and we still matched */ 448 return (NIL); 449 } 450 if (!isalnum(*s1) && *s1 != '_') 451 return (NIL); 452 if (*s1 == '\\') 453 _escaped = _escaped ? FALSE : TRUE; 454 else 455 _escaped = FALSE; 456 } while (*s1++); 457 return (NIL); 458 459 /* try to match anything */ 460 case 'a': 461 /* 462 * This is really the same as trying the match the 463 * remaining parts of the expression to any subset 464 * of the string. 465 */ 466 s1 = s; 467 do { 468 ptr = expmatch (s1, MNEXT(cs), mstring); 469 if (ptr != NIL && s1 != s) { 470 471 /* we have a match */ 472 return (ptr); 473 } else if (ptr != NIL && (*cs & OPT)) { 474 475 /* it was aoptional so no match is ok */ 476 return (ptr); 477 } else if (ptr != NIL) { 478 479 /* not optional and we still matched */ 480 return (NIL); 481 } 482 if (*s1 == '\\') 483 _escaped = _escaped ? FALSE : TRUE; 484 else 485 _escaped = FALSE; 486 } while (*s1++); 487 return (NIL); 488 489 /* fail if we are currently _escaped */ 490 case 'e': 491 if (_escaped) 492 return(NIL); 493 cs = MNEXT(cs); 494 break; 495 496 /* match any number of tabs and spaces */ 497 case 'd': 498 ptr = s; 499 while (*s == ' ' || *s == '\t') 500 s++; 501 if (s != ptr || s == _start) { 502 503 /* match, be happy */ 504 matched = 1; 505 cs = MNEXT(cs); 506 } else if (*s == '\n' || *s == '\0') { 507 508 /* match, be happy */ 509 matched = 1; 510 cs = MNEXT(cs); 511 } else if (*cs & ALT) { 512 513 /* try the next part */ 514 matched = 0; 515 cs = MNEXT(cs); 516 } else if (*cs & OPT) { 517 518 /* doesn't matter */ 519 matched = 1; 520 cs = MNEXT(cs); 521 } else 522 523 /* no match, error return */ 524 return (NIL); 525 break; 526 527 /* check for end of line */ 528 case '$': 529 if (*s == '\0' || *s == '\n') { 530 531 /* match, be happy */ 532 s++; 533 matched = 1; 534 cs = MNEXT(cs); 535 } else if (*cs & ALT) { 536 537 /* try the next part */ 538 matched = 0; 539 cs = MNEXT(cs); 540 } else if (*cs & OPT) { 541 542 /* doesn't matter */ 543 matched = 1; 544 cs = MNEXT(cs); 545 } else 546 547 /* no match, error return */ 548 return (NIL); 549 break; 550 551 /* check for start of line */ 552 case '^': 553 if (s == _start) { 554 555 /* match, be happy */ 556 matched = 1; 557 cs = MNEXT(cs); 558 } else if (*cs & ALT) { 559 560 /* try the next part */ 561 matched = 0; 562 cs = MNEXT(cs); 563 } else if (*cs & OPT) { 564 565 /* doesn't matter */ 566 matched = 1; 567 cs = MNEXT(cs); 568 } else 569 570 /* no match, error return */ 571 return (NIL); 572 break; 573 574 /* end of a subexpression, return success */ 575 case ')': 576 return (s); 577 } 578 break; 579 } 580 } 581 return (s); 582 } 583