1 /* 2 * Copyright (c) 1980 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 #ifndef lint 35 /*static char sccsid[] = "from: @(#)regexp.c 5.3 (Berkeley) 6/1/90";*/ 36 static char rcsid[] = "$Id: regexp.c,v 1.2 1993/08/01 18:03:29 mycroft Exp $"; 37 #endif /* not lint */ 38 39 #include <ctype.h> 40 41 typedef int boolean; 42 #define TRUE 1 43 #define FALSE 0 44 #define NIL 0 45 46 boolean l_onecase; /* true if upper and lower equivalent */ 47 48 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 49 50 /* STRNCMP - like strncmp except that we convert the 51 * first string to lower case before comparing 52 * if l_onecase is set. 53 */ 54 55 STRNCMP(s1, s2, len) 56 register char *s1,*s2; 57 register int len; 58 { 59 if (l_onecase) { 60 do 61 if (*s2 - makelower(*s1)) 62 return (*s2 - makelower(*s1)); 63 else { 64 s2++; 65 s1++; 66 } 67 while (--len); 68 } else { 69 do 70 if (*s2 - *s1) 71 return (*s2 - *s1); 72 else { 73 s2++; 74 s1++; 75 } 76 while (--len); 77 } 78 return(0); 79 } 80 81 /* The following routine converts an irregular expression to 82 * internal format. 83 * 84 * Either meta symbols (\a \d or \p) or character strings or 85 * operations ( alternation or perenthesizing ) can be 86 * specified. Each starts with a descriptor byte. The descriptor 87 * byte has STR set for strings, META set for meta symbols 88 * and OPER set for operations. 89 * The descriptor byte can also have the OPT bit set if the object 90 * defined is optional. Also ALT can be set to indicate an alternation. 91 * 92 * For metasymbols the byte following the descriptor byte identities 93 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 94 * strings the byte after the descriptor is a character count for 95 * the string: 96 * 97 * meta symbols := descriptor 98 * symbol 99 * 100 * strings := descriptor 101 * character count 102 * the string 103 * 104 * operatins := descriptor 105 * symbol 106 * character count 107 */ 108 109 /* 110 * handy macros for accessing parts of match blocks 111 */ 112 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 113 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 114 115 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 116 #define OCNT(A) (*(A+2)) /* character count */ 117 #define ONEXT(A) (A+3) /* next character after the operation */ 118 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 119 120 #define SCNT(A) (*(A+1)) /* byte count of a string */ 121 #define SSTR(A) (A+2) /* address of the string */ 122 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 123 124 /* 125 * bit flags in the descriptor 126 */ 127 #define OPT 1 128 #define STR 2 129 #define META 4 130 #define ALT 8 131 #define OPER 16 132 133 char *ure; /* pointer current position in unconverted exp */ 134 char *ccre; /* pointer to current position in converted exp*/ 135 char *malloc(); 136 137 char * 138 convexp(re) 139 char *re; /* unconverted irregular expression */ 140 { 141 register char *cre; /* pointer to converted regular expression */ 142 143 /* allocate room for the converted expression */ 144 if (re == NIL) 145 return (NIL); 146 if (*re == '\0') 147 return (NIL); 148 cre = malloc (4 * strlen(re) + 3); 149 ccre = cre; 150 ure = re; 151 152 /* start the conversion with a \a */ 153 *cre = META | OPT; 154 MSYM(cre) = 'a'; 155 ccre = MNEXT(cre); 156 157 /* start the conversion (its recursive) */ 158 expconv (); 159 *ccre = 0; 160 return (cre); 161 } 162 163 expconv() 164 { 165 register char *cs; /* pointer to current symbol in converted exp */ 166 register char c; /* character being processed */ 167 register char *acs; /* pinter to last alternate */ 168 register int temp; 169 170 /* let the conversion begin */ 171 acs = NIL; 172 cs = NIL; 173 while (*ure != NIL) { 174 switch (c = *ure++) { 175 176 case '\\': 177 switch (c = *ure++) { 178 179 /* escaped characters are just characters */ 180 default: 181 if (cs == NIL || (*cs & STR) == 0) { 182 cs = ccre; 183 *cs = STR; 184 SCNT(cs) = 1; 185 ccre += 2; 186 } else 187 SCNT(cs)++; 188 *ccre++ = c; 189 break; 190 191 /* normal(?) metacharacters */ 192 case 'a': 193 case 'd': 194 case 'e': 195 case 'p': 196 if (acs != NIL && acs != cs) { 197 do { 198 temp = OCNT(acs); 199 OCNT(acs) = ccre - acs; 200 acs -= temp; 201 } while (temp != 0); 202 acs = NIL; 203 } 204 cs = ccre; 205 *cs = META; 206 MSYM(cs) = c; 207 ccre = MNEXT(cs); 208 break; 209 } 210 break; 211 212 /* just put the symbol in */ 213 case '^': 214 case '$': 215 if (acs != NIL && acs != cs) { 216 do { 217 temp = OCNT(acs); 218 OCNT(acs) = ccre - acs; 219 acs -= temp; 220 } while (temp != 0); 221 acs = NIL; 222 } 223 cs = ccre; 224 *cs = META; 225 MSYM(cs) = c; 226 ccre = MNEXT(cs); 227 break; 228 229 /* mark the last match sequence as optional */ 230 case '?': 231 if (cs) 232 *cs = *cs | OPT; 233 break; 234 235 /* recurse and define a subexpression */ 236 case '(': 237 if (acs != NIL && acs != cs) { 238 do { 239 temp = OCNT(acs); 240 OCNT(acs) = ccre - acs; 241 acs -= temp; 242 } while (temp != 0); 243 acs = NIL; 244 } 245 cs = ccre; 246 *cs = OPER; 247 OSYM(cs) = '('; 248 ccre = ONEXT(cs); 249 expconv (); 250 OCNT(cs) = ccre - cs; /* offset to next symbol */ 251 break; 252 253 /* return from a recursion */ 254 case ')': 255 if (acs != NIL) { 256 do { 257 temp = OCNT(acs); 258 OCNT(acs) = ccre - acs; 259 acs -= temp; 260 } while (temp != 0); 261 acs = NIL; 262 } 263 cs = ccre; 264 *cs = META; 265 MSYM(cs) = c; 266 ccre = MNEXT(cs); 267 return; 268 269 /* mark the last match sequence as having an alternate */ 270 /* the third byte will contain an offset to jump over the */ 271 /* alternate match in case the first did not fail */ 272 case '|': 273 if (acs != NIL && acs != cs) 274 OCNT(ccre) = ccre - acs; /* make a back pointer */ 275 else 276 OCNT(ccre) = 0; 277 *cs |= ALT; 278 cs = ccre; 279 *cs = OPER; 280 OSYM(cs) = '|'; 281 ccre = ONEXT(cs); 282 acs = cs; /* remember that the pointer is to be filles */ 283 break; 284 285 /* if its not a metasymbol just build a scharacter string */ 286 default: 287 if (cs == NIL || (*cs & STR) == 0) { 288 cs = ccre; 289 *cs = STR; 290 SCNT(cs) = 1; 291 ccre = SSTR(cs); 292 } else 293 SCNT(cs)++; 294 *ccre++ = c; 295 break; 296 } 297 } 298 if (acs != NIL) { 299 do { 300 temp = OCNT(acs); 301 OCNT(acs) = ccre - acs; 302 acs -= temp; 303 } while (temp != 0); 304 acs = NIL; 305 } 306 return; 307 } 308 /* end of convertre */ 309 310 311 /* 312 * The following routine recognises an irregular expresion 313 * with the following special characters: 314 * 315 * \? - means last match was optional 316 * \a - matches any number of characters 317 * \d - matches any number of spaces and tabs 318 * \p - matches any number of alphanumeric 319 * characters. The 320 * characters matched will be copied into 321 * the area pointed to by 'name'. 322 * \| - alternation 323 * \( \) - grouping used mostly for alternation and 324 * optionality 325 * 326 * The irregular expression must be translated to internal form 327 * prior to calling this routine 328 * 329 * The value returned is the pointer to the first non \a 330 * character matched. 331 */ 332 333 boolean _escaped; /* true if we are currently _escaped */ 334 char *_start; /* start of string */ 335 336 char * 337 expmatch (s, re, mstring) 338 register char *s; /* string to check for a match in */ 339 register char *re; /* a converted irregular expression */ 340 register char *mstring; /* where to put whatever matches a \p */ 341 { 342 register char *cs; /* the current symbol */ 343 register char *ptr,*s1; /* temporary pointer */ 344 boolean matched; /* a temporary boolean */ 345 346 /* initial conditions */ 347 if (re == NIL) 348 return (NIL); 349 cs = re; 350 matched = FALSE; 351 352 /* loop till expression string is exhausted (or at least pretty tired) */ 353 while (*cs) { 354 switch (*cs & (OPER | STR | META)) { 355 356 /* try to match a string */ 357 case STR: 358 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 359 if (matched) { 360 361 /* hoorah it matches */ 362 s += SCNT(cs); 363 cs = SNEXT(cs); 364 } else if (*cs & ALT) { 365 366 /* alternation, skip to next expression */ 367 cs = SNEXT(cs); 368 } else if (*cs & OPT) { 369 370 /* the match is optional */ 371 cs = SNEXT(cs); 372 matched = 1; /* indicate a successful match */ 373 } else { 374 375 /* no match, error return */ 376 return (NIL); 377 } 378 break; 379 380 /* an operator, do something fancy */ 381 case OPER: 382 switch (OSYM(cs)) { 383 384 /* this is an alternation */ 385 case '|': 386 if (matched) 387 388 /* last thing in the alternation was a match, skip ahead */ 389 cs = OPTR(cs); 390 else 391 392 /* no match, keep trying */ 393 cs = ONEXT(cs); 394 break; 395 396 /* this is a grouping, recurse */ 397 case '(': 398 ptr = expmatch (s, ONEXT(cs), mstring); 399 if (ptr != NIL) { 400 401 /* the subexpression matched */ 402 matched = 1; 403 s = ptr; 404 } else if (*cs & ALT) { 405 406 /* alternation, skip to next expression */ 407 matched = 0; 408 } else if (*cs & OPT) { 409 410 /* the match is optional */ 411 matched = 1; /* indicate a successful match */ 412 } else { 413 414 /* no match, error return */ 415 return (NIL); 416 } 417 cs = OPTR(cs); 418 break; 419 } 420 break; 421 422 /* try to match a metasymbol */ 423 case META: 424 switch (MSYM(cs)) { 425 426 /* try to match anything and remember what was matched */ 427 case 'p': 428 /* 429 * This is really the same as trying the match the 430 * remaining parts of the expression to any subset 431 * of the string. 432 */ 433 s1 = s; 434 do { 435 ptr = expmatch (s1, MNEXT(cs), mstring); 436 if (ptr != NIL && s1 != s) { 437 438 /* we have a match, remember the match */ 439 strncpy (mstring, s, s1 - s); 440 mstring[s1 - s] = '\0'; 441 return (ptr); 442 } else if (ptr != NIL && (*cs & OPT)) { 443 444 /* it was aoptional so no match is ok */ 445 return (ptr); 446 } else if (ptr != NIL) { 447 448 /* not optional and we still matched */ 449 return (NIL); 450 } 451 if (!isalnum(*s1) && *s1 != '_') 452 return (NIL); 453 if (*s1 == '\\') 454 _escaped = _escaped ? FALSE : TRUE; 455 else 456 _escaped = FALSE; 457 } while (*s1++); 458 return (NIL); 459 460 /* try to match anything */ 461 case 'a': 462 /* 463 * This is really the same as trying the match the 464 * remaining parts of the expression to any subset 465 * of the string. 466 */ 467 s1 = s; 468 do { 469 ptr = expmatch (s1, MNEXT(cs), mstring); 470 if (ptr != NIL && s1 != s) { 471 472 /* we have a match */ 473 return (ptr); 474 } else if (ptr != NIL && (*cs & OPT)) { 475 476 /* it was aoptional so no match is ok */ 477 return (ptr); 478 } else if (ptr != NIL) { 479 480 /* not optional and we still matched */ 481 return (NIL); 482 } 483 if (*s1 == '\\') 484 _escaped = _escaped ? FALSE : TRUE; 485 else 486 _escaped = FALSE; 487 } while (*s1++); 488 return (NIL); 489 490 /* fail if we are currently _escaped */ 491 case 'e': 492 if (_escaped) 493 return(NIL); 494 cs = MNEXT(cs); 495 break; 496 497 /* match any number of tabs and spaces */ 498 case 'd': 499 ptr = s; 500 while (*s == ' ' || *s == '\t') 501 s++; 502 if (s != ptr || s == _start) { 503 504 /* match, be happy */ 505 matched = 1; 506 cs = MNEXT(cs); 507 } else if (*s == '\n' || *s == '\0') { 508 509 /* match, be happy */ 510 matched = 1; 511 cs = MNEXT(cs); 512 } else if (*cs & ALT) { 513 514 /* try the next part */ 515 matched = 0; 516 cs = MNEXT(cs); 517 } else if (*cs & OPT) { 518 519 /* doesn't matter */ 520 matched = 1; 521 cs = MNEXT(cs); 522 } else 523 524 /* no match, error return */ 525 return (NIL); 526 break; 527 528 /* check for end of line */ 529 case '$': 530 if (*s == '\0' || *s == '\n') { 531 532 /* match, be happy */ 533 s++; 534 matched = 1; 535 cs = MNEXT(cs); 536 } else if (*cs & ALT) { 537 538 /* try the next part */ 539 matched = 0; 540 cs = MNEXT(cs); 541 } else if (*cs & OPT) { 542 543 /* doesn't matter */ 544 matched = 1; 545 cs = MNEXT(cs); 546 } else 547 548 /* no match, error return */ 549 return (NIL); 550 break; 551 552 /* check for start of line */ 553 case '^': 554 if (s == _start) { 555 556 /* match, be happy */ 557 matched = 1; 558 cs = MNEXT(cs); 559 } else if (*cs & ALT) { 560 561 /* try the next part */ 562 matched = 0; 563 cs = MNEXT(cs); 564 } else if (*cs & OPT) { 565 566 /* doesn't matter */ 567 matched = 1; 568 cs = MNEXT(cs); 569 } else 570 571 /* no match, error return */ 572 return (NIL); 573 break; 574 575 /* end of a subexpression, return success */ 576 case ')': 577 return (s); 578 } 579 break; 580 } 581 } 582 return (s); 583 } 584