1 /* $NetBSD: regexp.c,v 1.8 2004/11/04 07:44:47 dsl Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 #ifndef lint 35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\n\ 36 The Regents of the University of California. All rights reserved.\n"); 37 #endif /* not lint */ 38 39 #ifndef lint 40 #if 0 41 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 42 #endif 43 __RCSID("$NetBSD: regexp.c,v 1.8 2004/11/04 07:44:47 dsl Exp $"); 44 #endif /* not lint */ 45 46 #include <ctype.h> 47 #include <stdlib.h> 48 #include <string.h> 49 #include "extern.h" 50 51 #define FALSE 0 52 #define TRUE !(FALSE) 53 #define NIL 0 54 55 static void expconv __P((void)); 56 57 boolean x_escaped; /* true if we are currently x_escaped */ 58 char *x_start; /* start of string */ 59 boolean l_onecase; /* true if upper and lower equivalent */ 60 61 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c)) 62 63 /* STRNCMP - like strncmp except that we convert the 64 * first string to lower case before comparing 65 * if l_onecase is set. 66 */ 67 68 int 69 STRNCMP(s1, s2, len) 70 char *s1,*s2; 71 int len; 72 { 73 if (l_onecase) { 74 do 75 if (*s2 - makelower(*s1)) 76 return (*s2 - makelower(*s1)); 77 else { 78 s2++; 79 s1++; 80 } 81 while (--len); 82 } else { 83 do 84 if (*s2 - *s1) 85 return (*s2 - *s1); 86 else { 87 s2++; 88 s1++; 89 } 90 while (--len); 91 } 92 return(0); 93 } 94 95 /* The following routine converts an irregular expression to 96 * internal format. 97 * 98 * Either meta symbols (\a \d or \p) or character strings or 99 * operations ( alternation or perenthesizing ) can be 100 * specified. Each starts with a descriptor byte. The descriptor 101 * byte has STR set for strings, META set for meta symbols 102 * and OPER set for operations. 103 * The descriptor byte can also have the OPT bit set if the object 104 * defined is optional. Also ALT can be set to indicate an alternation. 105 * 106 * For metasymbols the byte following the descriptor byte identities 107 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 108 * strings the byte after the descriptor is a character count for 109 * the string: 110 * 111 * meta symbols := descriptor 112 * symbol 113 * 114 * strings := descriptor 115 * character count 116 * the string 117 * 118 * operatins := descriptor 119 * symbol 120 * character count 121 */ 122 123 /* 124 * handy macros for accessing parts of match blocks 125 */ 126 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 127 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 128 129 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 130 #define OCNT(A) (*(A+2)) /* character count */ 131 #define ONEXT(A) (A+3) /* next character after the operation */ 132 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 133 134 #define SCNT(A) (*(A+1)) /* byte count of a string */ 135 #define SSTR(A) (A+2) /* address of the string */ 136 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 137 138 /* 139 * bit flags in the descriptor 140 */ 141 #define OPT 1 142 #define STR 2 143 #define META 4 144 #define ALT 8 145 #define OPER 16 146 147 static char *ccre; /* pointer to current position in converted exp*/ 148 static char *ure; /* pointer current position in unconverted exp */ 149 150 char * 151 convexp(re) 152 char *re; /* unconverted irregular expression */ 153 { 154 char *cre; /* pointer to converted regular expression */ 155 156 /* allocate room for the converted expression */ 157 if (re == NIL) 158 return (NIL); 159 if (*re == '\0') 160 return (NIL); 161 cre = malloc(4 * strlen(re) + 3); 162 ccre = cre; 163 ure = re; 164 165 /* start the conversion with a \a */ 166 *cre = META | OPT; 167 MSYM(cre) = 'a'; 168 ccre = MNEXT(cre); 169 170 /* start the conversion (its recursive) */ 171 expconv(); 172 *ccre = 0; 173 return (cre); 174 } 175 176 static void 177 expconv() 178 { 179 char *cs; /* pointer to current symbol in converted exp */ 180 char c; /* character being processed */ 181 char *acs; /* pinter to last alternate */ 182 int temp; 183 184 /* let the conversion begin */ 185 acs = NIL; 186 cs = NIL; 187 while (*ure != NIL) { 188 switch (c = *ure++) { 189 190 case '\\': 191 switch (c = *ure++) { 192 193 /* escaped characters are just characters */ 194 default: 195 if (cs == NIL || (*cs & STR) == 0) { 196 cs = ccre; 197 *cs = STR; 198 SCNT(cs) = 1; 199 ccre += 2; 200 } else 201 SCNT(cs)++; 202 *ccre++ = c; 203 break; 204 205 /* normal(?) metacharacters */ 206 case 'a': 207 case 'd': 208 case 'e': 209 case 'p': 210 if (acs != NIL && acs != cs) { 211 do { 212 temp = OCNT(acs); 213 OCNT(acs) = ccre - acs; 214 acs -= temp; 215 } while (temp != 0); 216 acs = NIL; 217 } 218 cs = ccre; 219 *cs = META; 220 MSYM(cs) = c; 221 ccre = MNEXT(cs); 222 break; 223 } 224 break; 225 226 /* just put the symbol in */ 227 case '^': 228 case '$': 229 if (acs != NIL && acs != cs) { 230 do { 231 temp = OCNT(acs); 232 OCNT(acs) = ccre - acs; 233 acs -= temp; 234 } while (temp != 0); 235 acs = NIL; 236 } 237 cs = ccre; 238 *cs = META; 239 MSYM(cs) = c; 240 ccre = MNEXT(cs); 241 break; 242 243 /* mark the last match sequence as optional */ 244 case '?': 245 if (cs) 246 *cs = *cs | OPT; 247 break; 248 249 /* recurse and define a subexpression */ 250 case '(': 251 if (acs != NIL && acs != cs) { 252 do { 253 temp = OCNT(acs); 254 OCNT(acs) = ccre - acs; 255 acs -= temp; 256 } while (temp != 0); 257 acs = NIL; 258 } 259 cs = ccre; 260 *cs = OPER; 261 OSYM(cs) = '('; 262 ccre = ONEXT(cs); 263 expconv(); 264 OCNT(cs) = ccre - cs; /* offset to next symbol */ 265 break; 266 267 /* reurn from a recursion */ 268 case ')': 269 if (acs != NIL) { 270 do { 271 temp = OCNT(acs); 272 OCNT(acs) = ccre - acs; 273 acs -= temp; 274 } while (temp != 0); 275 acs = NIL; 276 } 277 cs = ccre; 278 *cs = META; 279 MSYM(cs) = c; 280 ccre = MNEXT(cs); 281 return; 282 283 /* mark the last match sequence as having an alternate */ 284 /* the third byte will contain an offset to jump over the */ 285 /* alternate match in case the first did not fail */ 286 case '|': 287 if (acs != NIL && acs != cs) 288 OCNT(ccre) = ccre - acs; /* make a back pointer */ 289 else 290 OCNT(ccre) = 0; 291 *cs |= ALT; 292 cs = ccre; 293 *cs = OPER; 294 OSYM(cs) = '|'; 295 ccre = ONEXT(cs); 296 acs = cs; /* remember that the pointer is to be filles */ 297 break; 298 299 /* if its not a metasymbol just build a scharacter string */ 300 default: 301 if (cs == NIL || (*cs & STR) == 0) { 302 cs = ccre; 303 *cs = STR; 304 SCNT(cs) = 1; 305 ccre = SSTR(cs); 306 } else 307 SCNT(cs)++; 308 *ccre++ = c; 309 break; 310 } 311 } 312 if (acs != NIL) { 313 do { 314 temp = OCNT(acs); 315 OCNT(acs) = ccre - acs; 316 acs -= temp; 317 } while (temp != 0); 318 acs = NIL; 319 } 320 return; 321 } 322 /* end of convertre */ 323 324 325 /* 326 * The following routine recognises an irregular expresion 327 * with the following special characters: 328 * 329 * \? - means last match was optional 330 * \a - matches any number of characters 331 * \d - matches any number of spaces and tabs 332 * \p - matches any number of alphanumeric 333 * characters. The 334 * characters matched will be copied into 335 * the area pointed to by 'name'. 336 * \| - alternation 337 * \( \) - grouping used mostly for alternation and 338 * optionality 339 * 340 * The irregular expression must be translated to internal form 341 * prior to calling this routine 342 * 343 * The value returned is the pointer to the first non \a 344 * character matched. 345 */ 346 347 char * 348 expmatch(s, re, mstring) 349 char *s; /* string to check for a match in */ 350 char *re; /* a converted irregular expression */ 351 char *mstring; /* where to put whatever matches a \p */ 352 { 353 char *cs; /* the current symbol */ 354 char *ptr,*s1; /* temporary pointer */ 355 boolean matched; /* a temporary boolean */ 356 357 /* initial conditions */ 358 if (re == NIL) 359 return (NIL); 360 cs = re; 361 matched = FALSE; 362 363 /* loop till expression string is exhausted (or at least pretty tired) */ 364 while (*cs) { 365 switch (*cs & (OPER | STR | META)) { 366 367 /* try to match a string */ 368 case STR: 369 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 370 if (matched) { 371 372 /* hoorah it matches */ 373 s += SCNT(cs); 374 cs = SNEXT(cs); 375 } else if (*cs & ALT) { 376 377 /* alternation, skip to next expression */ 378 cs = SNEXT(cs); 379 } else if (*cs & OPT) { 380 381 /* the match is optional */ 382 cs = SNEXT(cs); 383 matched = 1; /* indicate a successful match */ 384 } else { 385 386 /* no match, error return */ 387 return (NIL); 388 } 389 break; 390 391 /* an operator, do something fancy */ 392 case OPER: 393 switch (OSYM(cs)) { 394 395 /* this is an alternation */ 396 case '|': 397 if (matched) 398 399 /* last thing in the alternation was a match, skip ahead */ 400 cs = OPTR(cs); 401 else 402 403 /* no match, keep trying */ 404 cs = ONEXT(cs); 405 break; 406 407 /* this is a grouping, recurse */ 408 case '(': 409 ptr = expmatch(s, ONEXT(cs), mstring); 410 if (ptr != NIL) { 411 412 /* the subexpression matched */ 413 matched = 1; 414 s = ptr; 415 } else if (*cs & ALT) { 416 417 /* alternation, skip to next expression */ 418 matched = 0; 419 } else if (*cs & OPT) { 420 421 /* the match is optional */ 422 matched = 1; /* indicate a successful match */ 423 } else { 424 425 /* no match, error return */ 426 return (NIL); 427 } 428 cs = OPTR(cs); 429 break; 430 } 431 break; 432 433 /* try to match a metasymbol */ 434 case META: 435 switch (MSYM(cs)) { 436 437 /* try to match anything and remember what was matched */ 438 case 'p': 439 /* 440 * This is really the same as trying the match the 441 * remaining parts of the expression to any subset 442 * of the string. 443 */ 444 s1 = s; 445 do { 446 ptr = expmatch(s1, MNEXT(cs), mstring); 447 if (ptr != NIL && s1 != s) { 448 449 /* we have a match, remember the match */ 450 strncpy(mstring, s, s1 - s); 451 mstring[s1 - s] = '\0'; 452 return (ptr); 453 } else if (ptr != NIL && (*cs & OPT)) { 454 455 /* it was aoptional so no match is ok */ 456 return (ptr); 457 } else if (ptr != NIL) { 458 459 /* not optional and we still matched */ 460 return (NIL); 461 } 462 if (!isalnum((unsigned char)*s1) && *s1 != '_') 463 return (NIL); 464 if (*s1 == '\\') 465 x_escaped = x_escaped ? FALSE : TRUE; 466 else 467 x_escaped = FALSE; 468 } while (*s1++); 469 return (NIL); 470 471 /* try to match anything */ 472 case 'a': 473 /* 474 * This is really the same as trying the match the 475 * remaining parts of the expression to any subset 476 * of the string. 477 */ 478 s1 = s; 479 do { 480 ptr = expmatch(s1, MNEXT(cs), mstring); 481 if (ptr != NIL && s1 != s) { 482 483 /* we have a match */ 484 return (ptr); 485 } else if (ptr != NIL && (*cs & OPT)) { 486 487 /* it was aoptional so no match is ok */ 488 return (ptr); 489 } else if (ptr != NIL) { 490 491 /* not optional and we still matched */ 492 return (NIL); 493 } 494 if (*s1 == '\\') 495 x_escaped = x_escaped ? FALSE : TRUE; 496 else 497 x_escaped = FALSE; 498 } while (*s1++); 499 return (NIL); 500 501 /* fail if we are currently x_escaped */ 502 case 'e': 503 if (x_escaped) 504 return(NIL); 505 cs = MNEXT(cs); 506 break; 507 508 /* match any number of tabs and spaces */ 509 case 'd': 510 ptr = s; 511 while (*s == ' ' || *s == '\t') 512 s++; 513 if (s != ptr || s == x_start) { 514 515 /* match, be happy */ 516 matched = 1; 517 cs = MNEXT(cs); 518 } else if (*s == '\n' || *s == '\0') { 519 520 /* match, be happy */ 521 matched = 1; 522 cs = MNEXT(cs); 523 } else if (*cs & ALT) { 524 525 /* try the next part */ 526 matched = 0; 527 cs = MNEXT(cs); 528 } else if (*cs & OPT) { 529 530 /* doesn't matter */ 531 matched = 1; 532 cs = MNEXT(cs); 533 } else 534 535 /* no match, error return */ 536 return (NIL); 537 break; 538 539 /* check for end of line */ 540 case '$': 541 if (*s == '\0' || *s == '\n') { 542 543 /* match, be happy */ 544 s++; 545 matched = 1; 546 cs = MNEXT(cs); 547 } else if (*cs & ALT) { 548 549 /* try the next part */ 550 matched = 0; 551 cs = MNEXT(cs); 552 } else if (*cs & OPT) { 553 554 /* doesn't matter */ 555 matched = 1; 556 cs = MNEXT(cs); 557 } else 558 559 /* no match, error return */ 560 return (NIL); 561 break; 562 563 /* check for start of line */ 564 case '^': 565 if (s == x_start) { 566 567 /* match, be happy */ 568 matched = 1; 569 cs = MNEXT(cs); 570 } else if (*cs & ALT) { 571 572 /* try the next part */ 573 matched = 0; 574 cs = MNEXT(cs); 575 } else if (*cs & OPT) { 576 577 /* doesn't matter */ 578 matched = 1; 579 cs = MNEXT(cs); 580 } else 581 582 /* no match, error return */ 583 return (NIL); 584 break; 585 586 /* end of a subexpression, return success */ 587 case ')': 588 return (s); 589 } 590 break; 591 } 592 } 593 return (s); 594 } 595