1 /* $NetBSD: regexp.c,v 1.12 2012/03/20 20:34:59 matt Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 #ifndef lint 35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\ 36 The Regents of the University of California. All rights reserved."); 37 #endif /* not lint */ 38 39 #ifndef lint 40 #if 0 41 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 42 #endif 43 __RCSID("$NetBSD: regexp.c,v 1.12 2012/03/20 20:34:59 matt Exp $"); 44 #endif /* not lint */ 45 46 #include <assert.h> 47 #include <ctype.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include "extern.h" 51 52 #define FALSE 0 53 #define TRUE !(FALSE) 54 #define NIL 0 55 56 static void expconv __P((void)); 57 58 boolean x_escaped; /* true if we are currently x_escaped */ 59 char *x_start; /* start of string */ 60 boolean l_onecase; /* true if upper and lower equivalent */ 61 62 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c)) 63 64 /* STRNCMP - like strncmp except that we convert the 65 * first string to lower case before comparing 66 * if l_onecase is set. 67 */ 68 69 int 70 STRNCMP(char *s1, char *s2, int len) 71 { 72 if (l_onecase) { 73 do 74 if (*s2 - makelower(*s1)) 75 return (*s2 - makelower(*s1)); 76 else { 77 s2++; 78 s1++; 79 } 80 while (--len); 81 } else { 82 do 83 if (*s2 - *s1) 84 return (*s2 - *s1); 85 else { 86 s2++; 87 s1++; 88 } 89 while (--len); 90 } 91 return(0); 92 } 93 94 /* The following routine converts an irregular expression to 95 * internal format. 96 * 97 * Either meta symbols (\a \d or \p) or character strings or 98 * operations ( alternation or perenthesizing ) can be 99 * specified. Each starts with a descriptor byte. The descriptor 100 * byte has STR set for strings, META set for meta symbols 101 * and OPER set for operations. 102 * The descriptor byte can also have the OPT bit set if the object 103 * defined is optional. Also ALT can be set to indicate an alternation. 104 * 105 * For metasymbols the byte following the descriptor byte identities 106 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 107 * strings the byte after the descriptor is a character count for 108 * the string: 109 * 110 * meta symbols := descriptor 111 * symbol 112 * 113 * strings := descriptor 114 * character count 115 * the string 116 * 117 * operatins := descriptor 118 * symbol 119 * character count 120 */ 121 122 /* 123 * handy macros for accessing parts of match blocks 124 */ 125 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 126 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 127 128 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 129 #define OCNT(A) (*(A+2)) /* character count */ 130 #define ONEXT(A) (A+3) /* next character after the operation */ 131 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 132 133 #define SCNT(A) (*(A+1)) /* byte count of a string */ 134 #define SSTR(A) (A+2) /* address of the string */ 135 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 136 137 /* 138 * bit flags in the descriptor 139 */ 140 #define OPT 1 141 #define STR 2 142 #define META 4 143 #define ALT 8 144 #define OPER 16 145 146 static char *ccre; /* pointer to current position in converted exp*/ 147 static char *ure; /* pointer current position in unconverted exp */ 148 149 char * 150 convexp(char *re) /* unconverted irregular expression */ 151 { 152 char *cre; /* pointer to converted regular expression */ 153 154 /* allocate room for the converted expression */ 155 if (re == NIL) 156 return (NIL); 157 if (*re == '\0') 158 return (NIL); 159 cre = malloc(4 * strlen(re) + 3); 160 ccre = cre; 161 ure = re; 162 163 /* start the conversion with a \a */ 164 *cre = META | OPT; 165 MSYM(cre) = 'a'; 166 ccre = MNEXT(cre); 167 168 /* start the conversion (its recursive) */ 169 expconv(); 170 *ccre = 0; 171 return (cre); 172 } 173 174 static void 175 expconv(void) 176 { 177 char *cs; /* pointer to current symbol in converted exp */ 178 char c; /* character being processed */ 179 char *acs; /* pinter to last alternate */ 180 int temp; 181 182 /* let the conversion begin */ 183 acs = NIL; 184 cs = NIL; 185 while (*ure != NIL) { 186 switch (c = *ure++) { 187 188 case '\\': 189 switch (c = *ure++) { 190 191 /* escaped characters are just characters */ 192 default: 193 if (cs == NIL || (*cs & STR) == 0) { 194 cs = ccre; 195 *cs = STR; 196 SCNT(cs) = 1; 197 ccre += 2; 198 } else 199 SCNT(cs)++; 200 *ccre++ = c; 201 break; 202 203 /* normal(?) metacharacters */ 204 case 'a': 205 case 'd': 206 case 'e': 207 case 'p': 208 if (acs != NIL && acs != cs) { 209 do { 210 temp = OCNT(acs); 211 OCNT(acs) = ccre - acs; 212 acs -= temp; 213 } while (temp != 0); 214 acs = NIL; 215 } 216 cs = ccre; 217 *cs = META; 218 MSYM(cs) = c; 219 ccre = MNEXT(cs); 220 break; 221 } 222 break; 223 224 /* just put the symbol in */ 225 case '^': 226 case '$': 227 if (acs != NIL && acs != cs) { 228 do { 229 temp = OCNT(acs); 230 OCNT(acs) = ccre - acs; 231 acs -= temp; 232 } while (temp != 0); 233 acs = NIL; 234 } 235 cs = ccre; 236 *cs = META; 237 MSYM(cs) = c; 238 ccre = MNEXT(cs); 239 break; 240 241 /* mark the last match sequence as optional */ 242 case '?': 243 if (cs) 244 *cs = *cs | OPT; 245 break; 246 247 /* recurse and define a subexpression */ 248 case '(': 249 if (acs != NIL && acs != cs) { 250 do { 251 temp = OCNT(acs); 252 OCNT(acs) = ccre - acs; 253 acs -= temp; 254 } while (temp != 0); 255 acs = NIL; 256 } 257 cs = ccre; 258 *cs = OPER; 259 OSYM(cs) = '('; 260 ccre = ONEXT(cs); 261 expconv(); 262 OCNT(cs) = ccre - cs; /* offset to next symbol */ 263 break; 264 265 /* reurn from a recursion */ 266 case ')': 267 if (acs != NIL) { 268 do { 269 temp = OCNT(acs); 270 OCNT(acs) = ccre - acs; 271 acs -= temp; 272 } while (temp != 0); 273 acs = NIL; 274 } 275 cs = ccre; 276 *cs = META; 277 MSYM(cs) = c; 278 ccre = MNEXT(cs); 279 return; 280 281 /* mark the last match sequence as having an alternate */ 282 /* the third byte will contain an offset to jump over the */ 283 /* alternate match in case the first did not fail */ 284 case '|': 285 if (acs != NIL && acs != cs) 286 OCNT(ccre) = ccre - acs; /* make a back pointer */ 287 else 288 OCNT(ccre) = 0; 289 assert(cs != NULL); 290 *cs |= ALT; 291 cs = ccre; 292 *cs = OPER; 293 OSYM(cs) = '|'; 294 ccre = ONEXT(cs); 295 acs = cs; /* remember that the pointer is to be filles */ 296 break; 297 298 /* if its not a metasymbol just build a scharacter string */ 299 default: 300 if (cs == NIL || (*cs & STR) == 0) { 301 cs = ccre; 302 *cs = STR; 303 SCNT(cs) = 1; 304 ccre = SSTR(cs); 305 } else 306 SCNT(cs)++; 307 *ccre++ = c; 308 break; 309 } 310 } 311 if (acs != NIL) { 312 do { 313 temp = OCNT(acs); 314 OCNT(acs) = ccre - acs; 315 acs -= temp; 316 } while (temp != 0); 317 acs = NIL; 318 } 319 return; 320 } 321 /* end of convertre */ 322 323 324 /* 325 * The following routine recognises an irregular expresion 326 * with the following special characters: 327 * 328 * \? - means last match was optional 329 * \a - matches any number of characters 330 * \d - matches any number of spaces and tabs 331 * \p - matches any number of alphanumeric 332 * characters. The 333 * characters matched will be copied into 334 * the area pointed to by 'name'. 335 * \| - alternation 336 * \( \) - grouping used mostly for alternation and 337 * optionality 338 * 339 * The irregular expression must be translated to internal form 340 * prior to calling this routine 341 * 342 * The value returned is the pointer to the first non \a 343 * character matched. 344 */ 345 346 char * 347 expmatch( 348 char *s, /* string to check for a match in */ 349 char *re, /* a converted irregular expression */ 350 char *mstring) /* where to put whatever matches a \p */ 351 { 352 char *cs; /* the current symbol */ 353 char *ptr,*s1; /* temporary pointer */ 354 boolean matched; /* a temporary boolean */ 355 356 /* initial conditions */ 357 if (re == NIL) 358 return (NIL); 359 cs = re; 360 matched = FALSE; 361 362 /* loop till expression string is exhausted (or at least pretty tired) */ 363 while (*cs) { 364 switch (*cs & (OPER | STR | META)) { 365 366 /* try to match a string */ 367 case STR: 368 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 369 if (matched) { 370 371 /* hoorah it matches */ 372 s += SCNT(cs); 373 cs = SNEXT(cs); 374 } else if (*cs & ALT) { 375 376 /* alternation, skip to next expression */ 377 cs = SNEXT(cs); 378 } else if (*cs & OPT) { 379 380 /* the match is optional */ 381 cs = SNEXT(cs); 382 matched = 1; /* indicate a successful match */ 383 } else { 384 385 /* no match, error return */ 386 return (NIL); 387 } 388 break; 389 390 /* an operator, do something fancy */ 391 case OPER: 392 switch (OSYM(cs)) { 393 394 /* this is an alternation */ 395 case '|': 396 if (matched) 397 398 /* last thing in the alternation was a match, skip ahead */ 399 cs = OPTR(cs); 400 else 401 402 /* no match, keep trying */ 403 cs = ONEXT(cs); 404 break; 405 406 /* this is a grouping, recurse */ 407 case '(': 408 ptr = expmatch(s, ONEXT(cs), mstring); 409 if (ptr != NIL) { 410 411 /* the subexpression matched */ 412 matched = 1; 413 s = ptr; 414 } else if (*cs & ALT) { 415 416 /* alternation, skip to next expression */ 417 matched = 0; 418 } else if (*cs & OPT) { 419 420 /* the match is optional */ 421 matched = 1; /* indicate a successful match */ 422 } else { 423 424 /* no match, error return */ 425 return (NIL); 426 } 427 cs = OPTR(cs); 428 break; 429 } 430 break; 431 432 /* try to match a metasymbol */ 433 case META: 434 switch (MSYM(cs)) { 435 436 /* try to match anything and remember what was matched */ 437 case 'p': 438 /* 439 * This is really the same as trying the match the 440 * remaining parts of the expression to any subset 441 * of the string. 442 */ 443 s1 = s; 444 do { 445 ptr = expmatch(s1, MNEXT(cs), mstring); 446 if (ptr != NIL && s1 != s) { 447 448 /* we have a match, remember the match */ 449 strncpy(mstring, s, s1 - s); 450 mstring[s1 - s] = '\0'; 451 return (ptr); 452 } else if (ptr != NIL && (*cs & OPT)) { 453 454 /* it was aoptional so no match is ok */ 455 return (ptr); 456 } else if (ptr != NIL) { 457 458 /* not optional and we still matched */ 459 return (NIL); 460 } 461 if (!isalnum((unsigned char)*s1) && *s1 != '_') 462 return (NIL); 463 if (*s1 == '\\') 464 x_escaped = x_escaped ? FALSE : TRUE; 465 else 466 x_escaped = FALSE; 467 } while (*s1++); 468 return (NIL); 469 470 /* try to match anything */ 471 case 'a': 472 /* 473 * This is really the same as trying the match the 474 * remaining parts of the expression to any subset 475 * of the string. 476 */ 477 s1 = s; 478 do { 479 ptr = expmatch(s1, MNEXT(cs), mstring); 480 if (ptr != NIL && s1 != s) { 481 482 /* we have a match */ 483 return (ptr); 484 } else if (ptr != NIL && (*cs & OPT)) { 485 486 /* it was aoptional so no match is ok */ 487 return (ptr); 488 } else if (ptr != NIL) { 489 490 /* not optional and we still matched */ 491 return (NIL); 492 } 493 if (*s1 == '\\') 494 x_escaped = x_escaped ? FALSE : TRUE; 495 else 496 x_escaped = FALSE; 497 } while (*s1++); 498 return (NIL); 499 500 /* fail if we are currently x_escaped */ 501 case 'e': 502 if (x_escaped) 503 return(NIL); 504 cs = MNEXT(cs); 505 break; 506 507 /* match any number of tabs and spaces */ 508 case 'd': 509 ptr = s; 510 while (*s == ' ' || *s == '\t') 511 s++; 512 if (s != ptr || s == x_start) { 513 514 /* match, be happy */ 515 matched = 1; 516 cs = MNEXT(cs); 517 } else if (*s == '\n' || *s == '\0') { 518 519 /* match, be happy */ 520 matched = 1; 521 cs = MNEXT(cs); 522 } else if (*cs & ALT) { 523 524 /* try the next part */ 525 matched = 0; 526 cs = MNEXT(cs); 527 } else if (*cs & OPT) { 528 529 /* doesn't matter */ 530 matched = 1; 531 cs = MNEXT(cs); 532 } else 533 534 /* no match, error return */ 535 return (NIL); 536 break; 537 538 /* check for end of line */ 539 case '$': 540 if (*s == '\0' || *s == '\n') { 541 542 /* match, be happy */ 543 s++; 544 matched = 1; 545 cs = MNEXT(cs); 546 } else if (*cs & ALT) { 547 548 /* try the next part */ 549 matched = 0; 550 cs = MNEXT(cs); 551 } else if (*cs & OPT) { 552 553 /* doesn't matter */ 554 matched = 1; 555 cs = MNEXT(cs); 556 } else 557 558 /* no match, error return */ 559 return (NIL); 560 break; 561 562 /* check for start of line */ 563 case '^': 564 if (s == x_start) { 565 566 /* match, be happy */ 567 matched = 1; 568 cs = MNEXT(cs); 569 } else if (*cs & ALT) { 570 571 /* try the next part */ 572 matched = 0; 573 cs = MNEXT(cs); 574 } else if (*cs & OPT) { 575 576 /* doesn't matter */ 577 matched = 1; 578 cs = MNEXT(cs); 579 } else 580 581 /* no match, error return */ 582 return (NIL); 583 break; 584 585 /* end of a subexpression, return success */ 586 case ')': 587 return (s); 588 } 589 break; 590 } 591 } 592 return (s); 593 } 594