1 /* $NetBSD: regexp.c,v 1.16 2024/12/08 07:53:18 andvar Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #if HAVE_NBTOOL_CONFIG_H 34 #include "nbtool_config.h" 35 #endif 36 37 #include <sys/cdefs.h> 38 #ifndef lint 39 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\ 40 The Regents of the University of California. All rights reserved."); 41 #endif /* not lint */ 42 43 #ifndef lint 44 #if 0 45 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 46 #endif 47 __RCSID("$NetBSD: regexp.c,v 1.16 2024/12/08 07:53:18 andvar Exp $"); 48 #endif /* not lint */ 49 50 #include <assert.h> 51 #include <ctype.h> 52 #include <stdlib.h> 53 #include <stdbool.h> 54 #include <string.h> 55 #include "extern.h" 56 57 static void expconv (void); 58 59 bool x_escaped; /* true if we are currently x_escaped */ 60 char *x_start; /* start of string */ 61 bool l_onecase; /* true if upper and lower equivalent */ 62 63 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c)) 64 65 static char 66 tocc(ptrdiff_t x) { 67 if (x & ~0xff) 68 abort(); 69 return (char)x; 70 } 71 72 /* STRNCMP - like strncmp except that we convert the 73 * first string to lower case before comparing 74 * if l_onecase is set. 75 */ 76 77 int 78 STRNCMP(char *s1, char *s2, int len) 79 { 80 if (l_onecase) { 81 do 82 if (*s2 - makelower(*s1)) 83 return *s2 - makelower(*s1); 84 else { 85 s2++; 86 s1++; 87 } 88 while (--len); 89 } else { 90 do 91 if (*s2 - *s1) 92 return *s2 - *s1; 93 else { 94 s2++; 95 s1++; 96 } 97 while (--len); 98 } 99 return(0); 100 } 101 102 /* The following routine converts an irregular expression to 103 * internal format. 104 * 105 * Either meta symbols (\a \d or \p) or character strings or 106 * operations ( alternation or perenthesizing ) can be 107 * specified. Each starts with a descriptor byte. The descriptor 108 * byte has STR set for strings, META set for meta symbols 109 * and OPER set for operations. 110 * The descriptor byte can also have the OPT bit set if the object 111 * defined is optional. Also ALT can be set to indicate an alternation. 112 * 113 * For metasymbols the byte following the descriptor byte identities 114 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 115 * strings the byte after the descriptor is a character count for 116 * the string: 117 * 118 * meta symbols := descriptor 119 * symbol 120 * 121 * strings := descriptor 122 * character count 123 * the string 124 * 125 * operations := descriptor 126 * symbol 127 * character count 128 */ 129 130 /* 131 * handy macros for accessing parts of match blocks 132 */ 133 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 134 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 135 136 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 137 #define OCNT(A) (*(A+2)) /* character count */ 138 #define ONEXT(A) (A+3) /* next character after the operation */ 139 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 140 141 #define SCNT(A) (*(A+1)) /* byte count of a string */ 142 #define SSTR(A) (A+2) /* address of the string */ 143 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 144 145 /* 146 * bit flags in the descriptor 147 */ 148 #define OPT 1 149 #define STR 2 150 #define META 4 151 #define ALT 8 152 #define OPER 16 153 154 static char *ccre; /* pointer to current position in converted exp*/ 155 static char *ure; /* pointer current position in unconverted exp */ 156 157 char * 158 convexp(char *re) /* unconverted irregular expression */ 159 { 160 char *cre; /* pointer to converted regular expression */ 161 162 /* allocate room for the converted expression */ 163 if (re == NULL) 164 return NULL; 165 if (*re == '\0') 166 return NULL; 167 cre = malloc(4 * strlen(re) + 3); 168 ccre = cre; 169 ure = re; 170 171 /* start the conversion with a \a */ 172 *cre = META | OPT; 173 MSYM(cre) = 'a'; 174 ccre = MNEXT(cre); 175 176 /* start the conversion (its recursive) */ 177 expconv(); 178 *ccre = 0; 179 return cre; 180 } 181 182 static void 183 expconv(void) 184 { 185 char *cs; /* pointer to current symbol in converted exp */ 186 char c; /* character being processed */ 187 char *acs; /* pinter to last alternate */ 188 int temp; 189 190 /* let the conversion begin */ 191 acs = NULL; 192 cs = NULL; 193 while (*ure) { 194 switch (c = *ure++) { 195 196 case '\\': 197 switch (c = *ure++) { 198 199 /* escaped characters are just characters */ 200 default: 201 if (cs == NULL || (*cs & STR) == 0) { 202 cs = ccre; 203 *cs = STR; 204 SCNT(cs) = 1; 205 ccre += 2; 206 } else 207 SCNT(cs)++; 208 *ccre++ = c; 209 break; 210 211 /* normal(?) metacharacters */ 212 case 'a': 213 case 'd': 214 case 'e': 215 case 'p': 216 if (acs != NULL && acs != cs) { 217 do { 218 temp = OCNT(acs); 219 OCNT(acs) = tocc(ccre - acs); 220 acs -= temp; 221 } while (temp != 0); 222 acs = NULL; 223 } 224 cs = ccre; 225 *cs = META; 226 MSYM(cs) = c; 227 ccre = MNEXT(cs); 228 break; 229 } 230 break; 231 232 /* just put the symbol in */ 233 case '^': 234 case '$': 235 if (acs != NULL && acs != cs) { 236 do { 237 temp = OCNT(acs); 238 OCNT(acs) = tocc(ccre - acs); 239 acs -= temp; 240 } while (temp != 0); 241 acs = NULL; 242 } 243 cs = ccre; 244 *cs = META; 245 MSYM(cs) = c; 246 ccre = MNEXT(cs); 247 break; 248 249 /* mark the last match sequence as optional */ 250 case '?': 251 if (cs) 252 *cs = *cs | OPT; 253 break; 254 255 /* recurse and define a subexpression */ 256 case '(': 257 if (acs != NULL && acs != cs) { 258 do { 259 temp = OCNT(acs); 260 OCNT(acs) = tocc(ccre - acs); 261 acs -= temp; 262 } while (temp != 0); 263 acs = NULL; 264 } 265 cs = ccre; 266 *cs = OPER; 267 OSYM(cs) = '('; 268 ccre = ONEXT(cs); 269 expconv(); 270 OCNT(cs) = tocc(ccre - cs); /* offset to next symbol */ 271 break; 272 273 /* return from a recursion */ 274 case ')': 275 if (acs != NULL) { 276 do { 277 temp = OCNT(acs); 278 OCNT(acs) = tocc(ccre - acs); 279 acs -= temp; 280 } while (temp != 0); 281 acs = NULL; 282 } 283 cs = ccre; 284 *cs = META; 285 MSYM(cs) = c; 286 ccre = MNEXT(cs); 287 return; 288 289 /* mark the last match sequence as having an alternate */ 290 /* the third byte will contain an offset to jump over the */ 291 /* alternate match in case the first did not fail */ 292 case '|': 293 if (acs != NULL && acs != cs) 294 OCNT(ccre) = tocc(ccre - acs); /* make a back pointer */ 295 else 296 OCNT(ccre) = 0; 297 assert(cs != NULL); 298 *cs |= ALT; 299 cs = ccre; 300 *cs = OPER; 301 OSYM(cs) = '|'; 302 ccre = ONEXT(cs); 303 acs = cs; /* remember that the pointer is to be filles */ 304 break; 305 306 /* if its not a metasymbol just build a scharacter string */ 307 default: 308 if (cs == NULL || (*cs & STR) == 0) { 309 cs = ccre; 310 *cs = STR; 311 SCNT(cs) = 1; 312 ccre = SSTR(cs); 313 } else 314 SCNT(cs)++; 315 *ccre++ = c; 316 break; 317 } 318 } 319 if (acs != NULL) { 320 do { 321 temp = OCNT(acs); 322 OCNT(acs) = tocc(ccre - acs); 323 acs -= temp; 324 } while (temp != 0); 325 acs = NULL; 326 } 327 return; 328 } 329 /* end of convertre */ 330 331 332 /* 333 * The following routine recognises an irregular expression 334 * with the following special characters: 335 * 336 * \? - means last match was optional 337 * \a - matches any number of characters 338 * \d - matches any number of spaces and tabs 339 * \p - matches any number of alphanumeric 340 * characters. The 341 * characters matched will be copied into 342 * the area pointed to by 'name'. 343 * \| - alternation 344 * \( \) - grouping used mostly for alternation and 345 * optionality 346 * 347 * The irregular expression must be translated to internal form 348 * prior to calling this routine 349 * 350 * The value returned is the pointer to the first non \a 351 * character matched. 352 */ 353 354 char * 355 expmatch( 356 char *s, /* string to check for a match in */ 357 char *re, /* a converted irregular expression */ 358 char *mstring) /* where to put whatever matches a \p */ 359 { 360 char *cs; /* the current symbol */ 361 char *ptr,*s1; /* temporary pointer */ 362 bool matched; /* a temporary bool */ 363 364 /* initial conditions */ 365 if (re == NULL) 366 return NULL; 367 cs = re; 368 matched = false; 369 370 /* loop till expression string is exhausted (or at least pretty tired) */ 371 while (*cs) { 372 switch (*cs & (OPER | STR | META)) { 373 374 /* try to match a string */ 375 case STR: 376 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 377 if (matched) { 378 379 /* hoorah it matches */ 380 s += SCNT(cs); 381 cs = SNEXT(cs); 382 } else if (*cs & ALT) { 383 384 /* alternation, skip to next expression */ 385 cs = SNEXT(cs); 386 } else if (*cs & OPT) { 387 388 /* the match is optional */ 389 cs = SNEXT(cs); 390 matched = 1; /* indicate a successful match */ 391 } else { 392 393 /* no match, error return */ 394 return NULL; 395 } 396 break; 397 398 /* an operator, do something fancy */ 399 case OPER: 400 switch (OSYM(cs)) { 401 402 /* this is an alternation */ 403 case '|': 404 if (matched) 405 406 /* last thing in the alternation was a match, skip ahead */ 407 cs = OPTR(cs); 408 else 409 410 /* no match, keep trying */ 411 cs = ONEXT(cs); 412 break; 413 414 /* this is a grouping, recurse */ 415 case '(': 416 ptr = expmatch(s, ONEXT(cs), mstring); 417 if (ptr != NULL) { 418 419 /* the subexpression matched */ 420 matched = 1; 421 s = ptr; 422 } else if (*cs & ALT) { 423 424 /* alternation, skip to next expression */ 425 matched = 0; 426 } else if (*cs & OPT) { 427 428 /* the match is optional */ 429 matched = 1; /* indicate a successful match */ 430 } else { 431 432 /* no match, error return */ 433 return NULL; 434 } 435 cs = OPTR(cs); 436 break; 437 } 438 break; 439 440 /* try to match a metasymbol */ 441 case META: 442 switch (MSYM(cs)) { 443 444 /* try to match anything and remember what was matched */ 445 case 'p': 446 /* 447 * This is really the same as trying the match the 448 * remaining parts of the expression to any subset 449 * of the string. 450 */ 451 s1 = s; 452 do { 453 ptr = expmatch(s1, MNEXT(cs), mstring); 454 if (ptr != NULL && s1 != s) { 455 456 /* we have a match, remember the match */ 457 strncpy(mstring, s, (size_t)(s1 - s)); 458 mstring[s1 - s] = '\0'; 459 return ptr; 460 } else if (ptr != NULL && (*cs & OPT)) { 461 462 /* it was aoptional so no match is ok */ 463 return ptr; 464 } else if (ptr != NULL) { 465 466 /* not optional and we still matched */ 467 return NULL; 468 } 469 if (!isalnum((unsigned char)*s1) && *s1 != '_') 470 return NULL; 471 if (*s1 == '\\') 472 x_escaped = x_escaped ? false : true; 473 else 474 x_escaped = false; 475 } while (*s1++); 476 return NULL; 477 478 /* try to match anything */ 479 case 'a': 480 /* 481 * This is really the same as trying the match the 482 * remaining parts of the expression to any subset 483 * of the string. 484 */ 485 s1 = s; 486 do { 487 ptr = expmatch(s1, MNEXT(cs), mstring); 488 if (ptr != NULL && s1 != s) { 489 490 /* we have a match */ 491 return ptr; 492 } else if (ptr != NULL && (*cs & OPT)) { 493 494 /* it was aoptional so no match is ok */ 495 return ptr; 496 } else if (ptr != NULL) { 497 498 /* not optional and we still matched */ 499 return NULL; 500 } 501 if (*s1 == '\\') 502 x_escaped = x_escaped ? false : true; 503 else 504 x_escaped = false; 505 } while (*s1++); 506 return NULL; 507 508 /* fail if we are currently x_escaped */ 509 case 'e': 510 if (x_escaped) 511 return NULL; 512 cs = MNEXT(cs); 513 break; 514 515 /* match any number of tabs and spaces */ 516 case 'd': 517 ptr = s; 518 while (*s == ' ' || *s == '\t') 519 s++; 520 if (s != ptr || s == x_start) { 521 522 /* match, be happy */ 523 matched = 1; 524 cs = MNEXT(cs); 525 } else if (*s == '\n' || *s == '\0') { 526 527 /* match, be happy */ 528 matched = 1; 529 cs = MNEXT(cs); 530 } else if (*cs & ALT) { 531 532 /* try the next part */ 533 matched = 0; 534 cs = MNEXT(cs); 535 } else if (*cs & OPT) { 536 537 /* doesn't matter */ 538 matched = 1; 539 cs = MNEXT(cs); 540 } else 541 542 /* no match, error return */ 543 return NULL; 544 break; 545 546 /* check for end of line */ 547 case '$': 548 if (*s == '\0' || *s == '\n') { 549 550 /* match, be happy */ 551 s++; 552 matched = 1; 553 cs = MNEXT(cs); 554 } else if (*cs & ALT) { 555 556 /* try the next part */ 557 matched = 0; 558 cs = MNEXT(cs); 559 } else if (*cs & OPT) { 560 561 /* doesn't matter */ 562 matched = 1; 563 cs = MNEXT(cs); 564 } else 565 566 /* no match, error return */ 567 return NULL; 568 break; 569 570 /* check for start of line */ 571 case '^': 572 if (s == x_start) { 573 574 /* match, be happy */ 575 matched = 1; 576 cs = MNEXT(cs); 577 } else if (*cs & ALT) { 578 579 /* try the next part */ 580 matched = 0; 581 cs = MNEXT(cs); 582 } else if (*cs & OPT) { 583 584 /* doesn't matter */ 585 matched = 1; 586 cs = MNEXT(cs); 587 } else 588 589 /* no match, error return */ 590 return NULL; 591 break; 592 593 /* end of a subexpression, return success */ 594 case ')': 595 return s; 596 } 597 break; 598 } 599 } 600 return s; 601 } 602