1 /* 2 * Copyright (c) 1980 Regents of the University of California. 3 * All rights reserved. The Berkeley software License Agreement 4 * specifies the terms and conditions for redistribution. 5 */ 6 7 #ifndef lint 8 static char sccsid[] = "@(#)regexp.c 5.1 (Berkeley) 06/05/85"; 9 #endif not lint 10 11 #include <ctype.h> 12 13 typedef int boolean; 14 #define TRUE 1 15 #define FALSE 0 16 #define NIL 0 17 18 boolean l_onecase; /* true if upper and lower equivalent */ 19 20 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 21 22 /* STRNCMP - like strncmp except that we convert the 23 * first string to lower case before comparing 24 * if l_onecase is set. 25 */ 26 27 STRNCMP(s1, s2, len) 28 register char *s1,*s2; 29 register int len; 30 { 31 if (l_onecase) { 32 do 33 if (*s2 - makelower(*s1)) 34 return (*s2 - makelower(*s1)); 35 else { 36 s2++; 37 s1++; 38 } 39 while (--len); 40 } else { 41 do 42 if (*s2 - *s1) 43 return (*s2 - *s1); 44 else { 45 s2++; 46 s1++; 47 } 48 while (--len); 49 } 50 return(0); 51 } 52 53 /* The following routine converts an irregular expression to 54 * internal format. 55 * 56 * Either meta symbols (\a \d or \p) or character strings or 57 * operations ( alternation or perenthesizing ) can be 58 * specified. Each starts with a descriptor byte. The descriptor 59 * byte has STR set for strings, META set for meta symbols 60 * and OPER set for operations. 61 * The descriptor byte can also have the OPT bit set if the object 62 * defined is optional. Also ALT can be set to indicate an alternation. 63 * 64 * For metasymbols the byte following the descriptor byte identities 65 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 66 * strings the byte after the descriptor is a character count for 67 * the string: 68 * 69 * meta symbols := descriptor 70 * symbol 71 * 72 * strings := descriptor 73 * character count 74 * the string 75 * 76 * operatins := descriptor 77 * symbol 78 * character count 79 */ 80 81 /* 82 * handy macros for accessing parts of match blocks 83 */ 84 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 85 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 86 87 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 88 #define OCNT(A) (*(A+2)) /* character count */ 89 #define ONEXT(A) (A+3) /* next character after the operation */ 90 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 91 92 #define SCNT(A) (*(A+1)) /* byte count of a string */ 93 #define SSTR(A) (A+2) /* address of the string */ 94 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 95 96 /* 97 * bit flags in the descriptor 98 */ 99 #define OPT 1 100 #define STR 2 101 #define META 4 102 #define ALT 8 103 #define OPER 16 104 105 char *ure; /* pointer current position in unconverted exp */ 106 char *ccre; /* pointer to current position in converted exp*/ 107 char *malloc(); 108 109 char * 110 convexp(re) 111 char *re; /* unconverted irregular expression */ 112 { 113 register char *cre; /* pointer to converted regular expression */ 114 115 /* allocate room for the converted expression */ 116 if (re == NIL) 117 return (NIL); 118 if (*re == '\0') 119 return (NIL); 120 cre = malloc (4 * strlen(re) + 3); 121 ccre = cre; 122 ure = re; 123 124 /* start the conversion with a \a */ 125 *cre = META | OPT; 126 MSYM(cre) = 'a'; 127 ccre = MNEXT(cre); 128 129 /* start the conversion (its recursive) */ 130 expconv (); 131 *ccre = 0; 132 return (cre); 133 } 134 135 expconv() 136 { 137 register char *cs; /* pointer to current symbol in converted exp */ 138 register char c; /* character being processed */ 139 register char *acs; /* pinter to last alternate */ 140 register int temp; 141 142 /* let the conversion begin */ 143 acs = NIL; 144 cs = NIL; 145 while (*ure != NIL) { 146 switch (c = *ure++) { 147 148 case '\\': 149 switch (c = *ure++) { 150 151 /* escaped characters are just characters */ 152 default: 153 if (cs == NIL || (*cs & STR) == 0) { 154 cs = ccre; 155 *cs = STR; 156 SCNT(cs) = 1; 157 ccre += 2; 158 } else 159 SCNT(cs)++; 160 *ccre++ = c; 161 break; 162 163 /* normal(?) metacharacters */ 164 case 'a': 165 case 'd': 166 case 'e': 167 case 'p': 168 if (acs != NIL && acs != cs) { 169 do { 170 temp = OCNT(acs); 171 OCNT(acs) = ccre - acs; 172 acs -= temp; 173 } while (temp != 0); 174 acs = NIL; 175 } 176 cs = ccre; 177 *cs = META; 178 MSYM(cs) = c; 179 ccre = MNEXT(cs); 180 break; 181 } 182 break; 183 184 /* just put the symbol in */ 185 case '^': 186 case '$': 187 if (acs != NIL && acs != cs) { 188 do { 189 temp = OCNT(acs); 190 OCNT(acs) = ccre - acs; 191 acs -= temp; 192 } while (temp != 0); 193 acs = NIL; 194 } 195 cs = ccre; 196 *cs = META; 197 MSYM(cs) = c; 198 ccre = MNEXT(cs); 199 break; 200 201 /* mark the last match sequence as optional */ 202 case '?': 203 if (cs) 204 *cs = *cs | OPT; 205 break; 206 207 /* recurse and define a subexpression */ 208 case '(': 209 if (acs != NIL && acs != cs) { 210 do { 211 temp = OCNT(acs); 212 OCNT(acs) = ccre - acs; 213 acs -= temp; 214 } while (temp != 0); 215 acs = NIL; 216 } 217 cs = ccre; 218 *cs = OPER; 219 OSYM(cs) = '('; 220 ccre = ONEXT(cs); 221 expconv (); 222 OCNT(cs) = ccre - cs; /* offset to next symbol */ 223 break; 224 225 /* return from a recursion */ 226 case ')': 227 if (acs != NIL) { 228 do { 229 temp = OCNT(acs); 230 OCNT(acs) = ccre - acs; 231 acs -= temp; 232 } while (temp != 0); 233 acs = NIL; 234 } 235 cs = ccre; 236 *cs = META; 237 MSYM(cs) = c; 238 ccre = MNEXT(cs); 239 return; 240 241 /* mark the last match sequence as having an alternate */ 242 /* the third byte will contain an offset to jump over the */ 243 /* alternate match in case the first did not fail */ 244 case '|': 245 if (acs != NIL && acs != cs) 246 OCNT(ccre) = ccre - acs; /* make a back pointer */ 247 else 248 OCNT(ccre) = 0; 249 *cs |= ALT; 250 cs = ccre; 251 *cs = OPER; 252 OSYM(cs) = '|'; 253 ccre = ONEXT(cs); 254 acs = cs; /* remember that the pointer is to be filles */ 255 break; 256 257 /* if its not a metasymbol just build a scharacter string */ 258 default: 259 if (cs == NIL || (*cs & STR) == 0) { 260 cs = ccre; 261 *cs = STR; 262 SCNT(cs) = 1; 263 ccre = SSTR(cs); 264 } else 265 SCNT(cs)++; 266 *ccre++ = c; 267 break; 268 } 269 } 270 if (acs != NIL) { 271 do { 272 temp = OCNT(acs); 273 OCNT(acs) = ccre - acs; 274 acs -= temp; 275 } while (temp != 0); 276 acs = NIL; 277 } 278 return; 279 } 280 /* end of convertre */ 281 282 283 /* 284 * The following routine recognises an irregular expresion 285 * with the following special characters: 286 * 287 * \? - means last match was optional 288 * \a - matches any number of characters 289 * \d - matches any number of spaces and tabs 290 * \p - matches any number of alphanumeric 291 * characters. The 292 * characters matched will be copied into 293 * the area pointed to by 'name'. 294 * \| - alternation 295 * \( \) - grouping used mostly for alternation and 296 * optionality 297 * 298 * The irregular expression must be translated to internal form 299 * prior to calling this routine 300 * 301 * The value returned is the pointer to the first non \a 302 * character matched. 303 */ 304 305 boolean _escaped; /* true if we are currently _escaped */ 306 char *_start; /* start of string */ 307 308 char * 309 expmatch (s, re, mstring) 310 register char *s; /* string to check for a match in */ 311 register char *re; /* a converted irregular expression */ 312 register char *mstring; /* where to put whatever matches a \p */ 313 { 314 register char *cs; /* the current symbol */ 315 register char *ptr,*s1; /* temporary pointer */ 316 boolean matched; /* a temporary boolean */ 317 318 /* initial conditions */ 319 if (re == NIL) 320 return (NIL); 321 cs = re; 322 matched = FALSE; 323 324 /* loop till expression string is exhausted (or at least pretty tired) */ 325 while (*cs) { 326 switch (*cs & (OPER | STR | META)) { 327 328 /* try to match a string */ 329 case STR: 330 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 331 if (matched) { 332 333 /* hoorah it matches */ 334 s += SCNT(cs); 335 cs = SNEXT(cs); 336 } else if (*cs & ALT) { 337 338 /* alternation, skip to next expression */ 339 cs = SNEXT(cs); 340 } else if (*cs & OPT) { 341 342 /* the match is optional */ 343 cs = SNEXT(cs); 344 matched = 1; /* indicate a successful match */ 345 } else { 346 347 /* no match, error return */ 348 return (NIL); 349 } 350 break; 351 352 /* an operator, do something fancy */ 353 case OPER: 354 switch (OSYM(cs)) { 355 356 /* this is an alternation */ 357 case '|': 358 if (matched) 359 360 /* last thing in the alternation was a match, skip ahead */ 361 cs = OPTR(cs); 362 else 363 364 /* no match, keep trying */ 365 cs = ONEXT(cs); 366 break; 367 368 /* this is a grouping, recurse */ 369 case '(': 370 ptr = expmatch (s, ONEXT(cs), mstring); 371 if (ptr != NIL) { 372 373 /* the subexpression matched */ 374 matched = 1; 375 s = ptr; 376 } else if (*cs & ALT) { 377 378 /* alternation, skip to next expression */ 379 matched = 0; 380 } else if (*cs & OPT) { 381 382 /* the match is optional */ 383 matched = 1; /* indicate a successful match */ 384 } else { 385 386 /* no match, error return */ 387 return (NIL); 388 } 389 cs = OPTR(cs); 390 break; 391 } 392 break; 393 394 /* try to match a metasymbol */ 395 case META: 396 switch (MSYM(cs)) { 397 398 /* try to match anything and remember what was matched */ 399 case 'p': 400 /* 401 * This is really the same as trying the match the 402 * remaining parts of the expression to any subset 403 * of the string. 404 */ 405 s1 = s; 406 do { 407 ptr = expmatch (s1, MNEXT(cs), mstring); 408 if (ptr != NIL && s1 != s) { 409 410 /* we have a match, remember the match */ 411 strncpy (mstring, s, s1 - s); 412 mstring[s1 - s] = '\0'; 413 return (ptr); 414 } else if (ptr != NIL && (*cs & OPT)) { 415 416 /* it was aoptional so no match is ok */ 417 return (ptr); 418 } else if (ptr != NIL) { 419 420 /* not optional and we still matched */ 421 return (NIL); 422 } 423 if (!isalnum(*s1) && *s1 != '_') 424 return (NIL); 425 if (*s1 == '\\') 426 _escaped = _escaped ? FALSE : TRUE; 427 else 428 _escaped = FALSE; 429 } while (*s1++); 430 return (NIL); 431 432 /* try to match anything */ 433 case 'a': 434 /* 435 * This is really the same as trying the match the 436 * remaining parts of the expression to any subset 437 * of the string. 438 */ 439 s1 = s; 440 do { 441 ptr = expmatch (s1, MNEXT(cs), mstring); 442 if (ptr != NIL && s1 != s) { 443 444 /* we have a match */ 445 return (ptr); 446 } else if (ptr != NIL && (*cs & OPT)) { 447 448 /* it was aoptional so no match is ok */ 449 return (ptr); 450 } else if (ptr != NIL) { 451 452 /* not optional and we still matched */ 453 return (NIL); 454 } 455 if (*s1 == '\\') 456 _escaped = _escaped ? FALSE : TRUE; 457 else 458 _escaped = FALSE; 459 } while (*s1++); 460 return (NIL); 461 462 /* fail if we are currently _escaped */ 463 case 'e': 464 if (_escaped) 465 return(NIL); 466 cs = MNEXT(cs); 467 break; 468 469 /* match any number of tabs and spaces */ 470 case 'd': 471 ptr = s; 472 while (*s == ' ' || *s == '\t') 473 s++; 474 if (s != ptr || s == _start) { 475 476 /* match, be happy */ 477 matched = 1; 478 cs = MNEXT(cs); 479 } else if (*s == '\n' || *s == '\0') { 480 481 /* match, be happy */ 482 matched = 1; 483 cs = MNEXT(cs); 484 } else if (*cs & ALT) { 485 486 /* try the next part */ 487 matched = 0; 488 cs = MNEXT(cs); 489 } else if (*cs & OPT) { 490 491 /* doesn't matter */ 492 matched = 1; 493 cs = MNEXT(cs); 494 } else 495 496 /* no match, error return */ 497 return (NIL); 498 break; 499 500 /* check for end of line */ 501 case '$': 502 if (*s == '\0' || *s == '\n') { 503 504 /* match, be happy */ 505 s++; 506 matched = 1; 507 cs = MNEXT(cs); 508 } else if (*cs & ALT) { 509 510 /* try the next part */ 511 matched = 0; 512 cs = MNEXT(cs); 513 } else if (*cs & OPT) { 514 515 /* doesn't matter */ 516 matched = 1; 517 cs = MNEXT(cs); 518 } else 519 520 /* no match, error return */ 521 return (NIL); 522 break; 523 524 /* check for start of line */ 525 case '^': 526 if (s == _start) { 527 528 /* match, be happy */ 529 matched = 1; 530 cs = MNEXT(cs); 531 } else if (*cs & ALT) { 532 533 /* try the next part */ 534 matched = 0; 535 cs = MNEXT(cs); 536 } else if (*cs & OPT) { 537 538 /* doesn't matter */ 539 matched = 1; 540 cs = MNEXT(cs); 541 } else 542 543 /* no match, error return */ 544 return (NIL); 545 break; 546 547 /* end of a subexpression, return success */ 548 case ')': 549 return (s); 550 } 551 break; 552 } 553 } 554 return (s); 555 } 556