1 /* $NetBSD: ure.c,v 1.1.1.6 2018/02/06 01:53:07 christos Exp $ */ 2 3 /* $OpenLDAP$ */ 4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 1998-2017 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17 /* Copyright 1997, 1998, 1999 Computing Research Labs, 18 * New Mexico State University 19 * 20 * Permission is hereby granted, free of charge, to any person obtaining a 21 * copy of this software and associated documentation files (the "Software"), 22 * to deal in the Software without restriction, including without limitation 23 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 24 * and/or sell copies of the Software, and to permit persons to whom the 25 * Software is furnished to do so, subject to the following conditions: 26 * 27 * The above copyright notice and this permission notice shall be included in 28 * all copies or substantial portions of the Software. 29 * 30 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 33 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 34 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 35 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 36 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 37 */ 38 /* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp " */ 39 40 #include <sys/cdefs.h> 41 __RCSID("$NetBSD: ure.c,v 1.1.1.6 2018/02/06 01:53:07 christos Exp $"); 42 43 #include "portable.h" 44 45 #include <ac/stdlib.h> 46 #include <ac/string.h> 47 #include <ac/unistd.h> 48 49 #include "ure.h" 50 51 /* 52 * Flags used internally in the DFA. 53 */ 54 #define _URE_DFA_CASEFOLD 0x01 55 #define _URE_DFA_BLANKLINE 0x02 56 57 static unsigned long cclass_flags[] = { 58 0, 59 _URE_NONSPACING, 60 _URE_COMBINING, 61 _URE_NUMDIGIT, 62 _URE_NUMOTHER, 63 _URE_SPACESEP, 64 _URE_LINESEP, 65 _URE_PARASEP, 66 _URE_CNTRL, 67 _URE_PUA, 68 _URE_UPPER, 69 _URE_LOWER, 70 _URE_TITLE, 71 _URE_MODIFIER, 72 _URE_OTHERLETTER, 73 _URE_DASHPUNCT, 74 _URE_OPENPUNCT, 75 _URE_CLOSEPUNCT, 76 _URE_OTHERPUNCT, 77 _URE_MATHSYM, 78 _URE_CURRENCYSYM, 79 _URE_OTHERSYM, 80 _URE_LTR, 81 _URE_RTL, 82 _URE_EURONUM, 83 _URE_EURONUMSEP, 84 _URE_EURONUMTERM, 85 _URE_ARABNUM, 86 _URE_COMMONSEP, 87 _URE_BLOCKSEP, 88 _URE_SEGMENTSEP, 89 _URE_WHITESPACE, 90 _URE_OTHERNEUT, 91 }; 92 93 /* 94 * Symbol types for the DFA. 95 */ 96 #define _URE_ANY_CHAR 1 97 #define _URE_CHAR 2 98 #define _URE_CCLASS 3 99 #define _URE_NCCLASS 4 100 #define _URE_BOL_ANCHOR 5 101 #define _URE_EOL_ANCHOR 6 102 103 /* 104 * Op codes for converting the NFA to a DFA. 105 */ 106 #define _URE_SYMBOL 10 107 #define _URE_PAREN 11 108 #define _URE_QUEST 12 109 #define _URE_STAR 13 110 #define _URE_PLUS 14 111 #define _URE_ONE 15 112 #define _URE_AND 16 113 #define _URE_OR 17 114 115 #define _URE_NOOP 0xffff 116 117 #define _URE_REGSTART 0x8000 118 #define _URE_REGEND 0x4000 119 120 /* 121 * Structure used to handle a compacted range of characters. 122 */ 123 typedef struct { 124 ucs4_t min_code; 125 ucs4_t max_code; 126 } _ure_range_t; 127 128 typedef struct { 129 _ure_range_t *ranges; 130 ucs2_t ranges_used; 131 ucs2_t ranges_size; 132 } _ure_ccl_t; 133 134 typedef union { 135 ucs4_t chr; 136 _ure_ccl_t ccl; 137 } _ure_sym_t; 138 139 /* 140 * This is a general element structure used for expressions and stack 141 * elements. 142 */ 143 typedef struct { 144 ucs2_t reg; 145 ucs2_t onstack; 146 ucs2_t type; 147 ucs2_t lhs; 148 ucs2_t rhs; 149 } _ure_elt_t; 150 151 /* 152 * This is a structure used to track a list or a stack of states. 153 */ 154 typedef struct { 155 ucs2_t *slist; 156 ucs2_t slist_size; 157 ucs2_t slist_used; 158 } _ure_stlist_t; 159 160 /* 161 * Structure to track the list of unique states for a symbol 162 * during reduction. 163 */ 164 typedef struct { 165 ucs2_t id; 166 ucs2_t type; 167 unsigned long mods; 168 unsigned long props; 169 _ure_sym_t sym; 170 _ure_stlist_t states; 171 } _ure_symtab_t; 172 173 /* 174 * Structure to hold a single state. 175 */ 176 typedef struct { 177 ucs2_t id; 178 ucs2_t accepting; 179 ucs2_t pad; 180 _ure_stlist_t st; 181 _ure_elt_t *trans; 182 ucs2_t trans_size; 183 ucs2_t trans_used; 184 } _ure_state_t; 185 186 /* 187 * Structure used for keeping lists of states. 188 */ 189 typedef struct { 190 _ure_state_t *states; 191 ucs2_t states_size; 192 ucs2_t states_used; 193 } _ure_statetable_t; 194 195 /* 196 * Structure to track pairs of DFA states when equivalent states are 197 * merged. 198 */ 199 typedef struct { 200 ucs2_t l; 201 ucs2_t r; 202 } _ure_equiv_t; 203 204 /* 205 * Structure used for constructing the NFA and reducing to a minimal DFA. 206 */ 207 typedef struct _ure_buffer_t { 208 int reducing; 209 int error; 210 unsigned long flags; 211 212 _ure_stlist_t stack; 213 214 /* 215 * Table of unique symbols encountered. 216 */ 217 _ure_symtab_t *symtab; 218 ucs2_t symtab_size; 219 ucs2_t symtab_used; 220 221 /* 222 * Tracks the unique expressions generated for the NFA and when the NFA is 223 * reduced. 224 */ 225 _ure_elt_t *expr; 226 ucs2_t expr_used; 227 ucs2_t expr_size; 228 229 /* 230 * The reduced table of unique groups of NFA states. 231 */ 232 _ure_statetable_t states; 233 234 /* 235 * Tracks states when equivalent states are merged. 236 */ 237 _ure_equiv_t *equiv; 238 ucs2_t equiv_used; 239 ucs2_t equiv_size; 240 } _ure_buffer_t; 241 242 typedef struct { 243 ucs2_t symbol; 244 ucs2_t next_state; 245 } _ure_trans_t; 246 247 typedef struct { 248 ucs2_t accepting; 249 ucs2_t ntrans; 250 _ure_trans_t *trans; 251 } _ure_dstate_t; 252 253 typedef struct _ure_dfa_t { 254 unsigned long flags; 255 256 _ure_symtab_t *syms; 257 ucs2_t nsyms; 258 259 _ure_dstate_t *states; 260 ucs2_t nstates; 261 262 _ure_trans_t *trans; 263 ucs2_t ntrans; 264 } _ure_dfa_t; 265 266 /************************************************************************* 267 * 268 * Functions. 269 * 270 *************************************************************************/ 271 272 static void 273 _ure_memmove(char *dest, char *src, unsigned long bytes) 274 { 275 long i, j; 276 277 i = (long) bytes; 278 j = i & 7; 279 i = (i + 7) >> 3; 280 281 /* 282 * Do a memmove using Ye Olde Duff's Device for efficiency. 283 */ 284 if (src < dest) { 285 src += bytes; 286 dest += bytes; 287 288 switch (j) { 289 case 0: do { 290 *--dest = *--src; 291 case 7: *--dest = *--src; 292 case 6: *--dest = *--src; 293 case 5: *--dest = *--src; 294 case 4: *--dest = *--src; 295 case 3: *--dest = *--src; 296 case 2: *--dest = *--src; 297 case 1: *--dest = *--src; 298 } while (--i > 0); 299 } 300 } else if (src > dest) { 301 switch (j) { 302 case 0: do { 303 *dest++ = *src++; 304 case 7: *dest++ = *src++; 305 case 6: *dest++ = *src++; 306 case 5: *dest++ = *src++; 307 case 4: *dest++ = *src++; 308 case 3: *dest++ = *src++; 309 case 2: *dest++ = *src++; 310 case 1: *dest++ = *src++; 311 } while (--i > 0); 312 } 313 } 314 } 315 316 static void 317 _ure_push(ucs2_t v, _ure_buffer_t *b) 318 { 319 _ure_stlist_t *s; 320 321 if (b == 0) 322 return; 323 324 /* 325 * If the `reducing' parameter is non-zero, check to see if the value 326 * passed is already on the stack. 327 */ 328 if (b->reducing != 0 && b->expr[v].onstack != 0) 329 return; 330 331 s = &b->stack; 332 if (s->slist_used == s->slist_size) { 333 if (s->slist_size == 0) 334 s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); 335 else 336 s->slist = (ucs2_t *) realloc((char *) s->slist, 337 sizeof(ucs2_t) * (s->slist_size + 8)); 338 s->slist_size += 8; 339 } 340 s->slist[s->slist_used++] = v; 341 342 /* 343 * If the `reducing' parameter is non-zero, flag the element as being on 344 * the stack. 345 */ 346 if (b->reducing != 0) 347 b->expr[v].onstack = 1; 348 } 349 350 static ucs2_t 351 _ure_peek(_ure_buffer_t *b) 352 { 353 if (b == 0 || b->stack.slist_used == 0) 354 return _URE_NOOP; 355 356 return b->stack.slist[b->stack.slist_used - 1]; 357 } 358 359 static ucs2_t 360 _ure_pop(_ure_buffer_t *b) 361 { 362 ucs2_t v; 363 364 if (b == 0 || b->stack.slist_used == 0) 365 return _URE_NOOP; 366 367 v = b->stack.slist[--b->stack.slist_used]; 368 if (b->reducing) 369 b->expr[v].onstack = 0; 370 371 return v; 372 } 373 374 /************************************************************************* 375 * 376 * Start symbol parse functions. 377 * 378 *************************************************************************/ 379 380 /* 381 * Parse a comma-separated list of integers that represent character 382 * properties. Combine them into a mask that is returned in the `mask' 383 * variable, and return the number of characters consumed. 384 */ 385 static unsigned long 386 _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask, 387 _ure_buffer_t *b) 388 { 389 unsigned long n, m; 390 ucs2_t *sp, *ep; 391 392 sp = pp; 393 ep = sp + limit; 394 395 for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) { 396 if (*sp == ',') { 397 /* 398 * Encountered a comma, so select the next character property flag 399 * and reset the number. 400 */ 401 m |= cclass_flags[n]; 402 n = 0; 403 } else if (*sp >= '0' && *sp <= '9') 404 /* 405 * Encountered a digit, so start or continue building the cardinal 406 * that represents the character property flag. 407 */ 408 n = (n * 10) + (*sp - '0'); 409 else 410 /* 411 * Encountered something that is not part of the property list. 412 * Indicate that we are done. 413 */ 414 break; 415 416 /* 417 * If a property number greater than 32 occurs, then there is a 418 * problem. Most likely a missing comma separator. 419 */ 420 if (n > 32) 421 b->error = _URE_INVALID_PROPERTY; 422 } 423 424 if (b->error == _URE_OK && n != 0) 425 m |= cclass_flags[n]; 426 427 /* 428 * Set the mask that represents the group of character properties. 429 */ 430 *mask = m; 431 432 /* 433 * Return the number of characters consumed. 434 */ 435 return sp - pp; 436 } 437 438 /* 439 * Collect a hex number with 1 to 4 digits and return the number 440 * of characters used. 441 */ 442 static unsigned long 443 _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n) 444 { 445 ucs2_t i; 446 ucs2_t *sp, *ep; 447 ucs4_t nn; 448 449 sp = np; 450 ep = sp + limit; 451 452 for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) { 453 if (*sp >= '0' && *sp <= '9') 454 nn = (nn << 4) + (*sp - '0'); 455 else if (*sp >= 'A' && *sp <= 'F') 456 nn = (nn << 4) + ((*sp - 'A') + 10); 457 else if (*sp >= 'a' && *sp <= 'f') 458 nn = (nn << 4) + ((*sp - 'a') + 10); 459 else 460 /* 461 * Encountered something that is not a hex digit. 462 */ 463 break; 464 } 465 466 /* 467 * Assign the character code collected and return the number of 468 * characters used. 469 */ 470 *n = nn; 471 472 return sp - np; 473 } 474 475 /* 476 * Insert a range into a character class, removing duplicates and ordering 477 * them in increasing range-start order. 478 */ 479 static void 480 _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b) 481 { 482 ucs2_t i; 483 ucs4_t tmp; 484 _ure_range_t *rp; 485 486 /* 487 * If the `casefold' flag is set, then make sure both endpoints of the 488 * range are converted to lower case. 489 */ 490 if (b->flags & _URE_DFA_CASEFOLD) { 491 r->min_code = _ure_tolower(r->min_code); 492 r->max_code = _ure_tolower(r->max_code); 493 } 494 495 /* 496 * Swap the range endpoints if they are not in increasing order. 497 */ 498 if (r->min_code > r->max_code) { 499 tmp = r->min_code; 500 r->min_code = r->max_code; 501 r->max_code = tmp; 502 } 503 504 for (i = 0, rp = ccl->ranges; 505 i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ; 506 507 /* 508 * Check for a duplicate. 509 */ 510 if (i < ccl->ranges_used && 511 r->min_code == rp->min_code && r->max_code == rp->max_code) 512 return; 513 514 if (ccl->ranges_used == ccl->ranges_size) { 515 if (ccl->ranges_size == 0) 516 ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3); 517 else 518 ccl->ranges = (_ure_range_t *) 519 realloc((char *) ccl->ranges, 520 sizeof(_ure_range_t) * (ccl->ranges_size + 8)); 521 ccl->ranges_size += 8; 522 } 523 524 rp = ccl->ranges + ccl->ranges_used; 525 526 if (i < ccl->ranges_used) 527 _ure_memmove((char *) (rp + 1), (char *) rp, 528 sizeof(_ure_range_t) * (ccl->ranges_used - i)); 529 530 ccl->ranges_used++; 531 rp->min_code = r->min_code; 532 rp->max_code = r->max_code; 533 } 534 535 #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\ 536 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING) 537 #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT) 538 #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\ 539 _URE_OTHERPUNCT) 540 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\ 541 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM) 542 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP) 543 #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP) 544 545 typedef void (*_ure_cclsetup_t)( 546 _ure_symtab_t *sym, 547 unsigned long mask, 548 _ure_buffer_t *b 549 ); 550 551 typedef struct { 552 ucs2_t key; 553 unsigned long len; 554 unsigned long next; 555 _ure_cclsetup_t func; 556 unsigned long mask; 557 } _ure_trie_t; 558 559 static void 560 _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 561 { 562 sym->props |= mask; 563 } 564 565 static void 566 _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 567 { 568 _ure_range_t range; 569 570 sym->props |= mask; 571 572 /* 573 * Add the additional characters needed for handling isspace(). 574 */ 575 range.min_code = range.max_code = '\t'; 576 _ure_add_range(&sym->sym.ccl, &range, b); 577 range.min_code = range.max_code = '\r'; 578 _ure_add_range(&sym->sym.ccl, &range, b); 579 range.min_code = range.max_code = '\n'; 580 _ure_add_range(&sym->sym.ccl, &range, b); 581 range.min_code = range.max_code = '\f'; 582 _ure_add_range(&sym->sym.ccl, &range, b); 583 range.min_code = range.max_code = 0xfeff; 584 _ure_add_range(&sym->sym.ccl, &range, b); 585 } 586 587 static void 588 _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 589 { 590 _ure_range_t range; 591 592 /* 593 * Add the additional characters needed for handling isxdigit(). 594 */ 595 range.min_code = '0'; 596 range.max_code = '9'; 597 _ure_add_range(&sym->sym.ccl, &range, b); 598 range.min_code = 'A'; 599 range.max_code = 'F'; 600 _ure_add_range(&sym->sym.ccl, &range, b); 601 range.min_code = 'a'; 602 range.max_code = 'f'; 603 _ure_add_range(&sym->sym.ccl, &range, b); 604 } 605 606 static _ure_trie_t cclass_trie[] = { 607 {0x003a, 1, 1, 0, 0}, 608 {0x0061, 9, 10, 0, 0}, 609 {0x0063, 8, 19, 0, 0}, 610 {0x0064, 7, 24, 0, 0}, 611 {0x0067, 6, 29, 0, 0}, 612 {0x006c, 5, 34, 0, 0}, 613 {0x0070, 4, 39, 0, 0}, 614 {0x0073, 3, 49, 0, 0}, 615 {0x0075, 2, 54, 0, 0}, 616 {0x0078, 1, 59, 0, 0}, 617 {0x006c, 1, 11, 0, 0}, 618 {0x006e, 2, 13, 0, 0}, 619 {0x0070, 1, 16, 0, 0}, 620 {0x0075, 1, 14, 0, 0}, 621 {0x006d, 1, 15, 0, 0}, 622 {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK}, 623 {0x0068, 1, 17, 0, 0}, 624 {0x0061, 1, 18, 0, 0}, 625 {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK}, 626 {0x006e, 1, 20, 0, 0}, 627 {0x0074, 1, 21, 0, 0}, 628 {0x0072, 1, 22, 0, 0}, 629 {0x006c, 1, 23, 0, 0}, 630 {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL}, 631 {0x0069, 1, 25, 0, 0}, 632 {0x0067, 1, 26, 0, 0}, 633 {0x0069, 1, 27, 0, 0}, 634 {0x0074, 1, 28, 0, 0}, 635 {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT}, 636 {0x0072, 1, 30, 0, 0}, 637 {0x0061, 1, 31, 0, 0}, 638 {0x0070, 1, 32, 0, 0}, 639 {0x0068, 1, 33, 0, 0}, 640 {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK}, 641 {0x006f, 1, 35, 0, 0}, 642 {0x0077, 1, 36, 0, 0}, 643 {0x0065, 1, 37, 0, 0}, 644 {0x0072, 1, 38, 0, 0}, 645 {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER}, 646 {0x0072, 2, 41, 0, 0}, 647 {0x0075, 1, 45, 0, 0}, 648 {0x0069, 1, 42, 0, 0}, 649 {0x006e, 1, 43, 0, 0}, 650 {0x0074, 1, 44, 0, 0}, 651 {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK}, 652 {0x006e, 1, 46, 0, 0}, 653 {0x0063, 1, 47, 0, 0}, 654 {0x0074, 1, 48, 0, 0}, 655 {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK}, 656 {0x0070, 1, 50, 0, 0}, 657 {0x0061, 1, 51, 0, 0}, 658 {0x0063, 1, 52, 0, 0}, 659 {0x0065, 1, 53, 0, 0}, 660 {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK}, 661 {0x0070, 1, 55, 0, 0}, 662 {0x0070, 1, 56, 0, 0}, 663 {0x0065, 1, 57, 0, 0}, 664 {0x0072, 1, 58, 0, 0}, 665 {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER}, 666 {0x0064, 1, 60, 0, 0}, 667 {0x0069, 1, 61, 0, 0}, 668 {0x0067, 1, 62, 0, 0}, 669 {0x0069, 1, 63, 0, 0}, 670 {0x0074, 1, 64, 0, 0}, 671 {0x003a, 1, 65, _ure_xdigit_setup, 0}, 672 }; 673 674 /* 675 * Probe for one of the POSIX colon delimited character classes in the static 676 * trie. 677 */ 678 static unsigned long 679 _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym, 680 _ure_buffer_t *b) 681 { 682 int i; 683 unsigned long n; 684 _ure_trie_t *tp; 685 ucs2_t *sp, *ep; 686 687 /* 688 * If the number of characters left is less than 7, then this cannot be 689 * interpreted as one of the colon delimited classes. 690 */ 691 if (limit < 7) 692 return 0; 693 694 sp = cp; 695 ep = sp + limit; 696 tp = cclass_trie; 697 for (i = 0; sp < ep && i < 8; i++, sp++) { 698 n = tp->len; 699 700 for (; n > 0 && tp->key != *sp; tp++, n--) ; 701 702 if (n == 0) 703 return 0; 704 705 if (*sp == ':' && (i == 6 || i == 7)) { 706 sp++; 707 break; 708 } 709 if (sp + 1 < ep) 710 tp = cclass_trie + tp->next; 711 } 712 if (tp->func == 0) 713 return 0; 714 715 (*tp->func)(sym, tp->mask, b); 716 717 return sp - cp; 718 } 719 720 /* 721 * Construct a list of ranges and return the number of characters consumed. 722 */ 723 static unsigned long 724 _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp, 725 _ure_buffer_t *b) 726 { 727 int range_end; 728 unsigned long n; 729 ucs2_t *sp, *ep; 730 ucs4_t c, last; 731 _ure_ccl_t *cclp; 732 _ure_range_t range; 733 734 sp = cp; 735 ep = sp + limit; 736 737 if (*sp == '^') { 738 symp->type = _URE_NCCLASS; 739 sp++; 740 } else 741 symp->type = _URE_CCLASS; 742 743 for (last = 0, range_end = 0; 744 b->error == _URE_OK && sp < ep && *sp != ']'; ) { 745 c = *sp++; 746 if (c == '\\') { 747 if (sp == ep) { 748 /* 749 * The EOS was encountered when expecting the reverse solidus 750 * to be followed by the character it is escaping. Set an 751 * error code and return the number of characters consumed up 752 * to this point. 753 */ 754 b->error = _URE_UNEXPECTED_EOS; 755 return sp - cp; 756 } 757 758 c = *sp++; 759 switch (c) { 760 case 'a': 761 c = 0x07; 762 break; 763 case 'b': 764 c = 0x08; 765 break; 766 case 'f': 767 c = 0x0c; 768 break; 769 case 'n': 770 c = 0x0a; 771 break; 772 case 'r': 773 c = 0x0d; 774 break; 775 case 't': 776 c = 0x09; 777 break; 778 case 'v': 779 c = 0x0b; 780 break; 781 case 'p': 782 case 'P': 783 sp += _ure_prop_list(sp, ep - sp, &symp->props, b); 784 /* 785 * Invert the bit mask of the properties if this is a negated 786 * character class or if 'P' is used to specify a list of 787 * character properties that should *not* match in a 788 * character class. 789 */ 790 if (c == 'P') 791 symp->props = ~symp->props; 792 continue; 793 break; 794 case 'x': 795 case 'X': 796 case 'u': 797 case 'U': 798 if (sp < ep && 799 ((*sp >= '0' && *sp <= '9') || 800 (*sp >= 'A' && *sp <= 'F') || 801 (*sp >= 'a' && *sp <= 'f'))) 802 sp += _ure_hex(sp, ep - sp, &c); 803 } 804 } else if (c == ':') { 805 /* 806 * Probe for a POSIX colon delimited character class. 807 */ 808 sp--; 809 if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0) 810 sp++; 811 else { 812 sp += n; 813 continue; 814 } 815 } 816 817 cclp = &symp->sym.ccl; 818 819 /* 820 * Check to see if the current character is a low surrogate that needs 821 * to be combined with a preceding high surrogate. 822 */ 823 if (last != 0) { 824 if (c >= 0xdc00 && c <= 0xdfff) 825 /* 826 * Construct the UTF16 character code. 827 */ 828 c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff)); 829 else { 830 /* 831 * Add the isolated high surrogate to the range. 832 */ 833 if (range_end == 1) 834 range.max_code = last & 0xffff; 835 else 836 range.min_code = range.max_code = last & 0xffff; 837 838 _ure_add_range(cclp, &range, b); 839 range_end = 0; 840 } 841 } 842 843 /* 844 * Clear the last character code. 845 */ 846 last = 0; 847 848 /* 849 * This slightly awkward code handles the different cases needed to 850 * construct a range. 851 */ 852 if (c >= 0xd800 && c <= 0xdbff) { 853 /* 854 * If the high surrogate is followed by a range indicator, simply 855 * add it as the range start. Otherwise, save it in case the next 856 * character is a low surrogate. 857 */ 858 if (*sp == '-') { 859 sp++; 860 range.min_code = c; 861 range_end = 1; 862 } else 863 last = c; 864 } else if (range_end == 1) { 865 range.max_code = c; 866 _ure_add_range(cclp, &range, b); 867 range_end = 0; 868 } else { 869 range.min_code = range.max_code = c; 870 if (*sp == '-') { 871 sp++; 872 range_end = 1; 873 } else 874 _ure_add_range(cclp, &range, b); 875 } 876 } 877 878 if (sp < ep && *sp == ']') 879 sp++; 880 else 881 /* 882 * The parse was not terminated by the character class close symbol 883 * (']'), so set an error code. 884 */ 885 b->error = _URE_CCLASS_OPEN; 886 887 return sp - cp; 888 } 889 890 /* 891 * Probe for a low surrogate hex code. 892 */ 893 static unsigned long 894 _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c) 895 { 896 ucs4_t i, code; 897 ucs2_t *sp, *ep; 898 899 for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) { 900 if (*sp >= '0' && *sp <= '9') 901 code = (code << 4) + (*sp - '0'); 902 else if (*sp >= 'A' && *sp <= 'F') 903 code = (code << 4) + ((*sp - 'A') + 10); 904 else if (*sp >= 'a' && *sp <= 'f') 905 code = (code << 4) + ((*sp - 'a') + 10); 906 else 907 break; 908 } 909 910 *c = code; 911 return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0; 912 } 913 914 static unsigned long 915 _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp, 916 _ure_buffer_t *b) 917 { 918 ucs4_t c; 919 ucs2_t *sp, *ep; 920 921 sp = sym; 922 ep = sym + limit; 923 924 if ((c = *sp++) == '\\') { 925 926 if (sp == ep) { 927 /* 928 * The EOS was encountered when expecting the reverse solidus to 929 * be followed by the character it is escaping. Set an error code 930 * and return the number of characters consumed up to this point. 931 */ 932 b->error = _URE_UNEXPECTED_EOS; 933 return sp - sym; 934 } 935 936 c = *sp++; 937 switch (c) { 938 case 'p': 939 case 'P': 940 symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS; 941 sp += _ure_prop_list(sp, ep - sp, &symp->props, b); 942 break; 943 case 'a': 944 symp->type = _URE_CHAR; 945 symp->sym.chr = 0x07; 946 break; 947 case 'b': 948 symp->type = _URE_CHAR; 949 symp->sym.chr = 0x08; 950 break; 951 case 'f': 952 symp->type = _URE_CHAR; 953 symp->sym.chr = 0x0c; 954 break; 955 case 'n': 956 symp->type = _URE_CHAR; 957 symp->sym.chr = 0x0a; 958 break; 959 case 'r': 960 symp->type = _URE_CHAR; 961 symp->sym.chr = 0x0d; 962 break; 963 case 't': 964 symp->type = _URE_CHAR; 965 symp->sym.chr = 0x09; 966 break; 967 case 'v': 968 symp->type = _URE_CHAR; 969 symp->sym.chr = 0x0b; 970 break; 971 case 'x': 972 case 'X': 973 case 'u': 974 case 'U': 975 /* 976 * Collect between 1 and 4 digits representing a UCS2 code. Fall 977 * through to the next case. 978 */ 979 if (sp < ep && 980 ((*sp >= '0' && *sp <= '9') || 981 (*sp >= 'A' && *sp <= 'F') || 982 (*sp >= 'a' && *sp <= 'f'))) 983 sp += _ure_hex(sp, ep - sp, &c); 984 /* FALLTHROUGH */ 985 default: 986 /* 987 * Simply add an escaped character here. 988 */ 989 symp->type = _URE_CHAR; 990 symp->sym.chr = c; 991 } 992 } else if (c == '^' || c == '$') 993 /* 994 * Handle the BOL and EOL anchors. This actually consists simply of 995 * setting a flag that indicates that the user supplied anchor match 996 * function should be called. This needs to be done instead of simply 997 * matching line/paragraph separators because beginning-of-text and 998 * end-of-text tests are needed as well. 999 */ 1000 symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR; 1001 else if (c == '[') 1002 /* 1003 * Construct a character class. 1004 */ 1005 sp += _ure_cclass(sp, ep - sp, symp, b); 1006 else if (c == '.') 1007 symp->type = _URE_ANY_CHAR; 1008 else { 1009 symp->type = _URE_CHAR; 1010 symp->sym.chr = c; 1011 } 1012 1013 /* 1014 * If the symbol type happens to be a character and is a high surrogate, 1015 * then probe forward to see if it is followed by a low surrogate that 1016 * needs to be added. 1017 */ 1018 if (sp < ep && symp->type == _URE_CHAR && 1019 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) { 1020 1021 if (0xdc00 <= *sp && *sp <= 0xdfff) { 1022 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | 1023 (*sp & 0x03ff)); 1024 sp++; 1025 } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' || 1026 *(sp + 1) == 'u' || *(sp + 1) == 'U')) { 1027 sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c); 1028 if (0xdc00 <= c && c <= 0xdfff) { 1029 /* 1030 * Take into account the \[xu] in front of the hex code. 1031 */ 1032 sp += 2; 1033 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | 1034 (c & 0x03ff)); 1035 } 1036 } 1037 } 1038 1039 /* 1040 * Last, make sure any _URE_CHAR type symbols are changed to lower case if 1041 * the `casefold' flag is set. 1042 */ 1043 if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR) 1044 symp->sym.chr = _ure_tolower(symp->sym.chr); 1045 1046 /* 1047 * If the symbol constructed is anything other than one of the anchors, 1048 * make sure the _URE_DFA_BLANKLINE flag is removed. 1049 */ 1050 if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR) 1051 b->flags &= ~_URE_DFA_BLANKLINE; 1052 1053 /* 1054 * Return the number of characters consumed. 1055 */ 1056 return sp - sym; 1057 } 1058 1059 static int 1060 _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b) 1061 { 1062 if (a->type != b->type || a->mods != b->mods || a->props != b->props) 1063 return 1; 1064 1065 if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) { 1066 if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used) 1067 return 1; 1068 if (a->sym.ccl.ranges_used > 0 && 1069 memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges, 1070 sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0) 1071 return 1; 1072 } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr) 1073 return 1; 1074 return 0; 1075 } 1076 1077 /* 1078 * Construct a symbol, but only keep unique symbols. 1079 */ 1080 static ucs2_t 1081 _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed, 1082 _ure_buffer_t *b) 1083 { 1084 ucs2_t i; 1085 _ure_symtab_t *sp, symbol; 1086 1087 /* 1088 * Build the next symbol so we can test to see if it is already in the 1089 * symbol table. 1090 */ 1091 (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t)); 1092 *consumed = _ure_compile_symbol(sym, limit, &symbol, b); 1093 1094 /* 1095 * Check to see if the symbol exists. 1096 */ 1097 for (i = 0, sp = b->symtab; 1098 i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ; 1099 1100 if (i < b->symtab_used) { 1101 /* 1102 * Free up any ranges used for the symbol. 1103 */ 1104 if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) && 1105 symbol.sym.ccl.ranges_size > 0) 1106 free((char *) symbol.sym.ccl.ranges); 1107 1108 return b->symtab[i].id; 1109 } 1110 1111 /* 1112 * Need to add the new symbol. 1113 */ 1114 if (b->symtab_used == b->symtab_size) { 1115 if (b->symtab_size == 0) 1116 b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3); 1117 else 1118 b->symtab = (_ure_symtab_t *) 1119 realloc((char *) b->symtab, 1120 sizeof(_ure_symtab_t) * (b->symtab_size + 8)); 1121 sp = b->symtab + b->symtab_size; 1122 (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3); 1123 b->symtab_size += 8; 1124 } 1125 1126 symbol.id = b->symtab_used++; 1127 (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol, 1128 sizeof(_ure_symtab_t)); 1129 1130 return symbol.id; 1131 } 1132 1133 /************************************************************************* 1134 * 1135 * End symbol parse functions. 1136 * 1137 *************************************************************************/ 1138 1139 static ucs2_t 1140 _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b) 1141 { 1142 ucs2_t i; 1143 1144 if (b == 0) 1145 return _URE_NOOP; 1146 1147 /* 1148 * Determine if the expression already exists or not. 1149 */ 1150 for (i = 0; i < b->expr_used; i++) { 1151 if (b->expr[i].type == type && b->expr[i].lhs == lhs && 1152 b->expr[i].rhs == rhs) 1153 break; 1154 } 1155 if (i < b->expr_used) 1156 return i; 1157 1158 /* 1159 * Need to add a new expression. 1160 */ 1161 if (b->expr_used == b->expr_size) { 1162 if (b->expr_size == 0) 1163 b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3); 1164 else 1165 b->expr = (_ure_elt_t *) 1166 realloc((char *) b->expr, 1167 sizeof(_ure_elt_t) * (b->expr_size + 8)); 1168 b->expr_size += 8; 1169 } 1170 1171 b->expr[b->expr_used].onstack = 0; 1172 b->expr[b->expr_used].type = type; 1173 b->expr[b->expr_used].lhs = lhs; 1174 b->expr[b->expr_used].rhs = rhs; 1175 1176 return b->expr_used++; 1177 } 1178 1179 static unsigned char spmap[] = { 1180 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 1181 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1182 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1183 }; 1184 1185 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \ 1186 (spmap[(cc) >> 3] & (1 << ((cc) & 7)))) 1187 1188 /* 1189 * Convert the regular expression into an NFA in a form that will be easy to 1190 * reduce to a DFA. The starting state for the reduction will be returned. 1191 */ 1192 static ucs2_t 1193 _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b) 1194 { 1195 ucs2_t c, state, top, sym, *sp, *ep; 1196 unsigned long used; 1197 1198 state = _URE_NOOP; 1199 1200 sp = re; 1201 ep = sp + relen; 1202 while (b->error == _URE_OK && sp < ep) { 1203 c = *sp++; 1204 switch (c) { 1205 case '(': 1206 _ure_push(_URE_PAREN, b); 1207 break; 1208 case ')': 1209 /* 1210 * Check for the case of too many close parentheses. 1211 */ 1212 if (_ure_peek(b) == _URE_NOOP) { 1213 b->error = _URE_UNBALANCED_GROUP; 1214 break; 1215 } 1216 1217 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1218 /* 1219 * Make an expression with the AND or OR operator and its right 1220 * hand side. 1221 */ 1222 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1223 1224 /* 1225 * Remove the _URE_PAREN off the stack. 1226 */ 1227 (void) _ure_pop(b); 1228 break; 1229 case '*': 1230 state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b); 1231 break; 1232 case '+': 1233 state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b); 1234 break; 1235 case '?': 1236 state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b); 1237 break; 1238 case '|': 1239 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1240 /* 1241 * Make an expression with the AND or OR operator and its right 1242 * hand side. 1243 */ 1244 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1245 1246 _ure_push(state, b); 1247 _ure_push(_URE_OR, b); 1248 break; 1249 default: 1250 sp--; 1251 sym = _ure_make_symbol(sp, ep - sp, &used, b); 1252 sp += used; 1253 state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b); 1254 break; 1255 } 1256 1257 if (c != '(' && c != '|' && sp < ep && 1258 (!_ure_isspecial(*sp) || *sp == '(')) { 1259 _ure_push(state, b); 1260 _ure_push(_URE_AND, b); 1261 } 1262 } 1263 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1264 /* 1265 * Make an expression with the AND or OR operator and its right 1266 * hand side. 1267 */ 1268 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1269 1270 if (b->stack.slist_used > 0) 1271 b->error = _URE_UNBALANCED_GROUP; 1272 1273 return (b->error == _URE_OK) ? state : _URE_NOOP; 1274 } 1275 1276 static void 1277 _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b) 1278 { 1279 ucs2_t i, *stp; 1280 _ure_symtab_t *sp; 1281 1282 /* 1283 * Locate the symbol in the symbol table so the state can be added. 1284 * If the symbol doesn't exist, then a real problem exists. 1285 */ 1286 for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id; 1287 i++, sp++) ; 1288 1289 /* 1290 * Now find out if the state exists in the symbol's state list. 1291 */ 1292 for (i = 0, stp = sp->states.slist; 1293 i < sp->states.slist_used && state > *stp; i++, stp++) ; 1294 1295 if (i == sp->states.slist_used || state < *stp) { 1296 /* 1297 * Need to add the state in order. 1298 */ 1299 if (sp->states.slist_used == sp->states.slist_size) { 1300 if (sp->states.slist_size == 0) 1301 sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); 1302 else 1303 sp->states.slist = (ucs2_t *) 1304 realloc((char *) sp->states.slist, 1305 sizeof(ucs2_t) * (sp->states.slist_size + 8)); 1306 sp->states.slist_size += 8; 1307 } 1308 if (i < sp->states.slist_used) 1309 (void) _ure_memmove((char *) (sp->states.slist + i + 1), 1310 (char *) (sp->states.slist + i), 1311 sizeof(ucs2_t) * (sp->states.slist_used - i)); 1312 sp->states.slist[i] = state; 1313 sp->states.slist_used++; 1314 } 1315 } 1316 1317 static ucs2_t 1318 _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b) 1319 { 1320 ucs2_t i; 1321 _ure_state_t *sp; 1322 1323 for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) { 1324 if (sp->st.slist_used == nstates && 1325 memcmp((char *) states, (char *) sp->st.slist, 1326 sizeof(ucs2_t) * nstates) == 0) 1327 break; 1328 } 1329 1330 if (i == b->states.states_used) { 1331 /* 1332 * Need to add a new DFA state (set of NFA states). 1333 */ 1334 if (b->states.states_used == b->states.states_size) { 1335 if (b->states.states_size == 0) 1336 b->states.states = (_ure_state_t *) 1337 malloc(sizeof(_ure_state_t) << 3); 1338 else 1339 b->states.states = (_ure_state_t *) 1340 realloc((char *) b->states.states, 1341 sizeof(_ure_state_t) * (b->states.states_size + 8)); 1342 sp = b->states.states + b->states.states_size; 1343 (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3); 1344 b->states.states_size += 8; 1345 } 1346 1347 sp = b->states.states + b->states.states_used++; 1348 sp->id = i; 1349 1350 if (sp->st.slist_used + nstates > sp->st.slist_size) { 1351 if (sp->st.slist_size == 0) 1352 sp->st.slist = (ucs2_t *) 1353 malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates)); 1354 else 1355 sp->st.slist = (ucs2_t *) 1356 realloc((char *) sp->st.slist, 1357 sizeof(ucs2_t) * (sp->st.slist_used + nstates)); 1358 sp->st.slist_size = sp->st.slist_used + nstates; 1359 } 1360 sp->st.slist_used = nstates; 1361 (void) AC_MEMCPY((char *) sp->st.slist, (char *) states, 1362 sizeof(ucs2_t) * nstates); 1363 } 1364 1365 /* 1366 * Return the ID of the DFA state representing a group of NFA states. 1367 */ 1368 return i; 1369 } 1370 1371 static void 1372 _ure_reduce(ucs2_t start, _ure_buffer_t *b) 1373 { 1374 ucs2_t i, j, state, eval, syms, rhs; 1375 ucs2_t s1, s2, ns1, ns2; 1376 _ure_state_t *sp; 1377 _ure_symtab_t *smp; 1378 1379 b->reducing = 1; 1380 1381 /* 1382 * Add the starting state for the reduction. 1383 */ 1384 _ure_add_state(1, &start, b); 1385 1386 /* 1387 * Process each set of NFA states that get created. 1388 */ 1389 for (i = 0; i < b->states.states_used; i++) { 1390 sp = b->states.states + i; 1391 1392 /* 1393 * Push the current states on the stack. 1394 */ 1395 for (j = 0; j < sp->st.slist_used; j++) 1396 _ure_push(sp->st.slist[j], b); 1397 1398 /* 1399 * Reduce the NFA states. 1400 */ 1401 for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) { 1402 state = b->stack.slist[j]; 1403 eval = 1; 1404 1405 /* 1406 * This inner loop is the iterative equivalent of recursively 1407 * reducing subexpressions generated as a result of a reduction. 1408 */ 1409 while (eval) { 1410 switch (b->expr[state].type) { 1411 case _URE_SYMBOL: 1412 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1413 _ure_add_symstate(b->expr[state].lhs, ns1, b); 1414 syms++; 1415 eval = 0; 1416 break; 1417 case _URE_ONE: 1418 sp->accepting = 1; 1419 eval = 0; 1420 break; 1421 case _URE_QUEST: 1422 s1 = b->expr[state].lhs; 1423 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1424 state = _ure_make_expr(_URE_OR, ns1, s1, b); 1425 break; 1426 case _URE_PLUS: 1427 s1 = b->expr[state].lhs; 1428 ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b); 1429 state = _ure_make_expr(_URE_AND, s1, ns1, b); 1430 break; 1431 case _URE_STAR: 1432 s1 = b->expr[state].lhs; 1433 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1434 ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b); 1435 state = _ure_make_expr(_URE_OR, ns1, ns2, b); 1436 break; 1437 case _URE_OR: 1438 s1 = b->expr[state].lhs; 1439 s2 = b->expr[state].rhs; 1440 _ure_push(s1, b); 1441 _ure_push(s2, b); 1442 eval = 0; 1443 break; 1444 case _URE_AND: 1445 s1 = b->expr[state].lhs; 1446 s2 = b->expr[state].rhs; 1447 switch (b->expr[s1].type) { 1448 case _URE_SYMBOL: 1449 _ure_add_symstate(b->expr[s1].lhs, s2, b); 1450 syms++; 1451 eval = 0; 1452 break; 1453 case _URE_ONE: 1454 state = s2; 1455 break; 1456 case _URE_QUEST: 1457 ns1 = b->expr[s1].lhs; 1458 ns2 = _ure_make_expr(_URE_AND, ns1, s2, b); 1459 state = _ure_make_expr(_URE_OR, s2, ns2, b); 1460 break; 1461 case _URE_PLUS: 1462 ns1 = b->expr[s1].lhs; 1463 ns2 = _ure_make_expr(_URE_OR, s2, state, b); 1464 state = _ure_make_expr(_URE_AND, ns1, ns2, b); 1465 break; 1466 case _URE_STAR: 1467 ns1 = b->expr[s1].lhs; 1468 ns2 = _ure_make_expr(_URE_AND, ns1, state, b); 1469 state = _ure_make_expr(_URE_OR, s2, ns2, b); 1470 break; 1471 case _URE_OR: 1472 ns1 = b->expr[s1].lhs; 1473 ns2 = b->expr[s1].rhs; 1474 ns1 = _ure_make_expr(_URE_AND, ns1, s2, b); 1475 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); 1476 state = _ure_make_expr(_URE_OR, ns1, ns2, b); 1477 break; 1478 case _URE_AND: 1479 ns1 = b->expr[s1].lhs; 1480 ns2 = b->expr[s1].rhs; 1481 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); 1482 state = _ure_make_expr(_URE_AND, ns1, ns2, b); 1483 break; 1484 } 1485 } 1486 } 1487 } 1488 1489 /* 1490 * Clear the state stack. 1491 */ 1492 while (_ure_pop(b) != _URE_NOOP) ; 1493 1494 /* 1495 * Reset the state pointer because the reduction may have moved it 1496 * during a reallocation. 1497 */ 1498 sp = b->states.states + i; 1499 1500 /* 1501 * Generate the DFA states for the symbols collected during the 1502 * current reduction. 1503 */ 1504 if (sp->trans_used + syms > sp->trans_size) { 1505 if (sp->trans_size == 0) 1506 sp->trans = (_ure_elt_t *) 1507 malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms)); 1508 else 1509 sp->trans = (_ure_elt_t *) 1510 realloc((char *) sp->trans, 1511 sizeof(_ure_elt_t) * (sp->trans_used + syms)); 1512 sp->trans_size = sp->trans_used + syms; 1513 } 1514 1515 /* 1516 * Go through the symbol table and generate the DFA state transitions 1517 * for each symbol that has collected NFA states. 1518 */ 1519 for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) { 1520 sp = b->states.states + i; 1521 1522 if (smp->states.slist_used > 0) { 1523 sp->trans[syms].lhs = smp->id; 1524 rhs = _ure_add_state(smp->states.slist_used, 1525 smp->states.slist, b); 1526 /* 1527 * Reset the state pointer in case the reallocation moves it 1528 * in memory. 1529 */ 1530 sp = b->states.states + i; 1531 sp->trans[syms].rhs = rhs; 1532 1533 smp->states.slist_used = 0; 1534 syms++; 1535 } 1536 } 1537 1538 /* 1539 * Set the number of transitions actually used. 1540 */ 1541 sp->trans_used = syms; 1542 } 1543 b->reducing = 0; 1544 } 1545 1546 static void 1547 _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b) 1548 { 1549 ucs2_t tmp; 1550 1551 l = b->states.states[l].id; 1552 r = b->states.states[r].id; 1553 1554 if (l == r) 1555 return; 1556 1557 if (l > r) { 1558 tmp = l; 1559 l = r; 1560 r = tmp; 1561 } 1562 1563 /* 1564 * Check to see if the equivalence pair already exists. 1565 */ 1566 for (tmp = 0; tmp < b->equiv_used && 1567 (b->equiv[tmp].l != l || b->equiv[tmp].r != r); 1568 tmp++) ; 1569 1570 if (tmp < b->equiv_used) 1571 return; 1572 1573 if (b->equiv_used == b->equiv_size) { 1574 if (b->equiv_size == 0) 1575 b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3); 1576 else 1577 b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv, 1578 sizeof(_ure_equiv_t) * 1579 (b->equiv_size + 8)); 1580 b->equiv_size += 8; 1581 } 1582 b->equiv[b->equiv_used].l = l; 1583 b->equiv[b->equiv_used].r = r; 1584 b->equiv_used++; 1585 } 1586 1587 /* 1588 * Merge the DFA states that are equivalent. 1589 */ 1590 static void 1591 _ure_merge_equiv(_ure_buffer_t *b) 1592 { 1593 ucs2_t i, j, k, eq, done; 1594 _ure_state_t *sp1, *sp2, *ls, *rs; 1595 1596 for (i = 0; i < b->states.states_used; i++) { 1597 sp1 = b->states.states + i; 1598 if (sp1->id != i) 1599 continue; 1600 for (j = 0; j < i; j++) { 1601 sp2 = b->states.states + j; 1602 if (sp2->id != j) 1603 continue; 1604 b->equiv_used = 0; 1605 _ure_add_equiv(i, j, b); 1606 for (eq = 0, done = 0; eq < b->equiv_used; eq++) { 1607 ls = b->states.states + b->equiv[eq].l; 1608 rs = b->states.states + b->equiv[eq].r; 1609 if (ls->accepting != rs->accepting || 1610 ls->trans_used != rs->trans_used) { 1611 done = 1; 1612 break; 1613 } 1614 for (k = 0; k < ls->trans_used && 1615 ls->trans[k].lhs == rs->trans[k].lhs; k++) ; 1616 if (k < ls->trans_used) { 1617 done = 1; 1618 break; 1619 } 1620 1621 for (k = 0; k < ls->trans_used; k++) 1622 _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b); 1623 } 1624 if (done == 0) 1625 break; 1626 } 1627 for (eq = 0; j < i && eq < b->equiv_used; eq++) 1628 b->states.states[b->equiv[eq].r].id = 1629 b->states.states[b->equiv[eq].l].id; 1630 } 1631 1632 /* 1633 * Renumber the states appropriately. 1634 */ 1635 for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used; 1636 sp1++, i++) 1637 sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id; 1638 } 1639 1640 /************************************************************************* 1641 * 1642 * API. 1643 * 1644 *************************************************************************/ 1645 1646 ure_buffer_t 1647 ure_buffer_create(void) 1648 { 1649 ure_buffer_t b; 1650 1651 b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t)); 1652 1653 return b; 1654 } 1655 1656 void 1657 ure_buffer_free(ure_buffer_t buf) 1658 { 1659 unsigned long i; 1660 1661 if (buf == 0) 1662 return; 1663 1664 if (buf->stack.slist_size > 0) 1665 free((char *) buf->stack.slist); 1666 1667 if (buf->expr_size > 0) 1668 free((char *) buf->expr); 1669 1670 for (i = 0; i < buf->symtab_size; i++) { 1671 if (buf->symtab[i].states.slist_size > 0) 1672 free((char *) buf->symtab[i].states.slist); 1673 } 1674 1675 if (buf->symtab_size > 0) 1676 free((char *) buf->symtab); 1677 1678 for (i = 0; i < buf->states.states_size; i++) { 1679 if (buf->states.states[i].trans_size > 0) 1680 free((char *) buf->states.states[i].trans); 1681 if (buf->states.states[i].st.slist_size > 0) 1682 free((char *) buf->states.states[i].st.slist); 1683 } 1684 1685 if (buf->states.states_size > 0) 1686 free((char *) buf->states.states); 1687 1688 if (buf->equiv_size > 0) 1689 free((char *) buf->equiv); 1690 1691 free((char *) buf); 1692 } 1693 1694 ure_dfa_t 1695 ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf) 1696 { 1697 ucs2_t i, j, state; 1698 _ure_state_t *sp; 1699 _ure_dstate_t *dsp; 1700 _ure_trans_t *tp; 1701 ure_dfa_t dfa; 1702 1703 if (re == 0 || *re == 0 || relen == 0 || buf == 0) 1704 return 0; 1705 1706 /* 1707 * Reset the various fields of the compilation buffer. Default the flags 1708 * to indicate the presense of the "^$" pattern. If any other pattern 1709 * occurs, then this flag will be removed. This is done to catch this 1710 * special pattern and handle it specially when matching. 1711 */ 1712 buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0); 1713 buf->reducing = 0; 1714 buf->stack.slist_used = 0; 1715 buf->expr_used = 0; 1716 1717 for (i = 0; i < buf->symtab_used; i++) 1718 buf->symtab[i].states.slist_used = 0; 1719 buf->symtab_used = 0; 1720 1721 for (i = 0; i < buf->states.states_used; i++) { 1722 buf->states.states[i].st.slist_used = 0; 1723 buf->states.states[i].trans_used = 0; 1724 } 1725 buf->states.states_used = 0; 1726 1727 /* 1728 * Construct the NFA. If this stage returns a 0, then an error occured or 1729 * an empty expression was passed. 1730 */ 1731 if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP) 1732 return 0; 1733 1734 /* 1735 * Do the expression reduction to get the initial DFA. 1736 */ 1737 _ure_reduce(state, buf); 1738 1739 /* 1740 * Merge all the equivalent DFA states. 1741 */ 1742 _ure_merge_equiv(buf); 1743 1744 /* 1745 * Construct the minimal DFA. 1746 */ 1747 dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t)); 1748 (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t)); 1749 1750 dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE); 1751 1752 /* 1753 * Free up the NFA state groups and transfer the symbols from the buffer 1754 * to the DFA. 1755 */ 1756 for (i = 0; i < buf->symtab_size; i++) { 1757 if (buf->symtab[i].states.slist_size > 0) 1758 free((char *) buf->symtab[i].states.slist); 1759 } 1760 dfa->syms = buf->symtab; 1761 dfa->nsyms = buf->symtab_used; 1762 1763 buf->symtab_used = buf->symtab_size = 0; 1764 1765 /* 1766 * Collect the total number of states and transitions needed for the DFA. 1767 */ 1768 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; 1769 i++, sp++) { 1770 if (sp->id == state) { 1771 dfa->nstates++; 1772 dfa->ntrans += sp->trans_used; 1773 state++; 1774 } 1775 } 1776 1777 /* 1778 * Allocate enough space for the states and transitions. 1779 */ 1780 dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) * 1781 dfa->nstates); 1782 dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans); 1783 1784 /* 1785 * Actually transfer the DFA states from the buffer. 1786 */ 1787 dsp = dfa->states; 1788 tp = dfa->trans; 1789 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; 1790 i++, sp++) { 1791 if (sp->id == state) { 1792 dsp->trans = tp; 1793 dsp->ntrans = sp->trans_used; 1794 dsp->accepting = sp->accepting; 1795 1796 /* 1797 * Add the transitions for the state. 1798 */ 1799 for (j = 0; j < dsp->ntrans; j++, tp++) { 1800 tp->symbol = sp->trans[j].lhs; 1801 tp->next_state = buf->states.states[sp->trans[j].rhs].id; 1802 } 1803 1804 dsp++; 1805 state++; 1806 } 1807 } 1808 1809 return dfa; 1810 } 1811 1812 void 1813 ure_dfa_free(ure_dfa_t dfa) 1814 { 1815 ucs2_t i; 1816 1817 if (dfa == 0) 1818 return; 1819 1820 for (i = 0; i < dfa->nsyms; i++) { 1821 if ((dfa->syms[i].type == _URE_CCLASS || 1822 dfa->syms[i].type == _URE_NCCLASS) && 1823 dfa->syms[i].sym.ccl.ranges_size > 0) 1824 free((char *) dfa->syms[i].sym.ccl.ranges); 1825 } 1826 if (dfa->nsyms > 0) 1827 free((char *) dfa->syms); 1828 1829 if (dfa->nstates > 0) 1830 free((char *) dfa->states); 1831 if (dfa->ntrans > 0) 1832 free((char *) dfa->trans); 1833 free((char *) dfa); 1834 } 1835 1836 void 1837 ure_write_dfa(ure_dfa_t dfa, FILE *out) 1838 { 1839 ucs2_t i, j, k, h, l; 1840 _ure_dstate_t *sp; 1841 _ure_symtab_t *sym; 1842 _ure_range_t *rp; 1843 1844 if (dfa == 0 || out == 0) 1845 return; 1846 1847 /* 1848 * Write all the different character classes. 1849 */ 1850 for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) { 1851 if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) { 1852 fprintf(out, "C%hd = ", sym->id); 1853 if (sym->sym.ccl.ranges_used > 0) { 1854 putc('[', out); 1855 if (sym->type == _URE_NCCLASS) 1856 putc('^', out); 1857 } 1858 if (sym->props != 0) { 1859 if (sym->type == _URE_NCCLASS) 1860 fprintf(out, "\\P"); 1861 else 1862 fprintf(out, "\\p"); 1863 for (k = h = 0; k < 32; k++) { 1864 if (sym->props & (1 << k)) { 1865 if (h != 0) 1866 putc(',', out); 1867 fprintf(out, "%hd", k + 1); 1868 h = 1; 1869 } 1870 } 1871 } 1872 /* 1873 * Dump the ranges. 1874 */ 1875 for (k = 0, rp = sym->sym.ccl.ranges; 1876 k < sym->sym.ccl.ranges_used; k++, rp++) { 1877 /* 1878 * Check for UTF16 characters. 1879 */ 1880 if (0x10000 <= rp->min_code && 1881 rp->min_code <= 0x10ffff) { 1882 h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800); 1883 l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00); 1884 fprintf(out, "\\x%04hX\\x%04hX", h, l); 1885 } else 1886 fprintf(out, "\\x%04lX", rp->min_code & 0xffff); 1887 if (rp->max_code != rp->min_code) { 1888 putc('-', out); 1889 if (rp->max_code >= 0x10000 && 1890 rp->max_code <= 0x10ffff) { 1891 h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800); 1892 l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00); 1893 fprintf(out, "\\x%04hX\\x%04hX", h, l); 1894 } else 1895 fprintf(out, "\\x%04lX", rp->max_code & 0xffff); 1896 } 1897 } 1898 if (sym->sym.ccl.ranges_used > 0) 1899 putc(']', out); 1900 putc('\n', out); 1901 } 1902 } 1903 1904 for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) { 1905 fprintf(out, "S%hd = ", i); 1906 if (sp->accepting) { 1907 fprintf(out, "1 "); 1908 if (sp->ntrans) 1909 fprintf(out, "| "); 1910 } 1911 for (j = 0; j < sp->ntrans; j++) { 1912 if (j > 0) 1913 fprintf(out, "| "); 1914 1915 sym = dfa->syms + sp->trans[j].symbol; 1916 switch (sym->type) { 1917 case _URE_CHAR: 1918 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) { 1919 /* 1920 * Take care of UTF16 characters. 1921 */ 1922 h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800); 1923 l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00); 1924 fprintf(out, "\\x%04hX\\x%04hX ", h, l); 1925 } else 1926 fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff); 1927 break; 1928 case _URE_ANY_CHAR: 1929 fprintf(out, "<any> "); 1930 break; 1931 case _URE_BOL_ANCHOR: 1932 fprintf(out, "<bol-anchor> "); 1933 break; 1934 case _URE_EOL_ANCHOR: 1935 fprintf(out, "<eol-anchor> "); 1936 break; 1937 case _URE_CCLASS: 1938 case _URE_NCCLASS: 1939 fprintf(out, "[C%hd] ", sym->id); 1940 break; 1941 } 1942 fprintf(out, "S%hd", sp->trans[j].next_state); 1943 if (j + 1 < sp->ntrans) 1944 putc(' ', out); 1945 } 1946 putc('\n', out); 1947 } 1948 } 1949 1950 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\ 1951 (cc) == 0x2029) 1952 1953 int 1954 ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen, 1955 unsigned long *match_start, unsigned long *match_end) 1956 { 1957 int i, j, matched, found, skip; 1958 unsigned long ms, me; 1959 ucs4_t c; 1960 ucs2_t *sp, *ep, *lp; 1961 _ure_dstate_t *stp; 1962 _ure_symtab_t *sym; 1963 _ure_range_t *rp; 1964 1965 if (dfa == 0 || text == 0) 1966 return 0; 1967 1968 /* 1969 * Handle the special case of an empty string matching the "^$" pattern. 1970 */ 1971 if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) { 1972 *match_start = *match_end = 0; 1973 return 1; 1974 } 1975 1976 sp = text; 1977 ep = sp + textlen; 1978 1979 ms = me = ~0; 1980 1981 stp = dfa->states; 1982 1983 for (found = skip = 0; found == 0 && sp < ep; ) { 1984 lp = sp; 1985 c = *sp++; 1986 1987 /* 1988 * Check to see if this is a high surrogate that should be 1989 * combined with a following low surrogate. 1990 */ 1991 if (sp < ep && 0xd800 <= c && c <= 0xdbff && 1992 0xdc00 <= *sp && *sp <= 0xdfff) 1993 c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff)); 1994 1995 /* 1996 * Determine if the character is non-spacing and should be skipped. 1997 */ 1998 if (_ure_matches_properties(_URE_NONSPACING, c) && 1999 (flags & URE_IGNORE_NONSPACING)) { 2000 sp++; 2001 continue; 2002 } 2003 2004 if (dfa->flags & _URE_DFA_CASEFOLD) 2005 c = _ure_tolower(c); 2006 2007 /* 2008 * See if one of the transitions matches. 2009 */ 2010 for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) { 2011 sym = dfa->syms + stp->trans[i].symbol; 2012 switch (sym->type) { 2013 case _URE_ANY_CHAR: 2014 if ((flags & URE_DOT_MATCHES_SEPARATORS) || 2015 !_ure_issep(c)) 2016 matched = 1; 2017 break; 2018 case _URE_CHAR: 2019 if (c == sym->sym.chr) 2020 matched = 1; 2021 break; 2022 case _URE_BOL_ANCHOR: 2023 if (lp == text) { 2024 sp = lp; 2025 matched = 1; 2026 } else if (_ure_issep(c)) { 2027 if (c == '\r' && sp < ep && *sp == '\n') 2028 sp++; 2029 lp = sp; 2030 matched = 1; 2031 } 2032 break; 2033 case _URE_EOL_ANCHOR: 2034 if (_ure_issep(c)) { 2035 /* 2036 * Put the pointer back before the separator so the match 2037 * end position will be correct. This case will also 2038 * cause the `sp' pointer to be advanced over the current 2039 * separator once the match end point has been recorded. 2040 */ 2041 sp = lp; 2042 matched = 1; 2043 } 2044 break; 2045 case _URE_CCLASS: 2046 case _URE_NCCLASS: 2047 if (sym->props != 0) 2048 matched = _ure_matches_properties(sym->props, c); 2049 for (j = 0, rp = sym->sym.ccl.ranges; 2050 j < sym->sym.ccl.ranges_used; j++, rp++) { 2051 if (rp->min_code <= c && c <= rp->max_code) 2052 matched = 1; 2053 } 2054 if (sym->type == _URE_NCCLASS) 2055 matched = !matched; 2056 break; 2057 } 2058 2059 if (matched) { 2060 if (ms == ~0UL) 2061 ms = lp - text; 2062 else 2063 me = sp - text; 2064 stp = dfa->states + stp->trans[i].next_state; 2065 2066 /* 2067 * If the match was an EOL anchor, adjust the pointer past the 2068 * separator that caused the match. The correct match 2069 * position has been recorded already. 2070 */ 2071 if (sym->type == _URE_EOL_ANCHOR) { 2072 /* 2073 * Skip the character that caused the match. 2074 */ 2075 sp++; 2076 2077 /* 2078 * Handle the infamous CRLF situation. 2079 */ 2080 if (sp < ep && c == '\r' && *sp == '\n') 2081 sp++; 2082 } 2083 } 2084 } 2085 2086 if (matched == 0) { 2087 if (stp->accepting == 0) { 2088 /* 2089 * If the last state was not accepting, then reset 2090 * and start over. 2091 */ 2092 stp = dfa->states; 2093 ms = me = ~0; 2094 } else 2095 /* 2096 * The last state was accepting, so terminate the matching 2097 * loop to avoid more work. 2098 */ 2099 found = 1; 2100 } else if (sp == ep) { 2101 if (!stp->accepting) { 2102 /* 2103 * This ugly hack is to make sure the end-of-line anchors 2104 * match when the source text hits the end. This is only done 2105 * if the last subexpression matches. 2106 */ 2107 for (i = 0; found == 0 && i < stp->ntrans; i++) { 2108 sym = dfa->syms + stp->trans[i].symbol; 2109 if (sym->type ==_URE_EOL_ANCHOR) { 2110 stp = dfa->states + stp->trans[i].next_state; 2111 if (stp->accepting) { 2112 me = sp - text; 2113 found = 1; 2114 } else 2115 break; 2116 } 2117 } 2118 } else { 2119 /* 2120 * Make sure any conditions that match all the way to the end 2121 * of the string match. 2122 */ 2123 found = 1; 2124 me = sp - text; 2125 } 2126 } 2127 } 2128 2129 if (found == 0) 2130 ms = me = ~0; 2131 2132 *match_start = ms; 2133 *match_end = me; 2134 2135 return (ms != ~0UL) ? 1 : 0; 2136 } 2137