1 /* Extended regular expression matching and search library, 2 version 0.12. 3 (Implements POSIX draft P10003.2/D11.2, except for 4 internationalization features.) 5 6 Copyright (C) 1993 Free Software Foundation, Inc. 7 8 This program is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, write to the Free Software 20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ 21 22 /* Trying to define this in the makefile would get hairy, unless we can 23 more gracefully do it for NT, OS/2, unix, etc. */ 24 #define REGEX_MALLOC 1 25 26 /* AIX requires this to be the first thing in the file. */ 27 #if defined (_AIX) && !defined (REGEX_MALLOC) 28 #pragma alloca 29 #endif 30 31 #define _GNU_SOURCE 32 33 /* We need this for `regex.h', and perhaps for the Emacs include files. */ 34 #include <sys/types.h> 35 36 #ifdef HAVE_CONFIG_H 37 #include "config.h" 38 #endif 39 40 /* The `emacs' switch turns on certain matching commands 41 that make sense only in Emacs. */ 42 #ifdef emacs 43 44 #include "lisp.h" 45 #include "buffer.h" 46 #include "syntax.h" 47 48 /* Emacs uses `NULL' as a predicate. */ 49 #undef NULL 50 51 #else /* not emacs */ 52 53 /* We used to test for `BSTRING' here, but only GCC and Emacs define 54 `BSTRING', as far as I know, and neither of them use this code. */ 55 #if HAVE_STRING_H || STDC_HEADERS 56 #include <string.h> 57 #ifndef bcmp 58 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) 59 #endif 60 #ifndef bcopy 61 #define bcopy(s, d, n) memcpy ((d), (s), (n)) 62 #endif 63 #ifndef bzero 64 #define bzero(s, n) memset ((s), 0, (n)) 65 #endif 66 #else 67 #include <strings.h> 68 #endif 69 70 #ifdef STDC_HEADERS 71 #include <stdlib.h> 72 #else 73 char *malloc (); 74 char *realloc (); 75 #endif 76 77 78 /* Define the syntax stuff for \<, \>, etc. */ 79 80 /* This must be nonzero for the wordchar and notwordchar pattern 81 commands in re_match_2. */ 82 #ifndef Sword 83 #define Sword 1 84 #endif 85 86 #ifdef SYNTAX_TABLE 87 88 extern char *re_syntax_table; 89 90 #else /* not SYNTAX_TABLE */ 91 92 /* How many characters in the character set. */ 93 #define CHAR_SET_SIZE 256 94 95 static char re_syntax_table[CHAR_SET_SIZE]; 96 97 static void 98 init_syntax_once () 99 { 100 register int c; 101 static int done = 0; 102 103 if (done) 104 return; 105 106 bzero (re_syntax_table, sizeof re_syntax_table); 107 108 for (c = 'a'; c <= 'z'; c++) 109 re_syntax_table[c] = Sword; 110 111 for (c = 'A'; c <= 'Z'; c++) 112 re_syntax_table[c] = Sword; 113 114 for (c = '0'; c <= '9'; c++) 115 re_syntax_table[c] = Sword; 116 117 re_syntax_table['_'] = Sword; 118 119 done = 1; 120 } 121 122 #endif /* not SYNTAX_TABLE */ 123 124 #define SYNTAX(c) re_syntax_table[c] 125 126 #endif /* not emacs */ 127 128 /* Get the interface, including the syntax bits. */ 129 #include "regex.h" 130 131 /* isalpha etc. are used for the character classes. */ 132 #include <ctype.h> 133 134 #ifndef isascii 135 #define isascii(c) 1 136 #endif 137 138 #ifdef isblank 139 #define ISBLANK(c) (isascii (c) && isblank (c)) 140 #else 141 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') 142 #endif 143 #ifdef isgraph 144 #define ISGRAPH(c) (isascii (c) && isgraph (c)) 145 #else 146 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c)) 147 #endif 148 149 #define ISPRINT(c) (isascii (c) && isprint (c)) 150 #define ISDIGIT(c) (isascii (c) && isdigit (c)) 151 #define ISALNUM(c) (isascii (c) && isalnum (c)) 152 #define ISALPHA(c) (isascii (c) && isalpha (c)) 153 #define ISCNTRL(c) (isascii (c) && iscntrl (c)) 154 #define ISLOWER(c) (isascii (c) && islower (c)) 155 #define ISPUNCT(c) (isascii (c) && ispunct (c)) 156 #define ISSPACE(c) (isascii (c) && isspace (c)) 157 #define ISUPPER(c) (isascii (c) && isupper (c)) 158 #define ISXDIGIT(c) (isascii (c) && isxdigit (c)) 159 160 #ifndef NULL 161 #define NULL 0 162 #endif 163 164 /* We remove any previous definition of `SIGN_EXTEND_CHAR', 165 since ours (we hope) works properly with all combinations of 166 machines, compilers, `char' and `unsigned char' argument types. 167 (Per Bothner suggested the basic approach.) */ 168 #undef SIGN_EXTEND_CHAR 169 #if __STDC__ 170 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) 171 #else /* not __STDC__ */ 172 /* As in Harbison and Steele. */ 173 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) 174 #endif 175 176 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 177 use `alloca' instead of `malloc'. This is because using malloc in 178 re_search* or re_match* could cause memory leaks when C-g is used in 179 Emacs; also, malloc is slower and causes storage fragmentation. On 180 the other hand, malloc is more portable, and easier to debug. 181 182 Because we sometimes use alloca, some routines have to be macros, 183 not functions -- `alloca'-allocated space disappears at the end of the 184 function it is called in. */ 185 186 #ifdef REGEX_MALLOC 187 188 #define REGEX_ALLOCATE malloc 189 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 190 191 #else /* not REGEX_MALLOC */ 192 193 /* Emacs already defines alloca, sometimes. */ 194 #ifndef alloca 195 196 /* Make alloca work the best possible way. */ 197 #ifdef __GNUC__ 198 #define alloca __builtin_alloca 199 #else /* not __GNUC__ */ 200 #if HAVE_ALLOCA_H 201 #include <alloca.h> 202 #else /* not __GNUC__ or HAVE_ALLOCA_H */ 203 #ifndef _AIX /* Already did AIX, up at the top. */ 204 char *alloca (); 205 #endif /* not _AIX */ 206 #endif /* not HAVE_ALLOCA_H */ 207 #endif /* not __GNUC__ */ 208 209 #endif /* not alloca */ 210 211 #define REGEX_ALLOCATE alloca 212 213 /* Assumes a `char *destination' variable. */ 214 #define REGEX_REALLOCATE(source, osize, nsize) \ 215 (destination = (char *) alloca (nsize), \ 216 bcopy (source, destination, osize), \ 217 destination) 218 219 #endif /* not REGEX_MALLOC */ 220 221 222 /* True if `size1' is non-NULL and PTR is pointing anywhere inside 223 `string1' or just past its end. This works if PTR is NULL, which is 224 a good thing. */ 225 #define FIRST_STRING_P(ptr) \ 226 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 227 228 /* (Re)Allocate N items of type T using malloc, or fail. */ 229 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 230 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 231 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 232 233 #define BYTEWIDTH 8 /* In bits. */ 234 235 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 236 237 /* The Mac CodeWarrier9 compiler defines MAX and MIN. */ 238 #ifndef MAX 239 #define MAX(a, b) ((a) > (b) ? (a) : (b)) 240 #endif 241 #ifndef MIN 242 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 243 #endif 244 245 typedef char boolean; 246 #define false 0 247 #define true 1 248 249 /* These are the command codes that appear in compiled regular 250 expressions. Some opcodes are followed by argument bytes. A 251 command code can specify any interpretation whatsoever for its 252 arguments. Zero bytes may appear in the compiled regular expression. 253 254 The value of `exactn' is needed in search.c (search_buffer) in Emacs. 255 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of 256 `exactn' we use here must also be 1. */ 257 258 typedef enum 259 { 260 no_op = 0, 261 262 /* Followed by one byte giving n, then by n literal bytes. */ 263 exactn = 1, 264 265 /* Matches any (more or less) character. */ 266 anychar, 267 268 /* Matches any one char belonging to specified set. First 269 following byte is number of bitmap bytes. Then come bytes 270 for a bitmap saying which chars are in. Bits in each byte 271 are ordered low-bit-first. A character is in the set if its 272 bit is 1. A character too large to have a bit in the map is 273 automatically not in the set. */ 274 charset, 275 276 /* Same parameters as charset, but match any character that is 277 not one of those specified. */ 278 charset_not, 279 280 /* Start remembering the text that is matched, for storing in a 281 register. Followed by one byte with the register number, in 282 the range 0 to one less than the pattern buffer's re_nsub 283 field. Then followed by one byte with the number of groups 284 inner to this one. (This last has to be part of the 285 start_memory only because we need it in the on_failure_jump 286 of re_match_2.) */ 287 start_memory, 288 289 /* Stop remembering the text that is matched and store it in a 290 memory register. Followed by one byte with the register 291 number, in the range 0 to one less than `re_nsub' in the 292 pattern buffer, and one byte with the number of inner groups, 293 just like `start_memory'. (We need the number of inner 294 groups here because we don't have any easy way of finding the 295 corresponding start_memory when we're at a stop_memory.) */ 296 stop_memory, 297 298 /* Match a duplicate of something remembered. Followed by one 299 byte containing the register number. */ 300 duplicate, 301 302 /* Fail unless at beginning of line. */ 303 begline, 304 305 /* Fail unless at end of line. */ 306 endline, 307 308 /* Succeeds if at beginning of buffer (if emacs) or at beginning 309 of string to be matched (if not). */ 310 begbuf, 311 312 /* Analogously, for end of buffer/string. */ 313 endbuf, 314 315 /* Followed by two byte relative address to which to jump. */ 316 jump, 317 318 /* Same as jump, but marks the end of an alternative. */ 319 jump_past_alt, 320 321 /* Followed by two-byte relative address of place to resume at 322 in case of failure. */ 323 on_failure_jump, 324 325 /* Like on_failure_jump, but pushes a placeholder instead of the 326 current string position when executed. */ 327 on_failure_keep_string_jump, 328 329 /* Throw away latest failure point and then jump to following 330 two-byte relative address. */ 331 pop_failure_jump, 332 333 /* Change to pop_failure_jump if know won't have to backtrack to 334 match; otherwise change to jump. This is used to jump 335 back to the beginning of a repeat. If what follows this jump 336 clearly won't match what the repeat does, such that we can be 337 sure that there is no use backtracking out of repetitions 338 already matched, then we change it to a pop_failure_jump. 339 Followed by two-byte address. */ 340 maybe_pop_jump, 341 342 /* Jump to following two-byte address, and push a dummy failure 343 point. This failure point will be thrown away if an attempt 344 is made to use it for a failure. A `+' construct makes this 345 before the first repeat. Also used as an intermediary kind 346 of jump when compiling an alternative. */ 347 dummy_failure_jump, 348 349 /* Push a dummy failure point and continue. Used at the end of 350 alternatives. */ 351 push_dummy_failure, 352 353 /* Followed by two-byte relative address and two-byte number n. 354 After matching N times, jump to the address upon failure. */ 355 succeed_n, 356 357 /* Followed by two-byte relative address, and two-byte number n. 358 Jump to the address N times, then fail. */ 359 jump_n, 360 361 /* Set the following two-byte relative address to the 362 subsequent two-byte number. The address *includes* the two 363 bytes of number. */ 364 set_number_at, 365 366 wordchar, /* Matches any word-constituent character. */ 367 notwordchar, /* Matches any char that is not a word-constituent. */ 368 369 wordbeg, /* Succeeds if at word beginning. */ 370 wordend, /* Succeeds if at word end. */ 371 372 wordbound, /* Succeeds if at a word boundary. */ 373 notwordbound /* Succeeds if not at a word boundary. */ 374 375 #ifdef emacs 376 ,before_dot, /* Succeeds if before point. */ 377 at_dot, /* Succeeds if at point. */ 378 after_dot, /* Succeeds if after point. */ 379 380 /* Matches any character whose syntax is specified. Followed by 381 a byte which contains a syntax code, e.g., Sword. */ 382 syntaxspec, 383 384 /* Matches any character whose syntax is not that specified. */ 385 notsyntaxspec 386 #endif /* emacs */ 387 } re_opcode_t; 388 389 /* Common operations on the compiled pattern. */ 390 391 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 392 393 #define STORE_NUMBER(destination, number) \ 394 do { \ 395 (destination)[0] = (number) & 0377; \ 396 (destination)[1] = (number) >> 8; \ 397 } while (0) 398 399 /* Same as STORE_NUMBER, except increment DESTINATION to 400 the byte after where the number is stored. Therefore, DESTINATION 401 must be an lvalue. */ 402 403 #define STORE_NUMBER_AND_INCR(destination, number) \ 404 do { \ 405 STORE_NUMBER (destination, number); \ 406 (destination) += 2; \ 407 } while (0) 408 409 /* Put into DESTINATION a number stored in two contiguous bytes starting 410 at SOURCE. */ 411 412 #define EXTRACT_NUMBER(destination, source) \ 413 do { \ 414 (destination) = *(source) & 0377; \ 415 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ 416 } while (0) 417 418 #ifdef DEBUG 419 static void 420 extract_number (dest, source) 421 int *dest; 422 unsigned char *source; 423 { 424 int temp = SIGN_EXTEND_CHAR (*(source + 1)); 425 *dest = *source & 0377; 426 *dest += temp << 8; 427 } 428 429 #ifndef EXTRACT_MACROS /* To debug the macros. */ 430 #undef EXTRACT_NUMBER 431 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) 432 #endif /* not EXTRACT_MACROS */ 433 434 #endif /* DEBUG */ 435 436 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 437 SOURCE must be an lvalue. */ 438 439 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ 440 do { \ 441 EXTRACT_NUMBER (destination, source); \ 442 (source) += 2; \ 443 } while (0) 444 445 #ifdef DEBUG 446 static void 447 extract_number_and_incr (destination, source) 448 int *destination; 449 unsigned char **source; 450 { 451 extract_number (destination, *source); 452 *source += 2; 453 } 454 455 #ifndef EXTRACT_MACROS 456 #undef EXTRACT_NUMBER_AND_INCR 457 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ 458 extract_number_and_incr (&dest, &src) 459 #endif /* not EXTRACT_MACROS */ 460 461 #endif /* DEBUG */ 462 463 /* If DEBUG is defined, Regex prints many voluminous messages about what 464 it is doing (if the variable `debug' is nonzero). If linked with the 465 main program in `iregex.c', you can enter patterns and strings 466 interactively. And if linked with the main program in `main.c' and 467 the other test files, you can run the already-written tests. */ 468 469 #ifdef DEBUG 470 471 /* We use standard I/O for debugging. */ 472 #include <stdio.h> 473 474 /* It is useful to test things that ``must'' be true when debugging. */ 475 #include <assert.h> 476 477 static int debug = 0; 478 479 #define DEBUG_STATEMENT(e) e 480 #define DEBUG_PRINT1(x) if (debug) printf (x) 481 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 482 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 483 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 484 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 485 if (debug) print_partial_compiled_pattern (s, e) 486 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 487 if (debug) print_double_string (w, s1, sz1, s2, sz2) 488 489 490 extern void printchar (); 491 492 /* Print the fastmap in human-readable form. */ 493 494 void 495 print_fastmap (fastmap) 496 char *fastmap; 497 { 498 unsigned was_a_range = 0; 499 unsigned i = 0; 500 501 while (i < (1 << BYTEWIDTH)) 502 { 503 if (fastmap[i++]) 504 { 505 was_a_range = 0; 506 printchar (i - 1); 507 while (i < (1 << BYTEWIDTH) && fastmap[i]) 508 { 509 was_a_range = 1; 510 i++; 511 } 512 if (was_a_range) 513 { 514 printf ("-"); 515 printchar (i - 1); 516 } 517 } 518 } 519 putchar ('\n'); 520 } 521 522 523 /* Print a compiled pattern string in human-readable form, starting at 524 the START pointer into it and ending just before the pointer END. */ 525 526 void 527 print_partial_compiled_pattern (start, end) 528 unsigned char *start; 529 unsigned char *end; 530 { 531 int mcnt, mcnt2; 532 unsigned char *p = start; 533 unsigned char *pend = end; 534 535 if (start == NULL) 536 { 537 printf ("(null)\n"); 538 return; 539 } 540 541 /* Loop over pattern commands. */ 542 while (p < pend) 543 { 544 switch ((re_opcode_t) *p++) 545 { 546 case no_op: 547 printf ("/no_op"); 548 break; 549 550 case exactn: 551 mcnt = *p++; 552 printf ("/exactn/%d", mcnt); 553 do 554 { 555 putchar ('/'); 556 printchar (*p++); 557 } 558 while (--mcnt); 559 break; 560 561 case start_memory: 562 mcnt = *p++; 563 printf ("/start_memory/%d/%d", mcnt, *p++); 564 break; 565 566 case stop_memory: 567 mcnt = *p++; 568 printf ("/stop_memory/%d/%d", mcnt, *p++); 569 break; 570 571 case duplicate: 572 printf ("/duplicate/%d", *p++); 573 break; 574 575 case anychar: 576 printf ("/anychar"); 577 break; 578 579 case charset: 580 case charset_not: 581 { 582 register int c; 583 584 printf ("/charset%s", 585 (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); 586 587 assert (p + *p < pend); 588 589 for (c = 0; c < *p; c++) 590 { 591 unsigned bit; 592 unsigned char map_byte = p[1 + c]; 593 594 putchar ('/'); 595 596 for (bit = 0; bit < BYTEWIDTH; bit++) 597 if (map_byte & (1 << bit)) 598 printchar (c * BYTEWIDTH + bit); 599 } 600 p += 1 + *p; 601 break; 602 } 603 604 case begline: 605 printf ("/begline"); 606 break; 607 608 case endline: 609 printf ("/endline"); 610 break; 611 612 case on_failure_jump: 613 extract_number_and_incr (&mcnt, &p); 614 printf ("/on_failure_jump/0/%d", mcnt); 615 break; 616 617 case on_failure_keep_string_jump: 618 extract_number_and_incr (&mcnt, &p); 619 printf ("/on_failure_keep_string_jump/0/%d", mcnt); 620 break; 621 622 case dummy_failure_jump: 623 extract_number_and_incr (&mcnt, &p); 624 printf ("/dummy_failure_jump/0/%d", mcnt); 625 break; 626 627 case push_dummy_failure: 628 printf ("/push_dummy_failure"); 629 break; 630 631 case maybe_pop_jump: 632 extract_number_and_incr (&mcnt, &p); 633 printf ("/maybe_pop_jump/0/%d", mcnt); 634 break; 635 636 case pop_failure_jump: 637 extract_number_and_incr (&mcnt, &p); 638 printf ("/pop_failure_jump/0/%d", mcnt); 639 break; 640 641 case jump_past_alt: 642 extract_number_and_incr (&mcnt, &p); 643 printf ("/jump_past_alt/0/%d", mcnt); 644 break; 645 646 case jump: 647 extract_number_and_incr (&mcnt, &p); 648 printf ("/jump/0/%d", mcnt); 649 break; 650 651 case succeed_n: 652 extract_number_and_incr (&mcnt, &p); 653 extract_number_and_incr (&mcnt2, &p); 654 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2); 655 break; 656 657 case jump_n: 658 extract_number_and_incr (&mcnt, &p); 659 extract_number_and_incr (&mcnt2, &p); 660 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2); 661 break; 662 663 case set_number_at: 664 extract_number_and_incr (&mcnt, &p); 665 extract_number_and_incr (&mcnt2, &p); 666 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2); 667 break; 668 669 case wordbound: 670 printf ("/wordbound"); 671 break; 672 673 case notwordbound: 674 printf ("/notwordbound"); 675 break; 676 677 case wordbeg: 678 printf ("/wordbeg"); 679 break; 680 681 case wordend: 682 printf ("/wordend"); 683 684 #ifdef emacs 685 case before_dot: 686 printf ("/before_dot"); 687 break; 688 689 case at_dot: 690 printf ("/at_dot"); 691 break; 692 693 case after_dot: 694 printf ("/after_dot"); 695 break; 696 697 case syntaxspec: 698 printf ("/syntaxspec"); 699 mcnt = *p++; 700 printf ("/%d", mcnt); 701 break; 702 703 case notsyntaxspec: 704 printf ("/notsyntaxspec"); 705 mcnt = *p++; 706 printf ("/%d", mcnt); 707 break; 708 #endif /* emacs */ 709 710 case wordchar: 711 printf ("/wordchar"); 712 break; 713 714 case notwordchar: 715 printf ("/notwordchar"); 716 break; 717 718 case begbuf: 719 printf ("/begbuf"); 720 break; 721 722 case endbuf: 723 printf ("/endbuf"); 724 break; 725 726 default: 727 printf ("?%d", *(p-1)); 728 } 729 } 730 printf ("/\n"); 731 } 732 733 734 void 735 print_compiled_pattern (bufp) 736 struct re_pattern_buffer *bufp; 737 { 738 unsigned char *buffer = bufp->buffer; 739 740 print_partial_compiled_pattern (buffer, buffer + bufp->used); 741 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); 742 743 if (bufp->fastmap_accurate && bufp->fastmap) 744 { 745 printf ("fastmap: "); 746 print_fastmap (bufp->fastmap); 747 } 748 749 printf ("re_nsub: %d\t", bufp->re_nsub); 750 printf ("regs_alloc: %d\t", bufp->regs_allocated); 751 printf ("can_be_null: %d\t", bufp->can_be_null); 752 printf ("newline_anchor: %d\n", bufp->newline_anchor); 753 printf ("no_sub: %d\t", bufp->no_sub); 754 printf ("not_bol: %d\t", bufp->not_bol); 755 printf ("not_eol: %d\t", bufp->not_eol); 756 printf ("syntax: %d\n", bufp->syntax); 757 /* Perhaps we should print the translate table? */ 758 } 759 760 761 void 762 print_double_string (where, string1, size1, string2, size2) 763 const char *where; 764 const char *string1; 765 const char *string2; 766 int size1; 767 int size2; 768 { 769 unsigned this_char; 770 771 if (where == NULL) 772 printf ("(null)"); 773 else 774 { 775 if (FIRST_STRING_P (where)) 776 { 777 for (this_char = where - string1; this_char < size1; this_char++) 778 printchar (string1[this_char]); 779 780 where = string2; 781 } 782 783 for (this_char = where - string2; this_char < size2; this_char++) 784 printchar (string2[this_char]); 785 } 786 } 787 788 #else /* not DEBUG */ 789 790 #undef assert 791 #define assert(e) 792 793 #define DEBUG_STATEMENT(e) 794 #define DEBUG_PRINT1(x) 795 #define DEBUG_PRINT2(x1, x2) 796 #define DEBUG_PRINT3(x1, x2, x3) 797 #define DEBUG_PRINT4(x1, x2, x3, x4) 798 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 799 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 800 801 #endif /* not DEBUG */ 802 803 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 804 also be assigned to arbitrarily: each pattern buffer stores its own 805 syntax, so it can be changed between regex compilations. */ 806 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; 807 808 809 /* Specify the precise syntax of regexps for compilation. This provides 810 for compatibility for various utilities which historically have 811 different, incompatible syntaxes. 812 813 The argument SYNTAX is a bit mask comprised of the various bits 814 defined in regex.h. We return the old syntax. */ 815 816 reg_syntax_t 817 re_set_syntax (syntax) 818 reg_syntax_t syntax; 819 { 820 reg_syntax_t ret = re_syntax_options; 821 822 re_syntax_options = syntax; 823 return ret; 824 } 825 826 /* This table gives an error message for each of the error codes listed 827 in regex.h. Obviously the order here has to be same as there. */ 828 829 static const char *re_error_msg[] = 830 { NULL, /* REG_NOERROR */ 831 "No match", /* REG_NOMATCH */ 832 "Invalid regular expression", /* REG_BADPAT */ 833 "Invalid collation character", /* REG_ECOLLATE */ 834 "Invalid character class name", /* REG_ECTYPE */ 835 "Trailing backslash", /* REG_EESCAPE */ 836 "Invalid back reference", /* REG_ESUBREG */ 837 "Unmatched [ or [^", /* REG_EBRACK */ 838 "Unmatched ( or \\(", /* REG_EPAREN */ 839 "Unmatched \\{", /* REG_EBRACE */ 840 "Invalid content of \\{\\}", /* REG_BADBR */ 841 "Invalid range end", /* REG_ERANGE */ 842 "Memory exhausted", /* REG_ESPACE */ 843 "Invalid preceding regular expression", /* REG_BADRPT */ 844 "Premature end of regular expression", /* REG_EEND */ 845 "Regular expression too big", /* REG_ESIZE */ 846 "Unmatched ) or \\)", /* REG_ERPAREN */ 847 }; 848 849 /* Subroutine declarations and macros for regex_compile. */ 850 851 static void store_op1 (), store_op2 (); 852 static void insert_op1 (), insert_op2 (); 853 static boolean at_begline_loc_p (), at_endline_loc_p (); 854 static boolean group_in_compile_stack (); 855 static reg_errcode_t compile_range (); 856 857 /* Fetch the next character in the uncompiled pattern---translating it 858 if necessary. Also cast from a signed character in the constant 859 string passed to us by the user to an unsigned char that we can use 860 as an array index (in, e.g., `translate'). */ 861 #define PATFETCH(c) \ 862 do {if (p == pend) return REG_EEND; \ 863 c = (unsigned char) *p++; \ 864 if (translate) c = translate[c]; \ 865 } while (0) 866 867 /* Fetch the next character in the uncompiled pattern, with no 868 translation. */ 869 #define PATFETCH_RAW(c) \ 870 do {if (p == pend) return REG_EEND; \ 871 c = (unsigned char) *p++; \ 872 } while (0) 873 874 /* Go backwards one character in the pattern. */ 875 #define PATUNFETCH p-- 876 877 878 /* If `translate' is non-null, return translate[D], else just D. We 879 cast the subscript to translate because some data is declared as 880 `char *', to avoid warnings when a string constant is passed. But 881 when we use a character as a subscript we must make it unsigned. */ 882 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) 883 884 885 /* Macros for outputting the compiled pattern into `buffer'. */ 886 887 /* If the buffer isn't allocated when it comes in, use this. */ 888 #define INIT_BUF_SIZE 32 889 890 /* Make sure we have at least N more bytes of space in buffer. */ 891 #define GET_BUFFER_SPACE(n) \ 892 while (b - bufp->buffer + (n) > bufp->allocated) \ 893 EXTEND_BUFFER () 894 895 /* Make sure we have one more byte of buffer space and then add C to it. */ 896 #define BUF_PUSH(c) \ 897 do { \ 898 GET_BUFFER_SPACE (1); \ 899 *b++ = (unsigned char) (c); \ 900 } while (0) 901 902 903 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 904 #define BUF_PUSH_2(c1, c2) \ 905 do { \ 906 GET_BUFFER_SPACE (2); \ 907 *b++ = (unsigned char) (c1); \ 908 *b++ = (unsigned char) (c2); \ 909 } while (0) 910 911 912 /* As with BUF_PUSH_2, except for three bytes. */ 913 #define BUF_PUSH_3(c1, c2, c3) \ 914 do { \ 915 GET_BUFFER_SPACE (3); \ 916 *b++ = (unsigned char) (c1); \ 917 *b++ = (unsigned char) (c2); \ 918 *b++ = (unsigned char) (c3); \ 919 } while (0) 920 921 922 /* Store a jump with opcode OP at LOC to location TO. We store a 923 relative address offset by the three bytes the jump itself occupies. */ 924 #define STORE_JUMP(op, loc, to) \ 925 store_op1 (op, loc, (to) - (loc) - 3) 926 927 /* Likewise, for a two-argument jump. */ 928 #define STORE_JUMP2(op, loc, to, arg) \ 929 store_op2 (op, loc, (to) - (loc) - 3, arg) 930 931 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 932 #define INSERT_JUMP(op, loc, to) \ 933 insert_op1 (op, loc, (to) - (loc) - 3, b) 934 935 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 936 #define INSERT_JUMP2(op, loc, to, arg) \ 937 insert_op2 (op, loc, (to) - (loc) - 3, arg, b) 938 939 940 /* This is not an arbitrary limit: the arguments which represent offsets 941 into the pattern are two bytes long. So if 2^16 bytes turns out to 942 be too small, many things would have to change. */ 943 #define MAX_BUF_SIZE (1L << 16) 944 945 946 /* Extend the buffer by twice its current size via realloc and 947 reset the pointers that pointed into the old block to point to the 948 correct places in the new one. If extending the buffer results in it 949 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ 950 #define EXTEND_BUFFER() \ 951 do { \ 952 unsigned char *old_buffer = bufp->buffer; \ 953 if (bufp->allocated == MAX_BUF_SIZE) \ 954 return REG_ESIZE; \ 955 bufp->allocated <<= 1; \ 956 if (bufp->allocated > MAX_BUF_SIZE) \ 957 bufp->allocated = MAX_BUF_SIZE; \ 958 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ 959 if (bufp->buffer == NULL) \ 960 return REG_ESPACE; \ 961 /* If the buffer moved, move all the pointers into it. */ \ 962 if (old_buffer != bufp->buffer) \ 963 { \ 964 b = (b - old_buffer) + bufp->buffer; \ 965 begalt = (begalt - old_buffer) + bufp->buffer; \ 966 if (fixup_alt_jump) \ 967 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ 968 if (laststart) \ 969 laststart = (laststart - old_buffer) + bufp->buffer; \ 970 if (pending_exact) \ 971 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ 972 } \ 973 } while (0) 974 975 976 /* Since we have one byte reserved for the register number argument to 977 {start,stop}_memory, the maximum number of groups we can report 978 things about is what fits in that byte. */ 979 #define MAX_REGNUM 255 980 981 /* But patterns can have more than `MAX_REGNUM' registers. We just 982 ignore the excess. */ 983 typedef unsigned regnum_t; 984 985 986 /* Macros for the compile stack. */ 987 988 /* Since offsets can go either forwards or backwards, this type needs to 989 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ 990 typedef int pattern_offset_t; 991 992 typedef struct 993 { 994 pattern_offset_t begalt_offset; 995 pattern_offset_t fixup_alt_jump; 996 pattern_offset_t inner_group_offset; 997 pattern_offset_t laststart_offset; 998 regnum_t regnum; 999 } compile_stack_elt_t; 1000 1001 1002 typedef struct 1003 { 1004 compile_stack_elt_t *stack; 1005 unsigned size; 1006 unsigned avail; /* Offset of next open position. */ 1007 } compile_stack_type; 1008 1009 1010 #define INIT_COMPILE_STACK_SIZE 32 1011 1012 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) 1013 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 1014 1015 /* The next available element. */ 1016 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 1017 1018 1019 /* Set the bit for character C in a list. */ 1020 #define SET_LIST_BIT(c) \ 1021 (b[((unsigned char) (c)) / BYTEWIDTH] \ 1022 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 1023 1024 1025 /* Get the next unsigned number in the uncompiled pattern. */ 1026 #define GET_UNSIGNED_NUMBER(num) \ 1027 { if (p != pend) \ 1028 { \ 1029 PATFETCH (c); \ 1030 while (ISDIGIT (c)) \ 1031 { \ 1032 if (num < 0) \ 1033 num = 0; \ 1034 num = num * 10 + c - '0'; \ 1035 if (p == pend) \ 1036 break; \ 1037 PATFETCH (c); \ 1038 } \ 1039 } \ 1040 } 1041 1042 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 1043 1044 #define IS_CHAR_CLASS(string) \ 1045 (STREQ (string, "alpha") || STREQ (string, "upper") \ 1046 || STREQ (string, "lower") || STREQ (string, "digit") \ 1047 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 1048 || STREQ (string, "space") || STREQ (string, "print") \ 1049 || STREQ (string, "punct") || STREQ (string, "graph") \ 1050 || STREQ (string, "cntrl") || STREQ (string, "blank")) 1051 1052 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 1053 Returns one of error codes defined in `regex.h', or zero for success. 1054 1055 Assumes the `allocated' (and perhaps `buffer') and `translate' 1056 fields are set in BUFP on entry. 1057 1058 If it succeeds, results are put in BUFP (if it returns an error, the 1059 contents of BUFP are undefined): 1060 `buffer' is the compiled pattern; 1061 `syntax' is set to SYNTAX; 1062 `used' is set to the length of the compiled pattern; 1063 `fastmap_accurate' is zero; 1064 `re_nsub' is the number of subexpressions in PATTERN; 1065 `not_bol' and `not_eol' are zero; 1066 1067 The `fastmap' and `newline_anchor' fields are neither 1068 examined nor set. */ 1069 1070 static reg_errcode_t 1071 regex_compile (pattern, size, syntax, bufp) 1072 const char *pattern; 1073 int size; 1074 reg_syntax_t syntax; 1075 struct re_pattern_buffer *bufp; 1076 { 1077 /* We fetch characters from PATTERN here. Even though PATTERN is 1078 `char *' (i.e., signed), we declare these variables as unsigned, so 1079 they can be reliably used as array indices. */ 1080 register unsigned char c, c1; 1081 1082 /* A random tempory spot in PATTERN. */ 1083 const char *p1; 1084 1085 /* Points to the end of the buffer, where we should append. */ 1086 register unsigned char *b; 1087 1088 /* Keeps track of unclosed groups. */ 1089 compile_stack_type compile_stack; 1090 1091 /* Points to the current (ending) position in the pattern. */ 1092 const char *p = pattern; 1093 const char *pend = pattern + size; 1094 1095 /* How to translate the characters in the pattern. */ 1096 char *translate = bufp->translate; 1097 1098 /* Address of the count-byte of the most recently inserted `exactn' 1099 command. This makes it possible to tell if a new exact-match 1100 character can be added to that command or if the character requires 1101 a new `exactn' command. */ 1102 unsigned char *pending_exact = 0; 1103 1104 /* Address of start of the most recently finished expression. 1105 This tells, e.g., postfix * where to find the start of its 1106 operand. Reset at the beginning of groups and alternatives. */ 1107 unsigned char *laststart = 0; 1108 1109 /* Address of beginning of regexp, or inside of last group. */ 1110 unsigned char *begalt; 1111 1112 /* Place in the uncompiled pattern (i.e., the {) to 1113 which to go back if the interval is invalid. */ 1114 const char *beg_interval; 1115 1116 /* Address of the place where a forward jump should go to the end of 1117 the containing expression. Each alternative of an `or' -- except the 1118 last -- ends with a forward jump of this sort. */ 1119 unsigned char *fixup_alt_jump = 0; 1120 1121 /* Counts open-groups as they are encountered. Remembered for the 1122 matching close-group on the compile stack, so the same register 1123 number is put in the stop_memory as the start_memory. */ 1124 regnum_t regnum = 0; 1125 1126 #ifdef DEBUG 1127 DEBUG_PRINT1 ("\nCompiling pattern: "); 1128 if (debug) 1129 { 1130 unsigned debug_count; 1131 1132 for (debug_count = 0; debug_count < size; debug_count++) 1133 printchar (pattern[debug_count]); 1134 putchar ('\n'); 1135 } 1136 #endif /* DEBUG */ 1137 1138 /* Initialize the compile stack. */ 1139 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); 1140 if (compile_stack.stack == NULL) 1141 return REG_ESPACE; 1142 1143 compile_stack.size = INIT_COMPILE_STACK_SIZE; 1144 compile_stack.avail = 0; 1145 1146 /* Initialize the pattern buffer. */ 1147 bufp->syntax = syntax; 1148 bufp->fastmap_accurate = 0; 1149 bufp->not_bol = bufp->not_eol = 0; 1150 1151 /* Set `used' to zero, so that if we return an error, the pattern 1152 printer (for debugging) will think there's no pattern. We reset it 1153 at the end. */ 1154 bufp->used = 0; 1155 1156 /* Always count groups, whether or not bufp->no_sub is set. */ 1157 bufp->re_nsub = 0; 1158 1159 #if !defined (emacs) && !defined (SYNTAX_TABLE) 1160 /* Initialize the syntax table. */ 1161 init_syntax_once (); 1162 #endif 1163 1164 if (bufp->allocated == 0) 1165 { 1166 if (bufp->buffer) 1167 { /* If zero allocated, but buffer is non-null, try to realloc 1168 enough space. This loses if buffer's address is bogus, but 1169 that is the user's responsibility. */ 1170 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); 1171 } 1172 else 1173 { /* Caller did not allocate a buffer. Do it for them. */ 1174 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); 1175 } 1176 if (!bufp->buffer) return REG_ESPACE; 1177 1178 bufp->allocated = INIT_BUF_SIZE; 1179 } 1180 1181 begalt = b = bufp->buffer; 1182 1183 /* Loop through the uncompiled pattern until we're at the end. */ 1184 while (p != pend) 1185 { 1186 PATFETCH (c); 1187 1188 switch (c) 1189 { 1190 case '^': 1191 { 1192 if ( /* If at start of pattern, it's an operator. */ 1193 p == pattern + 1 1194 /* If context independent, it's an operator. */ 1195 || syntax & RE_CONTEXT_INDEP_ANCHORS 1196 /* Otherwise, depends on what's come before. */ 1197 || at_begline_loc_p (pattern, p, syntax)) 1198 BUF_PUSH (begline); 1199 else 1200 goto normal_char; 1201 } 1202 break; 1203 1204 1205 case '$': 1206 { 1207 if ( /* If at end of pattern, it's an operator. */ 1208 p == pend 1209 /* If context independent, it's an operator. */ 1210 || syntax & RE_CONTEXT_INDEP_ANCHORS 1211 /* Otherwise, depends on what's next. */ 1212 || at_endline_loc_p (p, pend, syntax)) 1213 BUF_PUSH (endline); 1214 else 1215 goto normal_char; 1216 } 1217 break; 1218 1219 1220 case '+': 1221 case '?': 1222 if ((syntax & RE_BK_PLUS_QM) 1223 || (syntax & RE_LIMITED_OPS)) 1224 goto normal_char; 1225 handle_plus: 1226 case '*': 1227 /* If there is no previous pattern... */ 1228 if (!laststart) 1229 { 1230 if (syntax & RE_CONTEXT_INVALID_OPS) 1231 return REG_BADRPT; 1232 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) 1233 goto normal_char; 1234 } 1235 1236 { 1237 /* Are we optimizing this jump? */ 1238 boolean keep_string_p = false; 1239 1240 /* 1 means zero (many) matches is allowed. */ 1241 char zero_times_ok = 0, many_times_ok = 0; 1242 1243 /* If there is a sequence of repetition chars, collapse it 1244 down to just one (the right one). We can't combine 1245 interval operators with these because of, e.g., `a{2}*', 1246 which should only match an even number of `a's. */ 1247 1248 for (;;) 1249 { 1250 zero_times_ok |= c != '+'; 1251 many_times_ok |= c != '?'; 1252 1253 if (p == pend) 1254 break; 1255 1256 PATFETCH (c); 1257 1258 if (c == '*' 1259 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) 1260 ; 1261 1262 else if (syntax & RE_BK_PLUS_QM && c == '\\') 1263 { 1264 if (p == pend) return REG_EESCAPE; 1265 1266 PATFETCH (c1); 1267 if (!(c1 == '+' || c1 == '?')) 1268 { 1269 PATUNFETCH; 1270 PATUNFETCH; 1271 break; 1272 } 1273 1274 c = c1; 1275 } 1276 else 1277 { 1278 PATUNFETCH; 1279 break; 1280 } 1281 1282 /* If we get here, we found another repeat character. */ 1283 } 1284 1285 /* Star, etc. applied to an empty pattern is equivalent 1286 to an empty pattern. */ 1287 if (!laststart) 1288 break; 1289 1290 /* Now we know whether or not zero matches is allowed 1291 and also whether or not two or more matches is allowed. */ 1292 if (many_times_ok) 1293 { /* More than one repetition is allowed, so put in at the 1294 end a backward relative jump from `b' to before the next 1295 jump we're going to put in below (which jumps from 1296 laststart to after this jump). 1297 1298 But if we are at the `*' in the exact sequence `.*\n', 1299 insert an unconditional jump backwards to the ., 1300 instead of the beginning of the loop. This way we only 1301 push a failure point once, instead of every time 1302 through the loop. */ 1303 assert (p - 1 > pattern); 1304 1305 /* Allocate the space for the jump. */ 1306 GET_BUFFER_SPACE (3); 1307 1308 /* We know we are not at the first character of the pattern, 1309 because laststart was nonzero. And we've already 1310 incremented `p', by the way, to be the character after 1311 the `*'. Do we have to do something analogous here 1312 for null bytes, because of RE_DOT_NOT_NULL? */ 1313 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') 1314 && zero_times_ok 1315 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') 1316 && !(syntax & RE_DOT_NEWLINE)) 1317 { /* We have .*\n. */ 1318 STORE_JUMP (jump, b, laststart); 1319 keep_string_p = true; 1320 } 1321 else 1322 /* Anything else. */ 1323 STORE_JUMP (maybe_pop_jump, b, laststart - 3); 1324 1325 /* We've added more stuff to the buffer. */ 1326 b += 3; 1327 } 1328 1329 /* On failure, jump from laststart to b + 3, which will be the 1330 end of the buffer after this jump is inserted. */ 1331 GET_BUFFER_SPACE (3); 1332 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump 1333 : on_failure_jump, 1334 laststart, b + 3); 1335 pending_exact = 0; 1336 b += 3; 1337 1338 if (!zero_times_ok) 1339 { 1340 /* At least one repetition is required, so insert a 1341 `dummy_failure_jump' before the initial 1342 `on_failure_jump' instruction of the loop. This 1343 effects a skip over that instruction the first time 1344 we hit that loop. */ 1345 GET_BUFFER_SPACE (3); 1346 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); 1347 b += 3; 1348 } 1349 } 1350 break; 1351 1352 1353 case '.': 1354 laststart = b; 1355 BUF_PUSH (anychar); 1356 break; 1357 1358 1359 case '[': 1360 { 1361 boolean had_char_class = false; 1362 1363 if (p == pend) return REG_EBRACK; 1364 1365 /* Ensure that we have enough space to push a charset: the 1366 opcode, the length count, and the bitset; 34 bytes in all. */ 1367 GET_BUFFER_SPACE (34); 1368 1369 laststart = b; 1370 1371 /* We test `*p == '^' twice, instead of using an if 1372 statement, so we only need one BUF_PUSH. */ 1373 BUF_PUSH (*p == '^' ? charset_not : charset); 1374 if (*p == '^') 1375 p++; 1376 1377 /* Remember the first position in the bracket expression. */ 1378 p1 = p; 1379 1380 /* Push the number of bytes in the bitmap. */ 1381 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 1382 1383 /* Clear the whole map. */ 1384 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); 1385 1386 /* charset_not matches newline according to a syntax bit. */ 1387 if ((re_opcode_t) b[-2] == charset_not 1388 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 1389 SET_LIST_BIT ('\n'); 1390 1391 /* Read in characters and ranges, setting map bits. */ 1392 for (;;) 1393 { 1394 if (p == pend) return REG_EBRACK; 1395 1396 PATFETCH (c); 1397 1398 /* \ might escape characters inside [...] and [^...]. */ 1399 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 1400 { 1401 if (p == pend) return REG_EESCAPE; 1402 1403 PATFETCH (c1); 1404 SET_LIST_BIT (c1); 1405 continue; 1406 } 1407 1408 /* Could be the end of the bracket expression. If it's 1409 not (i.e., when the bracket expression is `[]' so 1410 far), the ']' character bit gets set way below. */ 1411 if (c == ']' && p != p1 + 1) 1412 break; 1413 1414 /* Look ahead to see if it's a range when the last thing 1415 was a character class. */ 1416 if (had_char_class && c == '-' && *p != ']') 1417 return REG_ERANGE; 1418 1419 /* Look ahead to see if it's a range when the last thing 1420 was a character: if this is a hyphen not at the 1421 beginning or the end of a list, then it's the range 1422 operator. */ 1423 if (c == '-' 1424 && !(p - 2 >= pattern && p[-2] == '[') 1425 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 1426 && *p != ']') 1427 { 1428 reg_errcode_t ret 1429 = compile_range (&p, pend, translate, syntax, b); 1430 if (ret != REG_NOERROR) return ret; 1431 } 1432 1433 else if (p[0] == '-' && p[1] != ']') 1434 { /* This handles ranges made up of characters only. */ 1435 reg_errcode_t ret; 1436 1437 /* Move past the `-'. */ 1438 PATFETCH (c1); 1439 1440 ret = compile_range (&p, pend, translate, syntax, b); 1441 if (ret != REG_NOERROR) return ret; 1442 } 1443 1444 /* See if we're at the beginning of a possible character 1445 class. */ 1446 1447 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 1448 { /* Leave room for the null. */ 1449 char str[CHAR_CLASS_MAX_LENGTH + 1]; 1450 1451 PATFETCH (c); 1452 c1 = 0; 1453 1454 /* If pattern is `[[:'. */ 1455 if (p == pend) return REG_EBRACK; 1456 1457 for (;;) 1458 { 1459 PATFETCH (c); 1460 if (c == ':' || c == ']' || p == pend 1461 || c1 == CHAR_CLASS_MAX_LENGTH) 1462 break; 1463 str[c1++] = c; 1464 } 1465 str[c1] = '\0'; 1466 1467 /* If isn't a word bracketed by `[:' and:`]': 1468 undo the ending character, the letters, and leave 1469 the leading `:' and `[' (but set bits for them). */ 1470 if (c == ':' && *p == ']') 1471 { 1472 int ch; 1473 boolean is_alnum = STREQ (str, "alnum"); 1474 boolean is_alpha = STREQ (str, "alpha"); 1475 boolean is_blank = STREQ (str, "blank"); 1476 boolean is_cntrl = STREQ (str, "cntrl"); 1477 boolean is_digit = STREQ (str, "digit"); 1478 boolean is_graph = STREQ (str, "graph"); 1479 boolean is_lower = STREQ (str, "lower"); 1480 boolean is_print = STREQ (str, "print"); 1481 boolean is_punct = STREQ (str, "punct"); 1482 boolean is_space = STREQ (str, "space"); 1483 boolean is_upper = STREQ (str, "upper"); 1484 boolean is_xdigit = STREQ (str, "xdigit"); 1485 1486 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; 1487 1488 /* Throw away the ] at the end of the character 1489 class. */ 1490 PATFETCH (c); 1491 1492 if (p == pend) return REG_EBRACK; 1493 1494 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 1495 { 1496 if ( (is_alnum && ISALNUM (ch)) 1497 || (is_alpha && ISALPHA (ch)) 1498 || (is_blank && ISBLANK (ch)) 1499 || (is_cntrl && ISCNTRL (ch)) 1500 || (is_digit && ISDIGIT (ch)) 1501 || (is_graph && ISGRAPH (ch)) 1502 || (is_lower && ISLOWER (ch)) 1503 || (is_print && ISPRINT (ch)) 1504 || (is_punct && ISPUNCT (ch)) 1505 || (is_space && ISSPACE (ch)) 1506 || (is_upper && ISUPPER (ch)) 1507 || (is_xdigit && ISXDIGIT (ch))) 1508 SET_LIST_BIT (ch); 1509 } 1510 had_char_class = true; 1511 } 1512 else 1513 { 1514 c1++; 1515 while (c1--) 1516 PATUNFETCH; 1517 SET_LIST_BIT ('['); 1518 SET_LIST_BIT (':'); 1519 had_char_class = false; 1520 } 1521 } 1522 else 1523 { 1524 had_char_class = false; 1525 SET_LIST_BIT (c); 1526 } 1527 } 1528 1529 /* Discard any (non)matching list bytes that are all 0 at the 1530 end of the map. Decrease the map-length byte too. */ 1531 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 1532 b[-1]--; 1533 b += b[-1]; 1534 } 1535 break; 1536 1537 1538 case '(': 1539 if (syntax & RE_NO_BK_PARENS) 1540 goto handle_open; 1541 else 1542 goto normal_char; 1543 1544 1545 case ')': 1546 if (syntax & RE_NO_BK_PARENS) 1547 goto handle_close; 1548 else 1549 goto normal_char; 1550 1551 1552 case '\n': 1553 if (syntax & RE_NEWLINE_ALT) 1554 goto handle_alt; 1555 else 1556 goto normal_char; 1557 1558 1559 case '|': 1560 if (syntax & RE_NO_BK_VBAR) 1561 goto handle_alt; 1562 else 1563 goto normal_char; 1564 1565 1566 case '{': 1567 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) 1568 goto handle_interval; 1569 else 1570 goto normal_char; 1571 1572 1573 case '\\': 1574 if (p == pend) return REG_EESCAPE; 1575 1576 /* Do not translate the character after the \, so that we can 1577 distinguish, e.g., \B from \b, even if we normally would 1578 translate, e.g., B to b. */ 1579 PATFETCH_RAW (c); 1580 1581 switch (c) 1582 { 1583 case '(': 1584 if (syntax & RE_NO_BK_PARENS) 1585 goto normal_backslash; 1586 1587 handle_open: 1588 bufp->re_nsub++; 1589 regnum++; 1590 1591 if (COMPILE_STACK_FULL) 1592 { 1593 RETALLOC (compile_stack.stack, compile_stack.size << 1, 1594 compile_stack_elt_t); 1595 if (compile_stack.stack == NULL) return REG_ESPACE; 1596 1597 compile_stack.size <<= 1; 1598 } 1599 1600 /* These are the values to restore when we hit end of this 1601 group. They are all relative offsets, so that if the 1602 whole pattern moves because of realloc, they will still 1603 be valid. */ 1604 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; 1605 COMPILE_STACK_TOP.fixup_alt_jump 1606 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; 1607 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; 1608 COMPILE_STACK_TOP.regnum = regnum; 1609 1610 /* We will eventually replace the 0 with the number of 1611 groups inner to this one. But do not push a 1612 start_memory for groups beyond the last one we can 1613 represent in the compiled pattern. */ 1614 if (regnum <= MAX_REGNUM) 1615 { 1616 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; 1617 BUF_PUSH_3 (start_memory, regnum, 0); 1618 } 1619 1620 compile_stack.avail++; 1621 1622 fixup_alt_jump = 0; 1623 laststart = 0; 1624 begalt = b; 1625 /* If we've reached MAX_REGNUM groups, then this open 1626 won't actually generate any code, so we'll have to 1627 clear pending_exact explicitly. */ 1628 pending_exact = 0; 1629 break; 1630 1631 1632 case ')': 1633 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; 1634 1635 if (COMPILE_STACK_EMPTY) 1636 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 1637 goto normal_backslash; 1638 else 1639 return REG_ERPAREN; 1640 1641 handle_close: 1642 if (fixup_alt_jump) 1643 { /* Push a dummy failure point at the end of the 1644 alternative for a possible future 1645 `pop_failure_jump' to pop. See comments at 1646 `push_dummy_failure' in `re_match_2'. */ 1647 BUF_PUSH (push_dummy_failure); 1648 1649 /* We allocated space for this jump when we assigned 1650 to `fixup_alt_jump', in the `handle_alt' case below. */ 1651 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); 1652 } 1653 1654 /* See similar code for backslashed left paren above. */ 1655 if (COMPILE_STACK_EMPTY) 1656 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 1657 goto normal_char; 1658 else 1659 return REG_ERPAREN; 1660 1661 /* Since we just checked for an empty stack above, this 1662 ``can't happen''. */ 1663 assert (compile_stack.avail != 0); 1664 { 1665 /* We don't just want to restore into `regnum', because 1666 later groups should continue to be numbered higher, 1667 as in `(ab)c(de)' -- the second group is #2. */ 1668 regnum_t this_group_regnum; 1669 1670 compile_stack.avail--; 1671 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; 1672 fixup_alt_jump 1673 = COMPILE_STACK_TOP.fixup_alt_jump 1674 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 1675 : 0; 1676 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; 1677 this_group_regnum = COMPILE_STACK_TOP.regnum; 1678 /* If we've reached MAX_REGNUM groups, then this open 1679 won't actually generate any code, so we'll have to 1680 clear pending_exact explicitly. */ 1681 pending_exact = 0; 1682 1683 /* We're at the end of the group, so now we know how many 1684 groups were inside this one. */ 1685 if (this_group_regnum <= MAX_REGNUM) 1686 { 1687 unsigned char *inner_group_loc 1688 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; 1689 1690 *inner_group_loc = regnum - this_group_regnum; 1691 BUF_PUSH_3 (stop_memory, this_group_regnum, 1692 regnum - this_group_regnum); 1693 } 1694 } 1695 break; 1696 1697 1698 case '|': /* `\|'. */ 1699 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) 1700 goto normal_backslash; 1701 handle_alt: 1702 if (syntax & RE_LIMITED_OPS) 1703 goto normal_char; 1704 1705 /* Insert before the previous alternative a jump which 1706 jumps to this alternative if the former fails. */ 1707 GET_BUFFER_SPACE (3); 1708 INSERT_JUMP (on_failure_jump, begalt, b + 6); 1709 pending_exact = 0; 1710 b += 3; 1711 1712 /* The alternative before this one has a jump after it 1713 which gets executed if it gets matched. Adjust that 1714 jump so it will jump to this alternative's analogous 1715 jump (put in below, which in turn will jump to the next 1716 (if any) alternative's such jump, etc.). The last such 1717 jump jumps to the correct final destination. A picture: 1718 _____ _____ 1719 | | | | 1720 | v | v 1721 a | b | c 1722 1723 If we are at `b', then fixup_alt_jump right now points to a 1724 three-byte space after `a'. We'll put in the jump, set 1725 fixup_alt_jump to right after `b', and leave behind three 1726 bytes which we'll fill in when we get to after `c'. */ 1727 1728 if (fixup_alt_jump) 1729 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 1730 1731 /* Mark and leave space for a jump after this alternative, 1732 to be filled in later either by next alternative or 1733 when know we're at the end of a series of alternatives. */ 1734 fixup_alt_jump = b; 1735 GET_BUFFER_SPACE (3); 1736 b += 3; 1737 1738 laststart = 0; 1739 begalt = b; 1740 break; 1741 1742 1743 case '{': 1744 /* If \{ is a literal. */ 1745 if (!(syntax & RE_INTERVALS) 1746 /* If we're at `\{' and it's not the open-interval 1747 operator. */ 1748 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1749 || (p - 2 == pattern && p == pend)) 1750 goto normal_backslash; 1751 1752 handle_interval: 1753 { 1754 /* If got here, then the syntax allows intervals. */ 1755 1756 /* At least (most) this many matches must be made. */ 1757 int lower_bound = -1, upper_bound = -1; 1758 1759 beg_interval = p - 1; 1760 1761 if (p == pend) 1762 { 1763 if (syntax & RE_NO_BK_BRACES) 1764 goto unfetch_interval; 1765 else 1766 return REG_EBRACE; 1767 } 1768 1769 GET_UNSIGNED_NUMBER (lower_bound); 1770 1771 if (c == ',') 1772 { 1773 GET_UNSIGNED_NUMBER (upper_bound); 1774 if (upper_bound < 0) upper_bound = RE_DUP_MAX; 1775 } 1776 else 1777 /* Interval such as `{1}' => match exactly once. */ 1778 upper_bound = lower_bound; 1779 1780 if (lower_bound < 0 || upper_bound > RE_DUP_MAX 1781 || lower_bound > upper_bound) 1782 { 1783 if (syntax & RE_NO_BK_BRACES) 1784 goto unfetch_interval; 1785 else 1786 return REG_BADBR; 1787 } 1788 1789 if (!(syntax & RE_NO_BK_BRACES)) 1790 { 1791 if (c != '\\') return REG_EBRACE; 1792 1793 PATFETCH (c); 1794 } 1795 1796 if (c != '}') 1797 { 1798 if (syntax & RE_NO_BK_BRACES) 1799 goto unfetch_interval; 1800 else 1801 return REG_BADBR; 1802 } 1803 1804 /* We just parsed a valid interval. */ 1805 1806 /* If it's invalid to have no preceding re. */ 1807 if (!laststart) 1808 { 1809 if (syntax & RE_CONTEXT_INVALID_OPS) 1810 return REG_BADRPT; 1811 else if (syntax & RE_CONTEXT_INDEP_OPS) 1812 laststart = b; 1813 else 1814 goto unfetch_interval; 1815 } 1816 1817 /* If the upper bound is zero, don't want to succeed at 1818 all; jump from `laststart' to `b + 3', which will be 1819 the end of the buffer after we insert the jump. */ 1820 if (upper_bound == 0) 1821 { 1822 GET_BUFFER_SPACE (3); 1823 INSERT_JUMP (jump, laststart, b + 3); 1824 b += 3; 1825 } 1826 1827 /* Otherwise, we have a nontrivial interval. When 1828 we're all done, the pattern will look like: 1829 set_number_at <jump count> <upper bound> 1830 set_number_at <succeed_n count> <lower bound> 1831 succeed_n <after jump addr> <succed_n count> 1832 <body of loop> 1833 jump_n <succeed_n addr> <jump count> 1834 (The upper bound and `jump_n' are omitted if 1835 `upper_bound' is 1, though.) */ 1836 else 1837 { /* If the upper bound is > 1, we need to insert 1838 more at the end of the loop. */ 1839 unsigned nbytes = 10 + (upper_bound > 1) * 10; 1840 1841 GET_BUFFER_SPACE (nbytes); 1842 1843 /* Initialize lower bound of the `succeed_n', even 1844 though it will be set during matching by its 1845 attendant `set_number_at' (inserted next), 1846 because `re_compile_fastmap' needs to know. 1847 Jump to the `jump_n' we might insert below. */ 1848 INSERT_JUMP2 (succeed_n, laststart, 1849 b + 5 + (upper_bound > 1) * 5, 1850 lower_bound); 1851 b += 5; 1852 1853 /* Code to initialize the lower bound. Insert 1854 before the `succeed_n'. The `5' is the last two 1855 bytes of this `set_number_at', plus 3 bytes of 1856 the following `succeed_n'. */ 1857 insert_op2 (set_number_at, laststart, 5, lower_bound, b); 1858 b += 5; 1859 1860 if (upper_bound > 1) 1861 { /* More than one repetition is allowed, so 1862 append a backward jump to the `succeed_n' 1863 that starts this interval. 1864 1865 When we've reached this during matching, 1866 we'll have matched the interval once, so 1867 jump back only `upper_bound - 1' times. */ 1868 STORE_JUMP2 (jump_n, b, laststart + 5, 1869 upper_bound - 1); 1870 b += 5; 1871 1872 /* The location we want to set is the second 1873 parameter of the `jump_n'; that is `b-2' as 1874 an absolute address. `laststart' will be 1875 the `set_number_at' we're about to insert; 1876 `laststart+3' the number to set, the source 1877 for the relative address. But we are 1878 inserting into the middle of the pattern -- 1879 so everything is getting moved up by 5. 1880 Conclusion: (b - 2) - (laststart + 3) + 5, 1881 i.e., b - laststart. 1882 1883 We insert this at the beginning of the loop 1884 so that if we fail during matching, we'll 1885 reinitialize the bounds. */ 1886 insert_op2 (set_number_at, laststart, b - laststart, 1887 upper_bound - 1, b); 1888 b += 5; 1889 } 1890 } 1891 pending_exact = 0; 1892 beg_interval = NULL; 1893 } 1894 break; 1895 1896 unfetch_interval: 1897 /* If an invalid interval, match the characters as literals. */ 1898 assert (beg_interval); 1899 p = beg_interval; 1900 beg_interval = NULL; 1901 1902 /* normal_char and normal_backslash need `c'. */ 1903 PATFETCH (c); 1904 1905 if (!(syntax & RE_NO_BK_BRACES)) 1906 { 1907 if (p > pattern && p[-1] == '\\') 1908 goto normal_backslash; 1909 } 1910 goto normal_char; 1911 1912 #ifdef emacs 1913 /* There is no way to specify the before_dot and after_dot 1914 operators. rms says this is ok. --karl */ 1915 case '=': 1916 BUF_PUSH (at_dot); 1917 break; 1918 1919 case 's': 1920 laststart = b; 1921 PATFETCH (c); 1922 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); 1923 break; 1924 1925 case 'S': 1926 laststart = b; 1927 PATFETCH (c); 1928 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); 1929 break; 1930 #endif /* emacs */ 1931 1932 1933 case 'w': 1934 laststart = b; 1935 BUF_PUSH (wordchar); 1936 break; 1937 1938 1939 case 'W': 1940 laststart = b; 1941 BUF_PUSH (notwordchar); 1942 break; 1943 1944 1945 case '<': 1946 BUF_PUSH (wordbeg); 1947 break; 1948 1949 case '>': 1950 BUF_PUSH (wordend); 1951 break; 1952 1953 case 'b': 1954 BUF_PUSH (wordbound); 1955 break; 1956 1957 case 'B': 1958 BUF_PUSH (notwordbound); 1959 break; 1960 1961 case '`': 1962 BUF_PUSH (begbuf); 1963 break; 1964 1965 case '\'': 1966 BUF_PUSH (endbuf); 1967 break; 1968 1969 case '1': case '2': case '3': case '4': case '5': 1970 case '6': case '7': case '8': case '9': 1971 if (syntax & RE_NO_BK_REFS) 1972 goto normal_char; 1973 1974 c1 = c - '0'; 1975 1976 if (c1 > regnum) 1977 return REG_ESUBREG; 1978 1979 /* Can't back reference to a subexpression if inside of it. */ 1980 if (group_in_compile_stack (compile_stack, c1)) 1981 goto normal_char; 1982 1983 laststart = b; 1984 BUF_PUSH_2 (duplicate, c1); 1985 break; 1986 1987 1988 case '+': 1989 case '?': 1990 if (syntax & RE_BK_PLUS_QM) 1991 goto handle_plus; 1992 else 1993 goto normal_backslash; 1994 1995 default: 1996 normal_backslash: 1997 /* You might think it would be useful for \ to mean 1998 not to translate; but if we don't translate it 1999 it will never match anything. */ 2000 c = TRANSLATE (c); 2001 goto normal_char; 2002 } 2003 break; 2004 2005 2006 default: 2007 /* Expects the character in `c'. */ 2008 normal_char: 2009 /* If no exactn currently being built. */ 2010 if (!pending_exact 2011 2012 /* If last exactn not at current position. */ 2013 || pending_exact + *pending_exact + 1 != b 2014 2015 /* We have only one byte following the exactn for the count. */ 2016 || *pending_exact == (1 << BYTEWIDTH) - 1 2017 2018 /* If followed by a repetition operator. */ 2019 || *p == '*' || *p == '^' 2020 || ((syntax & RE_BK_PLUS_QM) 2021 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 2022 : (*p == '+' || *p == '?')) 2023 || ((syntax & RE_INTERVALS) 2024 && ((syntax & RE_NO_BK_BRACES) 2025 ? *p == '{' 2026 : (p[0] == '\\' && p[1] == '{')))) 2027 { 2028 /* Start building a new exactn. */ 2029 2030 laststart = b; 2031 2032 BUF_PUSH_2 (exactn, 0); 2033 pending_exact = b - 1; 2034 } 2035 2036 BUF_PUSH (c); 2037 (*pending_exact)++; 2038 break; 2039 } /* switch (c) */ 2040 } /* while p != pend */ 2041 2042 2043 /* Through the pattern now. */ 2044 2045 if (fixup_alt_jump) 2046 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 2047 2048 if (!COMPILE_STACK_EMPTY) 2049 return REG_EPAREN; 2050 2051 free (compile_stack.stack); 2052 2053 /* We have succeeded; set the length of the buffer. */ 2054 bufp->used = b - bufp->buffer; 2055 2056 #ifdef DEBUG 2057 if (debug) 2058 { 2059 DEBUG_PRINT1 ("\nCompiled pattern: "); 2060 print_compiled_pattern (bufp); 2061 } 2062 #endif /* DEBUG */ 2063 2064 return REG_NOERROR; 2065 } /* regex_compile */ 2066 2067 /* Subroutines for `regex_compile'. */ 2068 2069 /* Store OP at LOC followed by two-byte integer parameter ARG. */ 2070 2071 static void 2072 store_op1 (op, loc, arg) 2073 re_opcode_t op; 2074 unsigned char *loc; 2075 int arg; 2076 { 2077 *loc = (unsigned char) op; 2078 STORE_NUMBER (loc + 1, arg); 2079 } 2080 2081 2082 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ 2083 2084 static void 2085 store_op2 (op, loc, arg1, arg2) 2086 re_opcode_t op; 2087 unsigned char *loc; 2088 int arg1, arg2; 2089 { 2090 *loc = (unsigned char) op; 2091 STORE_NUMBER (loc + 1, arg1); 2092 STORE_NUMBER (loc + 3, arg2); 2093 } 2094 2095 2096 /* Copy the bytes from LOC to END to open up three bytes of space at LOC 2097 for OP followed by two-byte integer parameter ARG. */ 2098 2099 static void 2100 insert_op1 (op, loc, arg, end) 2101 re_opcode_t op; 2102 unsigned char *loc; 2103 int arg; 2104 unsigned char *end; 2105 { 2106 register unsigned char *pfrom = end; 2107 register unsigned char *pto = end + 3; 2108 2109 while (pfrom != loc) 2110 *--pto = *--pfrom; 2111 2112 store_op1 (op, loc, arg); 2113 } 2114 2115 2116 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ 2117 2118 static void 2119 insert_op2 (op, loc, arg1, arg2, end) 2120 re_opcode_t op; 2121 unsigned char *loc; 2122 int arg1, arg2; 2123 unsigned char *end; 2124 { 2125 register unsigned char *pfrom = end; 2126 register unsigned char *pto = end + 5; 2127 2128 while (pfrom != loc) 2129 *--pto = *--pfrom; 2130 2131 store_op2 (op, loc, arg1, arg2); 2132 } 2133 2134 2135 /* P points to just after a ^ in PATTERN. Return true if that ^ comes 2136 after an alternative or a begin-subexpression. We assume there is at 2137 least one character before the ^. */ 2138 2139 static boolean 2140 at_begline_loc_p (pattern, p, syntax) 2141 const char *pattern, *p; 2142 reg_syntax_t syntax; 2143 { 2144 const char *prev = p - 2; 2145 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; 2146 2147 return 2148 /* After a subexpression? */ 2149 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) 2150 /* After an alternative? */ 2151 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); 2152 } 2153 2154 2155 /* The dual of at_begline_loc_p. This one is for $. We assume there is 2156 at least one character after the $, i.e., `P < PEND'. */ 2157 2158 static boolean 2159 at_endline_loc_p (p, pend, syntax) 2160 const char *p, *pend; 2161 int syntax; 2162 { 2163 const char *next = p; 2164 boolean next_backslash = *next == '\\'; 2165 const char *next_next = p + 1 < pend ? p + 1 : NULL; 2166 2167 return 2168 /* Before a subexpression? */ 2169 (syntax & RE_NO_BK_PARENS ? *next == ')' 2170 : next_backslash && next_next && *next_next == ')') 2171 /* Before an alternative? */ 2172 || (syntax & RE_NO_BK_VBAR ? *next == '|' 2173 : next_backslash && next_next && *next_next == '|'); 2174 } 2175 2176 2177 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and 2178 false if it's not. */ 2179 2180 static boolean 2181 group_in_compile_stack (compile_stack, regnum) 2182 compile_stack_type compile_stack; 2183 regnum_t regnum; 2184 { 2185 int this_element; 2186 2187 for (this_element = compile_stack.avail - 1; 2188 this_element >= 0; 2189 this_element--) 2190 if (compile_stack.stack[this_element].regnum == regnum) 2191 return true; 2192 2193 return false; 2194 } 2195 2196 2197 /* Read the ending character of a range (in a bracket expression) from the 2198 uncompiled pattern *P_PTR (which ends at PEND). We assume the 2199 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) 2200 Then we set the translation of all bits between the starting and 2201 ending characters (inclusive) in the compiled pattern B. 2202 2203 Return an error code. 2204 2205 We use these short variable names so we can use the same macros as 2206 `regex_compile' itself. */ 2207 2208 static reg_errcode_t 2209 compile_range (p_ptr, pend, translate, syntax, b) 2210 const char **p_ptr, *pend; 2211 char *translate; 2212 reg_syntax_t syntax; 2213 unsigned char *b; 2214 { 2215 unsigned this_char; 2216 2217 const char *p = *p_ptr; 2218 int range_start, range_end; 2219 2220 if (p == pend) 2221 return REG_ERANGE; 2222 2223 /* Even though the pattern is a signed `char *', we need to fetch 2224 with unsigned char *'s; if the high bit of the pattern character 2225 is set, the range endpoints will be negative if we fetch using a 2226 signed char *. 2227 2228 We also want to fetch the endpoints without translating them; the 2229 appropriate translation is done in the bit-setting loop below. */ 2230 range_start = ((unsigned char *) p)[-2]; 2231 range_end = ((unsigned char *) p)[0]; 2232 2233 /* Have to increment the pointer into the pattern string, so the 2234 caller isn't still at the ending character. */ 2235 (*p_ptr)++; 2236 2237 /* If the start is after the end, the range is empty. */ 2238 if (range_start > range_end) 2239 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; 2240 2241 /* Here we see why `this_char' has to be larger than an `unsigned 2242 char' -- the range is inclusive, so if `range_end' == 0xff 2243 (assuming 8-bit characters), we would otherwise go into an infinite 2244 loop, since all characters <= 0xff. */ 2245 for (this_char = range_start; this_char <= range_end; this_char++) 2246 { 2247 SET_LIST_BIT (TRANSLATE (this_char)); 2248 } 2249 2250 return REG_NOERROR; 2251 } 2252 2253 /* Failure stack declarations and macros; both re_compile_fastmap and 2254 re_match_2 use a failure stack. These have to be macros because of 2255 REGEX_ALLOCATE. */ 2256 2257 2258 /* Number of failure points for which to initially allocate space 2259 when matching. If this number is exceeded, we allocate more 2260 space, so it is not a hard limit. */ 2261 #ifndef INIT_FAILURE_ALLOC 2262 #define INIT_FAILURE_ALLOC 5 2263 #endif 2264 2265 /* Roughly the maximum number of failure points on the stack. Would be 2266 exactly that if always used MAX_FAILURE_SPACE each time we failed. 2267 This is a variable only so users of regex can assign to it; we never 2268 change it ourselves. */ 2269 int re_max_failures = 2000; 2270 2271 typedef const unsigned char *fail_stack_elt_t; 2272 2273 typedef struct 2274 { 2275 fail_stack_elt_t *stack; 2276 unsigned size; 2277 unsigned avail; /* Offset of next open position. */ 2278 } fail_stack_type; 2279 2280 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 2281 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 2282 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 2283 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail]) 2284 2285 2286 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */ 2287 2288 #define INIT_FAIL_STACK() \ 2289 do { \ 2290 fail_stack.stack = (fail_stack_elt_t *) \ 2291 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ 2292 \ 2293 if (fail_stack.stack == NULL) \ 2294 return -2; \ 2295 \ 2296 fail_stack.size = INIT_FAILURE_ALLOC; \ 2297 fail_stack.avail = 0; \ 2298 } while (0) 2299 2300 2301 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. 2302 2303 Return 1 if succeeds, and 0 if either ran out of memory 2304 allocating space for it or it was already too large. 2305 2306 REGEX_REALLOCATE requires `destination' be declared. */ 2307 2308 #define DOUBLE_FAIL_STACK(fail_stack) \ 2309 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ 2310 ? 0 \ 2311 : ((fail_stack).stack = (fail_stack_elt_t *) \ 2312 REGEX_REALLOCATE ((fail_stack).stack, \ 2313 (fail_stack).size * sizeof (fail_stack_elt_t), \ 2314 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ 2315 \ 2316 (fail_stack).stack == NULL \ 2317 ? 0 \ 2318 : ((fail_stack).size <<= 1, \ 2319 1))) 2320 2321 2322 /* Push PATTERN_OP on FAIL_STACK. 2323 2324 Return 1 if was able to do so and 0 if ran out of memory allocating 2325 space to do so. */ 2326 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \ 2327 ((FAIL_STACK_FULL () \ 2328 && !DOUBLE_FAIL_STACK (fail_stack)) \ 2329 ? 0 \ 2330 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \ 2331 1)) 2332 2333 /* This pushes an item onto the failure stack. Must be a four-byte 2334 value. Assumes the variable `fail_stack'. Probably should only 2335 be called from within `PUSH_FAILURE_POINT'. */ 2336 #define PUSH_FAILURE_ITEM(item) \ 2337 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item 2338 2339 /* The complement operation. Assumes `fail_stack' is nonempty. */ 2340 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail] 2341 2342 /* Used to omit pushing failure point id's when we're not debugging. */ 2343 #ifdef DEBUG 2344 #define DEBUG_PUSH PUSH_FAILURE_ITEM 2345 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM () 2346 #else 2347 #define DEBUG_PUSH(item) 2348 #define DEBUG_POP(item_addr) 2349 #endif 2350 2351 2352 /* Push the information about the state we will need 2353 if we ever fail back to it. 2354 2355 Requires variables fail_stack, regstart, regend, reg_info, and 2356 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be 2357 declared. 2358 2359 Does `return FAILURE_CODE' if runs out of memory. */ 2360 2361 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 2362 do { \ 2363 char *destination; \ 2364 /* Must be int, so when we don't save any registers, the arithmetic \ 2365 of 0 + -1 isn't done as unsigned. */ \ 2366 int this_reg; \ 2367 \ 2368 DEBUG_STATEMENT (failure_id++); \ 2369 DEBUG_STATEMENT (nfailure_points_pushed++); \ 2370 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 2371 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 2372 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 2373 \ 2374 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ 2375 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 2376 \ 2377 /* Ensure we have enough space allocated for what we will push. */ \ 2378 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 2379 { \ 2380 if (!DOUBLE_FAIL_STACK (fail_stack)) \ 2381 return failure_code; \ 2382 \ 2383 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 2384 (fail_stack).size); \ 2385 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 2386 } \ 2387 \ 2388 /* Push the info, starting with the registers. */ \ 2389 DEBUG_PRINT1 ("\n"); \ 2390 \ 2391 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 2392 this_reg++) \ 2393 { \ 2394 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ 2395 DEBUG_STATEMENT (num_regs_pushed++); \ 2396 \ 2397 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 2398 PUSH_FAILURE_ITEM (regstart[this_reg]); \ 2399 \ 2400 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 2401 PUSH_FAILURE_ITEM (regend[this_reg]); \ 2402 \ 2403 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ 2404 DEBUG_PRINT2 (" match_null=%d", \ 2405 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 2406 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 2407 DEBUG_PRINT2 (" matched_something=%d", \ 2408 MATCHED_SOMETHING (reg_info[this_reg])); \ 2409 DEBUG_PRINT2 (" ever_matched=%d", \ 2410 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 2411 DEBUG_PRINT1 ("\n"); \ 2412 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \ 2413 } \ 2414 \ 2415 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ 2416 PUSH_FAILURE_ITEM (lowest_active_reg); \ 2417 \ 2418 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ 2419 PUSH_FAILURE_ITEM (highest_active_reg); \ 2420 \ 2421 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ 2422 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 2423 PUSH_FAILURE_ITEM (pattern_place); \ 2424 \ 2425 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ 2426 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 2427 size2); \ 2428 DEBUG_PRINT1 ("'\n"); \ 2429 PUSH_FAILURE_ITEM (string_place); \ 2430 \ 2431 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 2432 DEBUG_PUSH (failure_id); \ 2433 } while (0) 2434 2435 /* This is the number of items that are pushed and popped on the stack 2436 for each register. */ 2437 #define NUM_REG_ITEMS 3 2438 2439 /* Individual items aside from the registers. */ 2440 #ifdef DEBUG 2441 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 2442 #else 2443 #define NUM_NONREG_ITEMS 4 2444 #endif 2445 2446 /* We push at most this many items on the stack. */ 2447 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) 2448 2449 /* We actually push this many items. */ 2450 #define NUM_FAILURE_ITEMS \ 2451 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ 2452 + NUM_NONREG_ITEMS) 2453 2454 /* How many items can still be added to the stack without overflowing it. */ 2455 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 2456 2457 2458 /* Pops what PUSH_FAIL_STACK pushes. 2459 2460 We restore into the parameters, all of which should be lvalues: 2461 STR -- the saved data position. 2462 PAT -- the saved pattern position. 2463 LOW_REG, HIGH_REG -- the highest and lowest active registers. 2464 REGSTART, REGEND -- arrays of string positions. 2465 REG_INFO -- array of information about each subexpression. 2466 2467 Also assumes the variables `fail_stack' and (if debugging), `bufp', 2468 `pend', `string1', `size1', `string2', and `size2'. */ 2469 2470 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 2471 { \ 2472 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ 2473 int this_reg; \ 2474 const unsigned char *string_temp; \ 2475 \ 2476 assert (!FAIL_STACK_EMPTY ()); \ 2477 \ 2478 /* Remove failure points and point to how many regs pushed. */ \ 2479 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 2480 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 2481 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 2482 \ 2483 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 2484 \ 2485 DEBUG_POP (&failure_id); \ 2486 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 2487 \ 2488 /* If the saved string location is NULL, it came from an \ 2489 on_failure_keep_string_jump opcode, and we want to throw away the \ 2490 saved NULL, thus retaining our current position in the string. */ \ 2491 string_temp = POP_FAILURE_ITEM (); \ 2492 if (string_temp != NULL) \ 2493 str = (const char *) string_temp; \ 2494 \ 2495 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ 2496 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 2497 DEBUG_PRINT1 ("'\n"); \ 2498 \ 2499 pat = (unsigned char *) POP_FAILURE_ITEM (); \ 2500 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ 2501 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 2502 \ 2503 /* Restore register info. */ \ 2504 high_reg = (unsigned) POP_FAILURE_ITEM (); \ 2505 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ 2506 \ 2507 low_reg = (unsigned) POP_FAILURE_ITEM (); \ 2508 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ 2509 \ 2510 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 2511 { \ 2512 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ 2513 \ 2514 reg_info[this_reg].word = POP_FAILURE_ITEM (); \ 2515 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ 2516 \ 2517 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \ 2518 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 2519 \ 2520 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \ 2521 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 2522 } \ 2523 \ 2524 DEBUG_STATEMENT (nfailure_points_popped++); \ 2525 } /* POP_FAILURE_POINT */ 2526 2527 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 2528 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 2529 characters can start a string that matches the pattern. This fastmap 2530 is used by re_search to skip quickly over impossible starting points. 2531 2532 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 2533 area as BUFP->fastmap. 2534 2535 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 2536 the pattern buffer. 2537 2538 Returns 0 if we succeed, -2 if an internal error. */ 2539 2540 int 2541 re_compile_fastmap (bufp) 2542 struct re_pattern_buffer *bufp; 2543 { 2544 int j, k; 2545 fail_stack_type fail_stack; 2546 #ifndef REGEX_MALLOC 2547 char *destination; 2548 #endif 2549 /* We don't push any register information onto the failure stack. */ 2550 unsigned num_regs = 0; 2551 2552 register char *fastmap = bufp->fastmap; 2553 unsigned char *pattern = bufp->buffer; 2554 unsigned long size = bufp->used; 2555 const unsigned char *p = pattern; 2556 register unsigned char *pend = pattern + size; 2557 2558 /* Assume that each path through the pattern can be null until 2559 proven otherwise. We set this false at the bottom of switch 2560 statement, to which we get only if a particular path doesn't 2561 match the empty string. */ 2562 boolean path_can_be_null = true; 2563 2564 /* We aren't doing a `succeed_n' to begin with. */ 2565 boolean succeed_n_p = false; 2566 2567 assert (fastmap != NULL && p != NULL); 2568 2569 INIT_FAIL_STACK (); 2570 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 2571 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 2572 bufp->can_be_null = 0; 2573 2574 while (p != pend || !FAIL_STACK_EMPTY ()) 2575 { 2576 if (p == pend) 2577 { 2578 bufp->can_be_null |= path_can_be_null; 2579 2580 /* Reset for next path. */ 2581 path_can_be_null = true; 2582 2583 p = fail_stack.stack[--fail_stack.avail]; 2584 } 2585 2586 /* We should never be about to go beyond the end of the pattern. */ 2587 assert (p < pend); 2588 2589 #ifdef SWITCH_ENUM_BUG 2590 switch ((int) ((re_opcode_t) *p++)) 2591 #else 2592 switch ((re_opcode_t) *p++) 2593 #endif 2594 { 2595 2596 /* I guess the idea here is to simply not bother with a fastmap 2597 if a backreference is used, since it's too hard to figure out 2598 the fastmap for the corresponding group. Setting 2599 `can_be_null' stops `re_search_2' from using the fastmap, so 2600 that is all we do. */ 2601 case duplicate: 2602 bufp->can_be_null = 1; 2603 return 0; 2604 2605 2606 /* Following are the cases which match a character. These end 2607 with `break'. */ 2608 2609 case exactn: 2610 fastmap[p[1]] = 1; 2611 break; 2612 2613 2614 case charset: 2615 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 2616 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 2617 fastmap[j] = 1; 2618 break; 2619 2620 2621 case charset_not: 2622 /* Chars beyond end of map must be allowed. */ 2623 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 2624 fastmap[j] = 1; 2625 2626 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 2627 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 2628 fastmap[j] = 1; 2629 break; 2630 2631 2632 case wordchar: 2633 for (j = 0; j < (1 << BYTEWIDTH); j++) 2634 if (SYNTAX (j) == Sword) 2635 fastmap[j] = 1; 2636 break; 2637 2638 2639 case notwordchar: 2640 for (j = 0; j < (1 << BYTEWIDTH); j++) 2641 if (SYNTAX (j) != Sword) 2642 fastmap[j] = 1; 2643 break; 2644 2645 2646 case anychar: 2647 /* `.' matches anything ... */ 2648 for (j = 0; j < (1 << BYTEWIDTH); j++) 2649 fastmap[j] = 1; 2650 2651 /* ... except perhaps newline. */ 2652 if (!(bufp->syntax & RE_DOT_NEWLINE)) 2653 fastmap['\n'] = 0; 2654 2655 /* Return if we have already set `can_be_null'; if we have, 2656 then the fastmap is irrelevant. Something's wrong here. */ 2657 else if (bufp->can_be_null) 2658 return 0; 2659 2660 /* Otherwise, have to check alternative paths. */ 2661 break; 2662 2663 2664 #ifdef emacs 2665 case syntaxspec: 2666 k = *p++; 2667 for (j = 0; j < (1 << BYTEWIDTH); j++) 2668 if (SYNTAX (j) == (enum syntaxcode) k) 2669 fastmap[j] = 1; 2670 break; 2671 2672 2673 case notsyntaxspec: 2674 k = *p++; 2675 for (j = 0; j < (1 << BYTEWIDTH); j++) 2676 if (SYNTAX (j) != (enum syntaxcode) k) 2677 fastmap[j] = 1; 2678 break; 2679 2680 2681 /* All cases after this match the empty string. These end with 2682 `continue'. */ 2683 2684 2685 case before_dot: 2686 case at_dot: 2687 case after_dot: 2688 continue; 2689 #endif /* not emacs */ 2690 2691 2692 case no_op: 2693 case begline: 2694 case endline: 2695 case begbuf: 2696 case endbuf: 2697 case wordbound: 2698 case notwordbound: 2699 case wordbeg: 2700 case wordend: 2701 case push_dummy_failure: 2702 continue; 2703 2704 2705 case jump_n: 2706 case pop_failure_jump: 2707 case maybe_pop_jump: 2708 case jump: 2709 case jump_past_alt: 2710 case dummy_failure_jump: 2711 EXTRACT_NUMBER_AND_INCR (j, p); 2712 p += j; 2713 if (j > 0) 2714 continue; 2715 2716 /* Jump backward implies we just went through the body of a 2717 loop and matched nothing. Opcode jumped to should be 2718 `on_failure_jump' or `succeed_n'. Just treat it like an 2719 ordinary jump. For a * loop, it has pushed its failure 2720 point already; if so, discard that as redundant. */ 2721 if ((re_opcode_t) *p != on_failure_jump 2722 && (re_opcode_t) *p != succeed_n) 2723 continue; 2724 2725 p++; 2726 EXTRACT_NUMBER_AND_INCR (j, p); 2727 p += j; 2728 2729 /* If what's on the stack is where we are now, pop it. */ 2730 if (!FAIL_STACK_EMPTY () 2731 && fail_stack.stack[fail_stack.avail - 1] == p) 2732 fail_stack.avail--; 2733 2734 continue; 2735 2736 2737 case on_failure_jump: 2738 case on_failure_keep_string_jump: 2739 handle_on_failure_jump: 2740 EXTRACT_NUMBER_AND_INCR (j, p); 2741 2742 /* For some patterns, e.g., `(a?)?', `p+j' here points to the 2743 end of the pattern. We don't want to push such a point, 2744 since when we restore it above, entering the switch will 2745 increment `p' past the end of the pattern. We don't need 2746 to push such a point since we obviously won't find any more 2747 fastmap entries beyond `pend'. Such a pattern can match 2748 the null string, though. */ 2749 if (p + j < pend) 2750 { 2751 if (!PUSH_PATTERN_OP (p + j, fail_stack)) 2752 return -2; 2753 } 2754 else 2755 bufp->can_be_null = 1; 2756 2757 if (succeed_n_p) 2758 { 2759 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 2760 succeed_n_p = false; 2761 } 2762 2763 continue; 2764 2765 2766 case succeed_n: 2767 /* Get to the number of times to succeed. */ 2768 p += 2; 2769 2770 /* Increment p past the n for when k != 0. */ 2771 EXTRACT_NUMBER_AND_INCR (k, p); 2772 if (k == 0) 2773 { 2774 p -= 4; 2775 succeed_n_p = true; /* Spaghetti code alert. */ 2776 goto handle_on_failure_jump; 2777 } 2778 continue; 2779 2780 2781 case set_number_at: 2782 p += 4; 2783 continue; 2784 2785 2786 case start_memory: 2787 case stop_memory: 2788 p += 2; 2789 continue; 2790 2791 2792 default: 2793 abort (); /* We have listed all the cases. */ 2794 } /* switch *p++ */ 2795 2796 /* Getting here means we have found the possible starting 2797 characters for one path of the pattern -- and that the empty 2798 string does not match. We need not follow this path further. 2799 Instead, look at the next alternative (remembered on the 2800 stack), or quit if no more. The test at the top of the loop 2801 does these things. */ 2802 path_can_be_null = false; 2803 p = pend; 2804 } /* while p */ 2805 2806 /* Set `can_be_null' for the last path (also the first path, if the 2807 pattern is empty). */ 2808 bufp->can_be_null |= path_can_be_null; 2809 return 0; 2810 } /* re_compile_fastmap */ 2811 2812 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 2813 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 2814 this memory for recording register information. STARTS and ENDS 2815 must be allocated using the malloc library routine, and must each 2816 be at least NUM_REGS * sizeof (regoff_t) bytes long. 2817 2818 If NUM_REGS == 0, then subsequent matches should allocate their own 2819 register data. 2820 2821 Unless this function is called, the first search or match using 2822 PATTERN_BUFFER will allocate its own register data, without 2823 freeing the old data. */ 2824 2825 void 2826 re_set_registers (bufp, regs, num_regs, starts, ends) 2827 struct re_pattern_buffer *bufp; 2828 struct re_registers *regs; 2829 unsigned num_regs; 2830 regoff_t *starts, *ends; 2831 { 2832 if (num_regs) 2833 { 2834 bufp->regs_allocated = REGS_REALLOCATE; 2835 regs->num_regs = num_regs; 2836 regs->start = starts; 2837 regs->end = ends; 2838 } 2839 else 2840 { 2841 bufp->regs_allocated = REGS_UNALLOCATED; 2842 regs->num_regs = 0; 2843 regs->start = regs->end = (regoff_t *) 0; 2844 } 2845 } 2846 2847 /* Searching routines. */ 2848 2849 /* Like re_search_2, below, but only one string is specified, and 2850 doesn't let you say where to stop matching. */ 2851 2852 int 2853 re_search (bufp, string, size, startpos, range, regs) 2854 struct re_pattern_buffer *bufp; 2855 const char *string; 2856 int size, startpos, range; 2857 struct re_registers *regs; 2858 { 2859 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, 2860 regs, size); 2861 } 2862 2863 2864 /* Using the compiled pattern in BUFP->buffer, first tries to match the 2865 virtual concatenation of STRING1 and STRING2, starting first at index 2866 STARTPOS, then at STARTPOS + 1, and so on. 2867 2868 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. 2869 2870 RANGE is how far to scan while trying to match. RANGE = 0 means try 2871 only at STARTPOS; in general, the last start tried is STARTPOS + 2872 RANGE. 2873 2874 In REGS, return the indices of the virtual concatenation of STRING1 2875 and STRING2 that matched the entire BUFP->buffer and its contained 2876 subexpressions. 2877 2878 Do not consider matching one past the index STOP in the virtual 2879 concatenation of STRING1 and STRING2. 2880 2881 We return either the position in the strings at which the match was 2882 found, -1 if no match, or -2 if error (such as failure 2883 stack overflow). */ 2884 2885 int 2886 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) 2887 struct re_pattern_buffer *bufp; 2888 const char *string1, *string2; 2889 int size1, size2; 2890 int startpos; 2891 int range; 2892 struct re_registers *regs; 2893 int stop; 2894 { 2895 int val; 2896 register char *fastmap = bufp->fastmap; 2897 register char *translate = bufp->translate; 2898 int total_size = size1 + size2; 2899 int endpos = startpos + range; 2900 2901 /* Check for out-of-range STARTPOS. */ 2902 if (startpos < 0 || startpos > total_size) 2903 return -1; 2904 2905 /* Fix up RANGE if it might eventually take us outside 2906 the virtual concatenation of STRING1 and STRING2. */ 2907 if (endpos < -1) 2908 range = -1 - startpos; 2909 else if (endpos > total_size) 2910 range = total_size - startpos; 2911 2912 /* If the search isn't to be a backwards one, don't waste time in a 2913 search for a pattern that must be anchored. */ 2914 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) 2915 { 2916 if (startpos > 0) 2917 return -1; 2918 else 2919 range = 1; 2920 } 2921 2922 /* Update the fastmap now if not correct already. */ 2923 if (fastmap && !bufp->fastmap_accurate) 2924 if (re_compile_fastmap (bufp) == -2) 2925 return -2; 2926 2927 /* Loop through the string, looking for a place to start matching. */ 2928 for (;;) 2929 { 2930 /* If a fastmap is supplied, skip quickly over characters that 2931 cannot be the start of a match. If the pattern can match the 2932 null string, however, we don't need to skip characters; we want 2933 the first null string. */ 2934 if (fastmap && startpos < total_size && !bufp->can_be_null) 2935 { 2936 if (range > 0) /* Searching forwards. */ 2937 { 2938 register const char *d; 2939 register int lim = 0; 2940 int irange = range; 2941 2942 if (startpos < size1 && startpos + range >= size1) 2943 lim = range - (size1 - startpos); 2944 2945 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; 2946 2947 /* Written out as an if-else to avoid testing `translate' 2948 inside the loop. */ 2949 if (translate) 2950 while (range > lim 2951 && !fastmap[(unsigned char) 2952 translate[(unsigned char) *d++]]) 2953 range--; 2954 else 2955 while (range > lim && !fastmap[(unsigned char) *d++]) 2956 range--; 2957 2958 startpos += irange - range; 2959 } 2960 else /* Searching backwards. */ 2961 { 2962 register char c = (size1 == 0 || startpos >= size1 2963 ? string2[startpos - size1] 2964 : string1[startpos]); 2965 2966 if (!fastmap[(unsigned char) TRANSLATE (c)]) 2967 goto advance; 2968 } 2969 } 2970 2971 /* If can't match the null string, and that's all we have left, fail. */ 2972 if (range >= 0 && startpos == total_size && fastmap 2973 && !bufp->can_be_null) 2974 return -1; 2975 2976 val = re_match_2 (bufp, string1, size1, string2, size2, 2977 startpos, regs, stop); 2978 if (val >= 0) 2979 return startpos; 2980 2981 if (val == -2) 2982 return -2; 2983 2984 advance: 2985 if (!range) 2986 break; 2987 else if (range > 0) 2988 { 2989 range--; 2990 startpos++; 2991 } 2992 else 2993 { 2994 range++; 2995 startpos--; 2996 } 2997 } 2998 return -1; 2999 } /* re_search_2 */ 3000 3001 /* Declarations and macros for re_match_2. */ 3002 3003 static int bcmp_translate (); 3004 static boolean alt_match_null_string_p (), 3005 common_op_match_null_string_p (), 3006 group_match_null_string_p (); 3007 3008 /* Structure for per-register (a.k.a. per-group) information. 3009 This must not be longer than one word, because we push this value 3010 onto the failure stack. Other register information, such as the 3011 starting and ending positions (which are addresses), and the list of 3012 inner groups (which is a bits list) are maintained in separate 3013 variables. 3014 3015 We are making a (strictly speaking) nonportable assumption here: that 3016 the compiler will pack our bit fields into something that fits into 3017 the type of `word', i.e., is something that fits into one item on the 3018 failure stack. */ 3019 typedef union 3020 { 3021 fail_stack_elt_t word; 3022 struct 3023 { 3024 /* This field is one if this group can match the empty string, 3025 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 3026 #define MATCH_NULL_UNSET_VALUE 3 3027 unsigned match_null_string_p : 2; 3028 unsigned is_active : 1; 3029 unsigned matched_something : 1; 3030 unsigned ever_matched_something : 1; 3031 } bits; 3032 } register_info_type; 3033 3034 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 3035 #define IS_ACTIVE(R) ((R).bits.is_active) 3036 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) 3037 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 3038 3039 3040 /* Call this when have matched a real character; it sets `matched' flags 3041 for the subexpressions which we are currently inside. Also records 3042 that those subexprs have matched. */ 3043 #define SET_REGS_MATCHED() \ 3044 do \ 3045 { \ 3046 unsigned r; \ 3047 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 3048 { \ 3049 MATCHED_SOMETHING (reg_info[r]) \ 3050 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 3051 = 1; \ 3052 } \ 3053 } \ 3054 while (0) 3055 3056 3057 /* This converts PTR, a pointer into one of the search strings `string1' 3058 and `string2' into an offset from the beginning of that string. */ 3059 #define POINTER_TO_OFFSET(ptr) \ 3060 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1) 3061 3062 /* Registers are set to a sentinel when they haven't yet matched. */ 3063 #define REG_UNSET_VALUE ((char *) -1) 3064 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 3065 3066 3067 /* Macros for dealing with the split strings in re_match_2. */ 3068 3069 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) 3070 3071 /* Call before fetching a character with *d. This switches over to 3072 string2 if necessary. */ 3073 #define PREFETCH() \ 3074 while (d == dend) \ 3075 { \ 3076 /* End of string2 => fail. */ \ 3077 if (dend == end_match_2) \ 3078 goto fail; \ 3079 /* End of string1 => advance to string2. */ \ 3080 d = string2; \ 3081 dend = end_match_2; \ 3082 } 3083 3084 3085 /* Test if at very beginning or at very end of the virtual concatenation 3086 of `string1' and `string2'. If only one string, it's `string2'. */ 3087 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) 3088 #define AT_STRINGS_END(d) ((d) == end2) 3089 3090 3091 /* Test if D points to a character which is word-constituent. We have 3092 two special cases to check for: if past the end of string1, look at 3093 the first character in string2; and if before the beginning of 3094 string2, look at the last character in string1. */ 3095 #define WORDCHAR_P(d) \ 3096 (SYNTAX ((d) == end1 ? *string2 \ 3097 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ 3098 == Sword) 3099 3100 /* Test if the character before D and the one at D differ with respect 3101 to being word-constituent. */ 3102 #define AT_WORD_BOUNDARY(d) \ 3103 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ 3104 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) 3105 3106 3107 /* Free everything we malloc. */ 3108 #ifdef REGEX_MALLOC 3109 #define FREE_VAR(var) if (var) free (var); var = NULL 3110 #define FREE_VARIABLES() \ 3111 do { \ 3112 FREE_VAR (fail_stack.stack); \ 3113 FREE_VAR (regstart); \ 3114 FREE_VAR (regend); \ 3115 FREE_VAR (old_regstart); \ 3116 FREE_VAR (old_regend); \ 3117 FREE_VAR (best_regstart); \ 3118 FREE_VAR (best_regend); \ 3119 FREE_VAR (reg_info); \ 3120 FREE_VAR (reg_dummy); \ 3121 FREE_VAR (reg_info_dummy); \ 3122 } while (0) 3123 #else /* not REGEX_MALLOC */ 3124 /* Some MIPS systems (at least) want this to free alloca'd storage. */ 3125 #define FREE_VARIABLES() alloca (0) 3126 #endif /* not REGEX_MALLOC */ 3127 3128 3129 /* These values must meet several constraints. They must not be valid 3130 register values; since we have a limit of 255 registers (because 3131 we use only one byte in the pattern for the register number), we can 3132 use numbers larger than 255. They must differ by 1, because of 3133 NUM_FAILURE_ITEMS above. And the value for the lowest register must 3134 be larger than the value for the highest register, so we do not try 3135 to actually save any registers when none are active. */ 3136 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) 3137 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) 3138 3139 /* Matching routines. */ 3140 3141 #ifndef emacs /* Emacs never uses this. */ 3142 /* re_match is like re_match_2 except it takes only a single string. */ 3143 3144 int 3145 re_match (bufp, string, size, pos, regs) 3146 struct re_pattern_buffer *bufp; 3147 const char *string; 3148 int size, pos; 3149 struct re_registers *regs; 3150 { 3151 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size); 3152 } 3153 #endif /* not emacs */ 3154 3155 3156 /* re_match_2 matches the compiled pattern in BUFP against the 3157 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 3158 and SIZE2, respectively). We start matching at POS, and stop 3159 matching at STOP. 3160 3161 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we 3162 store offsets for the substring each group matched in REGS. See the 3163 documentation for exactly how many groups we fill. 3164 3165 We return -1 if no match, -2 if an internal error (such as the 3166 failure stack overflowing). Otherwise, we return the length of the 3167 matched substring. */ 3168 3169 int 3170 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 3171 struct re_pattern_buffer *bufp; 3172 const char *string1, *string2; 3173 int size1, size2; 3174 int pos; 3175 struct re_registers *regs; 3176 int stop; 3177 { 3178 /* General temporaries. */ 3179 int mcnt; 3180 unsigned char *p1; 3181 3182 /* Just past the end of the corresponding string. */ 3183 const char *end1, *end2; 3184 3185 /* Pointers into string1 and string2, just past the last characters in 3186 each to consider matching. */ 3187 const char *end_match_1, *end_match_2; 3188 3189 /* Where we are in the data, and the end of the current string. */ 3190 const char *d, *dend; 3191 3192 /* Where we are in the pattern, and the end of the pattern. */ 3193 unsigned char *p = bufp->buffer; 3194 register unsigned char *pend = p + bufp->used; 3195 3196 /* We use this to map every character in the string. */ 3197 char *translate = bufp->translate; 3198 3199 /* Failure point stack. Each place that can handle a failure further 3200 down the line pushes a failure point on this stack. It consists of 3201 restart, regend, and reg_info for all registers corresponding to 3202 the subexpressions we're currently inside, plus the number of such 3203 registers, and, finally, two char *'s. The first char * is where 3204 to resume scanning the pattern; the second one is where to resume 3205 scanning the strings. If the latter is zero, the failure point is 3206 a ``dummy''; if a failure happens and the failure point is a dummy, 3207 it gets discarded and the next next one is tried. */ 3208 fail_stack_type fail_stack; 3209 #ifdef DEBUG 3210 static unsigned failure_id = 0; 3211 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; 3212 #endif 3213 3214 /* We fill all the registers internally, independent of what we 3215 return, for use in backreferences. The number here includes 3216 an element for register zero. */ 3217 unsigned num_regs = bufp->re_nsub + 1; 3218 3219 /* The currently active registers. */ 3220 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3221 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3222 3223 /* Information on the contents of registers. These are pointers into 3224 the input strings; they record just what was matched (on this 3225 attempt) by a subexpression part of the pattern, that is, the 3226 regnum-th regstart pointer points to where in the pattern we began 3227 matching and the regnum-th regend points to right after where we 3228 stopped matching the regnum-th subexpression. (The zeroth register 3229 keeps track of what the whole pattern matches.) */ 3230 const char **regstart, **regend; 3231 3232 /* If a group that's operated upon by a repetition operator fails to 3233 match anything, then the register for its start will need to be 3234 restored because it will have been set to wherever in the string we 3235 are when we last see its open-group operator. Similarly for a 3236 register's end. */ 3237 const char **old_regstart, **old_regend; 3238 3239 /* The is_active field of reg_info helps us keep track of which (possibly 3240 nested) subexpressions we are currently in. The matched_something 3241 field of reg_info[reg_num] helps us tell whether or not we have 3242 matched any of the pattern so far this time through the reg_num-th 3243 subexpression. These two fields get reset each time through any 3244 loop their register is in. */ 3245 register_info_type *reg_info; 3246 3247 /* The following record the register info as found in the above 3248 variables when we find a match better than any we've seen before. 3249 This happens as we backtrack through the failure points, which in 3250 turn happens only if we have not yet matched the entire string. */ 3251 unsigned best_regs_set = false; 3252 const char **best_regstart, **best_regend; 3253 3254 /* Logically, this is `best_regend[0]'. But we don't want to have to 3255 allocate space for that if we're not allocating space for anything 3256 else (see below). Also, we never need info about register 0 for 3257 any of the other register vectors, and it seems rather a kludge to 3258 treat `best_regend' differently than the rest. So we keep track of 3259 the end of the best match so far in a separate variable. We 3260 initialize this to NULL so that when we backtrack the first time 3261 and need to test it, it's not garbage. */ 3262 const char *match_end = NULL; 3263 3264 /* Used when we pop values we don't care about. */ 3265 const char **reg_dummy; 3266 register_info_type *reg_info_dummy; 3267 3268 #ifdef DEBUG 3269 /* Counts the total number of registers pushed. */ 3270 unsigned num_regs_pushed = 0; 3271 #endif 3272 3273 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); 3274 3275 INIT_FAIL_STACK (); 3276 3277 /* Do not bother to initialize all the register variables if there are 3278 no groups in the pattern, as it takes a fair amount of time. If 3279 there are groups, we include space for register 0 (the whole 3280 pattern), even though we never use it, since it simplifies the 3281 array indexing. We should fix this. */ 3282 if (bufp->re_nsub) 3283 { 3284 regstart = REGEX_TALLOC (num_regs, const char *); 3285 regend = REGEX_TALLOC (num_regs, const char *); 3286 old_regstart = REGEX_TALLOC (num_regs, const char *); 3287 old_regend = REGEX_TALLOC (num_regs, const char *); 3288 best_regstart = REGEX_TALLOC (num_regs, const char *); 3289 best_regend = REGEX_TALLOC (num_regs, const char *); 3290 reg_info = REGEX_TALLOC (num_regs, register_info_type); 3291 reg_dummy = REGEX_TALLOC (num_regs, const char *); 3292 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); 3293 3294 if (!(regstart && regend && old_regstart && old_regend && reg_info 3295 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) 3296 { 3297 FREE_VARIABLES (); 3298 return -2; 3299 } 3300 } 3301 #ifdef REGEX_MALLOC 3302 else 3303 { 3304 /* We must initialize all our variables to NULL, so that 3305 `FREE_VARIABLES' doesn't try to free them. */ 3306 regstart = regend = old_regstart = old_regend = best_regstart 3307 = best_regend = reg_dummy = NULL; 3308 reg_info = reg_info_dummy = (register_info_type *) NULL; 3309 } 3310 #endif /* REGEX_MALLOC */ 3311 3312 /* The starting position is bogus. */ 3313 if (pos < 0 || pos > size1 + size2) 3314 { 3315 FREE_VARIABLES (); 3316 return -1; 3317 } 3318 3319 /* Initialize subexpression text positions to -1 to mark ones that no 3320 start_memory/stop_memory has been seen for. Also initialize the 3321 register information struct. */ 3322 for (mcnt = 1; mcnt < num_regs; mcnt++) 3323 { 3324 regstart[mcnt] = regend[mcnt] 3325 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; 3326 3327 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; 3328 IS_ACTIVE (reg_info[mcnt]) = 0; 3329 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 3330 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; 3331 } 3332 3333 /* We move `string1' into `string2' if the latter's empty -- but not if 3334 `string1' is null. */ 3335 if (size2 == 0 && string1 != NULL) 3336 { 3337 string2 = string1; 3338 size2 = size1; 3339 string1 = 0; 3340 size1 = 0; 3341 } 3342 end1 = string1 + size1; 3343 end2 = string2 + size2; 3344 3345 /* Compute where to stop matching, within the two strings. */ 3346 if (stop <= size1) 3347 { 3348 end_match_1 = string1 + stop; 3349 end_match_2 = string2; 3350 } 3351 else 3352 { 3353 end_match_1 = end1; 3354 end_match_2 = string2 + stop - size1; 3355 } 3356 3357 /* `p' scans through the pattern as `d' scans through the data. 3358 `dend' is the end of the input string that `d' points within. `d' 3359 is advanced into the following input string whenever necessary, but 3360 this happens before fetching; therefore, at the beginning of the 3361 loop, `d' can be pointing at the end of a string, but it cannot 3362 equal `string2'. */ 3363 if (size1 > 0 && pos <= size1) 3364 { 3365 d = string1 + pos; 3366 dend = end_match_1; 3367 } 3368 else 3369 { 3370 d = string2 + pos - size1; 3371 dend = end_match_2; 3372 } 3373 3374 DEBUG_PRINT1 ("The compiled pattern is: "); 3375 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); 3376 DEBUG_PRINT1 ("The string to match is: `"); 3377 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); 3378 DEBUG_PRINT1 ("'\n"); 3379 3380 /* This loops over pattern commands. It exits by returning from the 3381 function if the match is complete, or it drops through if the match 3382 fails at this starting point in the input data. */ 3383 for (;;) 3384 { 3385 DEBUG_PRINT2 ("\n0x%x: ", p); 3386 3387 if (p == pend) 3388 { /* End of pattern means we might have succeeded. */ 3389 DEBUG_PRINT1 ("end of pattern ... "); 3390 3391 /* If we haven't matched the entire string, and we want the 3392 longest match, try backtracking. */ 3393 if (d != end_match_2) 3394 { 3395 DEBUG_PRINT1 ("backtracking.\n"); 3396 3397 if (!FAIL_STACK_EMPTY ()) 3398 { /* More failure points to try. */ 3399 boolean same_str_p = (FIRST_STRING_P (match_end) 3400 == MATCHING_IN_FIRST_STRING); 3401 3402 /* If exceeds best match so far, save it. */ 3403 if (!best_regs_set 3404 || (same_str_p && d > match_end) 3405 || (!same_str_p && !MATCHING_IN_FIRST_STRING)) 3406 { 3407 best_regs_set = true; 3408 match_end = d; 3409 3410 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); 3411 3412 for (mcnt = 1; mcnt < num_regs; mcnt++) 3413 { 3414 best_regstart[mcnt] = regstart[mcnt]; 3415 best_regend[mcnt] = regend[mcnt]; 3416 } 3417 } 3418 goto fail; 3419 } 3420 3421 /* If no failure points, don't restore garbage. */ 3422 else if (best_regs_set) 3423 { 3424 restore_best_regs: 3425 /* Restore best match. It may happen that `dend == 3426 end_match_1' while the restored d is in string2. 3427 For example, the pattern `x.*y.*z' against the 3428 strings `x-' and `y-z-', if the two strings are 3429 not consecutive in memory. */ 3430 DEBUG_PRINT1 ("Restoring best registers.\n"); 3431 3432 d = match_end; 3433 dend = ((d >= string1 && d <= end1) 3434 ? end_match_1 : end_match_2); 3435 3436 for (mcnt = 1; mcnt < num_regs; mcnt++) 3437 { 3438 regstart[mcnt] = best_regstart[mcnt]; 3439 regend[mcnt] = best_regend[mcnt]; 3440 } 3441 } 3442 } /* d != end_match_2 */ 3443 3444 DEBUG_PRINT1 ("Accepting match.\n"); 3445 3446 /* If caller wants register contents data back, do it. */ 3447 if (regs && !bufp->no_sub) 3448 { 3449 /* Have the register data arrays been allocated? */ 3450 if (bufp->regs_allocated == REGS_UNALLOCATED) 3451 { /* No. So allocate them with malloc. We need one 3452 extra element beyond `num_regs' for the `-1' marker 3453 GNU code uses. */ 3454 regs->num_regs = MAX (RE_NREGS, num_regs + 1); 3455 regs->start = TALLOC (regs->num_regs, regoff_t); 3456 regs->end = TALLOC (regs->num_regs, regoff_t); 3457 if (regs->start == NULL || regs->end == NULL) 3458 return -2; 3459 bufp->regs_allocated = REGS_REALLOCATE; 3460 } 3461 else if (bufp->regs_allocated == REGS_REALLOCATE) 3462 { /* Yes. If we need more elements than were already 3463 allocated, reallocate them. If we need fewer, just 3464 leave it alone. */ 3465 if (regs->num_regs < num_regs + 1) 3466 { 3467 regs->num_regs = num_regs + 1; 3468 RETALLOC (regs->start, regs->num_regs, regoff_t); 3469 RETALLOC (regs->end, regs->num_regs, regoff_t); 3470 if (regs->start == NULL || regs->end == NULL) 3471 return -2; 3472 } 3473 } 3474 else 3475 assert (bufp->regs_allocated == REGS_FIXED); 3476 3477 /* Convert the pointer data in `regstart' and `regend' to 3478 indices. Register zero has to be set differently, 3479 since we haven't kept track of any info for it. */ 3480 if (regs->num_regs > 0) 3481 { 3482 regs->start[0] = pos; 3483 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1 3484 : d - string2 + size1); 3485 } 3486 3487 /* Go through the first `min (num_regs, regs->num_regs)' 3488 registers, since that is all we initialized. */ 3489 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) 3490 { 3491 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) 3492 regs->start[mcnt] = regs->end[mcnt] = -1; 3493 else 3494 { 3495 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]); 3496 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]); 3497 } 3498 } 3499 3500 /* If the regs structure we return has more elements than 3501 were in the pattern, set the extra elements to -1. If 3502 we (re)allocated the registers, this is the case, 3503 because we always allocate enough to have at least one 3504 -1 at the end. */ 3505 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) 3506 regs->start[mcnt] = regs->end[mcnt] = -1; 3507 } /* regs && !bufp->no_sub */ 3508 3509 FREE_VARIABLES (); 3510 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", 3511 nfailure_points_pushed, nfailure_points_popped, 3512 nfailure_points_pushed - nfailure_points_popped); 3513 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); 3514 3515 mcnt = d - pos - (MATCHING_IN_FIRST_STRING 3516 ? string1 3517 : string2 - size1); 3518 3519 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); 3520 3521 return mcnt; 3522 } 3523 3524 /* Otherwise match next pattern command. */ 3525 #ifdef SWITCH_ENUM_BUG 3526 switch ((int) ((re_opcode_t) *p++)) 3527 #else 3528 switch ((re_opcode_t) *p++) 3529 #endif 3530 { 3531 /* Ignore these. Used to ignore the n of succeed_n's which 3532 currently have n == 0. */ 3533 case no_op: 3534 DEBUG_PRINT1 ("EXECUTING no_op.\n"); 3535 break; 3536 3537 3538 /* Match the next n pattern characters exactly. The following 3539 byte in the pattern defines n, and the n bytes after that 3540 are the characters to match. */ 3541 case exactn: 3542 mcnt = *p++; 3543 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); 3544 3545 /* This is written out as an if-else so we don't waste time 3546 testing `translate' inside the loop. */ 3547 if (translate) 3548 { 3549 do 3550 { 3551 PREFETCH (); 3552 if (translate[(unsigned char) *d++] != (char) *p++) 3553 goto fail; 3554 } 3555 while (--mcnt); 3556 } 3557 else 3558 { 3559 do 3560 { 3561 PREFETCH (); 3562 if (*d++ != (char) *p++) goto fail; 3563 } 3564 while (--mcnt); 3565 } 3566 SET_REGS_MATCHED (); 3567 break; 3568 3569 3570 /* Match any character except possibly a newline or a null. */ 3571 case anychar: 3572 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 3573 3574 PREFETCH (); 3575 3576 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') 3577 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) 3578 goto fail; 3579 3580 SET_REGS_MATCHED (); 3581 DEBUG_PRINT2 (" Matched `%d'.\n", *d); 3582 d++; 3583 break; 3584 3585 3586 case charset: 3587 case charset_not: 3588 { 3589 register unsigned char c; 3590 boolean not = (re_opcode_t) *(p - 1) == charset_not; 3591 3592 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); 3593 3594 PREFETCH (); 3595 c = TRANSLATE (*d); /* The character to match. */ 3596 3597 /* Cast to `unsigned' instead of `unsigned char' in case the 3598 bit list is a full 32 bytes long. */ 3599 if (c < (unsigned) (*p * BYTEWIDTH) 3600 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 3601 not = !not; 3602 3603 p += 1 + *p; 3604 3605 if (!not) goto fail; 3606 3607 SET_REGS_MATCHED (); 3608 d++; 3609 break; 3610 } 3611 3612 3613 /* The beginning of a group is represented by start_memory. 3614 The arguments are the register number in the next byte, and the 3615 number of groups inner to this one in the next. The text 3616 matched within the group is recorded (in the internal 3617 registers data structure) under the register number. */ 3618 case start_memory: 3619 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); 3620 3621 /* Find out if this group can match the empty string. */ 3622 p1 = p; /* To send to group_match_null_string_p. */ 3623 3624 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) 3625 REG_MATCH_NULL_STRING_P (reg_info[*p]) 3626 = group_match_null_string_p (&p1, pend, reg_info); 3627 3628 /* Save the position in the string where we were the last time 3629 we were at this open-group operator in case the group is 3630 operated upon by a repetition operator, e.g., with `(a*)*b' 3631 against `ab'; then we want to ignore where we are now in 3632 the string in case this attempt to match fails. */ 3633 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 3634 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] 3635 : regstart[*p]; 3636 DEBUG_PRINT2 (" old_regstart: %d\n", 3637 POINTER_TO_OFFSET (old_regstart[*p])); 3638 3639 regstart[*p] = d; 3640 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); 3641 3642 IS_ACTIVE (reg_info[*p]) = 1; 3643 MATCHED_SOMETHING (reg_info[*p]) = 0; 3644 3645 /* This is the new highest active register. */ 3646 highest_active_reg = *p; 3647 3648 /* If nothing was active before, this is the new lowest active 3649 register. */ 3650 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 3651 lowest_active_reg = *p; 3652 3653 /* Move past the register number and inner group count. */ 3654 p += 2; 3655 break; 3656 3657 3658 /* The stop_memory opcode represents the end of a group. Its 3659 arguments are the same as start_memory's: the register 3660 number, and the number of inner groups. */ 3661 case stop_memory: 3662 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); 3663 3664 /* We need to save the string position the last time we were at 3665 this close-group operator in case the group is operated 3666 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' 3667 against `aba'; then we want to ignore where we are now in 3668 the string in case this attempt to match fails. */ 3669 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 3670 ? REG_UNSET (regend[*p]) ? d : regend[*p] 3671 : regend[*p]; 3672 DEBUG_PRINT2 (" old_regend: %d\n", 3673 POINTER_TO_OFFSET (old_regend[*p])); 3674 3675 regend[*p] = d; 3676 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); 3677 3678 /* This register isn't active anymore. */ 3679 IS_ACTIVE (reg_info[*p]) = 0; 3680 3681 /* If this was the only register active, nothing is active 3682 anymore. */ 3683 if (lowest_active_reg == highest_active_reg) 3684 { 3685 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3686 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3687 } 3688 else 3689 { /* We must scan for the new highest active register, since 3690 it isn't necessarily one less than now: consider 3691 (a(b)c(d(e)f)g). When group 3 ends, after the f), the 3692 new highest active register is 1. */ 3693 unsigned char r = *p - 1; 3694 while (r > 0 && !IS_ACTIVE (reg_info[r])) 3695 r--; 3696 3697 /* If we end up at register zero, that means that we saved 3698 the registers as the result of an `on_failure_jump', not 3699 a `start_memory', and we jumped to past the innermost 3700 `stop_memory'. For example, in ((.)*) we save 3701 registers 1 and 2 as a result of the *, but when we pop 3702 back to the second ), we are at the stop_memory 1. 3703 Thus, nothing is active. */ 3704 if (r == 0) 3705 { 3706 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3707 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3708 } 3709 else 3710 highest_active_reg = r; 3711 } 3712 3713 /* If just failed to match something this time around with a 3714 group that's operated on by a repetition operator, try to 3715 force exit from the ``loop'', and restore the register 3716 information for this group that we had before trying this 3717 last match. */ 3718 if ((!MATCHED_SOMETHING (reg_info[*p]) 3719 || (re_opcode_t) p[-3] == start_memory) 3720 && (p + 2) < pend) 3721 { 3722 boolean is_a_jump_n = false; 3723 3724 p1 = p + 2; 3725 mcnt = 0; 3726 switch ((re_opcode_t) *p1++) 3727 { 3728 case jump_n: 3729 is_a_jump_n = true; 3730 case pop_failure_jump: 3731 case maybe_pop_jump: 3732 case jump: 3733 case dummy_failure_jump: 3734 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 3735 if (is_a_jump_n) 3736 p1 += 2; 3737 break; 3738 3739 default: 3740 /* do nothing */ ; 3741 } 3742 p1 += mcnt; 3743 3744 /* If the next operation is a jump backwards in the pattern 3745 to an on_failure_jump right before the start_memory 3746 corresponding to this stop_memory, exit from the loop 3747 by forcing a failure after pushing on the stack the 3748 on_failure_jump's jump in the pattern, and d. */ 3749 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump 3750 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) 3751 { 3752 /* If this group ever matched anything, then restore 3753 what its registers were before trying this last 3754 failed match, e.g., with `(a*)*b' against `ab' for 3755 regstart[1], and, e.g., with `((a*)*(b*)*)*' 3756 against `aba' for regend[3]. 3757 3758 Also restore the registers for inner groups for, 3759 e.g., `((a*)(b*))*' against `aba' (register 3 would 3760 otherwise get trashed). */ 3761 3762 if (EVER_MATCHED_SOMETHING (reg_info[*p])) 3763 { 3764 unsigned r; 3765 3766 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; 3767 3768 /* Restore this and inner groups' (if any) registers. */ 3769 for (r = *p; r < *p + *(p + 1); r++) 3770 { 3771 regstart[r] = old_regstart[r]; 3772 3773 /* xx why this test? */ 3774 if ((int) old_regend[r] >= (int) regstart[r]) 3775 regend[r] = old_regend[r]; 3776 } 3777 } 3778 p1++; 3779 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 3780 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); 3781 3782 goto fail; 3783 } 3784 } 3785 3786 /* Move past the register number and the inner group count. */ 3787 p += 2; 3788 break; 3789 3790 3791 /* \<digit> has been turned into a `duplicate' command which is 3792 followed by the numeric value of <digit> as the register number. */ 3793 case duplicate: 3794 { 3795 register const char *d2, *dend2; 3796 int regno = *p++; /* Get which register to match against. */ 3797 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); 3798 3799 /* Can't back reference a group which we've never matched. */ 3800 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) 3801 goto fail; 3802 3803 /* Where in input to try to start matching. */ 3804 d2 = regstart[regno]; 3805 3806 /* Where to stop matching; if both the place to start and 3807 the place to stop matching are in the same string, then 3808 set to the place to stop, otherwise, for now have to use 3809 the end of the first string. */ 3810 3811 dend2 = ((FIRST_STRING_P (regstart[regno]) 3812 == FIRST_STRING_P (regend[regno])) 3813 ? regend[regno] : end_match_1); 3814 for (;;) 3815 { 3816 /* If necessary, advance to next segment in register 3817 contents. */ 3818 while (d2 == dend2) 3819 { 3820 if (dend2 == end_match_2) break; 3821 if (dend2 == regend[regno]) break; 3822 3823 /* End of string1 => advance to string2. */ 3824 d2 = string2; 3825 dend2 = regend[regno]; 3826 } 3827 /* At end of register contents => success */ 3828 if (d2 == dend2) break; 3829 3830 /* If necessary, advance to next segment in data. */ 3831 PREFETCH (); 3832 3833 /* How many characters left in this segment to match. */ 3834 mcnt = dend - d; 3835 3836 /* Want how many consecutive characters we can match in 3837 one shot, so, if necessary, adjust the count. */ 3838 if (mcnt > dend2 - d2) 3839 mcnt = dend2 - d2; 3840 3841 /* Compare that many; failure if mismatch, else move 3842 past them. */ 3843 if (translate 3844 ? bcmp_translate (d, d2, mcnt, translate) 3845 : bcmp (d, d2, mcnt)) 3846 goto fail; 3847 d += mcnt, d2 += mcnt; 3848 } 3849 } 3850 break; 3851 3852 3853 /* begline matches the empty string at the beginning of the string 3854 (unless `not_bol' is set in `bufp'), and, if 3855 `newline_anchor' is set, after newlines. */ 3856 case begline: 3857 DEBUG_PRINT1 ("EXECUTING begline.\n"); 3858 3859 if (AT_STRINGS_BEG (d)) 3860 { 3861 if (!bufp->not_bol) break; 3862 } 3863 else if (d[-1] == '\n' && bufp->newline_anchor) 3864 { 3865 break; 3866 } 3867 /* In all other cases, we fail. */ 3868 goto fail; 3869 3870 3871 /* endline is the dual of begline. */ 3872 case endline: 3873 DEBUG_PRINT1 ("EXECUTING endline.\n"); 3874 3875 if (AT_STRINGS_END (d)) 3876 { 3877 if (!bufp->not_eol) break; 3878 } 3879 3880 /* We have to ``prefetch'' the next character. */ 3881 else if ((d == end1 ? *string2 : *d) == '\n' 3882 && bufp->newline_anchor) 3883 { 3884 break; 3885 } 3886 goto fail; 3887 3888 3889 /* Match at the very beginning of the data. */ 3890 case begbuf: 3891 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); 3892 if (AT_STRINGS_BEG (d)) 3893 break; 3894 goto fail; 3895 3896 3897 /* Match at the very end of the data. */ 3898 case endbuf: 3899 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); 3900 if (AT_STRINGS_END (d)) 3901 break; 3902 goto fail; 3903 3904 3905 /* on_failure_keep_string_jump is used to optimize `.*\n'. It 3906 pushes NULL as the value for the string on the stack. Then 3907 `pop_failure_point' will keep the current value for the 3908 string, instead of restoring it. To see why, consider 3909 matching `foo\nbar' against `.*\n'. The .* matches the foo; 3910 then the . fails against the \n. But the next thing we want 3911 to do is match the \n against the \n; if we restored the 3912 string value, we would be back at the foo. 3913 3914 Because this is used only in specific cases, we don't need to 3915 check all the things that `on_failure_jump' does, to make 3916 sure the right things get saved on the stack. Hence we don't 3917 share its code. The only reason to push anything on the 3918 stack at all is that otherwise we would have to change 3919 `anychar's code to do something besides goto fail in this 3920 case; that seems worse than this. */ 3921 case on_failure_keep_string_jump: 3922 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); 3923 3924 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3925 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); 3926 3927 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); 3928 break; 3929 3930 3931 /* Uses of on_failure_jump: 3932 3933 Each alternative starts with an on_failure_jump that points 3934 to the beginning of the next alternative. Each alternative 3935 except the last ends with a jump that in effect jumps past 3936 the rest of the alternatives. (They really jump to the 3937 ending jump of the following alternative, because tensioning 3938 these jumps is a hassle.) 3939 3940 Repeats start with an on_failure_jump that points past both 3941 the repetition text and either the following jump or 3942 pop_failure_jump back to this on_failure_jump. */ 3943 case on_failure_jump: 3944 on_failure: 3945 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); 3946 3947 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3948 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); 3949 3950 /* If this on_failure_jump comes right before a group (i.e., 3951 the original * applied to a group), save the information 3952 for that group and all inner ones, so that if we fail back 3953 to this point, the group's information will be correct. 3954 For example, in \(a*\)*\1, we need the preceding group, 3955 and in \(\(a*\)b*\)\2, we need the inner group. */ 3956 3957 /* We can't use `p' to check ahead because we push 3958 a failure point to `p + mcnt' after we do this. */ 3959 p1 = p; 3960 3961 /* We need to skip no_op's before we look for the 3962 start_memory in case this on_failure_jump is happening as 3963 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 3964 against aba. */ 3965 while (p1 < pend && (re_opcode_t) *p1 == no_op) 3966 p1++; 3967 3968 if (p1 < pend && (re_opcode_t) *p1 == start_memory) 3969 { 3970 /* We have a new highest active register now. This will 3971 get reset at the start_memory we are about to get to, 3972 but we will have saved all the registers relevant to 3973 this repetition op, as described above. */ 3974 highest_active_reg = *(p1 + 1) + *(p1 + 2); 3975 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 3976 lowest_active_reg = *(p1 + 1); 3977 } 3978 3979 DEBUG_PRINT1 (":\n"); 3980 PUSH_FAILURE_POINT (p + mcnt, d, -2); 3981 break; 3982 3983 3984 /* A smart repeat ends with `maybe_pop_jump'. 3985 We change it to either `pop_failure_jump' or `jump'. */ 3986 case maybe_pop_jump: 3987 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3988 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); 3989 { 3990 register unsigned char *p2 = p; 3991 3992 /* Compare the beginning of the repeat with what in the 3993 pattern follows its end. If we can establish that there 3994 is nothing that they would both match, i.e., that we 3995 would have to backtrack because of (as in, e.g., `a*a') 3996 then we can change to pop_failure_jump, because we'll 3997 never have to backtrack. 3998 3999 This is not true in the case of alternatives: in 4000 `(a|ab)*' we do need to backtrack to the `ab' alternative 4001 (e.g., if the string was `ab'). But instead of trying to 4002 detect that here, the alternative has put on a dummy 4003 failure point which is what we will end up popping. */ 4004 4005 /* Skip over open/close-group commands. */ 4006 while (p2 + 2 < pend 4007 && ((re_opcode_t) *p2 == stop_memory 4008 || (re_opcode_t) *p2 == start_memory)) 4009 p2 += 3; /* Skip over args, too. */ 4010 4011 /* If we're at the end of the pattern, we can change. */ 4012 if (p2 == pend) 4013 { 4014 /* Consider what happens when matching ":\(.*\)" 4015 against ":/". I don't really understand this code 4016 yet. */ 4017 p[-3] = (unsigned char) pop_failure_jump; 4018 DEBUG_PRINT1 4019 (" End of pattern: change to `pop_failure_jump'.\n"); 4020 } 4021 4022 else if ((re_opcode_t) *p2 == exactn 4023 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) 4024 { 4025 register unsigned char c 4026 = *p2 == (unsigned char) endline ? '\n' : p2[2]; 4027 p1 = p + mcnt; 4028 4029 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding 4030 to the `maybe_finalize_jump' of this case. Examine what 4031 follows. */ 4032 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) 4033 { 4034 p[-3] = (unsigned char) pop_failure_jump; 4035 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 4036 c, p1[5]); 4037 } 4038 4039 else if ((re_opcode_t) p1[3] == charset 4040 || (re_opcode_t) p1[3] == charset_not) 4041 { 4042 int not = (re_opcode_t) p1[3] == charset_not; 4043 4044 if (c < (unsigned char) (p1[4] * BYTEWIDTH) 4045 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4046 not = !not; 4047 4048 /* `not' is equal to 1 if c would match, which means 4049 that we can't change to pop_failure_jump. */ 4050 if (!not) 4051 { 4052 p[-3] = (unsigned char) pop_failure_jump; 4053 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4054 } 4055 } 4056 } 4057 } 4058 p -= 2; /* Point at relative address again. */ 4059 if ((re_opcode_t) p[-1] != pop_failure_jump) 4060 { 4061 p[-1] = (unsigned char) jump; 4062 DEBUG_PRINT1 (" Match => jump.\n"); 4063 goto unconditional_jump; 4064 } 4065 /* Note fall through. */ 4066 4067 4068 /* The end of a simple repeat has a pop_failure_jump back to 4069 its matching on_failure_jump, where the latter will push a 4070 failure point. The pop_failure_jump takes off failure 4071 points put on by this pop_failure_jump's matching 4072 on_failure_jump; we got through the pattern to here from the 4073 matching on_failure_jump, so didn't fail. */ 4074 case pop_failure_jump: 4075 { 4076 /* We need to pass separate storage for the lowest and 4077 highest registers, even though we don't care about the 4078 actual values. Otherwise, we will restore only one 4079 register from the stack, since lowest will == highest in 4080 `pop_failure_point'. */ 4081 unsigned dummy_low_reg, dummy_high_reg; 4082 unsigned char *pdummy; 4083 const char *sdummy; 4084 4085 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); 4086 POP_FAILURE_POINT (sdummy, pdummy, 4087 dummy_low_reg, dummy_high_reg, 4088 reg_dummy, reg_dummy, reg_info_dummy); 4089 } 4090 /* Note fall through. */ 4091 4092 4093 /* Unconditionally jump (without popping any failure points). */ 4094 case jump: 4095 unconditional_jump: 4096 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ 4097 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); 4098 p += mcnt; /* Do the jump. */ 4099 DEBUG_PRINT2 ("(to 0x%x).\n", p); 4100 break; 4101 4102 4103 /* We need this opcode so we can detect where alternatives end 4104 in `group_match_null_string_p' et al. */ 4105 case jump_past_alt: 4106 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); 4107 goto unconditional_jump; 4108 4109 4110 /* Normally, the on_failure_jump pushes a failure point, which 4111 then gets popped at pop_failure_jump. We will end up at 4112 pop_failure_jump, also, and with a pattern of, say, `a+', we 4113 are skipping over the on_failure_jump, so we have to push 4114 something meaningless for pop_failure_jump to pop. */ 4115 case dummy_failure_jump: 4116 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); 4117 /* It doesn't matter what we push for the string here. What 4118 the code at `fail' tests is the value for the pattern. */ 4119 PUSH_FAILURE_POINT (0, 0, -2); 4120 goto unconditional_jump; 4121 4122 4123 /* At the end of an alternative, we need to push a dummy failure 4124 point in case we are followed by a `pop_failure_jump', because 4125 we don't want the failure point for the alternative to be 4126 popped. For example, matching `(a|ab)*' against `aab' 4127 requires that we match the `ab' alternative. */ 4128 case push_dummy_failure: 4129 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); 4130 /* See comments just above at `dummy_failure_jump' about the 4131 two zeroes. */ 4132 PUSH_FAILURE_POINT (0, 0, -2); 4133 break; 4134 4135 /* Have to succeed matching what follows at least n times. 4136 After that, handle like `on_failure_jump'. */ 4137 case succeed_n: 4138 EXTRACT_NUMBER (mcnt, p + 2); 4139 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); 4140 4141 assert (mcnt >= 0); 4142 /* Originally, this is how many times we HAVE to succeed. */ 4143 if (mcnt > 0) 4144 { 4145 mcnt--; 4146 p += 2; 4147 STORE_NUMBER_AND_INCR (p, mcnt); 4148 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); 4149 } 4150 else if (mcnt == 0) 4151 { 4152 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); 4153 p[2] = (unsigned char) no_op; 4154 p[3] = (unsigned char) no_op; 4155 goto on_failure; 4156 } 4157 break; 4158 4159 case jump_n: 4160 EXTRACT_NUMBER (mcnt, p + 2); 4161 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); 4162 4163 /* Originally, this is how many times we CAN jump. */ 4164 if (mcnt) 4165 { 4166 mcnt--; 4167 STORE_NUMBER (p + 2, mcnt); 4168 goto unconditional_jump; 4169 } 4170 /* If don't have to jump any more, skip over the rest of command. */ 4171 else 4172 p += 4; 4173 break; 4174 4175 case set_number_at: 4176 { 4177 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); 4178 4179 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4180 p1 = p + mcnt; 4181 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4182 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); 4183 STORE_NUMBER (p1, mcnt); 4184 break; 4185 } 4186 4187 case wordbound: 4188 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 4189 if (AT_WORD_BOUNDARY (d)) 4190 break; 4191 goto fail; 4192 4193 case notwordbound: 4194 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 4195 if (AT_WORD_BOUNDARY (d)) 4196 goto fail; 4197 break; 4198 4199 case wordbeg: 4200 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); 4201 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) 4202 break; 4203 goto fail; 4204 4205 case wordend: 4206 DEBUG_PRINT1 ("EXECUTING wordend.\n"); 4207 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) 4208 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) 4209 break; 4210 goto fail; 4211 4212 #ifdef emacs 4213 #ifdef emacs19 4214 case before_dot: 4215 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); 4216 if (PTR_CHAR_POS ((unsigned char *) d) >= point) 4217 goto fail; 4218 break; 4219 4220 case at_dot: 4221 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 4222 if (PTR_CHAR_POS ((unsigned char *) d) != point) 4223 goto fail; 4224 break; 4225 4226 case after_dot: 4227 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); 4228 if (PTR_CHAR_POS ((unsigned char *) d) <= point) 4229 goto fail; 4230 break; 4231 #else /* not emacs19 */ 4232 case at_dot: 4233 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 4234 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point) 4235 goto fail; 4236 break; 4237 #endif /* not emacs19 */ 4238 4239 case syntaxspec: 4240 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); 4241 mcnt = *p++; 4242 goto matchsyntax; 4243 4244 case wordchar: 4245 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); 4246 mcnt = (int) Sword; 4247 matchsyntax: 4248 PREFETCH (); 4249 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) 4250 goto fail; 4251 SET_REGS_MATCHED (); 4252 break; 4253 4254 case notsyntaxspec: 4255 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); 4256 mcnt = *p++; 4257 goto matchnotsyntax; 4258 4259 case notwordchar: 4260 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); 4261 mcnt = (int) Sword; 4262 matchnotsyntax: 4263 PREFETCH (); 4264 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) 4265 goto fail; 4266 SET_REGS_MATCHED (); 4267 break; 4268 4269 #else /* not emacs */ 4270 case wordchar: 4271 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); 4272 PREFETCH (); 4273 if (!WORDCHAR_P (d)) 4274 goto fail; 4275 SET_REGS_MATCHED (); 4276 d++; 4277 break; 4278 4279 case notwordchar: 4280 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); 4281 PREFETCH (); 4282 if (WORDCHAR_P (d)) 4283 goto fail; 4284 SET_REGS_MATCHED (); 4285 d++; 4286 break; 4287 #endif /* not emacs */ 4288 4289 default: 4290 abort (); 4291 } 4292 continue; /* Successfully executed one pattern command; keep going. */ 4293 4294 4295 /* We goto here if a matching operation fails. */ 4296 fail: 4297 if (!FAIL_STACK_EMPTY ()) 4298 { /* A restart point is known. Restore to that state. */ 4299 DEBUG_PRINT1 ("\nFAIL:\n"); 4300 POP_FAILURE_POINT (d, p, 4301 lowest_active_reg, highest_active_reg, 4302 regstart, regend, reg_info); 4303 4304 /* If this failure point is a dummy, try the next one. */ 4305 if (!p) 4306 goto fail; 4307 4308 /* If we failed to the end of the pattern, don't examine *p. */ 4309 assert (p <= pend); 4310 if (p < pend) 4311 { 4312 boolean is_a_jump_n = false; 4313 4314 /* If failed to a backwards jump that's part of a repetition 4315 loop, need to pop this failure point and use the next one. */ 4316 switch ((re_opcode_t) *p) 4317 { 4318 case jump_n: 4319 is_a_jump_n = true; 4320 case maybe_pop_jump: 4321 case pop_failure_jump: 4322 case jump: 4323 p1 = p + 1; 4324 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4325 p1 += mcnt; 4326 4327 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) 4328 || (!is_a_jump_n 4329 && (re_opcode_t) *p1 == on_failure_jump)) 4330 goto fail; 4331 break; 4332 default: 4333 /* do nothing */ ; 4334 } 4335 } 4336 4337 if (d >= string1 && d <= end1) 4338 dend = end_match_1; 4339 } 4340 else 4341 break; /* Matching at this starting point really fails. */ 4342 } /* for (;;) */ 4343 4344 if (best_regs_set) 4345 goto restore_best_regs; 4346 4347 FREE_VARIABLES (); 4348 4349 return -1; /* Failure to match. */ 4350 } /* re_match_2 */ 4351 4352 /* Subroutine definitions for re_match_2. */ 4353 4354 4355 /* We are passed P pointing to a register number after a start_memory. 4356 4357 Return true if the pattern up to the corresponding stop_memory can 4358 match the empty string, and false otherwise. 4359 4360 If we find the matching stop_memory, sets P to point to one past its number. 4361 Otherwise, sets P to an undefined byte less than or equal to END. 4362 4363 We don't handle duplicates properly (yet). */ 4364 4365 static boolean 4366 group_match_null_string_p (p, end, reg_info) 4367 unsigned char **p, *end; 4368 register_info_type *reg_info; 4369 { 4370 int mcnt; 4371 /* Point to after the args to the start_memory. */ 4372 unsigned char *p1 = *p + 2; 4373 4374 while (p1 < end) 4375 { 4376 /* Skip over opcodes that can match nothing, and return true or 4377 false, as appropriate, when we get to one that can't, or to the 4378 matching stop_memory. */ 4379 4380 switch ((re_opcode_t) *p1) 4381 { 4382 /* Could be either a loop or a series of alternatives. */ 4383 case on_failure_jump: 4384 p1++; 4385 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4386 4387 /* If the next operation is not a jump backwards in the 4388 pattern. */ 4389 4390 if (mcnt >= 0) 4391 { 4392 /* Go through the on_failure_jumps of the alternatives, 4393 seeing if any of the alternatives cannot match nothing. 4394 The last alternative starts with only a jump, 4395 whereas the rest start with on_failure_jump and end 4396 with a jump, e.g., here is the pattern for `a|b|c': 4397 4398 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 4399 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 4400 /exactn/1/c 4401 4402 So, we have to first go through the first (n-1) 4403 alternatives and then deal with the last one separately. */ 4404 4405 4406 /* Deal with the first (n-1) alternatives, which start 4407 with an on_failure_jump (see above) that jumps to right 4408 past a jump_past_alt. */ 4409 4410 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) 4411 { 4412 /* `mcnt' holds how many bytes long the alternative 4413 is, including the ending `jump_past_alt' and 4414 its number. */ 4415 4416 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, 4417 reg_info)) 4418 return false; 4419 4420 /* Move to right after this alternative, including the 4421 jump_past_alt. */ 4422 p1 += mcnt; 4423 4424 /* Break if it's the beginning of an n-th alternative 4425 that doesn't begin with an on_failure_jump. */ 4426 if ((re_opcode_t) *p1 != on_failure_jump) 4427 break; 4428 4429 /* Still have to check that it's not an n-th 4430 alternative that starts with an on_failure_jump. */ 4431 p1++; 4432 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4433 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) 4434 { 4435 /* Get to the beginning of the n-th alternative. */ 4436 p1 -= 3; 4437 break; 4438 } 4439 } 4440 4441 /* Deal with the last alternative: go back and get number 4442 of the `jump_past_alt' just before it. `mcnt' contains 4443 the length of the alternative. */ 4444 EXTRACT_NUMBER (mcnt, p1 - 2); 4445 4446 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) 4447 return false; 4448 4449 p1 += mcnt; /* Get past the n-th alternative. */ 4450 } /* if mcnt > 0 */ 4451 break; 4452 4453 4454 case stop_memory: 4455 assert (p1[1] == **p); 4456 *p = p1 + 2; 4457 return true; 4458 4459 4460 default: 4461 if (!common_op_match_null_string_p (&p1, end, reg_info)) 4462 return false; 4463 } 4464 } /* while p1 < end */ 4465 4466 return false; 4467 } /* group_match_null_string_p */ 4468 4469 4470 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: 4471 It expects P to be the first byte of a single alternative and END one 4472 byte past the last. The alternative can contain groups. */ 4473 4474 static boolean 4475 alt_match_null_string_p (p, end, reg_info) 4476 unsigned char *p, *end; 4477 register_info_type *reg_info; 4478 { 4479 int mcnt; 4480 unsigned char *p1 = p; 4481 4482 while (p1 < end) 4483 { 4484 /* Skip over opcodes that can match nothing, and break when we get 4485 to one that can't. */ 4486 4487 switch ((re_opcode_t) *p1) 4488 { 4489 /* It's a loop. */ 4490 case on_failure_jump: 4491 p1++; 4492 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4493 p1 += mcnt; 4494 break; 4495 4496 default: 4497 if (!common_op_match_null_string_p (&p1, end, reg_info)) 4498 return false; 4499 } 4500 } /* while p1 < end */ 4501 4502 return true; 4503 } /* alt_match_null_string_p */ 4504 4505 4506 /* Deals with the ops common to group_match_null_string_p and 4507 alt_match_null_string_p. 4508 4509 Sets P to one after the op and its arguments, if any. */ 4510 4511 static boolean 4512 common_op_match_null_string_p (p, end, reg_info) 4513 unsigned char **p, *end; 4514 register_info_type *reg_info; 4515 { 4516 int mcnt; 4517 boolean ret; 4518 int reg_no; 4519 unsigned char *p1 = *p; 4520 4521 switch ((re_opcode_t) *p1++) 4522 { 4523 case no_op: 4524 case begline: 4525 case endline: 4526 case begbuf: 4527 case endbuf: 4528 case wordbeg: 4529 case wordend: 4530 case wordbound: 4531 case notwordbound: 4532 #ifdef emacs 4533 case before_dot: 4534 case at_dot: 4535 case after_dot: 4536 #endif 4537 break; 4538 4539 case start_memory: 4540 reg_no = *p1; 4541 assert (reg_no > 0 && reg_no <= MAX_REGNUM); 4542 ret = group_match_null_string_p (&p1, end, reg_info); 4543 4544 /* Have to set this here in case we're checking a group which 4545 contains a group and a back reference to it. */ 4546 4547 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) 4548 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; 4549 4550 if (!ret) 4551 return false; 4552 break; 4553 4554 /* If this is an optimized succeed_n for zero times, make the jump. */ 4555 case jump: 4556 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4557 if (mcnt >= 0) 4558 p1 += mcnt; 4559 else 4560 return false; 4561 break; 4562 4563 case succeed_n: 4564 /* Get to the number of times to succeed. */ 4565 p1 += 2; 4566 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4567 4568 if (mcnt == 0) 4569 { 4570 p1 -= 4; 4571 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4572 p1 += mcnt; 4573 } 4574 else 4575 return false; 4576 break; 4577 4578 case duplicate: 4579 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) 4580 return false; 4581 break; 4582 4583 case set_number_at: 4584 p1 += 4; 4585 4586 default: 4587 /* All other opcodes mean we cannot match the empty string. */ 4588 return false; 4589 } 4590 4591 *p = p1; 4592 return true; 4593 } /* common_op_match_null_string_p */ 4594 4595 4596 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 4597 bytes; nonzero otherwise. */ 4598 4599 static int 4600 bcmp_translate (s1, s2, len, translate) 4601 unsigned char *s1, *s2; 4602 register int len; 4603 char *translate; 4604 { 4605 register unsigned char *p1 = s1, *p2 = s2; 4606 while (len) 4607 { 4608 if (translate[*p1++] != translate[*p2++]) return 1; 4609 len--; 4610 } 4611 return 0; 4612 } 4613 4614 /* Entry points for GNU code. */ 4615 4616 /* re_compile_pattern is the GNU regular expression compiler: it 4617 compiles PATTERN (of length SIZE) and puts the result in BUFP. 4618 Returns 0 if the pattern was valid, otherwise an error string. 4619 4620 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 4621 are set in BUFP on entry. 4622 4623 We call regex_compile to do the actual compilation. */ 4624 4625 const char * 4626 re_compile_pattern (pattern, length, bufp) 4627 const char *pattern; 4628 int length; 4629 struct re_pattern_buffer *bufp; 4630 { 4631 reg_errcode_t ret; 4632 4633 /* GNU code is written to assume at least RE_NREGS registers will be set 4634 (and at least one extra will be -1). */ 4635 bufp->regs_allocated = REGS_UNALLOCATED; 4636 4637 /* And GNU code determines whether or not to get register information 4638 by passing null for the REGS argument to re_match, etc., not by 4639 setting no_sub. */ 4640 bufp->no_sub = 0; 4641 4642 /* Match anchors at newline. */ 4643 bufp->newline_anchor = 1; 4644 4645 ret = regex_compile (pattern, length, re_syntax_options, bufp); 4646 4647 return re_error_msg[(int) ret]; 4648 } 4649 4650 /* Entry points compatible with 4.2 BSD regex library. We don't define 4651 them if this is an Emacs or POSIX compilation. */ 4652 4653 #if !defined (emacs) 4654 4655 /* BSD has one and only one pattern buffer. */ 4656 static struct re_pattern_buffer re_comp_buf; 4657 4658 char * 4659 re_comp (s) 4660 const char *s; 4661 { 4662 reg_errcode_t ret; 4663 4664 if (!s) 4665 { 4666 if (!re_comp_buf.buffer) 4667 return "No previous regular expression"; 4668 return 0; 4669 } 4670 4671 if (!re_comp_buf.buffer) 4672 { 4673 re_comp_buf.buffer = (unsigned char *) malloc (200); 4674 if (re_comp_buf.buffer == NULL) 4675 return "Memory exhausted"; 4676 re_comp_buf.allocated = 200; 4677 4678 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); 4679 if (re_comp_buf.fastmap == NULL) 4680 return "Memory exhausted"; 4681 } 4682 4683 /* Since `re_exec' always passes NULL for the `regs' argument, we 4684 don't need to initialize the pattern buffer fields which affect it. */ 4685 4686 /* Match anchors at newlines. */ 4687 re_comp_buf.newline_anchor = 1; 4688 4689 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 4690 4691 /* Yes, we're discarding `const' here. */ 4692 return (char *) re_error_msg[(int) ret]; 4693 } 4694 4695 4696 int 4697 re_exec (s) 4698 const char *s; 4699 { 4700 const int len = strlen (s); 4701 return 4702 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); 4703 } 4704 #endif /* not emacs and not _POSIX_SOURCE */ 4705 4706 /* POSIX.2 functions. Don't define these for Emacs. */ 4707 4708 #ifndef emacs 4709 4710 /* regcomp takes a regular expression as a string and compiles it. 4711 4712 PREG is a regex_t *. We do not expect any fields to be initialized, 4713 since POSIX says we shouldn't. Thus, we set 4714 4715 `buffer' to the compiled pattern; 4716 `used' to the length of the compiled pattern; 4717 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 4718 REG_EXTENDED bit in CFLAGS is set; otherwise, to 4719 RE_SYNTAX_POSIX_BASIC; 4720 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 4721 `fastmap' and `fastmap_accurate' to zero; 4722 `re_nsub' to the number of subexpressions in PATTERN. 4723 4724 PATTERN is the address of the pattern string. 4725 4726 CFLAGS is a series of bits which affect compilation. 4727 4728 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 4729 use POSIX basic syntax. 4730 4731 If REG_NEWLINE is set, then . and [^...] don't match newline. 4732 Also, regexec will try a match beginning after every newline. 4733 4734 If REG_ICASE is set, then we considers upper- and lowercase 4735 versions of letters to be equivalent when matching. 4736 4737 If REG_NOSUB is set, then when PREG is passed to regexec, that 4738 routine will report only success or failure, and nothing about the 4739 registers. 4740 4741 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 4742 the return codes and their meanings.) */ 4743 4744 int 4745 regcomp (preg, pattern, cflags) 4746 regex_t *preg; 4747 const char *pattern; 4748 int cflags; 4749 { 4750 reg_errcode_t ret; 4751 unsigned syntax 4752 = (cflags & REG_EXTENDED) ? 4753 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; 4754 4755 /* regex_compile will allocate the space for the compiled pattern. */ 4756 preg->buffer = 0; 4757 preg->allocated = 0; 4758 4759 /* Don't bother to use a fastmap when searching. This simplifies the 4760 REG_NEWLINE case: if we used a fastmap, we'd have to put all the 4761 characters after newlines into the fastmap. This way, we just try 4762 every character. */ 4763 preg->fastmap = 0; 4764 4765 if (cflags & REG_ICASE) 4766 { 4767 unsigned i; 4768 4769 preg->translate = (char *) malloc (CHAR_SET_SIZE); 4770 if (preg->translate == NULL) 4771 return (int) REG_ESPACE; 4772 4773 /* Map uppercase characters to corresponding lowercase ones. */ 4774 for (i = 0; i < CHAR_SET_SIZE; i++) 4775 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; 4776 } 4777 else 4778 preg->translate = NULL; 4779 4780 /* If REG_NEWLINE is set, newlines are treated differently. */ 4781 if (cflags & REG_NEWLINE) 4782 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 4783 syntax &= ~RE_DOT_NEWLINE; 4784 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 4785 /* It also changes the matching behavior. */ 4786 preg->newline_anchor = 1; 4787 } 4788 else 4789 preg->newline_anchor = 0; 4790 4791 preg->no_sub = !!(cflags & REG_NOSUB); 4792 4793 /* POSIX says a null character in the pattern terminates it, so we 4794 can use strlen here in compiling the pattern. */ 4795 ret = regex_compile (pattern, strlen (pattern), syntax, preg); 4796 4797 /* POSIX doesn't distinguish between an unmatched open-group and an 4798 unmatched close-group: both are REG_EPAREN. */ 4799 if (ret == REG_ERPAREN) ret = REG_EPAREN; 4800 4801 return (int) ret; 4802 } 4803 4804 4805 /* regexec searches for a given pattern, specified by PREG, in the 4806 string STRING. 4807 4808 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 4809 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 4810 least NMATCH elements, and we set them to the offsets of the 4811 corresponding matched substrings. 4812 4813 EFLAGS specifies `execution flags' which affect matching: if 4814 REG_NOTBOL is set, then ^ does not match at the beginning of the 4815 string; if REG_NOTEOL is set, then $ does not match at the end. 4816 4817 We return 0 if we find a match and REG_NOMATCH if not. */ 4818 4819 int 4820 regexec (preg, string, nmatch, pmatch, eflags) 4821 const regex_t *preg; 4822 const char *string; 4823 size_t nmatch; 4824 regmatch_t pmatch[]; 4825 int eflags; 4826 { 4827 int ret; 4828 struct re_registers regs; 4829 regex_t private_preg; 4830 int len = strlen (string); 4831 boolean want_reg_info = !preg->no_sub && nmatch > 0; 4832 4833 private_preg = *preg; 4834 4835 private_preg.not_bol = !!(eflags & REG_NOTBOL); 4836 private_preg.not_eol = !!(eflags & REG_NOTEOL); 4837 4838 /* The user has told us exactly how many registers to return 4839 information about, via `nmatch'. We have to pass that on to the 4840 matching routines. */ 4841 private_preg.regs_allocated = REGS_FIXED; 4842 4843 if (want_reg_info) 4844 { 4845 regs.num_regs = nmatch; 4846 regs.start = TALLOC (nmatch, regoff_t); 4847 regs.end = TALLOC (nmatch, regoff_t); 4848 if (regs.start == NULL || regs.end == NULL) 4849 return (int) REG_NOMATCH; 4850 } 4851 4852 /* Perform the searching operation. */ 4853 ret = re_search (&private_preg, string, len, 4854 /* start: */ 0, /* range: */ len, 4855 want_reg_info ? ®s : (struct re_registers *) 0); 4856 4857 /* Copy the register information to the POSIX structure. */ 4858 if (want_reg_info) 4859 { 4860 if (ret >= 0) 4861 { 4862 unsigned r; 4863 4864 for (r = 0; r < nmatch; r++) 4865 { 4866 pmatch[r].rm_so = regs.start[r]; 4867 pmatch[r].rm_eo = regs.end[r]; 4868 } 4869 } 4870 4871 /* If we needed the temporary register info, free the space now. */ 4872 free (regs.start); 4873 free (regs.end); 4874 } 4875 4876 /* We want zero return to mean success, unlike `re_search'. */ 4877 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; 4878 } 4879 4880 4881 /* Returns a message corresponding to an error code, ERRCODE, returned 4882 from either regcomp or regexec. We don't use PREG here. */ 4883 4884 size_t 4885 regerror (errcode, preg, errbuf, errbuf_size) 4886 int errcode; 4887 const regex_t *preg; 4888 char *errbuf; 4889 size_t errbuf_size; 4890 { 4891 const char *msg; 4892 size_t msg_size; 4893 4894 if (errcode < 0 4895 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0]))) 4896 /* Only error codes returned by the rest of the code should be passed 4897 to this routine. If we are given anything else, or if other regex 4898 code generates an invalid error code, then the program has a bug. 4899 Dump core so we can fix it. */ 4900 abort (); 4901 4902 msg = re_error_msg[errcode]; 4903 4904 /* POSIX doesn't require that we do anything in this case, but why 4905 not be nice. */ 4906 if (! msg) 4907 msg = "Success"; 4908 4909 msg_size = strlen (msg) + 1; /* Includes the null. */ 4910 4911 if (errbuf_size != 0) 4912 { 4913 if (msg_size > errbuf_size) 4914 { 4915 strncpy (errbuf, msg, errbuf_size - 1); 4916 errbuf[errbuf_size - 1] = 0; 4917 } 4918 else 4919 strcpy (errbuf, msg); 4920 } 4921 4922 return msg_size; 4923 } 4924 4925 4926 /* Free dynamically allocated space used by PREG. */ 4927 4928 void 4929 regfree (preg) 4930 regex_t *preg; 4931 { 4932 if (preg->buffer != NULL) 4933 free (preg->buffer); 4934 preg->buffer = NULL; 4935 4936 preg->allocated = 0; 4937 preg->used = 0; 4938 4939 if (preg->fastmap != NULL) 4940 free (preg->fastmap); 4941 preg->fastmap = NULL; 4942 preg->fastmap_accurate = 0; 4943 4944 if (preg->translate != NULL) 4945 free (preg->translate); 4946 preg->translate = NULL; 4947 } 4948 4949 #endif /* not emacs */ 4950 4951 /* 4952 Local variables: 4953 make-backup-files: t 4954 version-control: t 4955 trim-versions-without-asking: nil 4956 End: 4957 */ 4958