1 /* Extended regular expression matching and search library, version 2 0.12. (Implements POSIX draft P10003.2/D11.2, except for 3 internationalization features.) 4 5 Copyright (C) 1993, 1994, 1995, 1996 Free Software Foundation, Inc. 6 7 This program is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 2, or (at your option) 10 any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. */ 16 17 /* Trying to define this in the makefile would get hairy, unless we can 18 more gracefully do it for NT, OS/2, unix, etc. */ 19 #define REGEX_MALLOC 1 20 21 /* AIX requires this to be the first thing in the file. */ 22 #if defined (_AIX) && !defined (REGEX_MALLOC) 23 #pragma alloca 24 #endif 25 26 #define _GNU_SOURCE 27 28 /* We need this for `regex.h', and perhaps for the Emacs include files. */ 29 #include <sys/types.h> 30 31 #ifdef HAVE_CONFIG_H 32 #include "config.h" 33 #endif 34 35 /* The `emacs' switch turns on certain matching commands 36 that make sense only in Emacs. */ 37 #ifdef emacs 38 39 #include "lisp.h" 40 #include "buffer.h" 41 42 /* Make syntax table lookup grant data in gl_state. */ 43 #define SYNTAX_ENTRY_VIA_PROPERTY 44 45 #include "syntax.h" 46 #include "charset.h" 47 #include "category.h" 48 49 #define malloc xmalloc 50 #define free xfree 51 52 #else /* not emacs */ 53 54 /* We used to test for `BSTRING' here, but only GCC and Emacs define 55 `BSTRING', as far as I know, and neither of them use this code. */ 56 #if HAVE_STRING_H || STDC_HEADERS 57 #include <string.h> 58 #ifndef bcmp 59 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) 60 #endif 61 #ifndef bcopy 62 #define bcopy(s, d, n) memcpy ((d), (s), (n)) 63 #endif 64 #ifndef bzero 65 #define bzero(s, n) memset ((s), 0, (n)) 66 #endif 67 #else 68 #include <strings.h> 69 #endif 70 71 #ifdef STDC_HEADERS 72 #include <stdlib.h> 73 #else 74 char *malloc (); 75 char *realloc (); 76 #endif 77 78 79 /* Define the syntax stuff for \<, \>, etc. */ 80 81 /* This must be nonzero for the wordchar and notwordchar pattern 82 commands in re_match_2. */ 83 #ifndef Sword 84 #define Sword 1 85 #endif 86 87 #ifdef SYNTAX_TABLE 88 89 extern char *re_syntax_table; 90 91 #else /* not SYNTAX_TABLE */ 92 93 /* How many characters in the character set. */ 94 #define CHAR_SET_SIZE 256 95 96 static char re_syntax_table[CHAR_SET_SIZE]; 97 98 static void 99 init_syntax_once () 100 { 101 register int c; 102 static int done = 0; 103 104 if (done) 105 return; 106 107 bzero (re_syntax_table, sizeof re_syntax_table); 108 109 for (c = 'a'; c <= 'z'; c++) 110 re_syntax_table[c] = Sword; 111 112 for (c = 'A'; c <= 'Z'; c++) 113 re_syntax_table[c] = Sword; 114 115 for (c = '0'; c <= '9'; c++) 116 re_syntax_table[c] = Sword; 117 118 re_syntax_table['_'] = Sword; 119 120 done = 1; 121 } 122 123 #endif /* not SYNTAX_TABLE */ 124 125 #define SYNTAX(c) re_syntax_table[c] 126 127 #endif /* not emacs */ 128 129 /* Get the interface, including the syntax bits. */ 130 #include "regex.h" 131 132 /* isalpha etc. are used for the character classes. */ 133 #include <ctype.h> 134 135 #ifndef isascii 136 #define isascii(c) 1 137 #endif 138 139 #ifdef isblank 140 #define ISBLANK(c) (isascii (c) && isblank (c)) 141 #else 142 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') 143 #endif 144 #ifdef isgraph 145 #define ISGRAPH(c) (isascii (c) && isgraph (c)) 146 #else 147 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c)) 148 #endif 149 150 #define ISPRINT(c) (isascii (c) && isprint (c)) 151 #define ISDIGIT(c) (isascii (c) && isdigit (c)) 152 #define ISALNUM(c) (isascii (c) && isalnum (c)) 153 #define ISALPHA(c) (isascii (c) && isalpha (c)) 154 #define ISCNTRL(c) (isascii (c) && iscntrl (c)) 155 #define ISLOWER(c) (isascii (c) && islower (c)) 156 #define ISPUNCT(c) (isascii (c) && ispunct (c)) 157 #define ISSPACE(c) (isascii (c) && isspace (c)) 158 #define ISUPPER(c) (isascii (c) && isupper (c)) 159 #define ISXDIGIT(c) (isascii (c) && isxdigit (c)) 160 161 #ifndef NULL 162 #define NULL 0 163 #endif 164 165 /* We remove any previous definition of `SIGN_EXTEND_CHAR', 166 since ours (we hope) works properly with all combinations of 167 machines, compilers, `char' and `unsigned char' argument types. 168 (Per Bothner suggested the basic approach.) */ 169 #undef SIGN_EXTEND_CHAR 170 #if __STDC__ 171 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) 172 #else /* not __STDC__ */ 173 /* As in Harbison and Steele. */ 174 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) 175 #endif 176 177 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 178 use `alloca' instead of `malloc'. This is because using malloc in 179 re_search* or re_match* could cause memory leaks when C-g is used in 180 Emacs; also, malloc is slower and causes storage fragmentation. On 181 the other hand, malloc is more portable, and easier to debug. 182 183 Because we sometimes use alloca, some routines have to be macros, 184 not functions -- `alloca'-allocated space disappears at the end of the 185 function it is called in. */ 186 187 #ifdef REGEX_MALLOC 188 189 #define REGEX_ALLOCATE malloc 190 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 191 192 #else /* not REGEX_MALLOC */ 193 194 /* Emacs already defines alloca, sometimes. */ 195 #ifndef alloca 196 197 /* Make alloca work the best possible way. */ 198 #ifdef __GNUC__ 199 #define alloca __builtin_alloca 200 #else /* not __GNUC__ */ 201 #if HAVE_ALLOCA_H 202 #include <alloca.h> 203 #else /* not __GNUC__ or HAVE_ALLOCA_H */ 204 #ifndef _AIX /* Already did AIX, up at the top. */ 205 char *alloca (); 206 #endif /* not _AIX */ 207 #endif /* not HAVE_ALLOCA_H */ 208 #endif /* not __GNUC__ */ 209 210 #endif /* not alloca */ 211 212 #define REGEX_ALLOCATE alloca 213 214 /* Assumes a `char *destination' variable. */ 215 #define REGEX_REALLOCATE(source, osize, nsize) \ 216 (destination = (char *) alloca (nsize), \ 217 bcopy (source, destination, osize), \ 218 destination) 219 220 #endif /* not REGEX_MALLOC */ 221 222 223 /* True if `size1' is non-NULL and PTR is pointing anywhere inside 224 `string1' or just past its end. This works if PTR is NULL, which is 225 a good thing. */ 226 #define FIRST_STRING_P(ptr) \ 227 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 228 229 /* (Re)Allocate N items of type T using malloc, or fail. */ 230 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 231 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 232 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 233 234 #define BYTEWIDTH 8 /* In bits. */ 235 236 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 237 238 #undef MAX 239 #undef MIN 240 #define MAX(a, b) ((a) > (b) ? (a) : (b)) 241 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 242 243 typedef char boolean; 244 #define false 0 245 #define true 1 246 247 /* These are the command codes that appear in compiled regular 248 expressions. Some opcodes are followed by argument bytes. A 249 command code can specify any interpretation whatsoever for its 250 arguments. Zero bytes may appear in the compiled regular expression. 251 252 The value of `exactn' is needed in search.c (search_buffer) in Emacs. 253 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of 254 `exactn' we use here must also be 1. */ 255 256 typedef enum 257 { 258 no_op = 0, 259 260 /* Followed by one byte giving n, then by n literal bytes. */ 261 exactn = 1, 262 263 /* Matches any (more or less) character. */ 264 anychar, 265 266 /* Matches any one char belonging to specified set. First 267 following byte is number of bitmap bytes. Then come bytes 268 for a bitmap saying which chars are in. Bits in each byte 269 are ordered low-bit-first. A character is in the set if its 270 bit is 1. A character too large to have a bit in the map is 271 automatically not in the set. */ 272 charset, 273 274 /* Same parameters as charset, but match any character that is 275 not one of those specified. */ 276 charset_not, 277 278 /* Start remembering the text that is matched, for storing in a 279 register. Followed by one byte with the register number, in 280 the range 0 to one less than the pattern buffer's re_nsub 281 field. Then followed by one byte with the number of groups 282 inner to this one. (This last has to be part of the 283 start_memory only because we need it in the on_failure_jump 284 of re_match_2.) */ 285 start_memory, 286 287 /* Stop remembering the text that is matched and store it in a 288 memory register. Followed by one byte with the register 289 number, in the range 0 to one less than `re_nsub' in the 290 pattern buffer, and one byte with the number of inner groups, 291 just like `start_memory'. (We need the number of inner 292 groups here because we don't have any easy way of finding the 293 corresponding start_memory when we're at a stop_memory.) */ 294 stop_memory, 295 296 /* Match a duplicate of something remembered. Followed by one 297 byte containing the register number. */ 298 duplicate, 299 300 /* Fail unless at beginning of line. */ 301 begline, 302 303 /* Fail unless at end of line. */ 304 endline, 305 306 /* Succeeds if at beginning of buffer (if emacs) or at beginning 307 of string to be matched (if not). */ 308 begbuf, 309 310 /* Analogously, for end of buffer/string. */ 311 endbuf, 312 313 /* Followed by two byte relative address to which to jump. */ 314 jump, 315 316 /* Same as jump, but marks the end of an alternative. */ 317 jump_past_alt, 318 319 /* Followed by two-byte relative address of place to resume at 320 in case of failure. */ 321 on_failure_jump, 322 323 /* Like on_failure_jump, but pushes a placeholder instead of the 324 current string position when executed. */ 325 on_failure_keep_string_jump, 326 327 /* Throw away latest failure point and then jump to following 328 two-byte relative address. */ 329 pop_failure_jump, 330 331 /* Change to pop_failure_jump if know won't have to backtrack to 332 match; otherwise change to jump. This is used to jump 333 back to the beginning of a repeat. If what follows this jump 334 clearly won't match what the repeat does, such that we can be 335 sure that there is no use backtracking out of repetitions 336 already matched, then we change it to a pop_failure_jump. 337 Followed by two-byte address. */ 338 maybe_pop_jump, 339 340 /* Jump to following two-byte address, and push a dummy failure 341 point. This failure point will be thrown away if an attempt 342 is made to use it for a failure. A `+' construct makes this 343 before the first repeat. Also used as an intermediary kind 344 of jump when compiling an alternative. */ 345 dummy_failure_jump, 346 347 /* Push a dummy failure point and continue. Used at the end of 348 alternatives. */ 349 push_dummy_failure, 350 351 /* Followed by two-byte relative address and two-byte number n. 352 After matching N times, jump to the address upon failure. */ 353 succeed_n, 354 355 /* Followed by two-byte relative address, and two-byte number n. 356 Jump to the address N times, then fail. */ 357 jump_n, 358 359 /* Set the following two-byte relative address to the 360 subsequent two-byte number. The address *includes* the two 361 bytes of number. */ 362 set_number_at, 363 364 wordchar, /* Matches any word-constituent character. */ 365 notwordchar, /* Matches any char that is not a word-constituent. */ 366 367 wordbeg, /* Succeeds if at word beginning. */ 368 wordend, /* Succeeds if at word end. */ 369 370 wordbound, /* Succeeds if at a word boundary. */ 371 notwordbound /* Succeeds if not at a word boundary. */ 372 373 #ifdef emacs 374 ,before_dot, /* Succeeds if before point. */ 375 at_dot, /* Succeeds if at point. */ 376 after_dot, /* Succeeds if after point. */ 377 378 /* Matches any character whose syntax is specified. Followed by 379 a byte which contains a syntax code, e.g., Sword. */ 380 syntaxspec, 381 382 /* Matches any character whose syntax is not that specified. */ 383 notsyntaxspec 384 #endif /* emacs */ 385 } re_opcode_t; 386 387 /* Common operations on the compiled pattern. */ 388 389 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 390 391 #define STORE_NUMBER(destination, number) \ 392 do { \ 393 (destination)[0] = (number) & 0377; \ 394 (destination)[1] = (number) >> 8; \ 395 } while (0) 396 397 /* Same as STORE_NUMBER, except increment DESTINATION to 398 the byte after where the number is stored. Therefore, DESTINATION 399 must be an lvalue. */ 400 401 #define STORE_NUMBER_AND_INCR(destination, number) \ 402 do { \ 403 STORE_NUMBER (destination, number); \ 404 (destination) += 2; \ 405 } while (0) 406 407 /* Put into DESTINATION a number stored in two contiguous bytes starting 408 at SOURCE. */ 409 410 #define EXTRACT_NUMBER(destination, source) \ 411 do { \ 412 (destination) = *(source) & 0377; \ 413 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ 414 } while (0) 415 416 #ifdef DEBUG 417 static void 418 extract_number (dest, source) 419 int *dest; 420 unsigned char *source; 421 { 422 int temp = SIGN_EXTEND_CHAR (*(source + 1)); 423 *dest = *source & 0377; 424 *dest += temp << 8; 425 } 426 427 #ifndef EXTRACT_MACROS /* To debug the macros. */ 428 #undef EXTRACT_NUMBER 429 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) 430 #endif /* not EXTRACT_MACROS */ 431 432 #endif /* DEBUG */ 433 434 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 435 SOURCE must be an lvalue. */ 436 437 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ 438 do { \ 439 EXTRACT_NUMBER (destination, source); \ 440 (source) += 2; \ 441 } while (0) 442 443 #ifdef DEBUG 444 static void 445 extract_number_and_incr (destination, source) 446 int *destination; 447 unsigned char **source; 448 { 449 extract_number (destination, *source); 450 *source += 2; 451 } 452 453 #ifndef EXTRACT_MACROS 454 #undef EXTRACT_NUMBER_AND_INCR 455 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ 456 extract_number_and_incr (&dest, &src) 457 #endif /* not EXTRACT_MACROS */ 458 459 #endif /* DEBUG */ 460 461 /* If DEBUG is defined, Regex prints many voluminous messages about what 462 it is doing (if the variable `debug' is nonzero). If linked with the 463 main program in `iregex.c', you can enter patterns and strings 464 interactively. And if linked with the main program in `main.c' and 465 the other test files, you can run the already-written tests. */ 466 467 #ifdef DEBUG 468 469 /* We use standard I/O for debugging. */ 470 #include <stdio.h> 471 472 /* It is useful to test things that ``must'' be true when debugging. */ 473 #include <assert.h> 474 475 static int debug = 0; 476 477 #define DEBUG_STATEMENT(e) e 478 #define DEBUG_PRINT1(x) if (debug) printf (x) 479 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 480 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 481 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 482 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 483 if (debug) print_partial_compiled_pattern (s, e) 484 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 485 if (debug) print_double_string (w, s1, sz1, s2, sz2) 486 487 488 extern void printchar (); 489 490 /* Print the fastmap in human-readable form. */ 491 492 void 493 print_fastmap (fastmap) 494 char *fastmap; 495 { 496 unsigned was_a_range = 0; 497 unsigned i = 0; 498 499 while (i < (1 << BYTEWIDTH)) 500 { 501 if (fastmap[i++]) 502 { 503 was_a_range = 0; 504 printchar (i - 1); 505 while (i < (1 << BYTEWIDTH) && fastmap[i]) 506 { 507 was_a_range = 1; 508 i++; 509 } 510 if (was_a_range) 511 { 512 printf ("-"); 513 printchar (i - 1); 514 } 515 } 516 } 517 putchar ('\n'); 518 } 519 520 521 /* Print a compiled pattern string in human-readable form, starting at 522 the START pointer into it and ending just before the pointer END. */ 523 524 void 525 print_partial_compiled_pattern (start, end) 526 unsigned char *start; 527 unsigned char *end; 528 { 529 int mcnt, mcnt2; 530 unsigned char *p = start; 531 unsigned char *pend = end; 532 533 if (start == NULL) 534 { 535 printf ("(null)\n"); 536 return; 537 } 538 539 /* Loop over pattern commands. */ 540 while (p < pend) 541 { 542 switch ((re_opcode_t) *p++) 543 { 544 case no_op: 545 printf ("/no_op"); 546 break; 547 548 case exactn: 549 mcnt = *p++; 550 printf ("/exactn/%d", mcnt); 551 do 552 { 553 putchar ('/'); 554 printchar (*p++); 555 } 556 while (--mcnt); 557 break; 558 559 case start_memory: 560 mcnt = *p++; 561 printf ("/start_memory/%d/%d", mcnt, *p++); 562 break; 563 564 case stop_memory: 565 mcnt = *p++; 566 printf ("/stop_memory/%d/%d", mcnt, *p++); 567 break; 568 569 case duplicate: 570 printf ("/duplicate/%d", *p++); 571 break; 572 573 case anychar: 574 printf ("/anychar"); 575 break; 576 577 case charset: 578 case charset_not: 579 { 580 register int c; 581 582 printf ("/charset%s", 583 (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); 584 585 assert (p + *p < pend); 586 587 for (c = 0; c < *p; c++) 588 { 589 unsigned bit; 590 unsigned char map_byte = p[1 + c]; 591 592 putchar ('/'); 593 594 for (bit = 0; bit < BYTEWIDTH; bit++) 595 if (map_byte & (1 << bit)) 596 printchar (c * BYTEWIDTH + bit); 597 } 598 p += 1 + *p; 599 break; 600 } 601 602 case begline: 603 printf ("/begline"); 604 break; 605 606 case endline: 607 printf ("/endline"); 608 break; 609 610 case on_failure_jump: 611 extract_number_and_incr (&mcnt, &p); 612 printf ("/on_failure_jump/0/%d", mcnt); 613 break; 614 615 case on_failure_keep_string_jump: 616 extract_number_and_incr (&mcnt, &p); 617 printf ("/on_failure_keep_string_jump/0/%d", mcnt); 618 break; 619 620 case dummy_failure_jump: 621 extract_number_and_incr (&mcnt, &p); 622 printf ("/dummy_failure_jump/0/%d", mcnt); 623 break; 624 625 case push_dummy_failure: 626 printf ("/push_dummy_failure"); 627 break; 628 629 case maybe_pop_jump: 630 extract_number_and_incr (&mcnt, &p); 631 printf ("/maybe_pop_jump/0/%d", mcnt); 632 break; 633 634 case pop_failure_jump: 635 extract_number_and_incr (&mcnt, &p); 636 printf ("/pop_failure_jump/0/%d", mcnt); 637 break; 638 639 case jump_past_alt: 640 extract_number_and_incr (&mcnt, &p); 641 printf ("/jump_past_alt/0/%d", mcnt); 642 break; 643 644 case jump: 645 extract_number_and_incr (&mcnt, &p); 646 printf ("/jump/0/%d", mcnt); 647 break; 648 649 case succeed_n: 650 extract_number_and_incr (&mcnt, &p); 651 extract_number_and_incr (&mcnt2, &p); 652 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2); 653 break; 654 655 case jump_n: 656 extract_number_and_incr (&mcnt, &p); 657 extract_number_and_incr (&mcnt2, &p); 658 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2); 659 break; 660 661 case set_number_at: 662 extract_number_and_incr (&mcnt, &p); 663 extract_number_and_incr (&mcnt2, &p); 664 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2); 665 break; 666 667 case wordbound: 668 printf ("/wordbound"); 669 break; 670 671 case notwordbound: 672 printf ("/notwordbound"); 673 break; 674 675 case wordbeg: 676 printf ("/wordbeg"); 677 break; 678 679 case wordend: 680 printf ("/wordend"); 681 682 #ifdef emacs 683 case before_dot: 684 printf ("/before_dot"); 685 break; 686 687 case at_dot: 688 printf ("/at_dot"); 689 break; 690 691 case after_dot: 692 printf ("/after_dot"); 693 break; 694 695 case syntaxspec: 696 printf ("/syntaxspec"); 697 mcnt = *p++; 698 printf ("/%d", mcnt); 699 break; 700 701 case notsyntaxspec: 702 printf ("/notsyntaxspec"); 703 mcnt = *p++; 704 printf ("/%d", mcnt); 705 break; 706 #endif /* emacs */ 707 708 case wordchar: 709 printf ("/wordchar"); 710 break; 711 712 case notwordchar: 713 printf ("/notwordchar"); 714 break; 715 716 case begbuf: 717 printf ("/begbuf"); 718 break; 719 720 case endbuf: 721 printf ("/endbuf"); 722 break; 723 724 default: 725 printf ("?%d", *(p-1)); 726 } 727 } 728 printf ("/\n"); 729 } 730 731 732 void 733 print_compiled_pattern (bufp) 734 struct re_pattern_buffer *bufp; 735 { 736 unsigned char *buffer = bufp->buffer; 737 738 print_partial_compiled_pattern (buffer, buffer + bufp->used); 739 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); 740 741 if (bufp->fastmap_accurate && bufp->fastmap) 742 { 743 printf ("fastmap: "); 744 print_fastmap (bufp->fastmap); 745 } 746 747 printf ("re_nsub: %d\t", bufp->re_nsub); 748 printf ("regs_alloc: %d\t", bufp->regs_allocated); 749 printf ("can_be_null: %d\t", bufp->can_be_null); 750 printf ("newline_anchor: %d\n", bufp->newline_anchor); 751 printf ("no_sub: %d\t", bufp->no_sub); 752 printf ("not_bol: %d\t", bufp->not_bol); 753 printf ("not_eol: %d\t", bufp->not_eol); 754 printf ("syntax: %d\n", bufp->syntax); 755 /* Perhaps we should print the translate table? */ 756 } 757 758 759 void 760 print_double_string (where, string1, size1, string2, size2) 761 const char *where; 762 const char *string1; 763 const char *string2; 764 int size1; 765 int size2; 766 { 767 unsigned this_char; 768 769 if (where == NULL) 770 printf ("(null)"); 771 else 772 { 773 if (FIRST_STRING_P (where)) 774 { 775 for (this_char = where - string1; this_char < size1; this_char++) 776 printchar (string1[this_char]); 777 778 where = string2; 779 } 780 781 for (this_char = where - string2; this_char < size2; this_char++) 782 printchar (string2[this_char]); 783 } 784 } 785 786 #else /* not DEBUG */ 787 788 #undef assert 789 #define assert(e) 790 791 #define DEBUG_STATEMENT(e) 792 #define DEBUG_PRINT1(x) 793 #define DEBUG_PRINT2(x1, x2) 794 #define DEBUG_PRINT3(x1, x2, x3) 795 #define DEBUG_PRINT4(x1, x2, x3, x4) 796 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 797 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 798 799 #endif /* not DEBUG */ 800 801 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 802 also be assigned to arbitrarily: each pattern buffer stores its own 803 syntax, so it can be changed between regex compilations. */ 804 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; 805 806 807 /* Specify the precise syntax of regexps for compilation. This provides 808 for compatibility for various utilities which historically have 809 different, incompatible syntaxes. 810 811 The argument SYNTAX is a bit mask comprised of the various bits 812 defined in regex.h. We return the old syntax. */ 813 814 reg_syntax_t 815 re_set_syntax (syntax) 816 reg_syntax_t syntax; 817 { 818 reg_syntax_t ret = re_syntax_options; 819 820 re_syntax_options = syntax; 821 return ret; 822 } 823 824 /* This table gives an error message for each of the error codes listed 825 in regex.h. Obviously the order here has to be same as there. */ 826 827 static const char *re_error_msg[] = 828 { NULL, /* REG_NOERROR */ 829 "No match", /* REG_NOMATCH */ 830 "Invalid regular expression", /* REG_BADPAT */ 831 "Invalid collation character", /* REG_ECOLLATE */ 832 "Invalid character class name", /* REG_ECTYPE */ 833 "Trailing backslash", /* REG_EESCAPE */ 834 "Invalid back reference", /* REG_ESUBREG */ 835 "Unmatched [ or [^", /* REG_EBRACK */ 836 "Unmatched ( or \\(", /* REG_EPAREN */ 837 "Unmatched \\{", /* REG_EBRACE */ 838 "Invalid content of \\{\\}", /* REG_BADBR */ 839 "Invalid range end", /* REG_ERANGE */ 840 "Memory exhausted", /* REG_ESPACE */ 841 "Invalid preceding regular expression", /* REG_BADRPT */ 842 "Premature end of regular expression", /* REG_EEND */ 843 "Regular expression too big", /* REG_ESIZE */ 844 "Unmatched ) or \\)", /* REG_ERPAREN */ 845 }; 846 847 /* Subroutine declarations and macros for regex_compile. */ 848 849 static void store_op1 (), store_op2 (); 850 static void insert_op1 (), insert_op2 (); 851 static boolean at_begline_loc_p (), at_endline_loc_p (); 852 static boolean group_in_compile_stack (); 853 static reg_errcode_t compile_range (); 854 855 /* Fetch the next character in the uncompiled pattern---translating it 856 if necessary. Also cast from a signed character in the constant 857 string passed to us by the user to an unsigned char that we can use 858 as an array index (in, e.g., `translate'). */ 859 #define PATFETCH(c) \ 860 do {if (p == pend) return REG_EEND; \ 861 c = (unsigned char) *p++; \ 862 if (translate) c = translate[c]; \ 863 } while (0) 864 865 /* Fetch the next character in the uncompiled pattern, with no 866 translation. */ 867 #define PATFETCH_RAW(c) \ 868 do {if (p == pend) return REG_EEND; \ 869 c = (unsigned char) *p++; \ 870 } while (0) 871 872 /* Go backwards one character in the pattern. */ 873 #define PATUNFETCH p-- 874 875 876 /* If `translate' is non-null, return translate[D], else just D. We 877 cast the subscript to translate because some data is declared as 878 `char *', to avoid warnings when a string constant is passed. But 879 when we use a character as a subscript we must make it unsigned. */ 880 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) 881 882 883 /* Macros for outputting the compiled pattern into `buffer'. */ 884 885 /* If the buffer isn't allocated when it comes in, use this. */ 886 #define INIT_BUF_SIZE 32 887 888 /* Make sure we have at least N more bytes of space in buffer. */ 889 #define GET_BUFFER_SPACE(n) \ 890 while (b - bufp->buffer + (n) > bufp->allocated) \ 891 EXTEND_BUFFER () 892 893 /* Make sure we have one more byte of buffer space and then add C to it. */ 894 #define BUF_PUSH(c) \ 895 do { \ 896 GET_BUFFER_SPACE (1); \ 897 *b++ = (unsigned char) (c); \ 898 } while (0) 899 900 901 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 902 #define BUF_PUSH_2(c1, c2) \ 903 do { \ 904 GET_BUFFER_SPACE (2); \ 905 *b++ = (unsigned char) (c1); \ 906 *b++ = (unsigned char) (c2); \ 907 } while (0) 908 909 910 /* As with BUF_PUSH_2, except for three bytes. */ 911 #define BUF_PUSH_3(c1, c2, c3) \ 912 do { \ 913 GET_BUFFER_SPACE (3); \ 914 *b++ = (unsigned char) (c1); \ 915 *b++ = (unsigned char) (c2); \ 916 *b++ = (unsigned char) (c3); \ 917 } while (0) 918 919 920 /* Store a jump with opcode OP at LOC to location TO. We store a 921 relative address offset by the three bytes the jump itself occupies. */ 922 #define STORE_JUMP(op, loc, to) \ 923 store_op1 (op, loc, (to) - (loc) - 3) 924 925 /* Likewise, for a two-argument jump. */ 926 #define STORE_JUMP2(op, loc, to, arg) \ 927 store_op2 (op, loc, (to) - (loc) - 3, arg) 928 929 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 930 #define INSERT_JUMP(op, loc, to) \ 931 insert_op1 (op, loc, (to) - (loc) - 3, b) 932 933 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 934 #define INSERT_JUMP2(op, loc, to, arg) \ 935 insert_op2 (op, loc, (to) - (loc) - 3, arg, b) 936 937 938 /* This is not an arbitrary limit: the arguments which represent offsets 939 into the pattern are two bytes long. So if 2^16 bytes turns out to 940 be too small, many things would have to change. */ 941 #define MAX_BUF_SIZE (1L << 16) 942 943 944 /* Extend the buffer by twice its current size via realloc and 945 reset the pointers that pointed into the old block to point to the 946 correct places in the new one. If extending the buffer results in it 947 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ 948 #define EXTEND_BUFFER() \ 949 do { \ 950 unsigned char *old_buffer = bufp->buffer; \ 951 if (bufp->allocated == MAX_BUF_SIZE) \ 952 return REG_ESIZE; \ 953 bufp->allocated <<= 1; \ 954 if (bufp->allocated > MAX_BUF_SIZE) \ 955 bufp->allocated = MAX_BUF_SIZE; \ 956 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ 957 if (bufp->buffer == NULL) \ 958 return REG_ESPACE; \ 959 /* If the buffer moved, move all the pointers into it. */ \ 960 if (old_buffer != bufp->buffer) \ 961 { \ 962 b = (b - old_buffer) + bufp->buffer; \ 963 begalt = (begalt - old_buffer) + bufp->buffer; \ 964 if (fixup_alt_jump) \ 965 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ 966 if (laststart) \ 967 laststart = (laststart - old_buffer) + bufp->buffer; \ 968 if (pending_exact) \ 969 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ 970 } \ 971 } while (0) 972 973 974 /* Since we have one byte reserved for the register number argument to 975 {start,stop}_memory, the maximum number of groups we can report 976 things about is what fits in that byte. */ 977 #define MAX_REGNUM 255 978 979 /* But patterns can have more than `MAX_REGNUM' registers. We just 980 ignore the excess. */ 981 typedef unsigned regnum_t; 982 983 984 /* Macros for the compile stack. */ 985 986 /* Since offsets can go either forwards or backwards, this type needs to 987 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ 988 typedef int pattern_offset_t; 989 990 typedef struct 991 { 992 pattern_offset_t begalt_offset; 993 pattern_offset_t fixup_alt_jump; 994 pattern_offset_t inner_group_offset; 995 pattern_offset_t laststart_offset; 996 regnum_t regnum; 997 } compile_stack_elt_t; 998 999 1000 typedef struct 1001 { 1002 compile_stack_elt_t *stack; 1003 unsigned size; 1004 unsigned avail; /* Offset of next open position. */ 1005 } compile_stack_type; 1006 1007 1008 #define INIT_COMPILE_STACK_SIZE 32 1009 1010 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) 1011 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 1012 1013 /* The next available element. */ 1014 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 1015 1016 1017 /* Set the bit for character C in a list. */ 1018 #define SET_LIST_BIT(c) \ 1019 (b[((unsigned char) (c)) / BYTEWIDTH] \ 1020 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 1021 1022 1023 /* Get the next unsigned number in the uncompiled pattern. */ 1024 #define GET_UNSIGNED_NUMBER(num) \ 1025 { if (p != pend) \ 1026 { \ 1027 PATFETCH (c); \ 1028 while (ISDIGIT (c)) \ 1029 { \ 1030 if (num < 0) \ 1031 num = 0; \ 1032 num = num * 10 + c - '0'; \ 1033 if (p == pend) \ 1034 break; \ 1035 PATFETCH (c); \ 1036 } \ 1037 } \ 1038 } 1039 1040 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 1041 1042 #define IS_CHAR_CLASS(string) \ 1043 (STREQ (string, "alpha") || STREQ (string, "upper") \ 1044 || STREQ (string, "lower") || STREQ (string, "digit") \ 1045 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 1046 || STREQ (string, "space") || STREQ (string, "print") \ 1047 || STREQ (string, "punct") || STREQ (string, "graph") \ 1048 || STREQ (string, "cntrl") || STREQ (string, "blank")) 1049 1050 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 1051 Returns one of error codes defined in `regex.h', or zero for success. 1052 1053 Assumes the `allocated' (and perhaps `buffer') and `translate' 1054 fields are set in BUFP on entry. 1055 1056 If it succeeds, results are put in BUFP (if it returns an error, the 1057 contents of BUFP are undefined): 1058 `buffer' is the compiled pattern; 1059 `syntax' is set to SYNTAX; 1060 `used' is set to the length of the compiled pattern; 1061 `fastmap_accurate' is zero; 1062 `re_nsub' is the number of subexpressions in PATTERN; 1063 `not_bol' and `not_eol' are zero; 1064 1065 The `fastmap' and `newline_anchor' fields are neither 1066 examined nor set. */ 1067 1068 static reg_errcode_t 1069 regex_compile (pattern, size, syntax, bufp) 1070 const char *pattern; 1071 int size; 1072 reg_syntax_t syntax; 1073 struct re_pattern_buffer *bufp; 1074 { 1075 /* We fetch characters from PATTERN here. Even though PATTERN is 1076 `char *' (i.e., signed), we declare these variables as unsigned, so 1077 they can be reliably used as array indices. */ 1078 register unsigned char c, c1; 1079 1080 /* A random temporary spot in PATTERN. */ 1081 const char *p1; 1082 1083 /* Points to the end of the buffer, where we should append. */ 1084 register unsigned char *b; 1085 1086 /* Keeps track of unclosed groups. */ 1087 compile_stack_type compile_stack; 1088 1089 /* Points to the current (ending) position in the pattern. */ 1090 const char *p = pattern; 1091 const char *pend = pattern + size; 1092 1093 /* How to translate the characters in the pattern. */ 1094 char *translate = bufp->translate; 1095 1096 /* Address of the count-byte of the most recently inserted `exactn' 1097 command. This makes it possible to tell if a new exact-match 1098 character can be added to that command or if the character requires 1099 a new `exactn' command. */ 1100 unsigned char *pending_exact = 0; 1101 1102 /* Address of start of the most recently finished expression. 1103 This tells, e.g., postfix * where to find the start of its 1104 operand. Reset at the beginning of groups and alternatives. */ 1105 unsigned char *laststart = 0; 1106 1107 /* Address of beginning of regexp, or inside of last group. */ 1108 unsigned char *begalt; 1109 1110 /* Place in the uncompiled pattern (i.e., the {) to 1111 which to go back if the interval is invalid. */ 1112 const char *beg_interval; 1113 1114 /* Address of the place where a forward jump should go to the end of 1115 the containing expression. Each alternative of an `or' -- except the 1116 last -- ends with a forward jump of this sort. */ 1117 unsigned char *fixup_alt_jump = 0; 1118 1119 /* Counts open-groups as they are encountered. Remembered for the 1120 matching close-group on the compile stack, so the same register 1121 number is put in the stop_memory as the start_memory. */ 1122 regnum_t regnum = 0; 1123 1124 #ifdef DEBUG 1125 DEBUG_PRINT1 ("\nCompiling pattern: "); 1126 if (debug) 1127 { 1128 unsigned debug_count; 1129 1130 for (debug_count = 0; debug_count < size; debug_count++) 1131 printchar (pattern[debug_count]); 1132 putchar ('\n'); 1133 } 1134 #endif /* DEBUG */ 1135 1136 /* Initialize the compile stack. */ 1137 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); 1138 if (compile_stack.stack == NULL) 1139 return REG_ESPACE; 1140 1141 compile_stack.size = INIT_COMPILE_STACK_SIZE; 1142 compile_stack.avail = 0; 1143 1144 /* Initialize the pattern buffer. */ 1145 bufp->syntax = syntax; 1146 bufp->fastmap_accurate = 0; 1147 bufp->not_bol = bufp->not_eol = 0; 1148 1149 /* Set `used' to zero, so that if we return an error, the pattern 1150 printer (for debugging) will think there's no pattern. We reset it 1151 at the end. */ 1152 bufp->used = 0; 1153 1154 /* Always count groups, whether or not bufp->no_sub is set. */ 1155 bufp->re_nsub = 0; 1156 1157 #if !defined (emacs) && !defined (SYNTAX_TABLE) 1158 /* Initialize the syntax table. */ 1159 init_syntax_once (); 1160 #endif 1161 1162 if (bufp->allocated == 0) 1163 { 1164 if (bufp->buffer) 1165 { /* If zero allocated, but buffer is non-null, try to realloc 1166 enough space. This loses if buffer's address is bogus, but 1167 that is the user's responsibility. */ 1168 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); 1169 } 1170 else 1171 { /* Caller did not allocate a buffer. Do it for them. */ 1172 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); 1173 } 1174 if (!bufp->buffer) return REG_ESPACE; 1175 1176 bufp->allocated = INIT_BUF_SIZE; 1177 } 1178 1179 begalt = b = bufp->buffer; 1180 1181 /* Loop through the uncompiled pattern until we're at the end. */ 1182 while (p != pend) 1183 { 1184 PATFETCH (c); 1185 1186 switch (c) 1187 { 1188 case '^': 1189 { 1190 if ( /* If at start of pattern, it's an operator. */ 1191 p == pattern + 1 1192 /* If context independent, it's an operator. */ 1193 || syntax & RE_CONTEXT_INDEP_ANCHORS 1194 /* Otherwise, depends on what's come before. */ 1195 || at_begline_loc_p (pattern, p, syntax)) 1196 BUF_PUSH (begline); 1197 else 1198 goto normal_char; 1199 } 1200 break; 1201 1202 1203 case '$': 1204 { 1205 if ( /* If at end of pattern, it's an operator. */ 1206 p == pend 1207 /* If context independent, it's an operator. */ 1208 || syntax & RE_CONTEXT_INDEP_ANCHORS 1209 /* Otherwise, depends on what's next. */ 1210 || at_endline_loc_p (p, pend, syntax)) 1211 BUF_PUSH (endline); 1212 else 1213 goto normal_char; 1214 } 1215 break; 1216 1217 1218 case '+': 1219 case '?': 1220 if ((syntax & RE_BK_PLUS_QM) 1221 || (syntax & RE_LIMITED_OPS)) 1222 goto normal_char; 1223 handle_plus: 1224 case '*': 1225 /* If there is no previous pattern... */ 1226 if (!laststart) 1227 { 1228 if (syntax & RE_CONTEXT_INVALID_OPS) 1229 return REG_BADRPT; 1230 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) 1231 goto normal_char; 1232 } 1233 1234 { 1235 /* Are we optimizing this jump? */ 1236 boolean keep_string_p = false; 1237 1238 /* 1 means zero (many) matches is allowed. */ 1239 char zero_times_ok = 0, many_times_ok = 0; 1240 1241 /* If there is a sequence of repetition chars, collapse it 1242 down to just one (the right one). We can't combine 1243 interval operators with these because of, e.g., `a{2}*', 1244 which should only match an even number of `a's. */ 1245 1246 for (;;) 1247 { 1248 zero_times_ok |= c != '+'; 1249 many_times_ok |= c != '?'; 1250 1251 if (p == pend) 1252 break; 1253 1254 PATFETCH (c); 1255 1256 if (c == '*' 1257 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) 1258 ; 1259 1260 else if (syntax & RE_BK_PLUS_QM && c == '\\') 1261 { 1262 if (p == pend) return REG_EESCAPE; 1263 1264 PATFETCH (c1); 1265 if (!(c1 == '+' || c1 == '?')) 1266 { 1267 PATUNFETCH; 1268 PATUNFETCH; 1269 break; 1270 } 1271 1272 c = c1; 1273 } 1274 else 1275 { 1276 PATUNFETCH; 1277 break; 1278 } 1279 1280 /* If we get here, we found another repeat character. */ 1281 } 1282 1283 /* Star, etc. applied to an empty pattern is equivalent 1284 to an empty pattern. */ 1285 if (!laststart) 1286 break; 1287 1288 /* Now we know whether or not zero matches is allowed 1289 and also whether or not two or more matches is allowed. */ 1290 if (many_times_ok) 1291 { /* More than one repetition is allowed, so put in at the 1292 end a backward relative jump from `b' to before the next 1293 jump we're going to put in below (which jumps from 1294 laststart to after this jump). 1295 1296 But if we are at the `*' in the exact sequence `.*\n', 1297 insert an unconditional jump backwards to the ., 1298 instead of the beginning of the loop. This way we only 1299 push a failure point once, instead of every time 1300 through the loop. */ 1301 assert (p - 1 > pattern); 1302 1303 /* Allocate the space for the jump. */ 1304 GET_BUFFER_SPACE (3); 1305 1306 /* We know we are not at the first character of the pattern, 1307 because laststart was nonzero. And we've already 1308 incremented `p', by the way, to be the character after 1309 the `*'. Do we have to do something analogous here 1310 for null bytes, because of RE_DOT_NOT_NULL? */ 1311 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') 1312 && zero_times_ok 1313 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') 1314 && !(syntax & RE_DOT_NEWLINE)) 1315 { /* We have .*\n. */ 1316 STORE_JUMP (jump, b, laststart); 1317 keep_string_p = true; 1318 } 1319 else 1320 /* Anything else. */ 1321 STORE_JUMP (maybe_pop_jump, b, laststart - 3); 1322 1323 /* We've added more stuff to the buffer. */ 1324 b += 3; 1325 } 1326 1327 /* On failure, jump from laststart to b + 3, which will be the 1328 end of the buffer after this jump is inserted. */ 1329 GET_BUFFER_SPACE (3); 1330 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump 1331 : on_failure_jump, 1332 laststart, b + 3); 1333 pending_exact = 0; 1334 b += 3; 1335 1336 if (!zero_times_ok) 1337 { 1338 /* At least one repetition is required, so insert a 1339 `dummy_failure_jump' before the initial 1340 `on_failure_jump' instruction of the loop. This 1341 effects a skip over that instruction the first time 1342 we hit that loop. */ 1343 GET_BUFFER_SPACE (3); 1344 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); 1345 b += 3; 1346 } 1347 } 1348 break; 1349 1350 1351 case '.': 1352 laststart = b; 1353 BUF_PUSH (anychar); 1354 break; 1355 1356 1357 case '[': 1358 { 1359 boolean had_char_class = false; 1360 1361 if (p == pend) return REG_EBRACK; 1362 1363 /* Ensure that we have enough space to push a charset: the 1364 opcode, the length count, and the bitset; 34 bytes in all. */ 1365 GET_BUFFER_SPACE (34); 1366 1367 laststart = b; 1368 1369 /* We test `*p == '^' twice, instead of using an if 1370 statement, so we only need one BUF_PUSH. */ 1371 BUF_PUSH (*p == '^' ? charset_not : charset); 1372 if (*p == '^') 1373 p++; 1374 1375 /* Remember the first position in the bracket expression. */ 1376 p1 = p; 1377 1378 /* Push the number of bytes in the bitmap. */ 1379 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 1380 1381 /* Clear the whole map. */ 1382 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); 1383 1384 /* charset_not matches newline according to a syntax bit. */ 1385 if ((re_opcode_t) b[-2] == charset_not 1386 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 1387 SET_LIST_BIT ('\n'); 1388 1389 /* Read in characters and ranges, setting map bits. */ 1390 for (;;) 1391 { 1392 if (p == pend) return REG_EBRACK; 1393 1394 PATFETCH (c); 1395 1396 /* \ might escape characters inside [...] and [^...]. */ 1397 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 1398 { 1399 if (p == pend) return REG_EESCAPE; 1400 1401 PATFETCH (c1); 1402 SET_LIST_BIT (c1); 1403 continue; 1404 } 1405 1406 /* Could be the end of the bracket expression. If it's 1407 not (i.e., when the bracket expression is `[]' so 1408 far), the ']' character bit gets set way below. */ 1409 if (c == ']' && p != p1 + 1) 1410 break; 1411 1412 /* Look ahead to see if it's a range when the last thing 1413 was a character class. */ 1414 if (had_char_class && c == '-' && *p != ']') 1415 return REG_ERANGE; 1416 1417 /* Look ahead to see if it's a range when the last thing 1418 was a character: if this is a hyphen not at the 1419 beginning or the end of a list, then it's the range 1420 operator. */ 1421 if (c == '-' 1422 && !(p - 2 >= pattern && p[-2] == '[') 1423 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 1424 && *p != ']') 1425 { 1426 reg_errcode_t ret 1427 = compile_range (&p, pend, translate, syntax, b); 1428 if (ret != REG_NOERROR) return ret; 1429 } 1430 1431 else if (p[0] == '-' && p[1] != ']') 1432 { /* This handles ranges made up of characters only. */ 1433 reg_errcode_t ret; 1434 1435 /* Move past the `-'. */ 1436 PATFETCH (c1); 1437 1438 ret = compile_range (&p, pend, translate, syntax, b); 1439 if (ret != REG_NOERROR) return ret; 1440 } 1441 1442 /* See if we're at the beginning of a possible character 1443 class. */ 1444 1445 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 1446 { /* Leave room for the null. */ 1447 char str[CHAR_CLASS_MAX_LENGTH + 1]; 1448 1449 PATFETCH (c); 1450 c1 = 0; 1451 1452 /* If pattern is `[[:'. */ 1453 if (p == pend) return REG_EBRACK; 1454 1455 for (;;) 1456 { 1457 PATFETCH (c); 1458 if (c == ':' || c == ']' || p == pend 1459 || c1 == CHAR_CLASS_MAX_LENGTH) 1460 break; 1461 str[c1++] = c; 1462 } 1463 str[c1] = '\0'; 1464 1465 /* If isn't a word bracketed by `[:' and:`]': 1466 undo the ending character, the letters, and leave 1467 the leading `:' and `[' (but set bits for them). */ 1468 if (c == ':' && *p == ']') 1469 { 1470 int ch; 1471 boolean is_alnum = STREQ (str, "alnum"); 1472 boolean is_alpha = STREQ (str, "alpha"); 1473 boolean is_blank = STREQ (str, "blank"); 1474 boolean is_cntrl = STREQ (str, "cntrl"); 1475 boolean is_digit = STREQ (str, "digit"); 1476 boolean is_graph = STREQ (str, "graph"); 1477 boolean is_lower = STREQ (str, "lower"); 1478 boolean is_print = STREQ (str, "print"); 1479 boolean is_punct = STREQ (str, "punct"); 1480 boolean is_space = STREQ (str, "space"); 1481 boolean is_upper = STREQ (str, "upper"); 1482 boolean is_xdigit = STREQ (str, "xdigit"); 1483 1484 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; 1485 1486 /* Throw away the ] at the end of the character 1487 class. */ 1488 PATFETCH (c); 1489 1490 if (p == pend) return REG_EBRACK; 1491 1492 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 1493 { 1494 if ( (is_alnum && ISALNUM (ch)) 1495 || (is_alpha && ISALPHA (ch)) 1496 || (is_blank && ISBLANK (ch)) 1497 || (is_cntrl && ISCNTRL (ch)) 1498 || (is_digit && ISDIGIT (ch)) 1499 || (is_graph && ISGRAPH (ch)) 1500 || (is_lower && ISLOWER (ch)) 1501 || (is_print && ISPRINT (ch)) 1502 || (is_punct && ISPUNCT (ch)) 1503 || (is_space && ISSPACE (ch)) 1504 || (is_upper && ISUPPER (ch)) 1505 || (is_xdigit && ISXDIGIT (ch))) 1506 SET_LIST_BIT (ch); 1507 } 1508 had_char_class = true; 1509 } 1510 else 1511 { 1512 c1++; 1513 while (c1--) 1514 PATUNFETCH; 1515 SET_LIST_BIT ('['); 1516 SET_LIST_BIT (':'); 1517 had_char_class = false; 1518 } 1519 } 1520 else 1521 { 1522 had_char_class = false; 1523 SET_LIST_BIT (c); 1524 } 1525 } 1526 1527 /* Discard any (non)matching list bytes that are all 0 at the 1528 end of the map. Decrease the map-length byte too. */ 1529 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 1530 b[-1]--; 1531 b += b[-1]; 1532 } 1533 break; 1534 1535 1536 case '(': 1537 if (syntax & RE_NO_BK_PARENS) 1538 goto handle_open; 1539 else 1540 goto normal_char; 1541 1542 1543 case ')': 1544 if (syntax & RE_NO_BK_PARENS) 1545 goto handle_close; 1546 else 1547 goto normal_char; 1548 1549 1550 case '\n': 1551 if (syntax & RE_NEWLINE_ALT) 1552 goto handle_alt; 1553 else 1554 goto normal_char; 1555 1556 1557 case '|': 1558 if (syntax & RE_NO_BK_VBAR) 1559 goto handle_alt; 1560 else 1561 goto normal_char; 1562 1563 1564 case '{': 1565 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) 1566 goto handle_interval; 1567 else 1568 goto normal_char; 1569 1570 1571 case '\\': 1572 if (p == pend) return REG_EESCAPE; 1573 1574 /* Do not translate the character after the \, so that we can 1575 distinguish, e.g., \B from \b, even if we normally would 1576 translate, e.g., B to b. */ 1577 PATFETCH_RAW (c); 1578 1579 switch (c) 1580 { 1581 case '(': 1582 if (syntax & RE_NO_BK_PARENS) 1583 goto normal_backslash; 1584 1585 handle_open: 1586 bufp->re_nsub++; 1587 regnum++; 1588 1589 if (COMPILE_STACK_FULL) 1590 { 1591 RETALLOC (compile_stack.stack, compile_stack.size << 1, 1592 compile_stack_elt_t); 1593 if (compile_stack.stack == NULL) return REG_ESPACE; 1594 1595 compile_stack.size <<= 1; 1596 } 1597 1598 /* These are the values to restore when we hit end of this 1599 group. They are all relative offsets, so that if the 1600 whole pattern moves because of realloc, they will still 1601 be valid. */ 1602 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; 1603 COMPILE_STACK_TOP.fixup_alt_jump 1604 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; 1605 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; 1606 COMPILE_STACK_TOP.regnum = regnum; 1607 1608 /* We will eventually replace the 0 with the number of 1609 groups inner to this one. But do not push a 1610 start_memory for groups beyond the last one we can 1611 represent in the compiled pattern. */ 1612 if (regnum <= MAX_REGNUM) 1613 { 1614 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; 1615 BUF_PUSH_3 (start_memory, regnum, 0); 1616 } 1617 1618 compile_stack.avail++; 1619 1620 fixup_alt_jump = 0; 1621 laststart = 0; 1622 begalt = b; 1623 /* If we've reached MAX_REGNUM groups, then this open 1624 won't actually generate any code, so we'll have to 1625 clear pending_exact explicitly. */ 1626 pending_exact = 0; 1627 break; 1628 1629 1630 case ')': 1631 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; 1632 1633 if (COMPILE_STACK_EMPTY) 1634 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 1635 goto normal_backslash; 1636 else 1637 return REG_ERPAREN; 1638 1639 handle_close: 1640 if (fixup_alt_jump) 1641 { /* Push a dummy failure point at the end of the 1642 alternative for a possible future 1643 `pop_failure_jump' to pop. See comments at 1644 `push_dummy_failure' in `re_match_2'. */ 1645 BUF_PUSH (push_dummy_failure); 1646 1647 /* We allocated space for this jump when we assigned 1648 to `fixup_alt_jump', in the `handle_alt' case below. */ 1649 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); 1650 } 1651 1652 /* See similar code for backslashed left paren above. */ 1653 if (COMPILE_STACK_EMPTY) 1654 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 1655 goto normal_char; 1656 else 1657 return REG_ERPAREN; 1658 1659 /* Since we just checked for an empty stack above, this 1660 ``can't happen''. */ 1661 assert (compile_stack.avail != 0); 1662 { 1663 /* We don't just want to restore into `regnum', because 1664 later groups should continue to be numbered higher, 1665 as in `(ab)c(de)' -- the second group is #2. */ 1666 regnum_t this_group_regnum; 1667 1668 compile_stack.avail--; 1669 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; 1670 fixup_alt_jump 1671 = COMPILE_STACK_TOP.fixup_alt_jump 1672 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 1673 : 0; 1674 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; 1675 this_group_regnum = COMPILE_STACK_TOP.regnum; 1676 /* If we've reached MAX_REGNUM groups, then this open 1677 won't actually generate any code, so we'll have to 1678 clear pending_exact explicitly. */ 1679 pending_exact = 0; 1680 1681 /* We're at the end of the group, so now we know how many 1682 groups were inside this one. */ 1683 if (this_group_regnum <= MAX_REGNUM) 1684 { 1685 unsigned char *inner_group_loc 1686 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; 1687 1688 *inner_group_loc = regnum - this_group_regnum; 1689 BUF_PUSH_3 (stop_memory, this_group_regnum, 1690 regnum - this_group_regnum); 1691 } 1692 } 1693 break; 1694 1695 1696 case '|': /* `\|'. */ 1697 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) 1698 goto normal_backslash; 1699 handle_alt: 1700 if (syntax & RE_LIMITED_OPS) 1701 goto normal_char; 1702 1703 /* Insert before the previous alternative a jump which 1704 jumps to this alternative if the former fails. */ 1705 GET_BUFFER_SPACE (3); 1706 INSERT_JUMP (on_failure_jump, begalt, b + 6); 1707 pending_exact = 0; 1708 b += 3; 1709 1710 /* The alternative before this one has a jump after it 1711 which gets executed if it gets matched. Adjust that 1712 jump so it will jump to this alternative's analogous 1713 jump (put in below, which in turn will jump to the next 1714 (if any) alternative's such jump, etc.). The last such 1715 jump jumps to the correct final destination. A picture: 1716 _____ _____ 1717 | | | | 1718 | v | v 1719 a | b | c 1720 1721 If we are at `b', then fixup_alt_jump right now points to a 1722 three-byte space after `a'. We'll put in the jump, set 1723 fixup_alt_jump to right after `b', and leave behind three 1724 bytes which we'll fill in when we get to after `c'. */ 1725 1726 if (fixup_alt_jump) 1727 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 1728 1729 /* Mark and leave space for a jump after this alternative, 1730 to be filled in later either by next alternative or 1731 when know we're at the end of a series of alternatives. */ 1732 fixup_alt_jump = b; 1733 GET_BUFFER_SPACE (3); 1734 b += 3; 1735 1736 laststart = 0; 1737 begalt = b; 1738 break; 1739 1740 1741 case '{': 1742 /* If \{ is a literal. */ 1743 if (!(syntax & RE_INTERVALS) 1744 /* If we're at `\{' and it's not the open-interval 1745 operator. */ 1746 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1747 || (p - 2 == pattern && p == pend)) 1748 goto normal_backslash; 1749 1750 handle_interval: 1751 { 1752 /* If got here, then the syntax allows intervals. */ 1753 1754 /* At least (most) this many matches must be made. */ 1755 int lower_bound = -1, upper_bound = -1; 1756 1757 beg_interval = p - 1; 1758 1759 if (p == pend) 1760 { 1761 if (syntax & RE_NO_BK_BRACES) 1762 goto unfetch_interval; 1763 else 1764 return REG_EBRACE; 1765 } 1766 1767 GET_UNSIGNED_NUMBER (lower_bound); 1768 1769 if (c == ',') 1770 { 1771 GET_UNSIGNED_NUMBER (upper_bound); 1772 if (upper_bound < 0) upper_bound = RE_DUP_MAX; 1773 } 1774 else 1775 /* Interval such as `{1}' => match exactly once. */ 1776 upper_bound = lower_bound; 1777 1778 if (lower_bound < 0 || upper_bound > RE_DUP_MAX 1779 || lower_bound > upper_bound) 1780 { 1781 if (syntax & RE_NO_BK_BRACES) 1782 goto unfetch_interval; 1783 else 1784 return REG_BADBR; 1785 } 1786 1787 if (!(syntax & RE_NO_BK_BRACES)) 1788 { 1789 if (c != '\\') return REG_EBRACE; 1790 1791 PATFETCH (c); 1792 } 1793 1794 if (c != '}') 1795 { 1796 if (syntax & RE_NO_BK_BRACES) 1797 goto unfetch_interval; 1798 else 1799 return REG_BADBR; 1800 } 1801 1802 /* We just parsed a valid interval. */ 1803 1804 /* If it's invalid to have no preceding re. */ 1805 if (!laststart) 1806 { 1807 if (syntax & RE_CONTEXT_INVALID_OPS) 1808 return REG_BADRPT; 1809 else if (syntax & RE_CONTEXT_INDEP_OPS) 1810 laststart = b; 1811 else 1812 goto unfetch_interval; 1813 } 1814 1815 /* If the upper bound is zero, don't want to succeed at 1816 all; jump from `laststart' to `b + 3', which will be 1817 the end of the buffer after we insert the jump. */ 1818 if (upper_bound == 0) 1819 { 1820 GET_BUFFER_SPACE (3); 1821 INSERT_JUMP (jump, laststart, b + 3); 1822 b += 3; 1823 } 1824 1825 /* Otherwise, we have a nontrivial interval. When 1826 we're all done, the pattern will look like: 1827 set_number_at <jump count> <upper bound> 1828 set_number_at <succeed_n count> <lower bound> 1829 succeed_n <after jump addr> <succeed_n count> 1830 <body of loop> 1831 jump_n <succeed_n addr> <jump count> 1832 (The upper bound and `jump_n' are omitted if 1833 `upper_bound' is 1, though.) */ 1834 else 1835 { /* If the upper bound is > 1, we need to insert 1836 more at the end of the loop. */ 1837 unsigned nbytes = 10 + (upper_bound > 1) * 10; 1838 1839 GET_BUFFER_SPACE (nbytes); 1840 1841 /* Initialize lower bound of the `succeed_n', even 1842 though it will be set during matching by its 1843 attendant `set_number_at' (inserted next), 1844 because `re_compile_fastmap' needs to know. 1845 Jump to the `jump_n' we might insert below. */ 1846 INSERT_JUMP2 (succeed_n, laststart, 1847 b + 5 + (upper_bound > 1) * 5, 1848 lower_bound); 1849 b += 5; 1850 1851 /* Code to initialize the lower bound. Insert 1852 before the `succeed_n'. The `5' is the last two 1853 bytes of this `set_number_at', plus 3 bytes of 1854 the following `succeed_n'. */ 1855 insert_op2 (set_number_at, laststart, 5, lower_bound, b); 1856 b += 5; 1857 1858 if (upper_bound > 1) 1859 { /* More than one repetition is allowed, so 1860 append a backward jump to the `succeed_n' 1861 that starts this interval. 1862 1863 When we've reached this during matching, 1864 we'll have matched the interval once, so 1865 jump back only `upper_bound - 1' times. */ 1866 STORE_JUMP2 (jump_n, b, laststart + 5, 1867 upper_bound - 1); 1868 b += 5; 1869 1870 /* The location we want to set is the second 1871 parameter of the `jump_n'; that is `b-2' as 1872 an absolute address. `laststart' will be 1873 the `set_number_at' we're about to insert; 1874 `laststart+3' the number to set, the source 1875 for the relative address. But we are 1876 inserting into the middle of the pattern -- 1877 so everything is getting moved up by 5. 1878 Conclusion: (b - 2) - (laststart + 3) + 5, 1879 i.e., b - laststart. 1880 1881 We insert this at the beginning of the loop 1882 so that if we fail during matching, we'll 1883 reinitialize the bounds. */ 1884 insert_op2 (set_number_at, laststart, b - laststart, 1885 upper_bound - 1, b); 1886 b += 5; 1887 } 1888 } 1889 pending_exact = 0; 1890 beg_interval = NULL; 1891 } 1892 break; 1893 1894 unfetch_interval: 1895 /* If an invalid interval, match the characters as literals. */ 1896 assert (beg_interval); 1897 p = beg_interval; 1898 beg_interval = NULL; 1899 1900 /* normal_char and normal_backslash need `c'. */ 1901 PATFETCH (c); 1902 1903 if (!(syntax & RE_NO_BK_BRACES)) 1904 { 1905 if (p > pattern && p[-1] == '\\') 1906 goto normal_backslash; 1907 } 1908 goto normal_char; 1909 1910 #ifdef emacs 1911 /* There is no way to specify the before_dot and after_dot 1912 operators. rms says this is ok. --karl */ 1913 case '=': 1914 BUF_PUSH (at_dot); 1915 break; 1916 1917 case 's': 1918 laststart = b; 1919 PATFETCH (c); 1920 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); 1921 break; 1922 1923 case 'S': 1924 laststart = b; 1925 PATFETCH (c); 1926 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); 1927 break; 1928 #endif /* emacs */ 1929 1930 1931 case 'w': 1932 laststart = b; 1933 BUF_PUSH (wordchar); 1934 break; 1935 1936 1937 case 'W': 1938 laststart = b; 1939 BUF_PUSH (notwordchar); 1940 break; 1941 1942 1943 case '<': 1944 BUF_PUSH (wordbeg); 1945 break; 1946 1947 case '>': 1948 BUF_PUSH (wordend); 1949 break; 1950 1951 case 'b': 1952 BUF_PUSH (wordbound); 1953 break; 1954 1955 case 'B': 1956 BUF_PUSH (notwordbound); 1957 break; 1958 1959 case '`': 1960 BUF_PUSH (begbuf); 1961 break; 1962 1963 case '\'': 1964 BUF_PUSH (endbuf); 1965 break; 1966 1967 case '1': case '2': case '3': case '4': case '5': 1968 case '6': case '7': case '8': case '9': 1969 if (syntax & RE_NO_BK_REFS) 1970 goto normal_char; 1971 1972 c1 = c - '0'; 1973 1974 if (c1 > regnum) 1975 return REG_ESUBREG; 1976 1977 /* Can't back reference to a subexpression if inside of it. */ 1978 if (group_in_compile_stack (compile_stack, c1)) 1979 goto normal_char; 1980 1981 laststart = b; 1982 BUF_PUSH_2 (duplicate, c1); 1983 break; 1984 1985 1986 case '+': 1987 case '?': 1988 if (syntax & RE_BK_PLUS_QM) 1989 goto handle_plus; 1990 else 1991 goto normal_backslash; 1992 1993 default: 1994 normal_backslash: 1995 /* You might think it would be useful for \ to mean 1996 not to translate; but if we don't translate it 1997 it will never match anything. */ 1998 c = TRANSLATE (c); 1999 goto normal_char; 2000 } 2001 break; 2002 2003 2004 default: 2005 /* Expects the character in `c'. */ 2006 normal_char: 2007 /* If no exactn currently being built. */ 2008 if (!pending_exact 2009 2010 /* If last exactn not at current position. */ 2011 || pending_exact + *pending_exact + 1 != b 2012 2013 /* We have only one byte following the exactn for the count. */ 2014 || *pending_exact == (1 << BYTEWIDTH) - 1 2015 2016 /* If followed by a repetition operator. */ 2017 || *p == '*' || *p == '^' 2018 || ((syntax & RE_BK_PLUS_QM) 2019 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 2020 : (*p == '+' || *p == '?')) 2021 || ((syntax & RE_INTERVALS) 2022 && ((syntax & RE_NO_BK_BRACES) 2023 ? *p == '{' 2024 : (p[0] == '\\' && p[1] == '{')))) 2025 { 2026 /* Start building a new exactn. */ 2027 2028 laststart = b; 2029 2030 BUF_PUSH_2 (exactn, 0); 2031 pending_exact = b - 1; 2032 } 2033 2034 BUF_PUSH (c); 2035 (*pending_exact)++; 2036 break; 2037 } /* switch (c) */ 2038 } /* while p != pend */ 2039 2040 2041 /* Through the pattern now. */ 2042 2043 if (fixup_alt_jump) 2044 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 2045 2046 if (!COMPILE_STACK_EMPTY) 2047 return REG_EPAREN; 2048 2049 free (compile_stack.stack); 2050 2051 /* We have succeeded; set the length of the buffer. */ 2052 bufp->used = b - bufp->buffer; 2053 2054 #ifdef DEBUG 2055 if (debug) 2056 { 2057 DEBUG_PRINT1 ("\nCompiled pattern: "); 2058 print_compiled_pattern (bufp); 2059 } 2060 #endif /* DEBUG */ 2061 2062 return REG_NOERROR; 2063 } /* regex_compile */ 2064 2065 /* Subroutines for `regex_compile'. */ 2066 2067 /* Store OP at LOC followed by two-byte integer parameter ARG. */ 2068 2069 static void 2070 store_op1 (op, loc, arg) 2071 re_opcode_t op; 2072 unsigned char *loc; 2073 int arg; 2074 { 2075 *loc = (unsigned char) op; 2076 STORE_NUMBER (loc + 1, arg); 2077 } 2078 2079 2080 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ 2081 2082 static void 2083 store_op2 (op, loc, arg1, arg2) 2084 re_opcode_t op; 2085 unsigned char *loc; 2086 int arg1, arg2; 2087 { 2088 *loc = (unsigned char) op; 2089 STORE_NUMBER (loc + 1, arg1); 2090 STORE_NUMBER (loc + 3, arg2); 2091 } 2092 2093 2094 /* Copy the bytes from LOC to END to open up three bytes of space at LOC 2095 for OP followed by two-byte integer parameter ARG. */ 2096 2097 static void 2098 insert_op1 (op, loc, arg, end) 2099 re_opcode_t op; 2100 unsigned char *loc; 2101 int arg; 2102 unsigned char *end; 2103 { 2104 register unsigned char *pfrom = end; 2105 register unsigned char *pto = end + 3; 2106 2107 while (pfrom != loc) 2108 *--pto = *--pfrom; 2109 2110 store_op1 (op, loc, arg); 2111 } 2112 2113 2114 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ 2115 2116 static void 2117 insert_op2 (op, loc, arg1, arg2, end) 2118 re_opcode_t op; 2119 unsigned char *loc; 2120 int arg1, arg2; 2121 unsigned char *end; 2122 { 2123 register unsigned char *pfrom = end; 2124 register unsigned char *pto = end + 5; 2125 2126 while (pfrom != loc) 2127 *--pto = *--pfrom; 2128 2129 store_op2 (op, loc, arg1, arg2); 2130 } 2131 2132 2133 /* P points to just after a ^ in PATTERN. Return true if that ^ comes 2134 after an alternative or a begin-subexpression. We assume there is at 2135 least one character before the ^. */ 2136 2137 static boolean 2138 at_begline_loc_p (pattern, p, syntax) 2139 const char *pattern, *p; 2140 reg_syntax_t syntax; 2141 { 2142 const char *prev = p - 2; 2143 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; 2144 2145 return 2146 /* After a subexpression? */ 2147 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) 2148 /* After an alternative? */ 2149 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); 2150 } 2151 2152 2153 /* The dual of at_begline_loc_p. This one is for $. We assume there is 2154 at least one character after the $, i.e., `P < PEND'. */ 2155 2156 static boolean 2157 at_endline_loc_p (p, pend, syntax) 2158 const char *p, *pend; 2159 int syntax; 2160 { 2161 const char *next = p; 2162 boolean next_backslash = *next == '\\'; 2163 const char *next_next = p + 1 < pend ? p + 1 : NULL; 2164 2165 return 2166 /* Before a subexpression? */ 2167 (syntax & RE_NO_BK_PARENS ? *next == ')' 2168 : next_backslash && next_next && *next_next == ')') 2169 /* Before an alternative? */ 2170 || (syntax & RE_NO_BK_VBAR ? *next == '|' 2171 : next_backslash && next_next && *next_next == '|'); 2172 } 2173 2174 2175 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and 2176 false if it's not. */ 2177 2178 static boolean 2179 group_in_compile_stack (compile_stack, regnum) 2180 compile_stack_type compile_stack; 2181 regnum_t regnum; 2182 { 2183 int this_element; 2184 2185 for (this_element = compile_stack.avail - 1; 2186 this_element >= 0; 2187 this_element--) 2188 if (compile_stack.stack[this_element].regnum == regnum) 2189 return true; 2190 2191 return false; 2192 } 2193 2194 2195 /* Read the ending character of a range (in a bracket expression) from the 2196 uncompiled pattern *P_PTR (which ends at PEND). We assume the 2197 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) 2198 Then we set the translation of all bits between the starting and 2199 ending characters (inclusive) in the compiled pattern B. 2200 2201 Return an error code. 2202 2203 We use these short variable names so we can use the same macros as 2204 `regex_compile' itself. */ 2205 2206 static reg_errcode_t 2207 compile_range (p_ptr, pend, translate, syntax, b) 2208 const char **p_ptr, *pend; 2209 char *translate; 2210 reg_syntax_t syntax; 2211 unsigned char *b; 2212 { 2213 unsigned this_char; 2214 2215 const char *p = *p_ptr; 2216 int range_start, range_end; 2217 2218 if (p == pend) 2219 return REG_ERANGE; 2220 2221 /* Even though the pattern is a signed `char *', we need to fetch 2222 with unsigned char *'s; if the high bit of the pattern character 2223 is set, the range endpoints will be negative if we fetch using a 2224 signed char *. 2225 2226 We also want to fetch the endpoints without translating them; the 2227 appropriate translation is done in the bit-setting loop below. */ 2228 range_start = ((unsigned char *) p)[-2]; 2229 range_end = ((unsigned char *) p)[0]; 2230 2231 /* Have to increment the pointer into the pattern string, so the 2232 caller isn't still at the ending character. */ 2233 (*p_ptr)++; 2234 2235 /* If the start is after the end, the range is empty. */ 2236 if (range_start > range_end) 2237 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; 2238 2239 /* Here we see why `this_char' has to be larger than an `unsigned 2240 char' -- the range is inclusive, so if `range_end' == 0xff 2241 (assuming 8-bit characters), we would otherwise go into an infinite 2242 loop, since all characters <= 0xff. */ 2243 for (this_char = range_start; this_char <= range_end; this_char++) 2244 { 2245 SET_LIST_BIT (TRANSLATE (this_char)); 2246 } 2247 2248 return REG_NOERROR; 2249 } 2250 2251 /* Failure stack declarations and macros; both re_compile_fastmap and 2252 re_match_2 use a failure stack. These have to be macros because of 2253 REGEX_ALLOCATE. */ 2254 2255 2256 /* Number of failure points for which to initially allocate space 2257 when matching. If this number is exceeded, we allocate more 2258 space, so it is not a hard limit. */ 2259 #ifndef INIT_FAILURE_ALLOC 2260 #define INIT_FAILURE_ALLOC 5 2261 #endif 2262 2263 /* Roughly the maximum number of failure points on the stack. Would be 2264 exactly that if always used MAX_FAILURE_SPACE each time we failed. 2265 This is a variable only so users of regex can assign to it; we never 2266 change it ourselves. */ 2267 int re_max_failures = 2000; 2268 2269 union fail_stack_elt 2270 { 2271 unsigned char *pointer; 2272 int integer; 2273 }; 2274 2275 typedef union fail_stack_elt fail_stack_elt_t; 2276 2277 typedef struct 2278 { 2279 fail_stack_elt_t *stack; 2280 unsigned size; 2281 unsigned avail; /* Offset of next open position. */ 2282 } fail_stack_type; 2283 2284 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 2285 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 2286 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 2287 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail]) 2288 2289 2290 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */ 2291 2292 #define INIT_FAIL_STACK() \ 2293 do { \ 2294 fail_stack.stack = (fail_stack_elt_t *) \ 2295 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ 2296 \ 2297 if (fail_stack.stack == NULL) \ 2298 return -2; \ 2299 \ 2300 fail_stack.size = INIT_FAILURE_ALLOC; \ 2301 fail_stack.avail = 0; \ 2302 } while (0) 2303 2304 2305 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. 2306 2307 Return 1 if succeeds, and 0 if either ran out of memory 2308 allocating space for it or it was already too large. 2309 2310 REGEX_REALLOCATE requires `destination' be declared. */ 2311 2312 #define DOUBLE_FAIL_STACK(fail_stack) \ 2313 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ 2314 ? 0 \ 2315 : ((fail_stack).stack = (fail_stack_elt_t *) \ 2316 REGEX_REALLOCATE ((fail_stack).stack, \ 2317 (fail_stack).size * sizeof (fail_stack_elt_t), \ 2318 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ 2319 \ 2320 (fail_stack).stack == NULL \ 2321 ? 0 \ 2322 : ((fail_stack).size <<= 1, \ 2323 1))) 2324 2325 2326 /* Push PATTERN_OP on FAIL_STACK. 2327 2328 Return 1 if was able to do so and 0 if ran out of memory allocating 2329 space to do so. */ 2330 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ 2331 ((FAIL_STACK_FULL () \ 2332 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ 2333 ? 0 \ 2334 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ 2335 1)) 2336 2337 /* Push a pointer value onto the failure stack. 2338 Assumes the variable `fail_stack'. Probably should only 2339 be called from within `PUSH_FAILURE_POINT'. */ 2340 #define PUSH_FAILURE_POINTER(item) \ 2341 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) 2342 2343 /* This pushes an integer-valued item onto the failure stack. 2344 Assumes the variable `fail_stack'. Probably should only 2345 be called from within `PUSH_FAILURE_POINT'. */ 2346 #define PUSH_FAILURE_INT(item) \ 2347 fail_stack.stack[fail_stack.avail++].integer = (item) 2348 2349 /* Push a fail_stack_elt_t value onto the failure stack. 2350 Assumes the variable `fail_stack'. Probably should only 2351 be called from within `PUSH_FAILURE_POINT'. */ 2352 #define PUSH_FAILURE_ELT(item) \ 2353 fail_stack.stack[fail_stack.avail++] = (item) 2354 2355 /* These three POP... operations complement the three PUSH... operations. 2356 All assume that `fail_stack' is nonempty. */ 2357 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer 2358 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer 2359 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] 2360 2361 /* Used to omit pushing failure point id's when we're not debugging. */ 2362 #ifdef DEBUG 2363 #define DEBUG_PUSH PUSH_FAILURE_INT 2364 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () 2365 #else 2366 #define DEBUG_PUSH(item) 2367 #define DEBUG_POP(item_addr) 2368 #endif 2369 2370 2371 /* Push the information about the state we will need 2372 if we ever fail back to it. 2373 2374 Requires variables fail_stack, regstart, regend, reg_info, and 2375 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be 2376 declared. 2377 2378 Does `return FAILURE_CODE' if runs out of memory. */ 2379 2380 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 2381 do { \ 2382 char *destination; \ 2383 /* Must be int, so when we don't save any registers, the arithmetic \ 2384 of 0 + -1 isn't done as unsigned. */ \ 2385 int this_reg; \ 2386 \ 2387 DEBUG_STATEMENT (failure_id++); \ 2388 DEBUG_STATEMENT (nfailure_points_pushed++); \ 2389 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 2390 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 2391 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 2392 \ 2393 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ 2394 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 2395 \ 2396 /* Ensure we have enough space allocated for what we will push. */ \ 2397 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 2398 { \ 2399 if (!DOUBLE_FAIL_STACK (fail_stack)) \ 2400 return failure_code; \ 2401 \ 2402 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 2403 (fail_stack).size); \ 2404 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 2405 } \ 2406 \ 2407 /* Push the info, starting with the registers. */ \ 2408 DEBUG_PRINT1 ("\n"); \ 2409 \ 2410 if (1) \ 2411 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 2412 this_reg++) \ 2413 { \ 2414 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ 2415 DEBUG_STATEMENT (num_regs_pushed++); \ 2416 \ 2417 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 2418 PUSH_FAILURE_POINTER (regstart[this_reg]); \ 2419 \ 2420 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 2421 PUSH_FAILURE_POINTER (regend[this_reg]); \ 2422 \ 2423 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ 2424 DEBUG_PRINT2 (" match_null=%d", \ 2425 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 2426 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 2427 DEBUG_PRINT2 (" matched_something=%d", \ 2428 MATCHED_SOMETHING (reg_info[this_reg])); \ 2429 DEBUG_PRINT2 (" ever_matched=%d", \ 2430 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 2431 DEBUG_PRINT1 ("\n"); \ 2432 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ 2433 } \ 2434 \ 2435 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ 2436 PUSH_FAILURE_INT (lowest_active_reg); \ 2437 \ 2438 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ 2439 PUSH_FAILURE_INT (highest_active_reg); \ 2440 \ 2441 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ 2442 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 2443 PUSH_FAILURE_POINTER (pattern_place); \ 2444 \ 2445 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ 2446 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 2447 size2); \ 2448 DEBUG_PRINT1 ("'\n"); \ 2449 PUSH_FAILURE_POINTER (string_place); \ 2450 \ 2451 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 2452 DEBUG_PUSH (failure_id); \ 2453 } while (0) 2454 2455 /* This is the number of items that are pushed and popped on the stack 2456 for each register. */ 2457 #define NUM_REG_ITEMS 3 2458 2459 /* Individual items aside from the registers. */ 2460 #ifdef DEBUG 2461 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 2462 #else 2463 #define NUM_NONREG_ITEMS 4 2464 #endif 2465 2466 /* We push at most this many items on the stack. */ 2467 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) 2468 2469 /* We actually push this many items. */ 2470 #define NUM_FAILURE_ITEMS \ 2471 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ 2472 + NUM_NONREG_ITEMS) 2473 2474 /* How many items can still be added to the stack without overflowing it. */ 2475 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 2476 2477 2478 /* Pops what PUSH_FAIL_STACK pushes. 2479 2480 We restore into the parameters, all of which should be lvalues: 2481 STR -- the saved data position. 2482 PAT -- the saved pattern position. 2483 LOW_REG, HIGH_REG -- the highest and lowest active registers. 2484 REGSTART, REGEND -- arrays of string positions. 2485 REG_INFO -- array of information about each subexpression. 2486 2487 Also assumes the variables `fail_stack' and (if debugging), `bufp', 2488 `pend', `string1', `size1', `string2', and `size2'. */ 2489 2490 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 2491 { \ 2492 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ 2493 int this_reg; \ 2494 const unsigned char *string_temp; \ 2495 \ 2496 assert (!FAIL_STACK_EMPTY ()); \ 2497 \ 2498 /* Remove failure points and point to how many regs pushed. */ \ 2499 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 2500 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 2501 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 2502 \ 2503 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 2504 \ 2505 DEBUG_POP (&failure_id); \ 2506 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 2507 \ 2508 /* If the saved string location is NULL, it came from an \ 2509 on_failure_keep_string_jump opcode, and we want to throw away the \ 2510 saved NULL, thus retaining our current position in the string. */ \ 2511 string_temp = POP_FAILURE_POINTER (); \ 2512 if (string_temp != NULL) \ 2513 str = (const char *) string_temp; \ 2514 \ 2515 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ 2516 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 2517 DEBUG_PRINT1 ("'\n"); \ 2518 \ 2519 pat = (unsigned char *) POP_FAILURE_POINTER (); \ 2520 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ 2521 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 2522 \ 2523 /* Restore register info. */ \ 2524 high_reg = (unsigned) POP_FAILURE_INT (); \ 2525 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ 2526 \ 2527 low_reg = (unsigned) POP_FAILURE_INT (); \ 2528 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ 2529 \ 2530 if (1) \ 2531 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 2532 { \ 2533 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ 2534 \ 2535 reg_info[this_reg].word = POP_FAILURE_ELT (); \ 2536 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ 2537 \ 2538 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \ 2539 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 2540 \ 2541 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \ 2542 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 2543 } \ 2544 else \ 2545 { \ 2546 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ 2547 { \ 2548 reg_info[this_reg].word.integer = 0; \ 2549 regend[this_reg] = 0; \ 2550 regstart[this_reg] = 0; \ 2551 } \ 2552 highest_active_reg = high_reg; \ 2553 } \ 2554 \ 2555 DEBUG_STATEMENT (nfailure_points_popped++); \ 2556 } /* POP_FAILURE_POINT */ 2557 2558 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 2559 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 2560 characters can start a string that matches the pattern. This fastmap 2561 is used by re_search to skip quickly over impossible starting points. 2562 2563 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 2564 area as BUFP->fastmap. 2565 2566 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 2567 the pattern buffer. 2568 2569 Returns 0 if we succeed, -2 if an internal error. */ 2570 2571 int 2572 re_compile_fastmap (bufp) 2573 struct re_pattern_buffer *bufp; 2574 { 2575 int j, k; 2576 fail_stack_type fail_stack; 2577 #ifndef REGEX_MALLOC 2578 char *destination; 2579 #endif 2580 /* We don't push any register information onto the failure stack. */ 2581 unsigned num_regs = 0; 2582 2583 register char *fastmap = bufp->fastmap; 2584 unsigned char *pattern = bufp->buffer; 2585 unsigned long size = bufp->used; 2586 unsigned char *p = pattern; 2587 register unsigned char *pend = pattern + size; 2588 2589 /* Assume that each path through the pattern can be null until 2590 proven otherwise. We set this false at the bottom of switch 2591 statement, to which we get only if a particular path doesn't 2592 match the empty string. */ 2593 boolean path_can_be_null = true; 2594 2595 /* We aren't doing a `succeed_n' to begin with. */ 2596 boolean succeed_n_p = false; 2597 2598 assert (fastmap != NULL && p != NULL); 2599 2600 INIT_FAIL_STACK (); 2601 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 2602 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 2603 bufp->can_be_null = 0; 2604 2605 while (p != pend || !FAIL_STACK_EMPTY ()) 2606 { 2607 if (p == pend) 2608 { 2609 bufp->can_be_null |= path_can_be_null; 2610 2611 /* Reset for next path. */ 2612 path_can_be_null = true; 2613 2614 p = fail_stack.stack[--fail_stack.avail].pointer; 2615 } 2616 2617 /* We should never be about to go beyond the end of the pattern. */ 2618 assert (p < pend); 2619 2620 #ifdef SWITCH_ENUM_BUG 2621 switch ((int) ((re_opcode_t) *p++)) 2622 #else 2623 switch ((re_opcode_t) *p++) 2624 #endif 2625 { 2626 2627 /* I guess the idea here is to simply not bother with a fastmap 2628 if a backreference is used, since it's too hard to figure out 2629 the fastmap for the corresponding group. Setting 2630 `can_be_null' stops `re_search_2' from using the fastmap, so 2631 that is all we do. */ 2632 case duplicate: 2633 bufp->can_be_null = 1; 2634 return 0; 2635 2636 2637 /* Following are the cases which match a character. These end 2638 with `break'. */ 2639 2640 case exactn: 2641 fastmap[p[1]] = 1; 2642 break; 2643 2644 2645 case charset: 2646 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 2647 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 2648 fastmap[j] = 1; 2649 break; 2650 2651 2652 case charset_not: 2653 /* Chars beyond end of map must be allowed. */ 2654 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 2655 fastmap[j] = 1; 2656 2657 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 2658 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 2659 fastmap[j] = 1; 2660 break; 2661 2662 2663 case wordchar: 2664 for (j = 0; j < (1 << BYTEWIDTH); j++) 2665 if (SYNTAX (j) == Sword) 2666 fastmap[j] = 1; 2667 break; 2668 2669 2670 case notwordchar: 2671 for (j = 0; j < (1 << BYTEWIDTH); j++) 2672 if (SYNTAX (j) != Sword) 2673 fastmap[j] = 1; 2674 break; 2675 2676 2677 case anychar: 2678 /* `.' matches anything ... */ 2679 for (j = 0; j < (1 << BYTEWIDTH); j++) 2680 fastmap[j] = 1; 2681 2682 /* ... except perhaps newline. */ 2683 if (!(bufp->syntax & RE_DOT_NEWLINE)) 2684 fastmap['\n'] = 0; 2685 2686 /* Return if we have already set `can_be_null'; if we have, 2687 then the fastmap is irrelevant. Something's wrong here. */ 2688 else if (bufp->can_be_null) 2689 return 0; 2690 2691 /* Otherwise, have to check alternative paths. */ 2692 break; 2693 2694 2695 #ifdef emacs 2696 case syntaxspec: 2697 k = *p++; 2698 for (j = 0; j < (1 << BYTEWIDTH); j++) 2699 if (SYNTAX (j) == (enum syntaxcode) k) 2700 fastmap[j] = 1; 2701 break; 2702 2703 2704 case notsyntaxspec: 2705 k = *p++; 2706 for (j = 0; j < (1 << BYTEWIDTH); j++) 2707 if (SYNTAX (j) != (enum syntaxcode) k) 2708 fastmap[j] = 1; 2709 break; 2710 2711 2712 /* All cases after this match the empty string. These end with 2713 `continue'. */ 2714 2715 2716 case before_dot: 2717 case at_dot: 2718 case after_dot: 2719 continue; 2720 #endif /* not emacs */ 2721 2722 2723 case no_op: 2724 case begline: 2725 case endline: 2726 case begbuf: 2727 case endbuf: 2728 case wordbound: 2729 case notwordbound: 2730 case wordbeg: 2731 case wordend: 2732 case push_dummy_failure: 2733 continue; 2734 2735 2736 case jump_n: 2737 case pop_failure_jump: 2738 case maybe_pop_jump: 2739 case jump: 2740 case jump_past_alt: 2741 case dummy_failure_jump: 2742 EXTRACT_NUMBER_AND_INCR (j, p); 2743 p += j; 2744 if (j > 0) 2745 continue; 2746 2747 /* Jump backward implies we just went through the body of a 2748 loop and matched nothing. Opcode jumped to should be 2749 `on_failure_jump' or `succeed_n'. Just treat it like an 2750 ordinary jump. For a * loop, it has pushed its failure 2751 point already; if so, discard that as redundant. */ 2752 if ((re_opcode_t) *p != on_failure_jump 2753 && (re_opcode_t) *p != succeed_n) 2754 continue; 2755 2756 p++; 2757 EXTRACT_NUMBER_AND_INCR (j, p); 2758 p += j; 2759 2760 /* If what's on the stack is where we are now, pop it. */ 2761 if (!FAIL_STACK_EMPTY () 2762 && fail_stack.stack[fail_stack.avail - 1].pointer == p) 2763 fail_stack.avail--; 2764 2765 continue; 2766 2767 2768 case on_failure_jump: 2769 case on_failure_keep_string_jump: 2770 handle_on_failure_jump: 2771 EXTRACT_NUMBER_AND_INCR (j, p); 2772 2773 /* For some patterns, e.g., `(a?)?', `p+j' here points to the 2774 end of the pattern. We don't want to push such a point, 2775 since when we restore it above, entering the switch will 2776 increment `p' past the end of the pattern. We don't need 2777 to push such a point since we obviously won't find any more 2778 fastmap entries beyond `pend'. Such a pattern can match 2779 the null string, though. */ 2780 if (p + j < pend) 2781 { 2782 if (!PUSH_PATTERN_OP (p + j, fail_stack)) 2783 return -2; 2784 } 2785 else 2786 bufp->can_be_null = 1; 2787 2788 if (succeed_n_p) 2789 { 2790 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 2791 succeed_n_p = false; 2792 } 2793 2794 continue; 2795 2796 2797 case succeed_n: 2798 /* Get to the number of times to succeed. */ 2799 p += 2; 2800 2801 /* Increment p past the n for when k != 0. */ 2802 EXTRACT_NUMBER_AND_INCR (k, p); 2803 if (k == 0) 2804 { 2805 p -= 4; 2806 succeed_n_p = true; /* Spaghetti code alert. */ 2807 goto handle_on_failure_jump; 2808 } 2809 continue; 2810 2811 2812 case set_number_at: 2813 p += 4; 2814 continue; 2815 2816 2817 case start_memory: 2818 case stop_memory: 2819 p += 2; 2820 continue; 2821 2822 2823 default: 2824 abort (); /* We have listed all the cases. */ 2825 } /* switch *p++ */ 2826 2827 /* Getting here means we have found the possible starting 2828 characters for one path of the pattern -- and that the empty 2829 string does not match. We need not follow this path further. 2830 Instead, look at the next alternative (remembered on the 2831 stack), or quit if no more. The test at the top of the loop 2832 does these things. */ 2833 path_can_be_null = false; 2834 p = pend; 2835 } /* while p */ 2836 2837 /* Set `can_be_null' for the last path (also the first path, if the 2838 pattern is empty). */ 2839 bufp->can_be_null |= path_can_be_null; 2840 return 0; 2841 } /* re_compile_fastmap */ 2842 2843 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 2844 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 2845 this memory for recording register information. STARTS and ENDS 2846 must be allocated using the malloc library routine, and must each 2847 be at least NUM_REGS * sizeof (regoff_t) bytes long. 2848 2849 If NUM_REGS == 0, then subsequent matches should allocate their own 2850 register data. 2851 2852 Unless this function is called, the first search or match using 2853 PATTERN_BUFFER will allocate its own register data, without 2854 freeing the old data. */ 2855 2856 void 2857 re_set_registers (bufp, regs, num_regs, starts, ends) 2858 struct re_pattern_buffer *bufp; 2859 struct re_registers *regs; 2860 unsigned num_regs; 2861 regoff_t *starts, *ends; 2862 { 2863 if (num_regs) 2864 { 2865 bufp->regs_allocated = REGS_REALLOCATE; 2866 regs->num_regs = num_regs; 2867 regs->start = starts; 2868 regs->end = ends; 2869 } 2870 else 2871 { 2872 bufp->regs_allocated = REGS_UNALLOCATED; 2873 regs->num_regs = 0; 2874 regs->start = regs->end = (regoff_t *) 0; 2875 } 2876 } 2877 2878 /* Searching routines. */ 2879 2880 /* Like re_search_2, below, but only one string is specified, and 2881 doesn't let you say where to stop matching. */ 2882 2883 int 2884 re_search (bufp, string, size, startpos, range, regs) 2885 struct re_pattern_buffer *bufp; 2886 const char *string; 2887 int size, startpos, range; 2888 struct re_registers *regs; 2889 { 2890 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, 2891 regs, size); 2892 } 2893 2894 2895 /* Using the compiled pattern in BUFP->buffer, first tries to match the 2896 virtual concatenation of STRING1 and STRING2, starting first at index 2897 STARTPOS, then at STARTPOS + 1, and so on. 2898 2899 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. 2900 2901 RANGE is how far to scan while trying to match. RANGE = 0 means try 2902 only at STARTPOS; in general, the last start tried is STARTPOS + 2903 RANGE. 2904 2905 In REGS, return the indices of the virtual concatenation of STRING1 2906 and STRING2 that matched the entire BUFP->buffer and its contained 2907 subexpressions. 2908 2909 Do not consider matching one past the index STOP in the virtual 2910 concatenation of STRING1 and STRING2. 2911 2912 We return either the position in the strings at which the match was 2913 found, -1 if no match, or -2 if error (such as failure 2914 stack overflow). */ 2915 2916 int 2917 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) 2918 struct re_pattern_buffer *bufp; 2919 const char *string1, *string2; 2920 int size1, size2; 2921 int startpos; 2922 int range; 2923 struct re_registers *regs; 2924 int stop; 2925 { 2926 int val; 2927 register char *fastmap = bufp->fastmap; 2928 register char *translate = bufp->translate; 2929 int total_size = size1 + size2; 2930 int endpos = startpos + range; 2931 2932 /* Check for out-of-range STARTPOS. */ 2933 if (startpos < 0 || startpos > total_size) 2934 return -1; 2935 2936 /* Fix up RANGE if it might eventually take us outside 2937 the virtual concatenation of STRING1 and STRING2. */ 2938 if (endpos < -1) 2939 range = -1 - startpos; 2940 else if (endpos > total_size) 2941 range = total_size - startpos; 2942 2943 /* If the search isn't to be a backwards one, don't waste time in a 2944 search for a pattern that must be anchored. */ 2945 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) 2946 { 2947 if (startpos > 0) 2948 return -1; 2949 else 2950 range = 1; 2951 } 2952 2953 #ifdef emacs 2954 /* In a forward search for something that starts with \=. 2955 don't keep searching past point. */ 2956 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) 2957 { 2958 range = PT - startpos; 2959 if (range <= 0) 2960 return -1; 2961 } 2962 #endif /* emacs */ 2963 2964 /* Update the fastmap now if not correct already. */ 2965 if (fastmap && !bufp->fastmap_accurate) 2966 if (re_compile_fastmap (bufp) == -2) 2967 return -2; 2968 2969 /* Loop through the string, looking for a place to start matching. */ 2970 for (;;) 2971 { 2972 /* If a fastmap is supplied, skip quickly over characters that 2973 cannot be the start of a match. If the pattern can match the 2974 null string, however, we don't need to skip characters; we want 2975 the first null string. */ 2976 if (fastmap && startpos < total_size && !bufp->can_be_null) 2977 { 2978 if (range > 0) /* Searching forwards. */ 2979 { 2980 register const char *d; 2981 register int lim = 0; 2982 int irange = range; 2983 2984 if (startpos < size1 && startpos + range >= size1) 2985 lim = range - (size1 - startpos); 2986 2987 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; 2988 2989 /* Written out as an if-else to avoid testing `translate' 2990 inside the loop. */ 2991 if (translate) 2992 while (range > lim 2993 && !fastmap[(unsigned char) 2994 translate[(unsigned char) *d++]]) 2995 range--; 2996 else 2997 while (range > lim && !fastmap[(unsigned char) *d++]) 2998 range--; 2999 3000 startpos += irange - range; 3001 } 3002 else /* Searching backwards. */ 3003 { 3004 register char c = (size1 == 0 || startpos >= size1 3005 ? string2[startpos - size1] 3006 : string1[startpos]); 3007 3008 if (!fastmap[(unsigned char) TRANSLATE (c)]) 3009 goto advance; 3010 } 3011 } 3012 3013 /* If can't match the null string, and that's all we have left, fail. */ 3014 if (range >= 0 && startpos == total_size && fastmap 3015 && !bufp->can_be_null) 3016 return -1; 3017 3018 val = re_match_2 (bufp, string1, size1, string2, size2, 3019 startpos, regs, stop); 3020 if (val >= 0) 3021 return startpos; 3022 3023 if (val == -2) 3024 return -2; 3025 3026 advance: 3027 if (!range) 3028 break; 3029 else if (range > 0) 3030 { 3031 range--; 3032 startpos++; 3033 } 3034 else 3035 { 3036 range++; 3037 startpos--; 3038 } 3039 } 3040 return -1; 3041 } /* re_search_2 */ 3042 3043 /* Declarations and macros for re_match_2. */ 3044 3045 static int bcmp_translate (); 3046 static boolean alt_match_null_string_p (), 3047 common_op_match_null_string_p (), 3048 group_match_null_string_p (); 3049 3050 /* Structure for per-register (a.k.a. per-group) information. 3051 Other register information, such as the 3052 starting and ending positions (which are addresses), and the list of 3053 inner groups (which is a bits list) are maintained in separate 3054 variables. 3055 3056 We are making a (strictly speaking) nonportable assumption here: that 3057 the compiler will pack our bit fields into something that fits into 3058 the type of `word', i.e., is something that fits into one item on the 3059 failure stack. */ 3060 3061 typedef union 3062 { 3063 fail_stack_elt_t word; 3064 struct 3065 { 3066 /* This field is one if this group can match the empty string, 3067 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 3068 #define MATCH_NULL_UNSET_VALUE 3 3069 unsigned match_null_string_p : 2; 3070 unsigned is_active : 1; 3071 unsigned matched_something : 1; 3072 unsigned ever_matched_something : 1; 3073 } bits; 3074 } register_info_type; 3075 3076 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 3077 #define IS_ACTIVE(R) ((R).bits.is_active) 3078 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) 3079 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 3080 3081 3082 /* Call this when have matched a real character; it sets `matched' flags 3083 for the subexpressions which we are currently inside. Also records 3084 that those subexprs have matched. */ 3085 #define SET_REGS_MATCHED() \ 3086 do \ 3087 { \ 3088 unsigned r; \ 3089 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 3090 { \ 3091 MATCHED_SOMETHING (reg_info[r]) \ 3092 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 3093 = 1; \ 3094 } \ 3095 } \ 3096 while (0) 3097 3098 3099 /* This converts PTR, a pointer into one of the search strings `string1' 3100 and `string2' into an offset from the beginning of that string. */ 3101 #define POINTER_TO_OFFSET(ptr) \ 3102 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1) 3103 3104 /* Registers are set to a sentinel when they haven't yet matched. */ 3105 #define REG_UNSET_VALUE ((char *) -1) 3106 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 3107 3108 3109 /* Macros for dealing with the split strings in re_match_2. */ 3110 3111 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) 3112 3113 /* Call before fetching a character with *d. This switches over to 3114 string2 if necessary. */ 3115 #define PREFETCH() \ 3116 while (d == dend) \ 3117 { \ 3118 /* End of string2 => fail. */ \ 3119 if (dend == end_match_2) \ 3120 goto fail; \ 3121 /* End of string1 => advance to string2. */ \ 3122 d = string2; \ 3123 dend = end_match_2; \ 3124 } 3125 3126 3127 /* Test if at very beginning or at very end of the virtual concatenation 3128 of `string1' and `string2'. If only one string, it's `string2'. */ 3129 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) 3130 #define AT_STRINGS_END(d) ((d) == end2) 3131 3132 3133 /* Test if D points to a character which is word-constituent. We have 3134 two special cases to check for: if past the end of string1, look at 3135 the first character in string2; and if before the beginning of 3136 string2, look at the last character in string1. */ 3137 #define WORDCHAR_P(d) \ 3138 (SYNTAX ((d) == end1 ? *string2 \ 3139 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ 3140 == Sword) 3141 3142 /* Test if the character before D and the one at D differ with respect 3143 to being word-constituent. */ 3144 #define AT_WORD_BOUNDARY(d) \ 3145 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ 3146 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) 3147 3148 3149 /* Free everything we malloc. */ 3150 #ifdef REGEX_MALLOC 3151 #define FREE_VAR(var) if (var) free (var); var = NULL 3152 #define FREE_VARIABLES() \ 3153 do { \ 3154 FREE_VAR (fail_stack.stack); \ 3155 FREE_VAR (regstart); \ 3156 FREE_VAR (regend); \ 3157 FREE_VAR (old_regstart); \ 3158 FREE_VAR (old_regend); \ 3159 FREE_VAR (best_regstart); \ 3160 FREE_VAR (best_regend); \ 3161 FREE_VAR (reg_info); \ 3162 FREE_VAR (reg_dummy); \ 3163 FREE_VAR (reg_info_dummy); \ 3164 } while (0) 3165 #else /* not REGEX_MALLOC */ 3166 /* Some MIPS systems (at least) want this to free alloca'd storage. */ 3167 #define FREE_VARIABLES() alloca (0) 3168 #endif /* not REGEX_MALLOC */ 3169 3170 3171 /* These values must meet several constraints. They must not be valid 3172 register values; since we have a limit of 255 registers (because 3173 we use only one byte in the pattern for the register number), we can 3174 use numbers larger than 255. They must differ by 1, because of 3175 NUM_FAILURE_ITEMS above. And the value for the lowest register must 3176 be larger than the value for the highest register, so we do not try 3177 to actually save any registers when none are active. */ 3178 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) 3179 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) 3180 3181 /* Matching routines. */ 3182 3183 #ifndef emacs /* Emacs never uses this. */ 3184 /* re_match is like re_match_2 except it takes only a single string. */ 3185 3186 int 3187 re_match (bufp, string, size, pos, regs) 3188 struct re_pattern_buffer *bufp; 3189 const char *string; 3190 int size, pos; 3191 struct re_registers *regs; 3192 { 3193 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size); 3194 } 3195 #endif /* not emacs */ 3196 3197 3198 /* re_match_2 matches the compiled pattern in BUFP against the 3199 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 3200 and SIZE2, respectively). We start matching at POS, and stop 3201 matching at STOP. 3202 3203 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we 3204 store offsets for the substring each group matched in REGS. See the 3205 documentation for exactly how many groups we fill. 3206 3207 We return -1 if no match, -2 if an internal error (such as the 3208 failure stack overflowing). Otherwise, we return the length of the 3209 matched substring. */ 3210 3211 int 3212 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 3213 struct re_pattern_buffer *bufp; 3214 const char *string1, *string2; 3215 int size1, size2; 3216 int pos; 3217 struct re_registers *regs; 3218 int stop; 3219 { 3220 /* General temporaries. */ 3221 int mcnt; 3222 unsigned char *p1; 3223 3224 /* Just past the end of the corresponding string. */ 3225 const char *end1, *end2; 3226 3227 /* Pointers into string1 and string2, just past the last characters in 3228 each to consider matching. */ 3229 const char *end_match_1, *end_match_2; 3230 3231 /* Where we are in the data, and the end of the current string. */ 3232 const char *d, *dend; 3233 3234 /* Where we are in the pattern, and the end of the pattern. */ 3235 unsigned char *p = bufp->buffer; 3236 register unsigned char *pend = p + bufp->used; 3237 3238 /* We use this to map every character in the string. */ 3239 char *translate = bufp->translate; 3240 3241 /* Failure point stack. Each place that can handle a failure further 3242 down the line pushes a failure point on this stack. It consists of 3243 restart, regend, and reg_info for all registers corresponding to 3244 the subexpressions we're currently inside, plus the number of such 3245 registers, and, finally, two char *'s. The first char * is where 3246 to resume scanning the pattern; the second one is where to resume 3247 scanning the strings. If the latter is zero, the failure point is 3248 a ``dummy''; if a failure happens and the failure point is a dummy, 3249 it gets discarded and the next next one is tried. */ 3250 fail_stack_type fail_stack; 3251 #ifdef DEBUG 3252 static unsigned failure_id = 0; 3253 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; 3254 #endif 3255 3256 /* We fill all the registers internally, independent of what we 3257 return, for use in backreferences. The number here includes 3258 an element for register zero. */ 3259 unsigned num_regs = bufp->re_nsub + 1; 3260 3261 /* The currently active registers. */ 3262 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3263 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3264 3265 /* Information on the contents of registers. These are pointers into 3266 the input strings; they record just what was matched (on this 3267 attempt) by a subexpression part of the pattern, that is, the 3268 regnum-th regstart pointer points to where in the pattern we began 3269 matching and the regnum-th regend points to right after where we 3270 stopped matching the regnum-th subexpression. (The zeroth register 3271 keeps track of what the whole pattern matches.) */ 3272 const char **regstart, **regend; 3273 3274 /* If a group that's operated upon by a repetition operator fails to 3275 match anything, then the register for its start will need to be 3276 restored because it will have been set to wherever in the string we 3277 are when we last see its open-group operator. Similarly for a 3278 register's end. */ 3279 const char **old_regstart, **old_regend; 3280 3281 /* The is_active field of reg_info helps us keep track of which (possibly 3282 nested) subexpressions we are currently in. The matched_something 3283 field of reg_info[reg_num] helps us tell whether or not we have 3284 matched any of the pattern so far this time through the reg_num-th 3285 subexpression. These two fields get reset each time through any 3286 loop their register is in. */ 3287 register_info_type *reg_info; 3288 3289 /* The following record the register info as found in the above 3290 variables when we find a match better than any we've seen before. 3291 This happens as we backtrack through the failure points, which in 3292 turn happens only if we have not yet matched the entire string. */ 3293 unsigned best_regs_set = false; 3294 const char **best_regstart, **best_regend; 3295 3296 /* Logically, this is `best_regend[0]'. But we don't want to have to 3297 allocate space for that if we're not allocating space for anything 3298 else (see below). Also, we never need info about register 0 for 3299 any of the other register vectors, and it seems rather a kludge to 3300 treat `best_regend' differently than the rest. So we keep track of 3301 the end of the best match so far in a separate variable. We 3302 initialize this to NULL so that when we backtrack the first time 3303 and need to test it, it's not garbage. */ 3304 const char *match_end = NULL; 3305 3306 /* Used when we pop values we don't care about. */ 3307 const char **reg_dummy; 3308 register_info_type *reg_info_dummy; 3309 3310 #ifdef DEBUG 3311 /* Counts the total number of registers pushed. */ 3312 unsigned num_regs_pushed = 0; 3313 #endif 3314 3315 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); 3316 3317 INIT_FAIL_STACK (); 3318 3319 /* Do not bother to initialize all the register variables if there are 3320 no groups in the pattern, as it takes a fair amount of time. If 3321 there are groups, we include space for register 0 (the whole 3322 pattern), even though we never use it, since it simplifies the 3323 array indexing. We should fix this. */ 3324 if (bufp->re_nsub) 3325 { 3326 regstart = REGEX_TALLOC (num_regs, const char *); 3327 regend = REGEX_TALLOC (num_regs, const char *); 3328 old_regstart = REGEX_TALLOC (num_regs, const char *); 3329 old_regend = REGEX_TALLOC (num_regs, const char *); 3330 best_regstart = REGEX_TALLOC (num_regs, const char *); 3331 best_regend = REGEX_TALLOC (num_regs, const char *); 3332 reg_info = REGEX_TALLOC (num_regs, register_info_type); 3333 reg_dummy = REGEX_TALLOC (num_regs, const char *); 3334 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); 3335 3336 if (!(regstart && regend && old_regstart && old_regend && reg_info 3337 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) 3338 { 3339 FREE_VARIABLES (); 3340 return -2; 3341 } 3342 } 3343 #ifdef REGEX_MALLOC 3344 else 3345 { 3346 /* We must initialize all our variables to NULL, so that 3347 `FREE_VARIABLES' doesn't try to free them. */ 3348 regstart = regend = old_regstart = old_regend = best_regstart 3349 = best_regend = reg_dummy = NULL; 3350 reg_info = reg_info_dummy = (register_info_type *) NULL; 3351 } 3352 #endif /* REGEX_MALLOC */ 3353 3354 /* The starting position is bogus. */ 3355 if (pos < 0 || pos > size1 + size2) 3356 { 3357 FREE_VARIABLES (); 3358 return -1; 3359 } 3360 3361 /* Initialize subexpression text positions to -1 to mark ones that no 3362 start_memory/stop_memory has been seen for. Also initialize the 3363 register information struct. */ 3364 for (mcnt = 1; mcnt < num_regs; mcnt++) 3365 { 3366 regstart[mcnt] = regend[mcnt] 3367 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; 3368 3369 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; 3370 IS_ACTIVE (reg_info[mcnt]) = 0; 3371 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 3372 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; 3373 } 3374 3375 /* We move `string1' into `string2' if the latter's empty -- but not if 3376 `string1' is null. */ 3377 if (size2 == 0 && string1 != NULL) 3378 { 3379 string2 = string1; 3380 size2 = size1; 3381 string1 = 0; 3382 size1 = 0; 3383 } 3384 end1 = string1 + size1; 3385 end2 = string2 + size2; 3386 3387 /* Compute where to stop matching, within the two strings. */ 3388 if (stop <= size1) 3389 { 3390 end_match_1 = string1 + stop; 3391 end_match_2 = string2; 3392 } 3393 else 3394 { 3395 end_match_1 = end1; 3396 end_match_2 = string2 + stop - size1; 3397 } 3398 3399 /* `p' scans through the pattern as `d' scans through the data. 3400 `dend' is the end of the input string that `d' points within. `d' 3401 is advanced into the following input string whenever necessary, but 3402 this happens before fetching; therefore, at the beginning of the 3403 loop, `d' can be pointing at the end of a string, but it cannot 3404 equal `string2'. */ 3405 if (size1 > 0 && pos <= size1) 3406 { 3407 d = string1 + pos; 3408 dend = end_match_1; 3409 } 3410 else 3411 { 3412 d = string2 + pos - size1; 3413 dend = end_match_2; 3414 } 3415 3416 DEBUG_PRINT1 ("The compiled pattern is: "); 3417 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); 3418 DEBUG_PRINT1 ("The string to match is: `"); 3419 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); 3420 DEBUG_PRINT1 ("'\n"); 3421 3422 /* This loops over pattern commands. It exits by returning from the 3423 function if the match is complete, or it drops through if the match 3424 fails at this starting point in the input data. */ 3425 for (;;) 3426 { 3427 DEBUG_PRINT2 ("\n0x%x: ", p); 3428 3429 if (p == pend) 3430 { /* End of pattern means we might have succeeded. */ 3431 DEBUG_PRINT1 ("end of pattern ... "); 3432 3433 /* If we haven't matched the entire string, and we want the 3434 longest match, try backtracking. */ 3435 if (d != end_match_2) 3436 { 3437 DEBUG_PRINT1 ("backtracking.\n"); 3438 3439 if (!FAIL_STACK_EMPTY ()) 3440 { /* More failure points to try. */ 3441 boolean same_str_p = (FIRST_STRING_P (match_end) 3442 == MATCHING_IN_FIRST_STRING); 3443 3444 /* If exceeds best match so far, save it. */ 3445 if (!best_regs_set 3446 || (same_str_p && d > match_end) 3447 || (!same_str_p && !MATCHING_IN_FIRST_STRING)) 3448 { 3449 best_regs_set = true; 3450 match_end = d; 3451 3452 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); 3453 3454 for (mcnt = 1; mcnt < num_regs; mcnt++) 3455 { 3456 best_regstart[mcnt] = regstart[mcnt]; 3457 best_regend[mcnt] = regend[mcnt]; 3458 } 3459 } 3460 goto fail; 3461 } 3462 3463 /* If no failure points, don't restore garbage. */ 3464 else if (best_regs_set) 3465 { 3466 restore_best_regs: 3467 /* Restore best match. It may happen that `dend == 3468 end_match_1' while the restored d is in string2. 3469 For example, the pattern `x.*y.*z' against the 3470 strings `x-' and `y-z-', if the two strings are 3471 not consecutive in memory. */ 3472 DEBUG_PRINT1 ("Restoring best registers.\n"); 3473 3474 d = match_end; 3475 dend = ((d >= string1 && d <= end1) 3476 ? end_match_1 : end_match_2); 3477 3478 for (mcnt = 1; mcnt < num_regs; mcnt++) 3479 { 3480 regstart[mcnt] = best_regstart[mcnt]; 3481 regend[mcnt] = best_regend[mcnt]; 3482 } 3483 } 3484 } /* d != end_match_2 */ 3485 3486 DEBUG_PRINT1 ("Accepting match.\n"); 3487 3488 /* If caller wants register contents data back, do it. */ 3489 if (regs && !bufp->no_sub) 3490 { 3491 /* Have the register data arrays been allocated? */ 3492 if (bufp->regs_allocated == REGS_UNALLOCATED) 3493 { /* No. So allocate them with malloc. We need one 3494 extra element beyond `num_regs' for the `-1' marker 3495 GNU code uses. */ 3496 regs->num_regs = MAX (RE_NREGS, num_regs + 1); 3497 regs->start = TALLOC (regs->num_regs, regoff_t); 3498 regs->end = TALLOC (regs->num_regs, regoff_t); 3499 if (regs->start == NULL || regs->end == NULL) 3500 return -2; 3501 bufp->regs_allocated = REGS_REALLOCATE; 3502 } 3503 else if (bufp->regs_allocated == REGS_REALLOCATE) 3504 { /* Yes. If we need more elements than were already 3505 allocated, reallocate them. If we need fewer, just 3506 leave it alone. */ 3507 if (regs->num_regs < num_regs + 1) 3508 { 3509 regs->num_regs = num_regs + 1; 3510 RETALLOC (regs->start, regs->num_regs, regoff_t); 3511 RETALLOC (regs->end, regs->num_regs, regoff_t); 3512 if (regs->start == NULL || regs->end == NULL) 3513 return -2; 3514 } 3515 } 3516 else 3517 { 3518 /* These braces fend off a "empty body in an else-statement" 3519 warning under GCC when assert expands to nothing. */ 3520 assert (bufp->regs_allocated == REGS_FIXED); 3521 } 3522 3523 /* Convert the pointer data in `regstart' and `regend' to 3524 indices. Register zero has to be set differently, 3525 since we haven't kept track of any info for it. */ 3526 if (regs->num_regs > 0) 3527 { 3528 regs->start[0] = pos; 3529 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1 3530 : d - string2 + size1); 3531 } 3532 3533 /* Go through the first `min (num_regs, regs->num_regs)' 3534 registers, since that is all we initialized. */ 3535 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) 3536 { 3537 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) 3538 regs->start[mcnt] = regs->end[mcnt] = -1; 3539 else 3540 { 3541 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]); 3542 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]); 3543 } 3544 } 3545 3546 /* If the regs structure we return has more elements than 3547 were in the pattern, set the extra elements to -1. If 3548 we (re)allocated the registers, this is the case, 3549 because we always allocate enough to have at least one 3550 -1 at the end. */ 3551 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) 3552 regs->start[mcnt] = regs->end[mcnt] = -1; 3553 } /* regs && !bufp->no_sub */ 3554 3555 FREE_VARIABLES (); 3556 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", 3557 nfailure_points_pushed, nfailure_points_popped, 3558 nfailure_points_pushed - nfailure_points_popped); 3559 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); 3560 3561 mcnt = d - pos - (MATCHING_IN_FIRST_STRING 3562 ? string1 3563 : string2 - size1); 3564 3565 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); 3566 3567 return mcnt; 3568 } 3569 3570 /* Otherwise match next pattern command. */ 3571 #ifdef SWITCH_ENUM_BUG 3572 switch ((int) ((re_opcode_t) *p++)) 3573 #else 3574 switch ((re_opcode_t) *p++) 3575 #endif 3576 { 3577 /* Ignore these. Used to ignore the n of succeed_n's which 3578 currently have n == 0. */ 3579 case no_op: 3580 DEBUG_PRINT1 ("EXECUTING no_op.\n"); 3581 break; 3582 3583 3584 /* Match the next n pattern characters exactly. The following 3585 byte in the pattern defines n, and the n bytes after that 3586 are the characters to match. */ 3587 case exactn: 3588 mcnt = *p++; 3589 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); 3590 3591 /* This is written out as an if-else so we don't waste time 3592 testing `translate' inside the loop. */ 3593 if (translate) 3594 { 3595 do 3596 { 3597 PREFETCH (); 3598 if (translate[(unsigned char) *d++] != (char) *p++) 3599 goto fail; 3600 } 3601 while (--mcnt); 3602 } 3603 else 3604 { 3605 do 3606 { 3607 PREFETCH (); 3608 if (*d++ != (char) *p++) goto fail; 3609 } 3610 while (--mcnt); 3611 } 3612 SET_REGS_MATCHED (); 3613 break; 3614 3615 3616 /* Match any character except possibly a newline or a null. */ 3617 case anychar: 3618 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 3619 3620 PREFETCH (); 3621 3622 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') 3623 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) 3624 goto fail; 3625 3626 SET_REGS_MATCHED (); 3627 DEBUG_PRINT2 (" Matched `%d'.\n", *d); 3628 d++; 3629 break; 3630 3631 3632 case charset: 3633 case charset_not: 3634 { 3635 register unsigned char c; 3636 boolean not = (re_opcode_t) *(p - 1) == charset_not; 3637 3638 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); 3639 3640 PREFETCH (); 3641 c = TRANSLATE (*d); /* The character to match. */ 3642 3643 /* Cast to `unsigned' instead of `unsigned char' in case the 3644 bit list is a full 32 bytes long. */ 3645 if (c < (unsigned) (*p * BYTEWIDTH) 3646 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 3647 not = !not; 3648 3649 p += 1 + *p; 3650 3651 if (!not) goto fail; 3652 3653 SET_REGS_MATCHED (); 3654 d++; 3655 break; 3656 } 3657 3658 3659 /* The beginning of a group is represented by start_memory. 3660 The arguments are the register number in the next byte, and the 3661 number of groups inner to this one in the next. The text 3662 matched within the group is recorded (in the internal 3663 registers data structure) under the register number. */ 3664 case start_memory: 3665 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); 3666 3667 /* Find out if this group can match the empty string. */ 3668 p1 = p; /* To send to group_match_null_string_p. */ 3669 3670 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) 3671 REG_MATCH_NULL_STRING_P (reg_info[*p]) 3672 = group_match_null_string_p (&p1, pend, reg_info); 3673 3674 /* Save the position in the string where we were the last time 3675 we were at this open-group operator in case the group is 3676 operated upon by a repetition operator, e.g., with `(a*)*b' 3677 against `ab'; then we want to ignore where we are now in 3678 the string in case this attempt to match fails. */ 3679 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 3680 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] 3681 : regstart[*p]; 3682 DEBUG_PRINT2 (" old_regstart: %d\n", 3683 POINTER_TO_OFFSET (old_regstart[*p])); 3684 3685 regstart[*p] = d; 3686 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); 3687 3688 IS_ACTIVE (reg_info[*p]) = 1; 3689 MATCHED_SOMETHING (reg_info[*p]) = 0; 3690 3691 /* This is the new highest active register. */ 3692 highest_active_reg = *p; 3693 3694 /* If nothing was active before, this is the new lowest active 3695 register. */ 3696 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 3697 lowest_active_reg = *p; 3698 3699 /* Move past the register number and inner group count. */ 3700 p += 2; 3701 break; 3702 3703 3704 /* The stop_memory opcode represents the end of a group. Its 3705 arguments are the same as start_memory's: the register 3706 number, and the number of inner groups. */ 3707 case stop_memory: 3708 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); 3709 3710 /* We need to save the string position the last time we were at 3711 this close-group operator in case the group is operated 3712 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' 3713 against `aba'; then we want to ignore where we are now in 3714 the string in case this attempt to match fails. */ 3715 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 3716 ? REG_UNSET (regend[*p]) ? d : regend[*p] 3717 : regend[*p]; 3718 DEBUG_PRINT2 (" old_regend: %d\n", 3719 POINTER_TO_OFFSET (old_regend[*p])); 3720 3721 regend[*p] = d; 3722 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); 3723 3724 /* This register isn't active anymore. */ 3725 IS_ACTIVE (reg_info[*p]) = 0; 3726 3727 /* If this was the only register active, nothing is active 3728 anymore. */ 3729 if (lowest_active_reg == highest_active_reg) 3730 { 3731 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3732 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3733 } 3734 else 3735 { /* We must scan for the new highest active register, since 3736 it isn't necessarily one less than now: consider 3737 (a(b)c(d(e)f)g). When group 3 ends, after the f), the 3738 new highest active register is 1. */ 3739 unsigned char r = *p - 1; 3740 while (r > 0 && !IS_ACTIVE (reg_info[r])) 3741 r--; 3742 3743 /* If we end up at register zero, that means that we saved 3744 the registers as the result of an `on_failure_jump', not 3745 a `start_memory', and we jumped to past the innermost 3746 `stop_memory'. For example, in ((.)*) we save 3747 registers 1 and 2 as a result of the *, but when we pop 3748 back to the second ), we are at the stop_memory 1. 3749 Thus, nothing is active. */ 3750 if (r == 0) 3751 { 3752 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3753 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3754 } 3755 else 3756 highest_active_reg = r; 3757 } 3758 3759 /* If just failed to match something this time around with a 3760 group that's operated on by a repetition operator, try to 3761 force exit from the ``loop'', and restore the register 3762 information for this group that we had before trying this 3763 last match. */ 3764 if ((!MATCHED_SOMETHING (reg_info[*p]) 3765 || (re_opcode_t) p[-3] == start_memory) 3766 && (p + 2) < pend) 3767 { 3768 boolean is_a_jump_n = false; 3769 3770 p1 = p + 2; 3771 mcnt = 0; 3772 switch ((re_opcode_t) *p1++) 3773 { 3774 case jump_n: 3775 is_a_jump_n = true; 3776 case pop_failure_jump: 3777 case maybe_pop_jump: 3778 case jump: 3779 case dummy_failure_jump: 3780 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 3781 if (is_a_jump_n) 3782 p1 += 2; 3783 break; 3784 3785 default: 3786 /* do nothing */ ; 3787 } 3788 p1 += mcnt; 3789 3790 /* If the next operation is a jump backwards in the pattern 3791 to an on_failure_jump right before the start_memory 3792 corresponding to this stop_memory, exit from the loop 3793 by forcing a failure after pushing on the stack the 3794 on_failure_jump's jump in the pattern, and d. */ 3795 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump 3796 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) 3797 { 3798 /* If this group ever matched anything, then restore 3799 what its registers were before trying this last 3800 failed match, e.g., with `(a*)*b' against `ab' for 3801 regstart[1], and, e.g., with `((a*)*(b*)*)*' 3802 against `aba' for regend[3]. 3803 3804 Also restore the registers for inner groups for, 3805 e.g., `((a*)(b*))*' against `aba' (register 3 would 3806 otherwise get trashed). */ 3807 3808 if (EVER_MATCHED_SOMETHING (reg_info[*p])) 3809 { 3810 unsigned r; 3811 3812 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; 3813 3814 /* Restore this and inner groups' (if any) registers. */ 3815 for (r = *p; r < *p + *(p + 1); r++) 3816 { 3817 regstart[r] = old_regstart[r]; 3818 3819 /* xx why this test? */ 3820 if (old_regend[r] >= regstart[r]) 3821 regend[r] = old_regend[r]; 3822 } 3823 } 3824 p1++; 3825 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 3826 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); 3827 3828 goto fail; 3829 } 3830 } 3831 3832 /* Move past the register number and the inner group count. */ 3833 p += 2; 3834 break; 3835 3836 3837 /* \<digit> has been turned into a `duplicate' command which is 3838 followed by the numeric value of <digit> as the register number. */ 3839 case duplicate: 3840 { 3841 register const char *d2, *dend2; 3842 int regno = *p++; /* Get which register to match against. */ 3843 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); 3844 3845 /* Can't back reference a group which we've never matched. */ 3846 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) 3847 goto fail; 3848 3849 /* Where in input to try to start matching. */ 3850 d2 = regstart[regno]; 3851 3852 /* Where to stop matching; if both the place to start and 3853 the place to stop matching are in the same string, then 3854 set to the place to stop, otherwise, for now have to use 3855 the end of the first string. */ 3856 3857 dend2 = ((FIRST_STRING_P (regstart[regno]) 3858 == FIRST_STRING_P (regend[regno])) 3859 ? regend[regno] : end_match_1); 3860 for (;;) 3861 { 3862 /* If necessary, advance to next segment in register 3863 contents. */ 3864 while (d2 == dend2) 3865 { 3866 if (dend2 == end_match_2) break; 3867 if (dend2 == regend[regno]) break; 3868 3869 /* End of string1 => advance to string2. */ 3870 d2 = string2; 3871 dend2 = regend[regno]; 3872 } 3873 /* At end of register contents => success */ 3874 if (d2 == dend2) break; 3875 3876 /* If necessary, advance to next segment in data. */ 3877 PREFETCH (); 3878 3879 /* How many characters left in this segment to match. */ 3880 mcnt = dend - d; 3881 3882 /* Want how many consecutive characters we can match in 3883 one shot, so, if necessary, adjust the count. */ 3884 if (mcnt > dend2 - d2) 3885 mcnt = dend2 - d2; 3886 3887 /* Compare that many; failure if mismatch, else move 3888 past them. */ 3889 if (translate 3890 ? bcmp_translate (d, d2, mcnt, translate) 3891 : bcmp (d, d2, mcnt)) 3892 goto fail; 3893 d += mcnt, d2 += mcnt; 3894 } 3895 } 3896 break; 3897 3898 3899 /* begline matches the empty string at the beginning of the string 3900 (unless `not_bol' is set in `bufp'), and, if 3901 `newline_anchor' is set, after newlines. */ 3902 case begline: 3903 DEBUG_PRINT1 ("EXECUTING begline.\n"); 3904 3905 if (AT_STRINGS_BEG (d)) 3906 { 3907 if (!bufp->not_bol) break; 3908 } 3909 else if (d[-1] == '\n' && bufp->newline_anchor) 3910 { 3911 break; 3912 } 3913 /* In all other cases, we fail. */ 3914 goto fail; 3915 3916 3917 /* endline is the dual of begline. */ 3918 case endline: 3919 DEBUG_PRINT1 ("EXECUTING endline.\n"); 3920 3921 if (AT_STRINGS_END (d)) 3922 { 3923 if (!bufp->not_eol) break; 3924 } 3925 3926 /* We have to ``prefetch'' the next character. */ 3927 else if ((d == end1 ? *string2 : *d) == '\n' 3928 && bufp->newline_anchor) 3929 { 3930 break; 3931 } 3932 goto fail; 3933 3934 3935 /* Match at the very beginning of the data. */ 3936 case begbuf: 3937 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); 3938 if (AT_STRINGS_BEG (d)) 3939 break; 3940 goto fail; 3941 3942 3943 /* Match at the very end of the data. */ 3944 case endbuf: 3945 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); 3946 if (AT_STRINGS_END (d)) 3947 break; 3948 goto fail; 3949 3950 3951 /* on_failure_keep_string_jump is used to optimize `.*\n'. It 3952 pushes NULL as the value for the string on the stack. Then 3953 `pop_failure_point' will keep the current value for the 3954 string, instead of restoring it. To see why, consider 3955 matching `foo\nbar' against `.*\n'. The .* matches the foo; 3956 then the . fails against the \n. But the next thing we want 3957 to do is match the \n against the \n; if we restored the 3958 string value, we would be back at the foo. 3959 3960 Because this is used only in specific cases, we don't need to 3961 check all the things that `on_failure_jump' does, to make 3962 sure the right things get saved on the stack. Hence we don't 3963 share its code. The only reason to push anything on the 3964 stack at all is that otherwise we would have to change 3965 `anychar's code to do something besides goto fail in this 3966 case; that seems worse than this. */ 3967 case on_failure_keep_string_jump: 3968 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); 3969 3970 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3971 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); 3972 3973 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); 3974 break; 3975 3976 3977 /* Uses of on_failure_jump: 3978 3979 Each alternative starts with an on_failure_jump that points 3980 to the beginning of the next alternative. Each alternative 3981 except the last ends with a jump that in effect jumps past 3982 the rest of the alternatives. (They really jump to the 3983 ending jump of the following alternative, because tensioning 3984 these jumps is a hassle.) 3985 3986 Repeats start with an on_failure_jump that points past both 3987 the repetition text and either the following jump or 3988 pop_failure_jump back to this on_failure_jump. */ 3989 case on_failure_jump: 3990 on_failure: 3991 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); 3992 3993 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3994 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); 3995 3996 /* If this on_failure_jump comes right before a group (i.e., 3997 the original * applied to a group), save the information 3998 for that group and all inner ones, so that if we fail back 3999 to this point, the group's information will be correct. 4000 For example, in \(a*\)*\1, we need the preceding group, 4001 and in \(\(a*\)b*\)\2, we need the inner group. */ 4002 4003 /* We can't use `p' to check ahead because we push 4004 a failure point to `p + mcnt' after we do this. */ 4005 p1 = p; 4006 4007 /* We need to skip no_op's before we look for the 4008 start_memory in case this on_failure_jump is happening as 4009 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 4010 against aba. */ 4011 while (p1 < pend && (re_opcode_t) *p1 == no_op) 4012 p1++; 4013 4014 if (p1 < pend && (re_opcode_t) *p1 == start_memory) 4015 { 4016 /* We have a new highest active register now. This will 4017 get reset at the start_memory we are about to get to, 4018 but we will have saved all the registers relevant to 4019 this repetition op, as described above. */ 4020 highest_active_reg = *(p1 + 1) + *(p1 + 2); 4021 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 4022 lowest_active_reg = *(p1 + 1); 4023 } 4024 4025 DEBUG_PRINT1 (":\n"); 4026 PUSH_FAILURE_POINT (p + mcnt, d, -2); 4027 break; 4028 4029 4030 /* A smart repeat ends with `maybe_pop_jump'. 4031 We change it to either `pop_failure_jump' or `jump'. */ 4032 case maybe_pop_jump: 4033 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4034 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); 4035 { 4036 register unsigned char *p2 = p; 4037 4038 /* Compare the beginning of the repeat with what in the 4039 pattern follows its end. If we can establish that there 4040 is nothing that they would both match, i.e., that we 4041 would have to backtrack because of (as in, e.g., `a*a') 4042 then we can change to pop_failure_jump, because we'll 4043 never have to backtrack. 4044 4045 This is not true in the case of alternatives: in 4046 `(a|ab)*' we do need to backtrack to the `ab' alternative 4047 (e.g., if the string was `ab'). But instead of trying to 4048 detect that here, the alternative has put on a dummy 4049 failure point which is what we will end up popping. */ 4050 4051 /* Skip over open/close-group commands. */ 4052 while (p2 + 2 < pend 4053 && ((re_opcode_t) *p2 == stop_memory 4054 || (re_opcode_t) *p2 == start_memory)) 4055 p2 += 3; /* Skip over args, too. */ 4056 4057 /* If we're at the end of the pattern, we can change. */ 4058 if (p2 == pend) 4059 { 4060 /* Consider what happens when matching ":\(.*\)" 4061 against ":/". I don't really understand this code 4062 yet. */ 4063 p[-3] = (unsigned char) pop_failure_jump; 4064 DEBUG_PRINT1 4065 (" End of pattern: change to `pop_failure_jump'.\n"); 4066 } 4067 4068 else if ((re_opcode_t) *p2 == exactn 4069 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) 4070 { 4071 register unsigned char c 4072 = *p2 == (unsigned char) endline ? '\n' : p2[2]; 4073 p1 = p + mcnt; 4074 4075 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding 4076 to the `maybe_finalize_jump' of this case. Examine what 4077 follows. */ 4078 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) 4079 { 4080 p[-3] = (unsigned char) pop_failure_jump; 4081 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 4082 c, p1[5]); 4083 } 4084 4085 else if ((re_opcode_t) p1[3] == charset 4086 || (re_opcode_t) p1[3] == charset_not) 4087 { 4088 int not = (re_opcode_t) p1[3] == charset_not; 4089 4090 if (c < (unsigned char) (p1[4] * BYTEWIDTH) 4091 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4092 not = !not; 4093 4094 /* `not' is equal to 1 if c would match, which means 4095 that we can't change to pop_failure_jump. */ 4096 if (!not) 4097 { 4098 p[-3] = (unsigned char) pop_failure_jump; 4099 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4100 } 4101 } 4102 } 4103 } 4104 p -= 2; /* Point at relative address again. */ 4105 if ((re_opcode_t) p[-1] != pop_failure_jump) 4106 { 4107 p[-1] = (unsigned char) jump; 4108 DEBUG_PRINT1 (" Match => jump.\n"); 4109 goto unconditional_jump; 4110 } 4111 /* Note fall through. */ 4112 4113 4114 /* The end of a simple repeat has a pop_failure_jump back to 4115 its matching on_failure_jump, where the latter will push a 4116 failure point. The pop_failure_jump takes off failure 4117 points put on by this pop_failure_jump's matching 4118 on_failure_jump; we got through the pattern to here from the 4119 matching on_failure_jump, so didn't fail. */ 4120 case pop_failure_jump: 4121 { 4122 /* We need to pass separate storage for the lowest and 4123 highest registers, even though we don't care about the 4124 actual values. Otherwise, we will restore only one 4125 register from the stack, since lowest will == highest in 4126 `pop_failure_point'. */ 4127 unsigned dummy_low_reg, dummy_high_reg; 4128 unsigned char *pdummy; 4129 const char *sdummy; 4130 4131 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); 4132 POP_FAILURE_POINT (sdummy, pdummy, 4133 dummy_low_reg, dummy_high_reg, 4134 reg_dummy, reg_dummy, reg_info_dummy); 4135 } 4136 /* Note fall through. */ 4137 4138 4139 /* Unconditionally jump (without popping any failure points). */ 4140 case jump: 4141 unconditional_jump: 4142 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ 4143 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); 4144 p += mcnt; /* Do the jump. */ 4145 DEBUG_PRINT2 ("(to 0x%x).\n", p); 4146 break; 4147 4148 4149 /* We need this opcode so we can detect where alternatives end 4150 in `group_match_null_string_p' et al. */ 4151 case jump_past_alt: 4152 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); 4153 goto unconditional_jump; 4154 4155 4156 /* Normally, the on_failure_jump pushes a failure point, which 4157 then gets popped at pop_failure_jump. We will end up at 4158 pop_failure_jump, also, and with a pattern of, say, `a+', we 4159 are skipping over the on_failure_jump, so we have to push 4160 something meaningless for pop_failure_jump to pop. */ 4161 case dummy_failure_jump: 4162 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); 4163 /* It doesn't matter what we push for the string here. What 4164 the code at `fail' tests is the value for the pattern. */ 4165 PUSH_FAILURE_POINT (0, 0, -2); 4166 goto unconditional_jump; 4167 4168 4169 /* At the end of an alternative, we need to push a dummy failure 4170 point in case we are followed by a `pop_failure_jump', because 4171 we don't want the failure point for the alternative to be 4172 popped. For example, matching `(a|ab)*' against `aab' 4173 requires that we match the `ab' alternative. */ 4174 case push_dummy_failure: 4175 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); 4176 /* See comments just above at `dummy_failure_jump' about the 4177 two zeroes. */ 4178 PUSH_FAILURE_POINT (0, 0, -2); 4179 break; 4180 4181 /* Have to succeed matching what follows at least n times. 4182 After that, handle like `on_failure_jump'. */ 4183 case succeed_n: 4184 EXTRACT_NUMBER (mcnt, p + 2); 4185 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); 4186 4187 assert (mcnt >= 0); 4188 /* Originally, this is how many times we HAVE to succeed. */ 4189 if (mcnt > 0) 4190 { 4191 mcnt--; 4192 p += 2; 4193 STORE_NUMBER_AND_INCR (p, mcnt); 4194 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); 4195 } 4196 else if (mcnt == 0) 4197 { 4198 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); 4199 p[2] = (unsigned char) no_op; 4200 p[3] = (unsigned char) no_op; 4201 goto on_failure; 4202 } 4203 break; 4204 4205 case jump_n: 4206 EXTRACT_NUMBER (mcnt, p + 2); 4207 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); 4208 4209 /* Originally, this is how many times we CAN jump. */ 4210 if (mcnt) 4211 { 4212 mcnt--; 4213 STORE_NUMBER (p + 2, mcnt); 4214 goto unconditional_jump; 4215 } 4216 /* If don't have to jump any more, skip over the rest of command. */ 4217 else 4218 p += 4; 4219 break; 4220 4221 case set_number_at: 4222 { 4223 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); 4224 4225 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4226 p1 = p + mcnt; 4227 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4228 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); 4229 STORE_NUMBER (p1, mcnt); 4230 break; 4231 } 4232 4233 case wordbound: 4234 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 4235 if (AT_WORD_BOUNDARY (d)) 4236 break; 4237 goto fail; 4238 4239 case notwordbound: 4240 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 4241 if (AT_WORD_BOUNDARY (d)) 4242 goto fail; 4243 break; 4244 4245 case wordbeg: 4246 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); 4247 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) 4248 break; 4249 goto fail; 4250 4251 case wordend: 4252 DEBUG_PRINT1 ("EXECUTING wordend.\n"); 4253 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) 4254 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) 4255 break; 4256 goto fail; 4257 4258 #ifdef emacs 4259 case before_dot: 4260 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); 4261 if (PTR_CHAR_POS ((unsigned char *) d) >= point) 4262 goto fail; 4263 break; 4264 4265 case at_dot: 4266 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 4267 if (PTR_CHAR_POS ((unsigned char *) d) != point) 4268 goto fail; 4269 break; 4270 4271 case after_dot: 4272 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); 4273 if (PTR_CHAR_POS ((unsigned char *) d) <= point) 4274 goto fail; 4275 break; 4276 4277 case syntaxspec: 4278 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); 4279 mcnt = *p++; 4280 goto matchsyntax; 4281 4282 case wordchar: 4283 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); 4284 mcnt = (int) Sword; 4285 matchsyntax: 4286 PREFETCH (); 4287 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) 4288 goto fail; 4289 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 4290 d++; 4291 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt) 4292 goto fail; 4293 SET_REGS_MATCHED (); 4294 break; 4295 4296 case notsyntaxspec: 4297 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); 4298 mcnt = *p++; 4299 goto matchnotsyntax; 4300 4301 case notwordchar: 4302 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); 4303 mcnt = (int) Sword; 4304 matchnotsyntax: 4305 PREFETCH (); 4306 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 4307 d++; 4308 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt) 4309 goto fail; 4310 SET_REGS_MATCHED (); 4311 break; 4312 4313 #else /* not emacs */ 4314 case wordchar: 4315 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); 4316 PREFETCH (); 4317 if (!WORDCHAR_P (d)) 4318 goto fail; 4319 SET_REGS_MATCHED (); 4320 d++; 4321 break; 4322 4323 case notwordchar: 4324 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); 4325 PREFETCH (); 4326 if (WORDCHAR_P (d)) 4327 goto fail; 4328 SET_REGS_MATCHED (); 4329 d++; 4330 break; 4331 #endif /* not emacs */ 4332 4333 default: 4334 abort (); 4335 } 4336 continue; /* Successfully executed one pattern command; keep going. */ 4337 4338 4339 /* We goto here if a matching operation fails. */ 4340 fail: 4341 if (!FAIL_STACK_EMPTY ()) 4342 { /* A restart point is known. Restore to that state. */ 4343 DEBUG_PRINT1 ("\nFAIL:\n"); 4344 POP_FAILURE_POINT (d, p, 4345 lowest_active_reg, highest_active_reg, 4346 regstart, regend, reg_info); 4347 4348 /* If this failure point is a dummy, try the next one. */ 4349 if (!p) 4350 goto fail; 4351 4352 /* If we failed to the end of the pattern, don't examine *p. */ 4353 assert (p <= pend); 4354 if (p < pend) 4355 { 4356 boolean is_a_jump_n = false; 4357 4358 /* If failed to a backwards jump that's part of a repetition 4359 loop, need to pop this failure point and use the next one. */ 4360 switch ((re_opcode_t) *p) 4361 { 4362 case jump_n: 4363 is_a_jump_n = true; 4364 case maybe_pop_jump: 4365 case pop_failure_jump: 4366 case jump: 4367 p1 = p + 1; 4368 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4369 p1 += mcnt; 4370 4371 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) 4372 || (!is_a_jump_n 4373 && (re_opcode_t) *p1 == on_failure_jump)) 4374 goto fail; 4375 break; 4376 default: 4377 /* do nothing */ ; 4378 } 4379 } 4380 4381 if (d >= string1 && d <= end1) 4382 dend = end_match_1; 4383 } 4384 else 4385 break; /* Matching at this starting point really fails. */ 4386 } /* for (;;) */ 4387 4388 if (best_regs_set) 4389 goto restore_best_regs; 4390 4391 FREE_VARIABLES (); 4392 4393 return -1; /* Failure to match. */ 4394 } /* re_match_2 */ 4395 4396 /* Subroutine definitions for re_match_2. */ 4397 4398 4399 /* We are passed P pointing to a register number after a start_memory. 4400 4401 Return true if the pattern up to the corresponding stop_memory can 4402 match the empty string, and false otherwise. 4403 4404 If we find the matching stop_memory, sets P to point to one past its number. 4405 Otherwise, sets P to an undefined byte less than or equal to END. 4406 4407 We don't handle duplicates properly (yet). */ 4408 4409 static boolean 4410 group_match_null_string_p (p, end, reg_info) 4411 unsigned char **p, *end; 4412 register_info_type *reg_info; 4413 { 4414 int mcnt; 4415 /* Point to after the args to the start_memory. */ 4416 unsigned char *p1 = *p + 2; 4417 4418 while (p1 < end) 4419 { 4420 /* Skip over opcodes that can match nothing, and return true or 4421 false, as appropriate, when we get to one that can't, or to the 4422 matching stop_memory. */ 4423 4424 switch ((re_opcode_t) *p1) 4425 { 4426 /* Could be either a loop or a series of alternatives. */ 4427 case on_failure_jump: 4428 p1++; 4429 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4430 4431 /* If the next operation is not a jump backwards in the 4432 pattern. */ 4433 4434 if (mcnt >= 0) 4435 { 4436 /* Go through the on_failure_jumps of the alternatives, 4437 seeing if any of the alternatives cannot match nothing. 4438 The last alternative starts with only a jump, 4439 whereas the rest start with on_failure_jump and end 4440 with a jump, e.g., here is the pattern for `a|b|c': 4441 4442 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 4443 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 4444 /exactn/1/c 4445 4446 So, we have to first go through the first (n-1) 4447 alternatives and then deal with the last one separately. */ 4448 4449 4450 /* Deal with the first (n-1) alternatives, which start 4451 with an on_failure_jump (see above) that jumps to right 4452 past a jump_past_alt. */ 4453 4454 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) 4455 { 4456 /* `mcnt' holds how many bytes long the alternative 4457 is, including the ending `jump_past_alt' and 4458 its number. */ 4459 4460 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, 4461 reg_info)) 4462 return false; 4463 4464 /* Move to right after this alternative, including the 4465 jump_past_alt. */ 4466 p1 += mcnt; 4467 4468 /* Break if it's the beginning of an n-th alternative 4469 that doesn't begin with an on_failure_jump. */ 4470 if ((re_opcode_t) *p1 != on_failure_jump) 4471 break; 4472 4473 /* Still have to check that it's not an n-th 4474 alternative that starts with an on_failure_jump. */ 4475 p1++; 4476 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4477 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) 4478 { 4479 /* Get to the beginning of the n-th alternative. */ 4480 p1 -= 3; 4481 break; 4482 } 4483 } 4484 4485 /* Deal with the last alternative: go back and get number 4486 of the `jump_past_alt' just before it. `mcnt' contains 4487 the length of the alternative. */ 4488 EXTRACT_NUMBER (mcnt, p1 - 2); 4489 4490 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) 4491 return false; 4492 4493 p1 += mcnt; /* Get past the n-th alternative. */ 4494 } /* if mcnt > 0 */ 4495 break; 4496 4497 4498 case stop_memory: 4499 assert (p1[1] == **p); 4500 *p = p1 + 2; 4501 return true; 4502 4503 4504 default: 4505 if (!common_op_match_null_string_p (&p1, end, reg_info)) 4506 return false; 4507 } 4508 } /* while p1 < end */ 4509 4510 return false; 4511 } /* group_match_null_string_p */ 4512 4513 4514 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: 4515 It expects P to be the first byte of a single alternative and END one 4516 byte past the last. The alternative can contain groups. */ 4517 4518 static boolean 4519 alt_match_null_string_p (p, end, reg_info) 4520 unsigned char *p, *end; 4521 register_info_type *reg_info; 4522 { 4523 int mcnt; 4524 unsigned char *p1 = p; 4525 4526 while (p1 < end) 4527 { 4528 /* Skip over opcodes that can match nothing, and break when we get 4529 to one that can't. */ 4530 4531 switch ((re_opcode_t) *p1) 4532 { 4533 /* It's a loop. */ 4534 case on_failure_jump: 4535 p1++; 4536 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4537 p1 += mcnt; 4538 break; 4539 4540 default: 4541 if (!common_op_match_null_string_p (&p1, end, reg_info)) 4542 return false; 4543 } 4544 } /* while p1 < end */ 4545 4546 return true; 4547 } /* alt_match_null_string_p */ 4548 4549 4550 /* Deals with the ops common to group_match_null_string_p and 4551 alt_match_null_string_p. 4552 4553 Sets P to one after the op and its arguments, if any. */ 4554 4555 static boolean 4556 common_op_match_null_string_p (p, end, reg_info) 4557 unsigned char **p, *end; 4558 register_info_type *reg_info; 4559 { 4560 int mcnt; 4561 boolean ret; 4562 int reg_no; 4563 unsigned char *p1 = *p; 4564 4565 switch ((re_opcode_t) *p1++) 4566 { 4567 case no_op: 4568 case begline: 4569 case endline: 4570 case begbuf: 4571 case endbuf: 4572 case wordbeg: 4573 case wordend: 4574 case wordbound: 4575 case notwordbound: 4576 #ifdef emacs 4577 case before_dot: 4578 case at_dot: 4579 case after_dot: 4580 #endif 4581 break; 4582 4583 case start_memory: 4584 reg_no = *p1; 4585 assert (reg_no > 0 && reg_no <= MAX_REGNUM); 4586 ret = group_match_null_string_p (&p1, end, reg_info); 4587 4588 /* Have to set this here in case we're checking a group which 4589 contains a group and a back reference to it. */ 4590 4591 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) 4592 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; 4593 4594 if (!ret) 4595 return false; 4596 break; 4597 4598 /* If this is an optimized succeed_n for zero times, make the jump. */ 4599 case jump: 4600 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4601 if (mcnt >= 0) 4602 p1 += mcnt; 4603 else 4604 return false; 4605 break; 4606 4607 case succeed_n: 4608 /* Get to the number of times to succeed. */ 4609 p1 += 2; 4610 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4611 4612 if (mcnt == 0) 4613 { 4614 p1 -= 4; 4615 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4616 p1 += mcnt; 4617 } 4618 else 4619 return false; 4620 break; 4621 4622 case duplicate: 4623 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) 4624 return false; 4625 break; 4626 4627 case set_number_at: 4628 p1 += 4; 4629 4630 default: 4631 /* All other opcodes mean we cannot match the empty string. */ 4632 return false; 4633 } 4634 4635 *p = p1; 4636 return true; 4637 } /* common_op_match_null_string_p */ 4638 4639 4640 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 4641 bytes; nonzero otherwise. */ 4642 4643 static int 4644 bcmp_translate (s1, s2, len, translate) 4645 unsigned char *s1, *s2; 4646 register int len; 4647 char *translate; 4648 { 4649 register unsigned char *p1 = s1, *p2 = s2; 4650 while (len) 4651 { 4652 if (translate[*p1++] != translate[*p2++]) return 1; 4653 len--; 4654 } 4655 return 0; 4656 } 4657 4658 /* Entry points for GNU code. */ 4659 4660 /* re_compile_pattern is the GNU regular expression compiler: it 4661 compiles PATTERN (of length SIZE) and puts the result in BUFP. 4662 Returns 0 if the pattern was valid, otherwise an error string. 4663 4664 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 4665 are set in BUFP on entry. 4666 4667 We call regex_compile to do the actual compilation. */ 4668 4669 const char * 4670 re_compile_pattern (pattern, length, bufp) 4671 const char *pattern; 4672 int length; 4673 struct re_pattern_buffer *bufp; 4674 { 4675 reg_errcode_t ret; 4676 4677 /* GNU code is written to assume at least RE_NREGS registers will be set 4678 (and at least one extra will be -1). */ 4679 bufp->regs_allocated = REGS_UNALLOCATED; 4680 4681 /* And GNU code determines whether or not to get register information 4682 by passing null for the REGS argument to re_match, etc., not by 4683 setting no_sub. */ 4684 bufp->no_sub = 0; 4685 4686 /* Match anchors at newline. */ 4687 bufp->newline_anchor = 1; 4688 4689 ret = regex_compile (pattern, length, re_syntax_options, bufp); 4690 4691 return re_error_msg[(int) ret]; 4692 } 4693 4694 /* Entry points compatible with 4.2 BSD regex library. We don't define 4695 them if this is an Emacs or POSIX compilation. */ 4696 4697 #if !defined (emacs) 4698 4699 /* BSD has one and only one pattern buffer. */ 4700 static struct re_pattern_buffer re_comp_buf; 4701 4702 char * 4703 re_comp (s) 4704 const char *s; 4705 { 4706 reg_errcode_t ret; 4707 4708 if (!s) 4709 { 4710 if (!re_comp_buf.buffer) 4711 return "No previous regular expression"; 4712 return 0; 4713 } 4714 4715 if (!re_comp_buf.buffer) 4716 { 4717 re_comp_buf.buffer = (unsigned char *) malloc (200); 4718 if (re_comp_buf.buffer == NULL) 4719 return "Memory exhausted"; 4720 re_comp_buf.allocated = 200; 4721 4722 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); 4723 if (re_comp_buf.fastmap == NULL) 4724 return "Memory exhausted"; 4725 } 4726 4727 /* Since `re_exec' always passes NULL for the `regs' argument, we 4728 don't need to initialize the pattern buffer fields which affect it. */ 4729 4730 /* Match anchors at newlines. */ 4731 re_comp_buf.newline_anchor = 1; 4732 4733 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 4734 4735 /* Yes, we're discarding `const' here. */ 4736 return (char *) re_error_msg[(int) ret]; 4737 } 4738 4739 4740 int 4741 re_exec (s) 4742 const char *s; 4743 { 4744 const int len = strlen (s); 4745 return 4746 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); 4747 } 4748 #endif /* not emacs and not _POSIX_SOURCE */ 4749 4750 /* POSIX.2 functions. Don't define these for Emacs. */ 4751 4752 #ifndef emacs 4753 4754 /* regcomp takes a regular expression as a string and compiles it. 4755 4756 PREG is a regex_t *. We do not expect any fields to be initialized, 4757 since POSIX says we shouldn't. Thus, we set 4758 4759 `buffer' to the compiled pattern; 4760 `used' to the length of the compiled pattern; 4761 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 4762 REG_EXTENDED bit in CFLAGS is set; otherwise, to 4763 RE_SYNTAX_POSIX_BASIC; 4764 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 4765 `fastmap' and `fastmap_accurate' to zero; 4766 `re_nsub' to the number of subexpressions in PATTERN. 4767 4768 PATTERN is the address of the pattern string. 4769 4770 CFLAGS is a series of bits which affect compilation. 4771 4772 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 4773 use POSIX basic syntax. 4774 4775 If REG_NEWLINE is set, then . and [^...] don't match newline. 4776 Also, regexec will try a match beginning after every newline. 4777 4778 If REG_ICASE is set, then we considers upper- and lowercase 4779 versions of letters to be equivalent when matching. 4780 4781 If REG_NOSUB is set, then when PREG is passed to regexec, that 4782 routine will report only success or failure, and nothing about the 4783 registers. 4784 4785 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 4786 the return codes and their meanings.) */ 4787 4788 int 4789 regcomp (preg, pattern, cflags) 4790 regex_t *preg; 4791 const char *pattern; 4792 int cflags; 4793 { 4794 reg_errcode_t ret; 4795 unsigned syntax 4796 = (cflags & REG_EXTENDED) ? 4797 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; 4798 4799 /* regex_compile will allocate the space for the compiled pattern. */ 4800 preg->buffer = 0; 4801 preg->allocated = 0; 4802 4803 /* Don't bother to use a fastmap when searching. This simplifies the 4804 REG_NEWLINE case: if we used a fastmap, we'd have to put all the 4805 characters after newlines into the fastmap. This way, we just try 4806 every character. */ 4807 preg->fastmap = 0; 4808 4809 if (cflags & REG_ICASE) 4810 { 4811 unsigned i; 4812 4813 preg->translate = (char *) malloc (CHAR_SET_SIZE); 4814 if (preg->translate == NULL) 4815 return (int) REG_ESPACE; 4816 4817 /* Map uppercase characters to corresponding lowercase ones. */ 4818 for (i = 0; i < CHAR_SET_SIZE; i++) 4819 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; 4820 } 4821 else 4822 preg->translate = NULL; 4823 4824 /* If REG_NEWLINE is set, newlines are treated differently. */ 4825 if (cflags & REG_NEWLINE) 4826 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 4827 syntax &= ~RE_DOT_NEWLINE; 4828 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 4829 /* It also changes the matching behavior. */ 4830 preg->newline_anchor = 1; 4831 } 4832 else 4833 preg->newline_anchor = 0; 4834 4835 preg->no_sub = !!(cflags & REG_NOSUB); 4836 4837 /* POSIX says a null character in the pattern terminates it, so we 4838 can use strlen here in compiling the pattern. */ 4839 ret = regex_compile (pattern, strlen (pattern), syntax, preg); 4840 4841 /* POSIX doesn't distinguish between an unmatched open-group and an 4842 unmatched close-group: both are REG_EPAREN. */ 4843 if (ret == REG_ERPAREN) ret = REG_EPAREN; 4844 4845 return (int) ret; 4846 } 4847 4848 4849 /* regexec searches for a given pattern, specified by PREG, in the 4850 string STRING. 4851 4852 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 4853 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 4854 least NMATCH elements, and we set them to the offsets of the 4855 corresponding matched substrings. 4856 4857 EFLAGS specifies `execution flags' which affect matching: if 4858 REG_NOTBOL is set, then ^ does not match at the beginning of the 4859 string; if REG_NOTEOL is set, then $ does not match at the end. 4860 4861 We return 0 if we find a match and REG_NOMATCH if not. */ 4862 4863 int 4864 regexec (preg, string, nmatch, pmatch, eflags) 4865 const regex_t *preg; 4866 const char *string; 4867 size_t nmatch; 4868 regmatch_t pmatch[]; 4869 int eflags; 4870 { 4871 int ret; 4872 struct re_registers regs; 4873 regex_t private_preg; 4874 int len = strlen (string); 4875 boolean want_reg_info = !preg->no_sub && nmatch > 0; 4876 4877 private_preg = *preg; 4878 4879 private_preg.not_bol = !!(eflags & REG_NOTBOL); 4880 private_preg.not_eol = !!(eflags & REG_NOTEOL); 4881 4882 /* The user has told us exactly how many registers to return 4883 information about, via `nmatch'. We have to pass that on to the 4884 matching routines. */ 4885 private_preg.regs_allocated = REGS_FIXED; 4886 4887 if (want_reg_info) 4888 { 4889 regs.num_regs = nmatch; 4890 regs.start = TALLOC (nmatch, regoff_t); 4891 regs.end = TALLOC (nmatch, regoff_t); 4892 if (regs.start == NULL || regs.end == NULL) 4893 return (int) REG_NOMATCH; 4894 } 4895 4896 /* Perform the searching operation. */ 4897 ret = re_search (&private_preg, string, len, 4898 /* start: */ 0, /* range: */ len, 4899 want_reg_info ? ®s : (struct re_registers *) 0); 4900 4901 /* Copy the register information to the POSIX structure. */ 4902 if (want_reg_info) 4903 { 4904 if (ret >= 0) 4905 { 4906 unsigned r; 4907 4908 for (r = 0; r < nmatch; r++) 4909 { 4910 pmatch[r].rm_so = regs.start[r]; 4911 pmatch[r].rm_eo = regs.end[r]; 4912 } 4913 } 4914 4915 /* If we needed the temporary register info, free the space now. */ 4916 free (regs.start); 4917 free (regs.end); 4918 } 4919 4920 /* We want zero return to mean success, unlike `re_search'. */ 4921 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; 4922 } 4923 4924 4925 /* Returns a message corresponding to an error code, ERRCODE, returned 4926 from either regcomp or regexec. We don't use PREG here. */ 4927 4928 size_t 4929 regerror (errcode, preg, errbuf, errbuf_size) 4930 int errcode; 4931 const regex_t *preg; 4932 char *errbuf; 4933 size_t errbuf_size; 4934 { 4935 const char *msg; 4936 size_t msg_size; 4937 4938 if (errcode < 0 4939 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0]))) 4940 /* Only error codes returned by the rest of the code should be passed 4941 to this routine. If we are given anything else, or if other regex 4942 code generates an invalid error code, then the program has a bug. 4943 Dump core so we can fix it. */ 4944 abort (); 4945 4946 msg = re_error_msg[errcode]; 4947 4948 /* POSIX doesn't require that we do anything in this case, but why 4949 not be nice. */ 4950 if (! msg) 4951 msg = "Success"; 4952 4953 msg_size = strlen (msg) + 1; /* Includes the null. */ 4954 4955 if (errbuf_size != 0) 4956 { 4957 if (msg_size > errbuf_size) 4958 { 4959 strncpy (errbuf, msg, errbuf_size - 1); 4960 errbuf[errbuf_size - 1] = 0; 4961 } 4962 else 4963 strcpy (errbuf, msg); 4964 } 4965 4966 return msg_size; 4967 } 4968 4969 4970 /* Free dynamically allocated space used by PREG. */ 4971 4972 void 4973 regfree (preg) 4974 regex_t *preg; 4975 { 4976 if (preg->buffer != NULL) 4977 free (preg->buffer); 4978 preg->buffer = NULL; 4979 4980 preg->allocated = 0; 4981 preg->used = 0; 4982 4983 if (preg->fastmap != NULL) 4984 free (preg->fastmap); 4985 preg->fastmap = NULL; 4986 preg->fastmap_accurate = 0; 4987 4988 if (preg->translate != NULL) 4989 free (preg->translate); 4990 preg->translate = NULL; 4991 } 4992 4993 #endif /* not emacs */ 4994