1 /* Extended regular expression matching and search library, 2 version 0.12. 3 (Implements POSIX draft P10003.2/D11.2, except for 4 internationalization features.) 5 6 Copyright (C) 1993 Free Software Foundation, Inc. 7 8 This program is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, write to the Free Software 20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ 21 22 /* Trying to define this in the makefile would get hairy, unless we can 23 more gracefully do it for NT, OS/2, unix, etc. */ 24 #define REGEX_MALLOC 1 25 26 /* AIX requires this to be the first thing in the file. */ 27 #if defined (_AIX) && !defined (REGEX_MALLOC) 28 #pragma alloca 29 #endif 30 31 #define _GNU_SOURCE 32 33 /* We need this for `regex.h', and perhaps for the Emacs include files. */ 34 #include <sys/types.h> 35 36 #ifdef HAVE_CONFIG_H 37 #include "config.h" 38 #endif 39 40 /* The `emacs' switch turns on certain matching commands 41 that make sense only in Emacs. */ 42 #ifdef emacs 43 44 #include "lisp.h" 45 #include "buffer.h" 46 #include "syntax.h" 47 48 /* Emacs uses `NULL' as a predicate. */ 49 #undef NULL 50 51 #else /* not emacs */ 52 53 /* We used to test for `BSTRING' here, but only GCC and Emacs define 54 `BSTRING', as far as I know, and neither of them use this code. */ 55 #if HAVE_STRING_H || STDC_HEADERS 56 #include <string.h> 57 #ifndef bcmp 58 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) 59 #endif 60 #ifndef bcopy 61 #define bcopy(s, d, n) memcpy ((d), (s), (n)) 62 #endif 63 #ifndef bzero 64 #define bzero(s, n) memset ((s), 0, (n)) 65 #endif 66 #else 67 #include <strings.h> 68 #endif 69 70 #ifdef STDC_HEADERS 71 #include <stdlib.h> 72 #else 73 char *malloc (); 74 char *realloc (); 75 #endif 76 77 78 /* Define the syntax stuff for \<, \>, etc. */ 79 80 /* This must be nonzero for the wordchar and notwordchar pattern 81 commands in re_match_2. */ 82 #ifndef Sword 83 #define Sword 1 84 #endif 85 86 #ifdef SYNTAX_TABLE 87 88 extern char *re_syntax_table; 89 90 #else /* not SYNTAX_TABLE */ 91 92 /* How many characters in the character set. */ 93 #define CHAR_SET_SIZE 256 94 95 static char re_syntax_table[CHAR_SET_SIZE]; 96 97 static void 98 init_syntax_once () 99 { 100 register int c; 101 static int done = 0; 102 103 if (done) 104 return; 105 106 bzero (re_syntax_table, sizeof re_syntax_table); 107 108 for (c = 'a'; c <= 'z'; c++) 109 re_syntax_table[c] = Sword; 110 111 for (c = 'A'; c <= 'Z'; c++) 112 re_syntax_table[c] = Sword; 113 114 for (c = '0'; c <= '9'; c++) 115 re_syntax_table[c] = Sword; 116 117 re_syntax_table['_'] = Sword; 118 119 done = 1; 120 } 121 122 #endif /* not SYNTAX_TABLE */ 123 124 #define SYNTAX(c) re_syntax_table[c] 125 126 #endif /* not emacs */ 127 128 /* Get the interface, including the syntax bits. */ 129 #include "regex.h" 130 131 /* isalpha etc. are used for the character classes. */ 132 #include <ctype.h> 133 134 #ifndef isascii 135 #define isascii(c) 1 136 #endif 137 138 #ifdef isblank 139 #define ISBLANK(c) (isascii (c) && isblank (c)) 140 #else 141 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') 142 #endif 143 #ifdef isgraph 144 #define ISGRAPH(c) (isascii (c) && isgraph (c)) 145 #else 146 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c)) 147 #endif 148 149 #define ISPRINT(c) (isascii (c) && isprint (c)) 150 #define ISDIGIT(c) (isascii (c) && isdigit (c)) 151 #define ISALNUM(c) (isascii (c) && isalnum (c)) 152 #define ISALPHA(c) (isascii (c) && isalpha (c)) 153 #define ISCNTRL(c) (isascii (c) && iscntrl (c)) 154 #define ISLOWER(c) (isascii (c) && islower (c)) 155 #define ISPUNCT(c) (isascii (c) && ispunct (c)) 156 #define ISSPACE(c) (isascii (c) && isspace (c)) 157 #define ISUPPER(c) (isascii (c) && isupper (c)) 158 #define ISXDIGIT(c) (isascii (c) && isxdigit (c)) 159 160 #ifndef NULL 161 #define NULL 0 162 #endif 163 164 /* We remove any previous definition of `SIGN_EXTEND_CHAR', 165 since ours (we hope) works properly with all combinations of 166 machines, compilers, `char' and `unsigned char' argument types. 167 (Per Bothner suggested the basic approach.) */ 168 #undef SIGN_EXTEND_CHAR 169 #if __STDC__ 170 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) 171 #else /* not __STDC__ */ 172 /* As in Harbison and Steele. */ 173 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) 174 #endif 175 176 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 177 use `alloca' instead of `malloc'. This is because using malloc in 178 re_search* or re_match* could cause memory leaks when C-g is used in 179 Emacs; also, malloc is slower and causes storage fragmentation. On 180 the other hand, malloc is more portable, and easier to debug. 181 182 Because we sometimes use alloca, some routines have to be macros, 183 not functions -- `alloca'-allocated space disappears at the end of the 184 function it is called in. */ 185 186 #ifdef REGEX_MALLOC 187 188 #define REGEX_ALLOCATE malloc 189 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 190 191 #else /* not REGEX_MALLOC */ 192 193 /* Emacs already defines alloca, sometimes. */ 194 #ifndef alloca 195 196 /* Make alloca work the best possible way. */ 197 #ifdef __GNUC__ 198 #define alloca __builtin_alloca 199 #else /* not __GNUC__ */ 200 #if HAVE_ALLOCA_H 201 #include <alloca.h> 202 #else /* not __GNUC__ or HAVE_ALLOCA_H */ 203 #ifndef _AIX /* Already did AIX, up at the top. */ 204 char *alloca (); 205 #endif /* not _AIX */ 206 #endif /* not HAVE_ALLOCA_H */ 207 #endif /* not __GNUC__ */ 208 209 #endif /* not alloca */ 210 211 #define REGEX_ALLOCATE alloca 212 213 /* Assumes a `char *destination' variable. */ 214 #define REGEX_REALLOCATE(source, osize, nsize) \ 215 (destination = (char *) alloca (nsize), \ 216 bcopy (source, destination, osize), \ 217 destination) 218 219 #endif /* not REGEX_MALLOC */ 220 221 222 /* True if `size1' is non-NULL and PTR is pointing anywhere inside 223 `string1' or just past its end. This works if PTR is NULL, which is 224 a good thing. */ 225 #define FIRST_STRING_P(ptr) \ 226 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 227 228 /* (Re)Allocate N items of type T using malloc, or fail. */ 229 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 230 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 231 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 232 233 #define BYTEWIDTH 8 /* In bits. */ 234 235 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 236 237 #define MAX(a, b) ((a) > (b) ? (a) : (b)) 238 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 239 240 typedef char boolean; 241 #define false 0 242 #define true 1 243 244 /* These are the command codes that appear in compiled regular 245 expressions. Some opcodes are followed by argument bytes. A 246 command code can specify any interpretation whatsoever for its 247 arguments. Zero bytes may appear in the compiled regular expression. 248 249 The value of `exactn' is needed in search.c (search_buffer) in Emacs. 250 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of 251 `exactn' we use here must also be 1. */ 252 253 typedef enum 254 { 255 no_op = 0, 256 257 /* Followed by one byte giving n, then by n literal bytes. */ 258 exactn = 1, 259 260 /* Matches any (more or less) character. */ 261 anychar, 262 263 /* Matches any one char belonging to specified set. First 264 following byte is number of bitmap bytes. Then come bytes 265 for a bitmap saying which chars are in. Bits in each byte 266 are ordered low-bit-first. A character is in the set if its 267 bit is 1. A character too large to have a bit in the map is 268 automatically not in the set. */ 269 charset, 270 271 /* Same parameters as charset, but match any character that is 272 not one of those specified. */ 273 charset_not, 274 275 /* Start remembering the text that is matched, for storing in a 276 register. Followed by one byte with the register number, in 277 the range 0 to one less than the pattern buffer's re_nsub 278 field. Then followed by one byte with the number of groups 279 inner to this one. (This last has to be part of the 280 start_memory only because we need it in the on_failure_jump 281 of re_match_2.) */ 282 start_memory, 283 284 /* Stop remembering the text that is matched and store it in a 285 memory register. Followed by one byte with the register 286 number, in the range 0 to one less than `re_nsub' in the 287 pattern buffer, and one byte with the number of inner groups, 288 just like `start_memory'. (We need the number of inner 289 groups here because we don't have any easy way of finding the 290 corresponding start_memory when we're at a stop_memory.) */ 291 stop_memory, 292 293 /* Match a duplicate of something remembered. Followed by one 294 byte containing the register number. */ 295 duplicate, 296 297 /* Fail unless at beginning of line. */ 298 begline, 299 300 /* Fail unless at end of line. */ 301 endline, 302 303 /* Succeeds if at beginning of buffer (if emacs) or at beginning 304 of string to be matched (if not). */ 305 begbuf, 306 307 /* Analogously, for end of buffer/string. */ 308 endbuf, 309 310 /* Followed by two byte relative address to which to jump. */ 311 jump, 312 313 /* Same as jump, but marks the end of an alternative. */ 314 jump_past_alt, 315 316 /* Followed by two-byte relative address of place to resume at 317 in case of failure. */ 318 on_failure_jump, 319 320 /* Like on_failure_jump, but pushes a placeholder instead of the 321 current string position when executed. */ 322 on_failure_keep_string_jump, 323 324 /* Throw away latest failure point and then jump to following 325 two-byte relative address. */ 326 pop_failure_jump, 327 328 /* Change to pop_failure_jump if know won't have to backtrack to 329 match; otherwise change to jump. This is used to jump 330 back to the beginning of a repeat. If what follows this jump 331 clearly won't match what the repeat does, such that we can be 332 sure that there is no use backtracking out of repetitions 333 already matched, then we change it to a pop_failure_jump. 334 Followed by two-byte address. */ 335 maybe_pop_jump, 336 337 /* Jump to following two-byte address, and push a dummy failure 338 point. This failure point will be thrown away if an attempt 339 is made to use it for a failure. A `+' construct makes this 340 before the first repeat. Also used as an intermediary kind 341 of jump when compiling an alternative. */ 342 dummy_failure_jump, 343 344 /* Push a dummy failure point and continue. Used at the end of 345 alternatives. */ 346 push_dummy_failure, 347 348 /* Followed by two-byte relative address and two-byte number n. 349 After matching N times, jump to the address upon failure. */ 350 succeed_n, 351 352 /* Followed by two-byte relative address, and two-byte number n. 353 Jump to the address N times, then fail. */ 354 jump_n, 355 356 /* Set the following two-byte relative address to the 357 subsequent two-byte number. The address *includes* the two 358 bytes of number. */ 359 set_number_at, 360 361 wordchar, /* Matches any word-constituent character. */ 362 notwordchar, /* Matches any char that is not a word-constituent. */ 363 364 wordbeg, /* Succeeds if at word beginning. */ 365 wordend, /* Succeeds if at word end. */ 366 367 wordbound, /* Succeeds if at a word boundary. */ 368 notwordbound /* Succeeds if not at a word boundary. */ 369 370 #ifdef emacs 371 ,before_dot, /* Succeeds if before point. */ 372 at_dot, /* Succeeds if at point. */ 373 after_dot, /* Succeeds if after point. */ 374 375 /* Matches any character whose syntax is specified. Followed by 376 a byte which contains a syntax code, e.g., Sword. */ 377 syntaxspec, 378 379 /* Matches any character whose syntax is not that specified. */ 380 notsyntaxspec 381 #endif /* emacs */ 382 } re_opcode_t; 383 384 /* Common operations on the compiled pattern. */ 385 386 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 387 388 #define STORE_NUMBER(destination, number) \ 389 do { \ 390 (destination)[0] = (number) & 0377; \ 391 (destination)[1] = (number) >> 8; \ 392 } while (0) 393 394 /* Same as STORE_NUMBER, except increment DESTINATION to 395 the byte after where the number is stored. Therefore, DESTINATION 396 must be an lvalue. */ 397 398 #define STORE_NUMBER_AND_INCR(destination, number) \ 399 do { \ 400 STORE_NUMBER (destination, number); \ 401 (destination) += 2; \ 402 } while (0) 403 404 /* Put into DESTINATION a number stored in two contiguous bytes starting 405 at SOURCE. */ 406 407 #define EXTRACT_NUMBER(destination, source) \ 408 do { \ 409 (destination) = *(source) & 0377; \ 410 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ 411 } while (0) 412 413 #ifdef DEBUG 414 static void 415 extract_number (dest, source) 416 int *dest; 417 unsigned char *source; 418 { 419 int temp = SIGN_EXTEND_CHAR (*(source + 1)); 420 *dest = *source & 0377; 421 *dest += temp << 8; 422 } 423 424 #ifndef EXTRACT_MACROS /* To debug the macros. */ 425 #undef EXTRACT_NUMBER 426 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) 427 #endif /* not EXTRACT_MACROS */ 428 429 #endif /* DEBUG */ 430 431 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 432 SOURCE must be an lvalue. */ 433 434 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ 435 do { \ 436 EXTRACT_NUMBER (destination, source); \ 437 (source) += 2; \ 438 } while (0) 439 440 #ifdef DEBUG 441 static void 442 extract_number_and_incr (destination, source) 443 int *destination; 444 unsigned char **source; 445 { 446 extract_number (destination, *source); 447 *source += 2; 448 } 449 450 #ifndef EXTRACT_MACROS 451 #undef EXTRACT_NUMBER_AND_INCR 452 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ 453 extract_number_and_incr (&dest, &src) 454 #endif /* not EXTRACT_MACROS */ 455 456 #endif /* DEBUG */ 457 458 /* If DEBUG is defined, Regex prints many voluminous messages about what 459 it is doing (if the variable `debug' is nonzero). If linked with the 460 main program in `iregex.c', you can enter patterns and strings 461 interactively. And if linked with the main program in `main.c' and 462 the other test files, you can run the already-written tests. */ 463 464 #ifdef DEBUG 465 466 /* We use standard I/O for debugging. */ 467 #include <stdio.h> 468 469 /* It is useful to test things that ``must'' be true when debugging. */ 470 #include <assert.h> 471 472 static int debug = 0; 473 474 #define DEBUG_STATEMENT(e) e 475 #define DEBUG_PRINT1(x) if (debug) printf (x) 476 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 477 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 478 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 479 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 480 if (debug) print_partial_compiled_pattern (s, e) 481 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 482 if (debug) print_double_string (w, s1, sz1, s2, sz2) 483 484 485 extern void printchar (); 486 487 /* Print the fastmap in human-readable form. */ 488 489 void 490 print_fastmap (fastmap) 491 char *fastmap; 492 { 493 unsigned was_a_range = 0; 494 unsigned i = 0; 495 496 while (i < (1 << BYTEWIDTH)) 497 { 498 if (fastmap[i++]) 499 { 500 was_a_range = 0; 501 printchar (i - 1); 502 while (i < (1 << BYTEWIDTH) && fastmap[i]) 503 { 504 was_a_range = 1; 505 i++; 506 } 507 if (was_a_range) 508 { 509 printf ("-"); 510 printchar (i - 1); 511 } 512 } 513 } 514 putchar ('\n'); 515 } 516 517 518 /* Print a compiled pattern string in human-readable form, starting at 519 the START pointer into it and ending just before the pointer END. */ 520 521 void 522 print_partial_compiled_pattern (start, end) 523 unsigned char *start; 524 unsigned char *end; 525 { 526 int mcnt, mcnt2; 527 unsigned char *p = start; 528 unsigned char *pend = end; 529 530 if (start == NULL) 531 { 532 printf ("(null)\n"); 533 return; 534 } 535 536 /* Loop over pattern commands. */ 537 while (p < pend) 538 { 539 switch ((re_opcode_t) *p++) 540 { 541 case no_op: 542 printf ("/no_op"); 543 break; 544 545 case exactn: 546 mcnt = *p++; 547 printf ("/exactn/%d", mcnt); 548 do 549 { 550 putchar ('/'); 551 printchar (*p++); 552 } 553 while (--mcnt); 554 break; 555 556 case start_memory: 557 mcnt = *p++; 558 printf ("/start_memory/%d/%d", mcnt, *p++); 559 break; 560 561 case stop_memory: 562 mcnt = *p++; 563 printf ("/stop_memory/%d/%d", mcnt, *p++); 564 break; 565 566 case duplicate: 567 printf ("/duplicate/%d", *p++); 568 break; 569 570 case anychar: 571 printf ("/anychar"); 572 break; 573 574 case charset: 575 case charset_not: 576 { 577 register int c; 578 579 printf ("/charset%s", 580 (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); 581 582 assert (p + *p < pend); 583 584 for (c = 0; c < *p; c++) 585 { 586 unsigned bit; 587 unsigned char map_byte = p[1 + c]; 588 589 putchar ('/'); 590 591 for (bit = 0; bit < BYTEWIDTH; bit++) 592 if (map_byte & (1 << bit)) 593 printchar (c * BYTEWIDTH + bit); 594 } 595 p += 1 + *p; 596 break; 597 } 598 599 case begline: 600 printf ("/begline"); 601 break; 602 603 case endline: 604 printf ("/endline"); 605 break; 606 607 case on_failure_jump: 608 extract_number_and_incr (&mcnt, &p); 609 printf ("/on_failure_jump/0/%d", mcnt); 610 break; 611 612 case on_failure_keep_string_jump: 613 extract_number_and_incr (&mcnt, &p); 614 printf ("/on_failure_keep_string_jump/0/%d", mcnt); 615 break; 616 617 case dummy_failure_jump: 618 extract_number_and_incr (&mcnt, &p); 619 printf ("/dummy_failure_jump/0/%d", mcnt); 620 break; 621 622 case push_dummy_failure: 623 printf ("/push_dummy_failure"); 624 break; 625 626 case maybe_pop_jump: 627 extract_number_and_incr (&mcnt, &p); 628 printf ("/maybe_pop_jump/0/%d", mcnt); 629 break; 630 631 case pop_failure_jump: 632 extract_number_and_incr (&mcnt, &p); 633 printf ("/pop_failure_jump/0/%d", mcnt); 634 break; 635 636 case jump_past_alt: 637 extract_number_and_incr (&mcnt, &p); 638 printf ("/jump_past_alt/0/%d", mcnt); 639 break; 640 641 case jump: 642 extract_number_and_incr (&mcnt, &p); 643 printf ("/jump/0/%d", mcnt); 644 break; 645 646 case succeed_n: 647 extract_number_and_incr (&mcnt, &p); 648 extract_number_and_incr (&mcnt2, &p); 649 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2); 650 break; 651 652 case jump_n: 653 extract_number_and_incr (&mcnt, &p); 654 extract_number_and_incr (&mcnt2, &p); 655 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2); 656 break; 657 658 case set_number_at: 659 extract_number_and_incr (&mcnt, &p); 660 extract_number_and_incr (&mcnt2, &p); 661 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2); 662 break; 663 664 case wordbound: 665 printf ("/wordbound"); 666 break; 667 668 case notwordbound: 669 printf ("/notwordbound"); 670 break; 671 672 case wordbeg: 673 printf ("/wordbeg"); 674 break; 675 676 case wordend: 677 printf ("/wordend"); 678 679 #ifdef emacs 680 case before_dot: 681 printf ("/before_dot"); 682 break; 683 684 case at_dot: 685 printf ("/at_dot"); 686 break; 687 688 case after_dot: 689 printf ("/after_dot"); 690 break; 691 692 case syntaxspec: 693 printf ("/syntaxspec"); 694 mcnt = *p++; 695 printf ("/%d", mcnt); 696 break; 697 698 case notsyntaxspec: 699 printf ("/notsyntaxspec"); 700 mcnt = *p++; 701 printf ("/%d", mcnt); 702 break; 703 #endif /* emacs */ 704 705 case wordchar: 706 printf ("/wordchar"); 707 break; 708 709 case notwordchar: 710 printf ("/notwordchar"); 711 break; 712 713 case begbuf: 714 printf ("/begbuf"); 715 break; 716 717 case endbuf: 718 printf ("/endbuf"); 719 break; 720 721 default: 722 printf ("?%d", *(p-1)); 723 } 724 } 725 printf ("/\n"); 726 } 727 728 729 void 730 print_compiled_pattern (bufp) 731 struct re_pattern_buffer *bufp; 732 { 733 unsigned char *buffer = bufp->buffer; 734 735 print_partial_compiled_pattern (buffer, buffer + bufp->used); 736 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); 737 738 if (bufp->fastmap_accurate && bufp->fastmap) 739 { 740 printf ("fastmap: "); 741 print_fastmap (bufp->fastmap); 742 } 743 744 printf ("re_nsub: %d\t", bufp->re_nsub); 745 printf ("regs_alloc: %d\t", bufp->regs_allocated); 746 printf ("can_be_null: %d\t", bufp->can_be_null); 747 printf ("newline_anchor: %d\n", bufp->newline_anchor); 748 printf ("no_sub: %d\t", bufp->no_sub); 749 printf ("not_bol: %d\t", bufp->not_bol); 750 printf ("not_eol: %d\t", bufp->not_eol); 751 printf ("syntax: %d\n", bufp->syntax); 752 /* Perhaps we should print the translate table? */ 753 } 754 755 756 void 757 print_double_string (where, string1, size1, string2, size2) 758 const char *where; 759 const char *string1; 760 const char *string2; 761 int size1; 762 int size2; 763 { 764 unsigned this_char; 765 766 if (where == NULL) 767 printf ("(null)"); 768 else 769 { 770 if (FIRST_STRING_P (where)) 771 { 772 for (this_char = where - string1; this_char < size1; this_char++) 773 printchar (string1[this_char]); 774 775 where = string2; 776 } 777 778 for (this_char = where - string2; this_char < size2; this_char++) 779 printchar (string2[this_char]); 780 } 781 } 782 783 #else /* not DEBUG */ 784 785 #undef assert 786 #define assert(e) 787 788 #define DEBUG_STATEMENT(e) 789 #define DEBUG_PRINT1(x) 790 #define DEBUG_PRINT2(x1, x2) 791 #define DEBUG_PRINT3(x1, x2, x3) 792 #define DEBUG_PRINT4(x1, x2, x3, x4) 793 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 794 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 795 796 #endif /* not DEBUG */ 797 798 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 799 also be assigned to arbitrarily: each pattern buffer stores its own 800 syntax, so it can be changed between regex compilations. */ 801 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; 802 803 804 /* Specify the precise syntax of regexps for compilation. This provides 805 for compatibility for various utilities which historically have 806 different, incompatible syntaxes. 807 808 The argument SYNTAX is a bit mask comprised of the various bits 809 defined in regex.h. We return the old syntax. */ 810 811 reg_syntax_t 812 re_set_syntax (syntax) 813 reg_syntax_t syntax; 814 { 815 reg_syntax_t ret = re_syntax_options; 816 817 re_syntax_options = syntax; 818 return ret; 819 } 820 821 /* This table gives an error message for each of the error codes listed 822 in regex.h. Obviously the order here has to be same as there. */ 823 824 static const char *re_error_msg[] = 825 { NULL, /* REG_NOERROR */ 826 "No match", /* REG_NOMATCH */ 827 "Invalid regular expression", /* REG_BADPAT */ 828 "Invalid collation character", /* REG_ECOLLATE */ 829 "Invalid character class name", /* REG_ECTYPE */ 830 "Trailing backslash", /* REG_EESCAPE */ 831 "Invalid back reference", /* REG_ESUBREG */ 832 "Unmatched [ or [^", /* REG_EBRACK */ 833 "Unmatched ( or \\(", /* REG_EPAREN */ 834 "Unmatched \\{", /* REG_EBRACE */ 835 "Invalid content of \\{\\}", /* REG_BADBR */ 836 "Invalid range end", /* REG_ERANGE */ 837 "Memory exhausted", /* REG_ESPACE */ 838 "Invalid preceding regular expression", /* REG_BADRPT */ 839 "Premature end of regular expression", /* REG_EEND */ 840 "Regular expression too big", /* REG_ESIZE */ 841 "Unmatched ) or \\)", /* REG_ERPAREN */ 842 }; 843 844 /* Subroutine declarations and macros for regex_compile. */ 845 846 static void store_op1 (), store_op2 (); 847 static void insert_op1 (), insert_op2 (); 848 static boolean at_begline_loc_p (), at_endline_loc_p (); 849 static boolean group_in_compile_stack (); 850 static reg_errcode_t compile_range (); 851 852 /* Fetch the next character in the uncompiled pattern---translating it 853 if necessary. Also cast from a signed character in the constant 854 string passed to us by the user to an unsigned char that we can use 855 as an array index (in, e.g., `translate'). */ 856 #define PATFETCH(c) \ 857 do {if (p == pend) return REG_EEND; \ 858 c = (unsigned char) *p++; \ 859 if (translate) c = translate[c]; \ 860 } while (0) 861 862 /* Fetch the next character in the uncompiled pattern, with no 863 translation. */ 864 #define PATFETCH_RAW(c) \ 865 do {if (p == pend) return REG_EEND; \ 866 c = (unsigned char) *p++; \ 867 } while (0) 868 869 /* Go backwards one character in the pattern. */ 870 #define PATUNFETCH p-- 871 872 873 /* If `translate' is non-null, return translate[D], else just D. We 874 cast the subscript to translate because some data is declared as 875 `char *', to avoid warnings when a string constant is passed. But 876 when we use a character as a subscript we must make it unsigned. */ 877 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) 878 879 880 /* Macros for outputting the compiled pattern into `buffer'. */ 881 882 /* If the buffer isn't allocated when it comes in, use this. */ 883 #define INIT_BUF_SIZE 32 884 885 /* Make sure we have at least N more bytes of space in buffer. */ 886 #define GET_BUFFER_SPACE(n) \ 887 while (b - bufp->buffer + (n) > bufp->allocated) \ 888 EXTEND_BUFFER () 889 890 /* Make sure we have one more byte of buffer space and then add C to it. */ 891 #define BUF_PUSH(c) \ 892 do { \ 893 GET_BUFFER_SPACE (1); \ 894 *b++ = (unsigned char) (c); \ 895 } while (0) 896 897 898 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 899 #define BUF_PUSH_2(c1, c2) \ 900 do { \ 901 GET_BUFFER_SPACE (2); \ 902 *b++ = (unsigned char) (c1); \ 903 *b++ = (unsigned char) (c2); \ 904 } while (0) 905 906 907 /* As with BUF_PUSH_2, except for three bytes. */ 908 #define BUF_PUSH_3(c1, c2, c3) \ 909 do { \ 910 GET_BUFFER_SPACE (3); \ 911 *b++ = (unsigned char) (c1); \ 912 *b++ = (unsigned char) (c2); \ 913 *b++ = (unsigned char) (c3); \ 914 } while (0) 915 916 917 /* Store a jump with opcode OP at LOC to location TO. We store a 918 relative address offset by the three bytes the jump itself occupies. */ 919 #define STORE_JUMP(op, loc, to) \ 920 store_op1 (op, loc, (to) - (loc) - 3) 921 922 /* Likewise, for a two-argument jump. */ 923 #define STORE_JUMP2(op, loc, to, arg) \ 924 store_op2 (op, loc, (to) - (loc) - 3, arg) 925 926 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 927 #define INSERT_JUMP(op, loc, to) \ 928 insert_op1 (op, loc, (to) - (loc) - 3, b) 929 930 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 931 #define INSERT_JUMP2(op, loc, to, arg) \ 932 insert_op2 (op, loc, (to) - (loc) - 3, arg, b) 933 934 935 /* This is not an arbitrary limit: the arguments which represent offsets 936 into the pattern are two bytes long. So if 2^16 bytes turns out to 937 be too small, many things would have to change. */ 938 #define MAX_BUF_SIZE (1L << 16) 939 940 941 /* Extend the buffer by twice its current size via realloc and 942 reset the pointers that pointed into the old block to point to the 943 correct places in the new one. If extending the buffer results in it 944 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ 945 #define EXTEND_BUFFER() \ 946 do { \ 947 unsigned char *old_buffer = bufp->buffer; \ 948 if (bufp->allocated == MAX_BUF_SIZE) \ 949 return REG_ESIZE; \ 950 bufp->allocated <<= 1; \ 951 if (bufp->allocated > MAX_BUF_SIZE) \ 952 bufp->allocated = MAX_BUF_SIZE; \ 953 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ 954 if (bufp->buffer == NULL) \ 955 return REG_ESPACE; \ 956 /* If the buffer moved, move all the pointers into it. */ \ 957 if (old_buffer != bufp->buffer) \ 958 { \ 959 b = (b - old_buffer) + bufp->buffer; \ 960 begalt = (begalt - old_buffer) + bufp->buffer; \ 961 if (fixup_alt_jump) \ 962 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ 963 if (laststart) \ 964 laststart = (laststart - old_buffer) + bufp->buffer; \ 965 if (pending_exact) \ 966 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ 967 } \ 968 } while (0) 969 970 971 /* Since we have one byte reserved for the register number argument to 972 {start,stop}_memory, the maximum number of groups we can report 973 things about is what fits in that byte. */ 974 #define MAX_REGNUM 255 975 976 /* But patterns can have more than `MAX_REGNUM' registers. We just 977 ignore the excess. */ 978 typedef unsigned regnum_t; 979 980 981 /* Macros for the compile stack. */ 982 983 /* Since offsets can go either forwards or backwards, this type needs to 984 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ 985 typedef int pattern_offset_t; 986 987 typedef struct 988 { 989 pattern_offset_t begalt_offset; 990 pattern_offset_t fixup_alt_jump; 991 pattern_offset_t inner_group_offset; 992 pattern_offset_t laststart_offset; 993 regnum_t regnum; 994 } compile_stack_elt_t; 995 996 997 typedef struct 998 { 999 compile_stack_elt_t *stack; 1000 unsigned size; 1001 unsigned avail; /* Offset of next open position. */ 1002 } compile_stack_type; 1003 1004 1005 #define INIT_COMPILE_STACK_SIZE 32 1006 1007 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) 1008 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 1009 1010 /* The next available element. */ 1011 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 1012 1013 1014 /* Set the bit for character C in a list. */ 1015 #define SET_LIST_BIT(c) \ 1016 (b[((unsigned char) (c)) / BYTEWIDTH] \ 1017 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 1018 1019 1020 /* Get the next unsigned number in the uncompiled pattern. */ 1021 #define GET_UNSIGNED_NUMBER(num) \ 1022 { if (p != pend) \ 1023 { \ 1024 PATFETCH (c); \ 1025 while (ISDIGIT (c)) \ 1026 { \ 1027 if (num < 0) \ 1028 num = 0; \ 1029 num = num * 10 + c - '0'; \ 1030 if (p == pend) \ 1031 break; \ 1032 PATFETCH (c); \ 1033 } \ 1034 } \ 1035 } 1036 1037 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 1038 1039 #define IS_CHAR_CLASS(string) \ 1040 (STREQ (string, "alpha") || STREQ (string, "upper") \ 1041 || STREQ (string, "lower") || STREQ (string, "digit") \ 1042 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 1043 || STREQ (string, "space") || STREQ (string, "print") \ 1044 || STREQ (string, "punct") || STREQ (string, "graph") \ 1045 || STREQ (string, "cntrl") || STREQ (string, "blank")) 1046 1047 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 1048 Returns one of error codes defined in `regex.h', or zero for success. 1049 1050 Assumes the `allocated' (and perhaps `buffer') and `translate' 1051 fields are set in BUFP on entry. 1052 1053 If it succeeds, results are put in BUFP (if it returns an error, the 1054 contents of BUFP are undefined): 1055 `buffer' is the compiled pattern; 1056 `syntax' is set to SYNTAX; 1057 `used' is set to the length of the compiled pattern; 1058 `fastmap_accurate' is zero; 1059 `re_nsub' is the number of subexpressions in PATTERN; 1060 `not_bol' and `not_eol' are zero; 1061 1062 The `fastmap' and `newline_anchor' fields are neither 1063 examined nor set. */ 1064 1065 static reg_errcode_t 1066 regex_compile (pattern, size, syntax, bufp) 1067 const char *pattern; 1068 int size; 1069 reg_syntax_t syntax; 1070 struct re_pattern_buffer *bufp; 1071 { 1072 /* We fetch characters from PATTERN here. Even though PATTERN is 1073 `char *' (i.e., signed), we declare these variables as unsigned, so 1074 they can be reliably used as array indices. */ 1075 register unsigned char c, c1; 1076 1077 /* A random tempory spot in PATTERN. */ 1078 const char *p1; 1079 1080 /* Points to the end of the buffer, where we should append. */ 1081 register unsigned char *b; 1082 1083 /* Keeps track of unclosed groups. */ 1084 compile_stack_type compile_stack; 1085 1086 /* Points to the current (ending) position in the pattern. */ 1087 const char *p = pattern; 1088 const char *pend = pattern + size; 1089 1090 /* How to translate the characters in the pattern. */ 1091 char *translate = bufp->translate; 1092 1093 /* Address of the count-byte of the most recently inserted `exactn' 1094 command. This makes it possible to tell if a new exact-match 1095 character can be added to that command or if the character requires 1096 a new `exactn' command. */ 1097 unsigned char *pending_exact = 0; 1098 1099 /* Address of start of the most recently finished expression. 1100 This tells, e.g., postfix * where to find the start of its 1101 operand. Reset at the beginning of groups and alternatives. */ 1102 unsigned char *laststart = 0; 1103 1104 /* Address of beginning of regexp, or inside of last group. */ 1105 unsigned char *begalt; 1106 1107 /* Place in the uncompiled pattern (i.e., the {) to 1108 which to go back if the interval is invalid. */ 1109 const char *beg_interval; 1110 1111 /* Address of the place where a forward jump should go to the end of 1112 the containing expression. Each alternative of an `or' -- except the 1113 last -- ends with a forward jump of this sort. */ 1114 unsigned char *fixup_alt_jump = 0; 1115 1116 /* Counts open-groups as they are encountered. Remembered for the 1117 matching close-group on the compile stack, so the same register 1118 number is put in the stop_memory as the start_memory. */ 1119 regnum_t regnum = 0; 1120 1121 #ifdef DEBUG 1122 DEBUG_PRINT1 ("\nCompiling pattern: "); 1123 if (debug) 1124 { 1125 unsigned debug_count; 1126 1127 for (debug_count = 0; debug_count < size; debug_count++) 1128 printchar (pattern[debug_count]); 1129 putchar ('\n'); 1130 } 1131 #endif /* DEBUG */ 1132 1133 /* Initialize the compile stack. */ 1134 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); 1135 if (compile_stack.stack == NULL) 1136 return REG_ESPACE; 1137 1138 compile_stack.size = INIT_COMPILE_STACK_SIZE; 1139 compile_stack.avail = 0; 1140 1141 /* Initialize the pattern buffer. */ 1142 bufp->syntax = syntax; 1143 bufp->fastmap_accurate = 0; 1144 bufp->not_bol = bufp->not_eol = 0; 1145 1146 /* Set `used' to zero, so that if we return an error, the pattern 1147 printer (for debugging) will think there's no pattern. We reset it 1148 at the end. */ 1149 bufp->used = 0; 1150 1151 /* Always count groups, whether or not bufp->no_sub is set. */ 1152 bufp->re_nsub = 0; 1153 1154 #if !defined (emacs) && !defined (SYNTAX_TABLE) 1155 /* Initialize the syntax table. */ 1156 init_syntax_once (); 1157 #endif 1158 1159 if (bufp->allocated == 0) 1160 { 1161 if (bufp->buffer) 1162 { /* If zero allocated, but buffer is non-null, try to realloc 1163 enough space. This loses if buffer's address is bogus, but 1164 that is the user's responsibility. */ 1165 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); 1166 } 1167 else 1168 { /* Caller did not allocate a buffer. Do it for them. */ 1169 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); 1170 } 1171 if (!bufp->buffer) return REG_ESPACE; 1172 1173 bufp->allocated = INIT_BUF_SIZE; 1174 } 1175 1176 begalt = b = bufp->buffer; 1177 1178 /* Loop through the uncompiled pattern until we're at the end. */ 1179 while (p != pend) 1180 { 1181 PATFETCH (c); 1182 1183 switch (c) 1184 { 1185 case '^': 1186 { 1187 if ( /* If at start of pattern, it's an operator. */ 1188 p == pattern + 1 1189 /* If context independent, it's an operator. */ 1190 || syntax & RE_CONTEXT_INDEP_ANCHORS 1191 /* Otherwise, depends on what's come before. */ 1192 || at_begline_loc_p (pattern, p, syntax)) 1193 BUF_PUSH (begline); 1194 else 1195 goto normal_char; 1196 } 1197 break; 1198 1199 1200 case '$': 1201 { 1202 if ( /* If at end of pattern, it's an operator. */ 1203 p == pend 1204 /* If context independent, it's an operator. */ 1205 || syntax & RE_CONTEXT_INDEP_ANCHORS 1206 /* Otherwise, depends on what's next. */ 1207 || at_endline_loc_p (p, pend, syntax)) 1208 BUF_PUSH (endline); 1209 else 1210 goto normal_char; 1211 } 1212 break; 1213 1214 1215 case '+': 1216 case '?': 1217 if ((syntax & RE_BK_PLUS_QM) 1218 || (syntax & RE_LIMITED_OPS)) 1219 goto normal_char; 1220 handle_plus: 1221 case '*': 1222 /* If there is no previous pattern... */ 1223 if (!laststart) 1224 { 1225 if (syntax & RE_CONTEXT_INVALID_OPS) 1226 return REG_BADRPT; 1227 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) 1228 goto normal_char; 1229 } 1230 1231 { 1232 /* Are we optimizing this jump? */ 1233 boolean keep_string_p = false; 1234 1235 /* 1 means zero (many) matches is allowed. */ 1236 char zero_times_ok = 0, many_times_ok = 0; 1237 1238 /* If there is a sequence of repetition chars, collapse it 1239 down to just one (the right one). We can't combine 1240 interval operators with these because of, e.g., `a{2}*', 1241 which should only match an even number of `a's. */ 1242 1243 for (;;) 1244 { 1245 zero_times_ok |= c != '+'; 1246 many_times_ok |= c != '?'; 1247 1248 if (p == pend) 1249 break; 1250 1251 PATFETCH (c); 1252 1253 if (c == '*' 1254 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) 1255 ; 1256 1257 else if (syntax & RE_BK_PLUS_QM && c == '\\') 1258 { 1259 if (p == pend) return REG_EESCAPE; 1260 1261 PATFETCH (c1); 1262 if (!(c1 == '+' || c1 == '?')) 1263 { 1264 PATUNFETCH; 1265 PATUNFETCH; 1266 break; 1267 } 1268 1269 c = c1; 1270 } 1271 else 1272 { 1273 PATUNFETCH; 1274 break; 1275 } 1276 1277 /* If we get here, we found another repeat character. */ 1278 } 1279 1280 /* Star, etc. applied to an empty pattern is equivalent 1281 to an empty pattern. */ 1282 if (!laststart) 1283 break; 1284 1285 /* Now we know whether or not zero matches is allowed 1286 and also whether or not two or more matches is allowed. */ 1287 if (many_times_ok) 1288 { /* More than one repetition is allowed, so put in at the 1289 end a backward relative jump from `b' to before the next 1290 jump we're going to put in below (which jumps from 1291 laststart to after this jump). 1292 1293 But if we are at the `*' in the exact sequence `.*\n', 1294 insert an unconditional jump backwards to the ., 1295 instead of the beginning of the loop. This way we only 1296 push a failure point once, instead of every time 1297 through the loop. */ 1298 assert (p - 1 > pattern); 1299 1300 /* Allocate the space for the jump. */ 1301 GET_BUFFER_SPACE (3); 1302 1303 /* We know we are not at the first character of the pattern, 1304 because laststart was nonzero. And we've already 1305 incremented `p', by the way, to be the character after 1306 the `*'. Do we have to do something analogous here 1307 for null bytes, because of RE_DOT_NOT_NULL? */ 1308 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') 1309 && zero_times_ok 1310 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') 1311 && !(syntax & RE_DOT_NEWLINE)) 1312 { /* We have .*\n. */ 1313 STORE_JUMP (jump, b, laststart); 1314 keep_string_p = true; 1315 } 1316 else 1317 /* Anything else. */ 1318 STORE_JUMP (maybe_pop_jump, b, laststart - 3); 1319 1320 /* We've added more stuff to the buffer. */ 1321 b += 3; 1322 } 1323 1324 /* On failure, jump from laststart to b + 3, which will be the 1325 end of the buffer after this jump is inserted. */ 1326 GET_BUFFER_SPACE (3); 1327 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump 1328 : on_failure_jump, 1329 laststart, b + 3); 1330 pending_exact = 0; 1331 b += 3; 1332 1333 if (!zero_times_ok) 1334 { 1335 /* At least one repetition is required, so insert a 1336 `dummy_failure_jump' before the initial 1337 `on_failure_jump' instruction of the loop. This 1338 effects a skip over that instruction the first time 1339 we hit that loop. */ 1340 GET_BUFFER_SPACE (3); 1341 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); 1342 b += 3; 1343 } 1344 } 1345 break; 1346 1347 1348 case '.': 1349 laststart = b; 1350 BUF_PUSH (anychar); 1351 break; 1352 1353 1354 case '[': 1355 { 1356 boolean had_char_class = false; 1357 1358 if (p == pend) return REG_EBRACK; 1359 1360 /* Ensure that we have enough space to push a charset: the 1361 opcode, the length count, and the bitset; 34 bytes in all. */ 1362 GET_BUFFER_SPACE (34); 1363 1364 laststart = b; 1365 1366 /* We test `*p == '^' twice, instead of using an if 1367 statement, so we only need one BUF_PUSH. */ 1368 BUF_PUSH (*p == '^' ? charset_not : charset); 1369 if (*p == '^') 1370 p++; 1371 1372 /* Remember the first position in the bracket expression. */ 1373 p1 = p; 1374 1375 /* Push the number of bytes in the bitmap. */ 1376 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 1377 1378 /* Clear the whole map. */ 1379 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); 1380 1381 /* charset_not matches newline according to a syntax bit. */ 1382 if ((re_opcode_t) b[-2] == charset_not 1383 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 1384 SET_LIST_BIT ('\n'); 1385 1386 /* Read in characters and ranges, setting map bits. */ 1387 for (;;) 1388 { 1389 if (p == pend) return REG_EBRACK; 1390 1391 PATFETCH (c); 1392 1393 /* \ might escape characters inside [...] and [^...]. */ 1394 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 1395 { 1396 if (p == pend) return REG_EESCAPE; 1397 1398 PATFETCH (c1); 1399 SET_LIST_BIT (c1); 1400 continue; 1401 } 1402 1403 /* Could be the end of the bracket expression. If it's 1404 not (i.e., when the bracket expression is `[]' so 1405 far), the ']' character bit gets set way below. */ 1406 if (c == ']' && p != p1 + 1) 1407 break; 1408 1409 /* Look ahead to see if it's a range when the last thing 1410 was a character class. */ 1411 if (had_char_class && c == '-' && *p != ']') 1412 return REG_ERANGE; 1413 1414 /* Look ahead to see if it's a range when the last thing 1415 was a character: if this is a hyphen not at the 1416 beginning or the end of a list, then it's the range 1417 operator. */ 1418 if (c == '-' 1419 && !(p - 2 >= pattern && p[-2] == '[') 1420 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 1421 && *p != ']') 1422 { 1423 reg_errcode_t ret 1424 = compile_range (&p, pend, translate, syntax, b); 1425 if (ret != REG_NOERROR) return ret; 1426 } 1427 1428 else if (p[0] == '-' && p[1] != ']') 1429 { /* This handles ranges made up of characters only. */ 1430 reg_errcode_t ret; 1431 1432 /* Move past the `-'. */ 1433 PATFETCH (c1); 1434 1435 ret = compile_range (&p, pend, translate, syntax, b); 1436 if (ret != REG_NOERROR) return ret; 1437 } 1438 1439 /* See if we're at the beginning of a possible character 1440 class. */ 1441 1442 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 1443 { /* Leave room for the null. */ 1444 char str[CHAR_CLASS_MAX_LENGTH + 1]; 1445 1446 PATFETCH (c); 1447 c1 = 0; 1448 1449 /* If pattern is `[[:'. */ 1450 if (p == pend) return REG_EBRACK; 1451 1452 for (;;) 1453 { 1454 PATFETCH (c); 1455 if (c == ':' || c == ']' || p == pend 1456 || c1 == CHAR_CLASS_MAX_LENGTH) 1457 break; 1458 str[c1++] = c; 1459 } 1460 str[c1] = '\0'; 1461 1462 /* If isn't a word bracketed by `[:' and:`]': 1463 undo the ending character, the letters, and leave 1464 the leading `:' and `[' (but set bits for them). */ 1465 if (c == ':' && *p == ']') 1466 { 1467 int ch; 1468 boolean is_alnum = STREQ (str, "alnum"); 1469 boolean is_alpha = STREQ (str, "alpha"); 1470 boolean is_blank = STREQ (str, "blank"); 1471 boolean is_cntrl = STREQ (str, "cntrl"); 1472 boolean is_digit = STREQ (str, "digit"); 1473 boolean is_graph = STREQ (str, "graph"); 1474 boolean is_lower = STREQ (str, "lower"); 1475 boolean is_print = STREQ (str, "print"); 1476 boolean is_punct = STREQ (str, "punct"); 1477 boolean is_space = STREQ (str, "space"); 1478 boolean is_upper = STREQ (str, "upper"); 1479 boolean is_xdigit = STREQ (str, "xdigit"); 1480 1481 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; 1482 1483 /* Throw away the ] at the end of the character 1484 class. */ 1485 PATFETCH (c); 1486 1487 if (p == pend) return REG_EBRACK; 1488 1489 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 1490 { 1491 if ( (is_alnum && ISALNUM (ch)) 1492 || (is_alpha && ISALPHA (ch)) 1493 || (is_blank && ISBLANK (ch)) 1494 || (is_cntrl && ISCNTRL (ch)) 1495 || (is_digit && ISDIGIT (ch)) 1496 || (is_graph && ISGRAPH (ch)) 1497 || (is_lower && ISLOWER (ch)) 1498 || (is_print && ISPRINT (ch)) 1499 || (is_punct && ISPUNCT (ch)) 1500 || (is_space && ISSPACE (ch)) 1501 || (is_upper && ISUPPER (ch)) 1502 || (is_xdigit && ISXDIGIT (ch))) 1503 SET_LIST_BIT (ch); 1504 } 1505 had_char_class = true; 1506 } 1507 else 1508 { 1509 c1++; 1510 while (c1--) 1511 PATUNFETCH; 1512 SET_LIST_BIT ('['); 1513 SET_LIST_BIT (':'); 1514 had_char_class = false; 1515 } 1516 } 1517 else 1518 { 1519 had_char_class = false; 1520 SET_LIST_BIT (c); 1521 } 1522 } 1523 1524 /* Discard any (non)matching list bytes that are all 0 at the 1525 end of the map. Decrease the map-length byte too. */ 1526 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 1527 b[-1]--; 1528 b += b[-1]; 1529 } 1530 break; 1531 1532 1533 case '(': 1534 if (syntax & RE_NO_BK_PARENS) 1535 goto handle_open; 1536 else 1537 goto normal_char; 1538 1539 1540 case ')': 1541 if (syntax & RE_NO_BK_PARENS) 1542 goto handle_close; 1543 else 1544 goto normal_char; 1545 1546 1547 case '\n': 1548 if (syntax & RE_NEWLINE_ALT) 1549 goto handle_alt; 1550 else 1551 goto normal_char; 1552 1553 1554 case '|': 1555 if (syntax & RE_NO_BK_VBAR) 1556 goto handle_alt; 1557 else 1558 goto normal_char; 1559 1560 1561 case '{': 1562 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) 1563 goto handle_interval; 1564 else 1565 goto normal_char; 1566 1567 1568 case '\\': 1569 if (p == pend) return REG_EESCAPE; 1570 1571 /* Do not translate the character after the \, so that we can 1572 distinguish, e.g., \B from \b, even if we normally would 1573 translate, e.g., B to b. */ 1574 PATFETCH_RAW (c); 1575 1576 switch (c) 1577 { 1578 case '(': 1579 if (syntax & RE_NO_BK_PARENS) 1580 goto normal_backslash; 1581 1582 handle_open: 1583 bufp->re_nsub++; 1584 regnum++; 1585 1586 if (COMPILE_STACK_FULL) 1587 { 1588 RETALLOC (compile_stack.stack, compile_stack.size << 1, 1589 compile_stack_elt_t); 1590 if (compile_stack.stack == NULL) return REG_ESPACE; 1591 1592 compile_stack.size <<= 1; 1593 } 1594 1595 /* These are the values to restore when we hit end of this 1596 group. They are all relative offsets, so that if the 1597 whole pattern moves because of realloc, they will still 1598 be valid. */ 1599 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; 1600 COMPILE_STACK_TOP.fixup_alt_jump 1601 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; 1602 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; 1603 COMPILE_STACK_TOP.regnum = regnum; 1604 1605 /* We will eventually replace the 0 with the number of 1606 groups inner to this one. But do not push a 1607 start_memory for groups beyond the last one we can 1608 represent in the compiled pattern. */ 1609 if (regnum <= MAX_REGNUM) 1610 { 1611 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; 1612 BUF_PUSH_3 (start_memory, regnum, 0); 1613 } 1614 1615 compile_stack.avail++; 1616 1617 fixup_alt_jump = 0; 1618 laststart = 0; 1619 begalt = b; 1620 /* If we've reached MAX_REGNUM groups, then this open 1621 won't actually generate any code, so we'll have to 1622 clear pending_exact explicitly. */ 1623 pending_exact = 0; 1624 break; 1625 1626 1627 case ')': 1628 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; 1629 1630 if (COMPILE_STACK_EMPTY) 1631 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 1632 goto normal_backslash; 1633 else 1634 return REG_ERPAREN; 1635 1636 handle_close: 1637 if (fixup_alt_jump) 1638 { /* Push a dummy failure point at the end of the 1639 alternative for a possible future 1640 `pop_failure_jump' to pop. See comments at 1641 `push_dummy_failure' in `re_match_2'. */ 1642 BUF_PUSH (push_dummy_failure); 1643 1644 /* We allocated space for this jump when we assigned 1645 to `fixup_alt_jump', in the `handle_alt' case below. */ 1646 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); 1647 } 1648 1649 /* See similar code for backslashed left paren above. */ 1650 if (COMPILE_STACK_EMPTY) 1651 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 1652 goto normal_char; 1653 else 1654 return REG_ERPAREN; 1655 1656 /* Since we just checked for an empty stack above, this 1657 ``can't happen''. */ 1658 assert (compile_stack.avail != 0); 1659 { 1660 /* We don't just want to restore into `regnum', because 1661 later groups should continue to be numbered higher, 1662 as in `(ab)c(de)' -- the second group is #2. */ 1663 regnum_t this_group_regnum; 1664 1665 compile_stack.avail--; 1666 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; 1667 fixup_alt_jump 1668 = COMPILE_STACK_TOP.fixup_alt_jump 1669 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 1670 : 0; 1671 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; 1672 this_group_regnum = COMPILE_STACK_TOP.regnum; 1673 /* If we've reached MAX_REGNUM groups, then this open 1674 won't actually generate any code, so we'll have to 1675 clear pending_exact explicitly. */ 1676 pending_exact = 0; 1677 1678 /* We're at the end of the group, so now we know how many 1679 groups were inside this one. */ 1680 if (this_group_regnum <= MAX_REGNUM) 1681 { 1682 unsigned char *inner_group_loc 1683 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; 1684 1685 *inner_group_loc = regnum - this_group_regnum; 1686 BUF_PUSH_3 (stop_memory, this_group_regnum, 1687 regnum - this_group_regnum); 1688 } 1689 } 1690 break; 1691 1692 1693 case '|': /* `\|'. */ 1694 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) 1695 goto normal_backslash; 1696 handle_alt: 1697 if (syntax & RE_LIMITED_OPS) 1698 goto normal_char; 1699 1700 /* Insert before the previous alternative a jump which 1701 jumps to this alternative if the former fails. */ 1702 GET_BUFFER_SPACE (3); 1703 INSERT_JUMP (on_failure_jump, begalt, b + 6); 1704 pending_exact = 0; 1705 b += 3; 1706 1707 /* The alternative before this one has a jump after it 1708 which gets executed if it gets matched. Adjust that 1709 jump so it will jump to this alternative's analogous 1710 jump (put in below, which in turn will jump to the next 1711 (if any) alternative's such jump, etc.). The last such 1712 jump jumps to the correct final destination. A picture: 1713 _____ _____ 1714 | | | | 1715 | v | v 1716 a | b | c 1717 1718 If we are at `b', then fixup_alt_jump right now points to a 1719 three-byte space after `a'. We'll put in the jump, set 1720 fixup_alt_jump to right after `b', and leave behind three 1721 bytes which we'll fill in when we get to after `c'. */ 1722 1723 if (fixup_alt_jump) 1724 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 1725 1726 /* Mark and leave space for a jump after this alternative, 1727 to be filled in later either by next alternative or 1728 when know we're at the end of a series of alternatives. */ 1729 fixup_alt_jump = b; 1730 GET_BUFFER_SPACE (3); 1731 b += 3; 1732 1733 laststart = 0; 1734 begalt = b; 1735 break; 1736 1737 1738 case '{': 1739 /* If \{ is a literal. */ 1740 if (!(syntax & RE_INTERVALS) 1741 /* If we're at `\{' and it's not the open-interval 1742 operator. */ 1743 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1744 || (p - 2 == pattern && p == pend)) 1745 goto normal_backslash; 1746 1747 handle_interval: 1748 { 1749 /* If got here, then the syntax allows intervals. */ 1750 1751 /* At least (most) this many matches must be made. */ 1752 int lower_bound = -1, upper_bound = -1; 1753 1754 beg_interval = p - 1; 1755 1756 if (p == pend) 1757 { 1758 if (syntax & RE_NO_BK_BRACES) 1759 goto unfetch_interval; 1760 else 1761 return REG_EBRACE; 1762 } 1763 1764 GET_UNSIGNED_NUMBER (lower_bound); 1765 1766 if (c == ',') 1767 { 1768 GET_UNSIGNED_NUMBER (upper_bound); 1769 if (upper_bound < 0) upper_bound = RE_DUP_MAX; 1770 } 1771 else 1772 /* Interval such as `{1}' => match exactly once. */ 1773 upper_bound = lower_bound; 1774 1775 if (lower_bound < 0 || upper_bound > RE_DUP_MAX 1776 || lower_bound > upper_bound) 1777 { 1778 if (syntax & RE_NO_BK_BRACES) 1779 goto unfetch_interval; 1780 else 1781 return REG_BADBR; 1782 } 1783 1784 if (!(syntax & RE_NO_BK_BRACES)) 1785 { 1786 if (c != '\\') return REG_EBRACE; 1787 1788 PATFETCH (c); 1789 } 1790 1791 if (c != '}') 1792 { 1793 if (syntax & RE_NO_BK_BRACES) 1794 goto unfetch_interval; 1795 else 1796 return REG_BADBR; 1797 } 1798 1799 /* We just parsed a valid interval. */ 1800 1801 /* If it's invalid to have no preceding re. */ 1802 if (!laststart) 1803 { 1804 if (syntax & RE_CONTEXT_INVALID_OPS) 1805 return REG_BADRPT; 1806 else if (syntax & RE_CONTEXT_INDEP_OPS) 1807 laststart = b; 1808 else 1809 goto unfetch_interval; 1810 } 1811 1812 /* If the upper bound is zero, don't want to succeed at 1813 all; jump from `laststart' to `b + 3', which will be 1814 the end of the buffer after we insert the jump. */ 1815 if (upper_bound == 0) 1816 { 1817 GET_BUFFER_SPACE (3); 1818 INSERT_JUMP (jump, laststart, b + 3); 1819 b += 3; 1820 } 1821 1822 /* Otherwise, we have a nontrivial interval. When 1823 we're all done, the pattern will look like: 1824 set_number_at <jump count> <upper bound> 1825 set_number_at <succeed_n count> <lower bound> 1826 succeed_n <after jump addr> <succed_n count> 1827 <body of loop> 1828 jump_n <succeed_n addr> <jump count> 1829 (The upper bound and `jump_n' are omitted if 1830 `upper_bound' is 1, though.) */ 1831 else 1832 { /* If the upper bound is > 1, we need to insert 1833 more at the end of the loop. */ 1834 unsigned nbytes = 10 + (upper_bound > 1) * 10; 1835 1836 GET_BUFFER_SPACE (nbytes); 1837 1838 /* Initialize lower bound of the `succeed_n', even 1839 though it will be set during matching by its 1840 attendant `set_number_at' (inserted next), 1841 because `re_compile_fastmap' needs to know. 1842 Jump to the `jump_n' we might insert below. */ 1843 INSERT_JUMP2 (succeed_n, laststart, 1844 b + 5 + (upper_bound > 1) * 5, 1845 lower_bound); 1846 b += 5; 1847 1848 /* Code to initialize the lower bound. Insert 1849 before the `succeed_n'. The `5' is the last two 1850 bytes of this `set_number_at', plus 3 bytes of 1851 the following `succeed_n'. */ 1852 insert_op2 (set_number_at, laststart, 5, lower_bound, b); 1853 b += 5; 1854 1855 if (upper_bound > 1) 1856 { /* More than one repetition is allowed, so 1857 append a backward jump to the `succeed_n' 1858 that starts this interval. 1859 1860 When we've reached this during matching, 1861 we'll have matched the interval once, so 1862 jump back only `upper_bound - 1' times. */ 1863 STORE_JUMP2 (jump_n, b, laststart + 5, 1864 upper_bound - 1); 1865 b += 5; 1866 1867 /* The location we want to set is the second 1868 parameter of the `jump_n'; that is `b-2' as 1869 an absolute address. `laststart' will be 1870 the `set_number_at' we're about to insert; 1871 `laststart+3' the number to set, the source 1872 for the relative address. But we are 1873 inserting into the middle of the pattern -- 1874 so everything is getting moved up by 5. 1875 Conclusion: (b - 2) - (laststart + 3) + 5, 1876 i.e., b - laststart. 1877 1878 We insert this at the beginning of the loop 1879 so that if we fail during matching, we'll 1880 reinitialize the bounds. */ 1881 insert_op2 (set_number_at, laststart, b - laststart, 1882 upper_bound - 1, b); 1883 b += 5; 1884 } 1885 } 1886 pending_exact = 0; 1887 beg_interval = NULL; 1888 } 1889 break; 1890 1891 unfetch_interval: 1892 /* If an invalid interval, match the characters as literals. */ 1893 assert (beg_interval); 1894 p = beg_interval; 1895 beg_interval = NULL; 1896 1897 /* normal_char and normal_backslash need `c'. */ 1898 PATFETCH (c); 1899 1900 if (!(syntax & RE_NO_BK_BRACES)) 1901 { 1902 if (p > pattern && p[-1] == '\\') 1903 goto normal_backslash; 1904 } 1905 goto normal_char; 1906 1907 #ifdef emacs 1908 /* There is no way to specify the before_dot and after_dot 1909 operators. rms says this is ok. --karl */ 1910 case '=': 1911 BUF_PUSH (at_dot); 1912 break; 1913 1914 case 's': 1915 laststart = b; 1916 PATFETCH (c); 1917 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); 1918 break; 1919 1920 case 'S': 1921 laststart = b; 1922 PATFETCH (c); 1923 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); 1924 break; 1925 #endif /* emacs */ 1926 1927 1928 case 'w': 1929 laststart = b; 1930 BUF_PUSH (wordchar); 1931 break; 1932 1933 1934 case 'W': 1935 laststart = b; 1936 BUF_PUSH (notwordchar); 1937 break; 1938 1939 1940 case '<': 1941 BUF_PUSH (wordbeg); 1942 break; 1943 1944 case '>': 1945 BUF_PUSH (wordend); 1946 break; 1947 1948 case 'b': 1949 BUF_PUSH (wordbound); 1950 break; 1951 1952 case 'B': 1953 BUF_PUSH (notwordbound); 1954 break; 1955 1956 case '`': 1957 BUF_PUSH (begbuf); 1958 break; 1959 1960 case '\'': 1961 BUF_PUSH (endbuf); 1962 break; 1963 1964 case '1': case '2': case '3': case '4': case '5': 1965 case '6': case '7': case '8': case '9': 1966 if (syntax & RE_NO_BK_REFS) 1967 goto normal_char; 1968 1969 c1 = c - '0'; 1970 1971 if (c1 > regnum) 1972 return REG_ESUBREG; 1973 1974 /* Can't back reference to a subexpression if inside of it. */ 1975 if (group_in_compile_stack (compile_stack, c1)) 1976 goto normal_char; 1977 1978 laststart = b; 1979 BUF_PUSH_2 (duplicate, c1); 1980 break; 1981 1982 1983 case '+': 1984 case '?': 1985 if (syntax & RE_BK_PLUS_QM) 1986 goto handle_plus; 1987 else 1988 goto normal_backslash; 1989 1990 default: 1991 normal_backslash: 1992 /* You might think it would be useful for \ to mean 1993 not to translate; but if we don't translate it 1994 it will never match anything. */ 1995 c = TRANSLATE (c); 1996 goto normal_char; 1997 } 1998 break; 1999 2000 2001 default: 2002 /* Expects the character in `c'. */ 2003 normal_char: 2004 /* If no exactn currently being built. */ 2005 if (!pending_exact 2006 2007 /* If last exactn not at current position. */ 2008 || pending_exact + *pending_exact + 1 != b 2009 2010 /* We have only one byte following the exactn for the count. */ 2011 || *pending_exact == (1 << BYTEWIDTH) - 1 2012 2013 /* If followed by a repetition operator. */ 2014 || *p == '*' || *p == '^' 2015 || ((syntax & RE_BK_PLUS_QM) 2016 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 2017 : (*p == '+' || *p == '?')) 2018 || ((syntax & RE_INTERVALS) 2019 && ((syntax & RE_NO_BK_BRACES) 2020 ? *p == '{' 2021 : (p[0] == '\\' && p[1] == '{')))) 2022 { 2023 /* Start building a new exactn. */ 2024 2025 laststart = b; 2026 2027 BUF_PUSH_2 (exactn, 0); 2028 pending_exact = b - 1; 2029 } 2030 2031 BUF_PUSH (c); 2032 (*pending_exact)++; 2033 break; 2034 } /* switch (c) */ 2035 } /* while p != pend */ 2036 2037 2038 /* Through the pattern now. */ 2039 2040 if (fixup_alt_jump) 2041 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 2042 2043 if (!COMPILE_STACK_EMPTY) 2044 return REG_EPAREN; 2045 2046 free (compile_stack.stack); 2047 2048 /* We have succeeded; set the length of the buffer. */ 2049 bufp->used = b - bufp->buffer; 2050 2051 #ifdef DEBUG 2052 if (debug) 2053 { 2054 DEBUG_PRINT1 ("\nCompiled pattern: "); 2055 print_compiled_pattern (bufp); 2056 } 2057 #endif /* DEBUG */ 2058 2059 return REG_NOERROR; 2060 } /* regex_compile */ 2061 2062 /* Subroutines for `regex_compile'. */ 2063 2064 /* Store OP at LOC followed by two-byte integer parameter ARG. */ 2065 2066 static void 2067 store_op1 (op, loc, arg) 2068 re_opcode_t op; 2069 unsigned char *loc; 2070 int arg; 2071 { 2072 *loc = (unsigned char) op; 2073 STORE_NUMBER (loc + 1, arg); 2074 } 2075 2076 2077 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ 2078 2079 static void 2080 store_op2 (op, loc, arg1, arg2) 2081 re_opcode_t op; 2082 unsigned char *loc; 2083 int arg1, arg2; 2084 { 2085 *loc = (unsigned char) op; 2086 STORE_NUMBER (loc + 1, arg1); 2087 STORE_NUMBER (loc + 3, arg2); 2088 } 2089 2090 2091 /* Copy the bytes from LOC to END to open up three bytes of space at LOC 2092 for OP followed by two-byte integer parameter ARG. */ 2093 2094 static void 2095 insert_op1 (op, loc, arg, end) 2096 re_opcode_t op; 2097 unsigned char *loc; 2098 int arg; 2099 unsigned char *end; 2100 { 2101 register unsigned char *pfrom = end; 2102 register unsigned char *pto = end + 3; 2103 2104 while (pfrom != loc) 2105 *--pto = *--pfrom; 2106 2107 store_op1 (op, loc, arg); 2108 } 2109 2110 2111 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ 2112 2113 static void 2114 insert_op2 (op, loc, arg1, arg2, end) 2115 re_opcode_t op; 2116 unsigned char *loc; 2117 int arg1, arg2; 2118 unsigned char *end; 2119 { 2120 register unsigned char *pfrom = end; 2121 register unsigned char *pto = end + 5; 2122 2123 while (pfrom != loc) 2124 *--pto = *--pfrom; 2125 2126 store_op2 (op, loc, arg1, arg2); 2127 } 2128 2129 2130 /* P points to just after a ^ in PATTERN. Return true if that ^ comes 2131 after an alternative or a begin-subexpression. We assume there is at 2132 least one character before the ^. */ 2133 2134 static boolean 2135 at_begline_loc_p (pattern, p, syntax) 2136 const char *pattern, *p; 2137 reg_syntax_t syntax; 2138 { 2139 const char *prev = p - 2; 2140 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; 2141 2142 return 2143 /* After a subexpression? */ 2144 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) 2145 /* After an alternative? */ 2146 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); 2147 } 2148 2149 2150 /* The dual of at_begline_loc_p. This one is for $. We assume there is 2151 at least one character after the $, i.e., `P < PEND'. */ 2152 2153 static boolean 2154 at_endline_loc_p (p, pend, syntax) 2155 const char *p, *pend; 2156 int syntax; 2157 { 2158 const char *next = p; 2159 boolean next_backslash = *next == '\\'; 2160 const char *next_next = p + 1 < pend ? p + 1 : NULL; 2161 2162 return 2163 /* Before a subexpression? */ 2164 (syntax & RE_NO_BK_PARENS ? *next == ')' 2165 : next_backslash && next_next && *next_next == ')') 2166 /* Before an alternative? */ 2167 || (syntax & RE_NO_BK_VBAR ? *next == '|' 2168 : next_backslash && next_next && *next_next == '|'); 2169 } 2170 2171 2172 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and 2173 false if it's not. */ 2174 2175 static boolean 2176 group_in_compile_stack (compile_stack, regnum) 2177 compile_stack_type compile_stack; 2178 regnum_t regnum; 2179 { 2180 int this_element; 2181 2182 for (this_element = compile_stack.avail - 1; 2183 this_element >= 0; 2184 this_element--) 2185 if (compile_stack.stack[this_element].regnum == regnum) 2186 return true; 2187 2188 return false; 2189 } 2190 2191 2192 /* Read the ending character of a range (in a bracket expression) from the 2193 uncompiled pattern *P_PTR (which ends at PEND). We assume the 2194 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) 2195 Then we set the translation of all bits between the starting and 2196 ending characters (inclusive) in the compiled pattern B. 2197 2198 Return an error code. 2199 2200 We use these short variable names so we can use the same macros as 2201 `regex_compile' itself. */ 2202 2203 static reg_errcode_t 2204 compile_range (p_ptr, pend, translate, syntax, b) 2205 const char **p_ptr, *pend; 2206 char *translate; 2207 reg_syntax_t syntax; 2208 unsigned char *b; 2209 { 2210 unsigned this_char; 2211 2212 const char *p = *p_ptr; 2213 int range_start, range_end; 2214 2215 if (p == pend) 2216 return REG_ERANGE; 2217 2218 /* Even though the pattern is a signed `char *', we need to fetch 2219 with unsigned char *'s; if the high bit of the pattern character 2220 is set, the range endpoints will be negative if we fetch using a 2221 signed char *. 2222 2223 We also want to fetch the endpoints without translating them; the 2224 appropriate translation is done in the bit-setting loop below. */ 2225 range_start = ((unsigned char *) p)[-2]; 2226 range_end = ((unsigned char *) p)[0]; 2227 2228 /* Have to increment the pointer into the pattern string, so the 2229 caller isn't still at the ending character. */ 2230 (*p_ptr)++; 2231 2232 /* If the start is after the end, the range is empty. */ 2233 if (range_start > range_end) 2234 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; 2235 2236 /* Here we see why `this_char' has to be larger than an `unsigned 2237 char' -- the range is inclusive, so if `range_end' == 0xff 2238 (assuming 8-bit characters), we would otherwise go into an infinite 2239 loop, since all characters <= 0xff. */ 2240 for (this_char = range_start; this_char <= range_end; this_char++) 2241 { 2242 SET_LIST_BIT (TRANSLATE (this_char)); 2243 } 2244 2245 return REG_NOERROR; 2246 } 2247 2248 /* Failure stack declarations and macros; both re_compile_fastmap and 2249 re_match_2 use a failure stack. These have to be macros because of 2250 REGEX_ALLOCATE. */ 2251 2252 2253 /* Number of failure points for which to initially allocate space 2254 when matching. If this number is exceeded, we allocate more 2255 space, so it is not a hard limit. */ 2256 #ifndef INIT_FAILURE_ALLOC 2257 #define INIT_FAILURE_ALLOC 5 2258 #endif 2259 2260 /* Roughly the maximum number of failure points on the stack. Would be 2261 exactly that if always used MAX_FAILURE_SPACE each time we failed. 2262 This is a variable only so users of regex can assign to it; we never 2263 change it ourselves. */ 2264 int re_max_failures = 2000; 2265 2266 typedef const unsigned char *fail_stack_elt_t; 2267 2268 typedef struct 2269 { 2270 fail_stack_elt_t *stack; 2271 unsigned size; 2272 unsigned avail; /* Offset of next open position. */ 2273 } fail_stack_type; 2274 2275 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 2276 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 2277 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 2278 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail]) 2279 2280 2281 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */ 2282 2283 #define INIT_FAIL_STACK() \ 2284 do { \ 2285 fail_stack.stack = (fail_stack_elt_t *) \ 2286 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ 2287 \ 2288 if (fail_stack.stack == NULL) \ 2289 return -2; \ 2290 \ 2291 fail_stack.size = INIT_FAILURE_ALLOC; \ 2292 fail_stack.avail = 0; \ 2293 } while (0) 2294 2295 2296 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. 2297 2298 Return 1 if succeeds, and 0 if either ran out of memory 2299 allocating space for it or it was already too large. 2300 2301 REGEX_REALLOCATE requires `destination' be declared. */ 2302 2303 #define DOUBLE_FAIL_STACK(fail_stack) \ 2304 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ 2305 ? 0 \ 2306 : ((fail_stack).stack = (fail_stack_elt_t *) \ 2307 REGEX_REALLOCATE ((fail_stack).stack, \ 2308 (fail_stack).size * sizeof (fail_stack_elt_t), \ 2309 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ 2310 \ 2311 (fail_stack).stack == NULL \ 2312 ? 0 \ 2313 : ((fail_stack).size <<= 1, \ 2314 1))) 2315 2316 2317 /* Push PATTERN_OP on FAIL_STACK. 2318 2319 Return 1 if was able to do so and 0 if ran out of memory allocating 2320 space to do so. */ 2321 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \ 2322 ((FAIL_STACK_FULL () \ 2323 && !DOUBLE_FAIL_STACK (fail_stack)) \ 2324 ? 0 \ 2325 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \ 2326 1)) 2327 2328 /* This pushes an item onto the failure stack. Must be a four-byte 2329 value. Assumes the variable `fail_stack'. Probably should only 2330 be called from within `PUSH_FAILURE_POINT'. */ 2331 #define PUSH_FAILURE_ITEM(item) \ 2332 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item 2333 2334 /* The complement operation. Assumes `fail_stack' is nonempty. */ 2335 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail] 2336 2337 /* Used to omit pushing failure point id's when we're not debugging. */ 2338 #ifdef DEBUG 2339 #define DEBUG_PUSH PUSH_FAILURE_ITEM 2340 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM () 2341 #else 2342 #define DEBUG_PUSH(item) 2343 #define DEBUG_POP(item_addr) 2344 #endif 2345 2346 2347 /* Push the information about the state we will need 2348 if we ever fail back to it. 2349 2350 Requires variables fail_stack, regstart, regend, reg_info, and 2351 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be 2352 declared. 2353 2354 Does `return FAILURE_CODE' if runs out of memory. */ 2355 2356 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 2357 do { \ 2358 char *destination; \ 2359 /* Must be int, so when we don't save any registers, the arithmetic \ 2360 of 0 + -1 isn't done as unsigned. */ \ 2361 int this_reg; \ 2362 \ 2363 DEBUG_STATEMENT (failure_id++); \ 2364 DEBUG_STATEMENT (nfailure_points_pushed++); \ 2365 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 2366 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 2367 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 2368 \ 2369 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ 2370 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 2371 \ 2372 /* Ensure we have enough space allocated for what we will push. */ \ 2373 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 2374 { \ 2375 if (!DOUBLE_FAIL_STACK (fail_stack)) \ 2376 return failure_code; \ 2377 \ 2378 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 2379 (fail_stack).size); \ 2380 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 2381 } \ 2382 \ 2383 /* Push the info, starting with the registers. */ \ 2384 DEBUG_PRINT1 ("\n"); \ 2385 \ 2386 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 2387 this_reg++) \ 2388 { \ 2389 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ 2390 DEBUG_STATEMENT (num_regs_pushed++); \ 2391 \ 2392 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 2393 PUSH_FAILURE_ITEM (regstart[this_reg]); \ 2394 \ 2395 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 2396 PUSH_FAILURE_ITEM (regend[this_reg]); \ 2397 \ 2398 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ 2399 DEBUG_PRINT2 (" match_null=%d", \ 2400 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 2401 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 2402 DEBUG_PRINT2 (" matched_something=%d", \ 2403 MATCHED_SOMETHING (reg_info[this_reg])); \ 2404 DEBUG_PRINT2 (" ever_matched=%d", \ 2405 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 2406 DEBUG_PRINT1 ("\n"); \ 2407 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \ 2408 } \ 2409 \ 2410 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ 2411 PUSH_FAILURE_ITEM (lowest_active_reg); \ 2412 \ 2413 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ 2414 PUSH_FAILURE_ITEM (highest_active_reg); \ 2415 \ 2416 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ 2417 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 2418 PUSH_FAILURE_ITEM (pattern_place); \ 2419 \ 2420 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ 2421 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 2422 size2); \ 2423 DEBUG_PRINT1 ("'\n"); \ 2424 PUSH_FAILURE_ITEM (string_place); \ 2425 \ 2426 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 2427 DEBUG_PUSH (failure_id); \ 2428 } while (0) 2429 2430 /* This is the number of items that are pushed and popped on the stack 2431 for each register. */ 2432 #define NUM_REG_ITEMS 3 2433 2434 /* Individual items aside from the registers. */ 2435 #ifdef DEBUG 2436 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 2437 #else 2438 #define NUM_NONREG_ITEMS 4 2439 #endif 2440 2441 /* We push at most this many items on the stack. */ 2442 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) 2443 2444 /* We actually push this many items. */ 2445 #define NUM_FAILURE_ITEMS \ 2446 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ 2447 + NUM_NONREG_ITEMS) 2448 2449 /* How many items can still be added to the stack without overflowing it. */ 2450 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 2451 2452 2453 /* Pops what PUSH_FAIL_STACK pushes. 2454 2455 We restore into the parameters, all of which should be lvalues: 2456 STR -- the saved data position. 2457 PAT -- the saved pattern position. 2458 LOW_REG, HIGH_REG -- the highest and lowest active registers. 2459 REGSTART, REGEND -- arrays of string positions. 2460 REG_INFO -- array of information about each subexpression. 2461 2462 Also assumes the variables `fail_stack' and (if debugging), `bufp', 2463 `pend', `string1', `size1', `string2', and `size2'. */ 2464 2465 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 2466 { \ 2467 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ 2468 int this_reg; \ 2469 const unsigned char *string_temp; \ 2470 \ 2471 assert (!FAIL_STACK_EMPTY ()); \ 2472 \ 2473 /* Remove failure points and point to how many regs pushed. */ \ 2474 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 2475 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 2476 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 2477 \ 2478 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 2479 \ 2480 DEBUG_POP (&failure_id); \ 2481 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 2482 \ 2483 /* If the saved string location is NULL, it came from an \ 2484 on_failure_keep_string_jump opcode, and we want to throw away the \ 2485 saved NULL, thus retaining our current position in the string. */ \ 2486 string_temp = POP_FAILURE_ITEM (); \ 2487 if (string_temp != NULL) \ 2488 str = (const char *) string_temp; \ 2489 \ 2490 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ 2491 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 2492 DEBUG_PRINT1 ("'\n"); \ 2493 \ 2494 pat = (unsigned char *) POP_FAILURE_ITEM (); \ 2495 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ 2496 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 2497 \ 2498 /* Restore register info. */ \ 2499 high_reg = (unsigned) POP_FAILURE_ITEM (); \ 2500 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ 2501 \ 2502 low_reg = (unsigned) POP_FAILURE_ITEM (); \ 2503 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ 2504 \ 2505 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 2506 { \ 2507 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ 2508 \ 2509 reg_info[this_reg].word = POP_FAILURE_ITEM (); \ 2510 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ 2511 \ 2512 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \ 2513 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 2514 \ 2515 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \ 2516 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 2517 } \ 2518 \ 2519 DEBUG_STATEMENT (nfailure_points_popped++); \ 2520 } /* POP_FAILURE_POINT */ 2521 2522 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 2523 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 2524 characters can start a string that matches the pattern. This fastmap 2525 is used by re_search to skip quickly over impossible starting points. 2526 2527 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 2528 area as BUFP->fastmap. 2529 2530 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 2531 the pattern buffer. 2532 2533 Returns 0 if we succeed, -2 if an internal error. */ 2534 2535 int 2536 re_compile_fastmap (bufp) 2537 struct re_pattern_buffer *bufp; 2538 { 2539 int j, k; 2540 fail_stack_type fail_stack; 2541 #ifndef REGEX_MALLOC 2542 char *destination; 2543 #endif 2544 /* We don't push any register information onto the failure stack. */ 2545 unsigned num_regs = 0; 2546 2547 register char *fastmap = bufp->fastmap; 2548 unsigned char *pattern = bufp->buffer; 2549 unsigned long size = bufp->used; 2550 const unsigned char *p = pattern; 2551 register unsigned char *pend = pattern + size; 2552 2553 /* Assume that each path through the pattern can be null until 2554 proven otherwise. We set this false at the bottom of switch 2555 statement, to which we get only if a particular path doesn't 2556 match the empty string. */ 2557 boolean path_can_be_null = true; 2558 2559 /* We aren't doing a `succeed_n' to begin with. */ 2560 boolean succeed_n_p = false; 2561 2562 assert (fastmap != NULL && p != NULL); 2563 2564 INIT_FAIL_STACK (); 2565 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 2566 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 2567 bufp->can_be_null = 0; 2568 2569 while (p != pend || !FAIL_STACK_EMPTY ()) 2570 { 2571 if (p == pend) 2572 { 2573 bufp->can_be_null |= path_can_be_null; 2574 2575 /* Reset for next path. */ 2576 path_can_be_null = true; 2577 2578 p = fail_stack.stack[--fail_stack.avail]; 2579 } 2580 2581 /* We should never be about to go beyond the end of the pattern. */ 2582 assert (p < pend); 2583 2584 #ifdef SWITCH_ENUM_BUG 2585 switch ((int) ((re_opcode_t) *p++)) 2586 #else 2587 switch ((re_opcode_t) *p++) 2588 #endif 2589 { 2590 2591 /* I guess the idea here is to simply not bother with a fastmap 2592 if a backreference is used, since it's too hard to figure out 2593 the fastmap for the corresponding group. Setting 2594 `can_be_null' stops `re_search_2' from using the fastmap, so 2595 that is all we do. */ 2596 case duplicate: 2597 bufp->can_be_null = 1; 2598 return 0; 2599 2600 2601 /* Following are the cases which match a character. These end 2602 with `break'. */ 2603 2604 case exactn: 2605 fastmap[p[1]] = 1; 2606 break; 2607 2608 2609 case charset: 2610 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 2611 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 2612 fastmap[j] = 1; 2613 break; 2614 2615 2616 case charset_not: 2617 /* Chars beyond end of map must be allowed. */ 2618 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 2619 fastmap[j] = 1; 2620 2621 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 2622 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 2623 fastmap[j] = 1; 2624 break; 2625 2626 2627 case wordchar: 2628 for (j = 0; j < (1 << BYTEWIDTH); j++) 2629 if (SYNTAX (j) == Sword) 2630 fastmap[j] = 1; 2631 break; 2632 2633 2634 case notwordchar: 2635 for (j = 0; j < (1 << BYTEWIDTH); j++) 2636 if (SYNTAX (j) != Sword) 2637 fastmap[j] = 1; 2638 break; 2639 2640 2641 case anychar: 2642 /* `.' matches anything ... */ 2643 for (j = 0; j < (1 << BYTEWIDTH); j++) 2644 fastmap[j] = 1; 2645 2646 /* ... except perhaps newline. */ 2647 if (!(bufp->syntax & RE_DOT_NEWLINE)) 2648 fastmap['\n'] = 0; 2649 2650 /* Return if we have already set `can_be_null'; if we have, 2651 then the fastmap is irrelevant. Something's wrong here. */ 2652 else if (bufp->can_be_null) 2653 return 0; 2654 2655 /* Otherwise, have to check alternative paths. */ 2656 break; 2657 2658 2659 #ifdef emacs 2660 case syntaxspec: 2661 k = *p++; 2662 for (j = 0; j < (1 << BYTEWIDTH); j++) 2663 if (SYNTAX (j) == (enum syntaxcode) k) 2664 fastmap[j] = 1; 2665 break; 2666 2667 2668 case notsyntaxspec: 2669 k = *p++; 2670 for (j = 0; j < (1 << BYTEWIDTH); j++) 2671 if (SYNTAX (j) != (enum syntaxcode) k) 2672 fastmap[j] = 1; 2673 break; 2674 2675 2676 /* All cases after this match the empty string. These end with 2677 `continue'. */ 2678 2679 2680 case before_dot: 2681 case at_dot: 2682 case after_dot: 2683 continue; 2684 #endif /* not emacs */ 2685 2686 2687 case no_op: 2688 case begline: 2689 case endline: 2690 case begbuf: 2691 case endbuf: 2692 case wordbound: 2693 case notwordbound: 2694 case wordbeg: 2695 case wordend: 2696 case push_dummy_failure: 2697 continue; 2698 2699 2700 case jump_n: 2701 case pop_failure_jump: 2702 case maybe_pop_jump: 2703 case jump: 2704 case jump_past_alt: 2705 case dummy_failure_jump: 2706 EXTRACT_NUMBER_AND_INCR (j, p); 2707 p += j; 2708 if (j > 0) 2709 continue; 2710 2711 /* Jump backward implies we just went through the body of a 2712 loop and matched nothing. Opcode jumped to should be 2713 `on_failure_jump' or `succeed_n'. Just treat it like an 2714 ordinary jump. For a * loop, it has pushed its failure 2715 point already; if so, discard that as redundant. */ 2716 if ((re_opcode_t) *p != on_failure_jump 2717 && (re_opcode_t) *p != succeed_n) 2718 continue; 2719 2720 p++; 2721 EXTRACT_NUMBER_AND_INCR (j, p); 2722 p += j; 2723 2724 /* If what's on the stack is where we are now, pop it. */ 2725 if (!FAIL_STACK_EMPTY () 2726 && fail_stack.stack[fail_stack.avail - 1] == p) 2727 fail_stack.avail--; 2728 2729 continue; 2730 2731 2732 case on_failure_jump: 2733 case on_failure_keep_string_jump: 2734 handle_on_failure_jump: 2735 EXTRACT_NUMBER_AND_INCR (j, p); 2736 2737 /* For some patterns, e.g., `(a?)?', `p+j' here points to the 2738 end of the pattern. We don't want to push such a point, 2739 since when we restore it above, entering the switch will 2740 increment `p' past the end of the pattern. We don't need 2741 to push such a point since we obviously won't find any more 2742 fastmap entries beyond `pend'. Such a pattern can match 2743 the null string, though. */ 2744 if (p + j < pend) 2745 { 2746 if (!PUSH_PATTERN_OP (p + j, fail_stack)) 2747 return -2; 2748 } 2749 else 2750 bufp->can_be_null = 1; 2751 2752 if (succeed_n_p) 2753 { 2754 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 2755 succeed_n_p = false; 2756 } 2757 2758 continue; 2759 2760 2761 case succeed_n: 2762 /* Get to the number of times to succeed. */ 2763 p += 2; 2764 2765 /* Increment p past the n for when k != 0. */ 2766 EXTRACT_NUMBER_AND_INCR (k, p); 2767 if (k == 0) 2768 { 2769 p -= 4; 2770 succeed_n_p = true; /* Spaghetti code alert. */ 2771 goto handle_on_failure_jump; 2772 } 2773 continue; 2774 2775 2776 case set_number_at: 2777 p += 4; 2778 continue; 2779 2780 2781 case start_memory: 2782 case stop_memory: 2783 p += 2; 2784 continue; 2785 2786 2787 default: 2788 abort (); /* We have listed all the cases. */ 2789 } /* switch *p++ */ 2790 2791 /* Getting here means we have found the possible starting 2792 characters for one path of the pattern -- and that the empty 2793 string does not match. We need not follow this path further. 2794 Instead, look at the next alternative (remembered on the 2795 stack), or quit if no more. The test at the top of the loop 2796 does these things. */ 2797 path_can_be_null = false; 2798 p = pend; 2799 } /* while p */ 2800 2801 /* Set `can_be_null' for the last path (also the first path, if the 2802 pattern is empty). */ 2803 bufp->can_be_null |= path_can_be_null; 2804 return 0; 2805 } /* re_compile_fastmap */ 2806 2807 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 2808 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 2809 this memory for recording register information. STARTS and ENDS 2810 must be allocated using the malloc library routine, and must each 2811 be at least NUM_REGS * sizeof (regoff_t) bytes long. 2812 2813 If NUM_REGS == 0, then subsequent matches should allocate their own 2814 register data. 2815 2816 Unless this function is called, the first search or match using 2817 PATTERN_BUFFER will allocate its own register data, without 2818 freeing the old data. */ 2819 2820 void 2821 re_set_registers (bufp, regs, num_regs, starts, ends) 2822 struct re_pattern_buffer *bufp; 2823 struct re_registers *regs; 2824 unsigned num_regs; 2825 regoff_t *starts, *ends; 2826 { 2827 if (num_regs) 2828 { 2829 bufp->regs_allocated = REGS_REALLOCATE; 2830 regs->num_regs = num_regs; 2831 regs->start = starts; 2832 regs->end = ends; 2833 } 2834 else 2835 { 2836 bufp->regs_allocated = REGS_UNALLOCATED; 2837 regs->num_regs = 0; 2838 regs->start = regs->end = (regoff_t *) 0; 2839 } 2840 } 2841 2842 /* Searching routines. */ 2843 2844 /* Like re_search_2, below, but only one string is specified, and 2845 doesn't let you say where to stop matching. */ 2846 2847 int 2848 re_search (bufp, string, size, startpos, range, regs) 2849 struct re_pattern_buffer *bufp; 2850 const char *string; 2851 int size, startpos, range; 2852 struct re_registers *regs; 2853 { 2854 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, 2855 regs, size); 2856 } 2857 2858 2859 /* Using the compiled pattern in BUFP->buffer, first tries to match the 2860 virtual concatenation of STRING1 and STRING2, starting first at index 2861 STARTPOS, then at STARTPOS + 1, and so on. 2862 2863 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. 2864 2865 RANGE is how far to scan while trying to match. RANGE = 0 means try 2866 only at STARTPOS; in general, the last start tried is STARTPOS + 2867 RANGE. 2868 2869 In REGS, return the indices of the virtual concatenation of STRING1 2870 and STRING2 that matched the entire BUFP->buffer and its contained 2871 subexpressions. 2872 2873 Do not consider matching one past the index STOP in the virtual 2874 concatenation of STRING1 and STRING2. 2875 2876 We return either the position in the strings at which the match was 2877 found, -1 if no match, or -2 if error (such as failure 2878 stack overflow). */ 2879 2880 int 2881 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) 2882 struct re_pattern_buffer *bufp; 2883 const char *string1, *string2; 2884 int size1, size2; 2885 int startpos; 2886 int range; 2887 struct re_registers *regs; 2888 int stop; 2889 { 2890 int val; 2891 register char *fastmap = bufp->fastmap; 2892 register char *translate = bufp->translate; 2893 int total_size = size1 + size2; 2894 int endpos = startpos + range; 2895 2896 /* Check for out-of-range STARTPOS. */ 2897 if (startpos < 0 || startpos > total_size) 2898 return -1; 2899 2900 /* Fix up RANGE if it might eventually take us outside 2901 the virtual concatenation of STRING1 and STRING2. */ 2902 if (endpos < -1) 2903 range = -1 - startpos; 2904 else if (endpos > total_size) 2905 range = total_size - startpos; 2906 2907 /* If the search isn't to be a backwards one, don't waste time in a 2908 search for a pattern that must be anchored. */ 2909 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) 2910 { 2911 if (startpos > 0) 2912 return -1; 2913 else 2914 range = 1; 2915 } 2916 2917 /* Update the fastmap now if not correct already. */ 2918 if (fastmap && !bufp->fastmap_accurate) 2919 if (re_compile_fastmap (bufp) == -2) 2920 return -2; 2921 2922 /* Loop through the string, looking for a place to start matching. */ 2923 for (;;) 2924 { 2925 /* If a fastmap is supplied, skip quickly over characters that 2926 cannot be the start of a match. If the pattern can match the 2927 null string, however, we don't need to skip characters; we want 2928 the first null string. */ 2929 if (fastmap && startpos < total_size && !bufp->can_be_null) 2930 { 2931 if (range > 0) /* Searching forwards. */ 2932 { 2933 register const char *d; 2934 register int lim = 0; 2935 int irange = range; 2936 2937 if (startpos < size1 && startpos + range >= size1) 2938 lim = range - (size1 - startpos); 2939 2940 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; 2941 2942 /* Written out as an if-else to avoid testing `translate' 2943 inside the loop. */ 2944 if (translate) 2945 while (range > lim 2946 && !fastmap[(unsigned char) 2947 translate[(unsigned char) *d++]]) 2948 range--; 2949 else 2950 while (range > lim && !fastmap[(unsigned char) *d++]) 2951 range--; 2952 2953 startpos += irange - range; 2954 } 2955 else /* Searching backwards. */ 2956 { 2957 register char c = (size1 == 0 || startpos >= size1 2958 ? string2[startpos - size1] 2959 : string1[startpos]); 2960 2961 if (!fastmap[(unsigned char) TRANSLATE (c)]) 2962 goto advance; 2963 } 2964 } 2965 2966 /* If can't match the null string, and that's all we have left, fail. */ 2967 if (range >= 0 && startpos == total_size && fastmap 2968 && !bufp->can_be_null) 2969 return -1; 2970 2971 val = re_match_2 (bufp, string1, size1, string2, size2, 2972 startpos, regs, stop); 2973 if (val >= 0) 2974 return startpos; 2975 2976 if (val == -2) 2977 return -2; 2978 2979 advance: 2980 if (!range) 2981 break; 2982 else if (range > 0) 2983 { 2984 range--; 2985 startpos++; 2986 } 2987 else 2988 { 2989 range++; 2990 startpos--; 2991 } 2992 } 2993 return -1; 2994 } /* re_search_2 */ 2995 2996 /* Declarations and macros for re_match_2. */ 2997 2998 static int bcmp_translate (); 2999 static boolean alt_match_null_string_p (), 3000 common_op_match_null_string_p (), 3001 group_match_null_string_p (); 3002 3003 /* Structure for per-register (a.k.a. per-group) information. 3004 This must not be longer than one word, because we push this value 3005 onto the failure stack. Other register information, such as the 3006 starting and ending positions (which are addresses), and the list of 3007 inner groups (which is a bits list) are maintained in separate 3008 variables. 3009 3010 We are making a (strictly speaking) nonportable assumption here: that 3011 the compiler will pack our bit fields into something that fits into 3012 the type of `word', i.e., is something that fits into one item on the 3013 failure stack. */ 3014 typedef union 3015 { 3016 fail_stack_elt_t word; 3017 struct 3018 { 3019 /* This field is one if this group can match the empty string, 3020 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 3021 #define MATCH_NULL_UNSET_VALUE 3 3022 unsigned match_null_string_p : 2; 3023 unsigned is_active : 1; 3024 unsigned matched_something : 1; 3025 unsigned ever_matched_something : 1; 3026 } bits; 3027 } register_info_type; 3028 3029 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 3030 #define IS_ACTIVE(R) ((R).bits.is_active) 3031 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) 3032 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 3033 3034 3035 /* Call this when have matched a real character; it sets `matched' flags 3036 for the subexpressions which we are currently inside. Also records 3037 that those subexprs have matched. */ 3038 #define SET_REGS_MATCHED() \ 3039 do \ 3040 { \ 3041 unsigned r; \ 3042 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 3043 { \ 3044 MATCHED_SOMETHING (reg_info[r]) \ 3045 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 3046 = 1; \ 3047 } \ 3048 } \ 3049 while (0) 3050 3051 3052 /* This converts PTR, a pointer into one of the search strings `string1' 3053 and `string2' into an offset from the beginning of that string. */ 3054 #define POINTER_TO_OFFSET(ptr) \ 3055 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1) 3056 3057 /* Registers are set to a sentinel when they haven't yet matched. */ 3058 #define REG_UNSET_VALUE ((char *) -1) 3059 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 3060 3061 3062 /* Macros for dealing with the split strings in re_match_2. */ 3063 3064 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) 3065 3066 /* Call before fetching a character with *d. This switches over to 3067 string2 if necessary. */ 3068 #define PREFETCH() \ 3069 while (d == dend) \ 3070 { \ 3071 /* End of string2 => fail. */ \ 3072 if (dend == end_match_2) \ 3073 goto fail; \ 3074 /* End of string1 => advance to string2. */ \ 3075 d = string2; \ 3076 dend = end_match_2; \ 3077 } 3078 3079 3080 /* Test if at very beginning or at very end of the virtual concatenation 3081 of `string1' and `string2'. If only one string, it's `string2'. */ 3082 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) 3083 #define AT_STRINGS_END(d) ((d) == end2) 3084 3085 3086 /* Test if D points to a character which is word-constituent. We have 3087 two special cases to check for: if past the end of string1, look at 3088 the first character in string2; and if before the beginning of 3089 string2, look at the last character in string1. */ 3090 #define WORDCHAR_P(d) \ 3091 (SYNTAX ((d) == end1 ? *string2 \ 3092 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ 3093 == Sword) 3094 3095 /* Test if the character before D and the one at D differ with respect 3096 to being word-constituent. */ 3097 #define AT_WORD_BOUNDARY(d) \ 3098 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ 3099 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) 3100 3101 3102 /* Free everything we malloc. */ 3103 #ifdef REGEX_MALLOC 3104 #define FREE_VAR(var) if (var) free (var); var = NULL 3105 #define FREE_VARIABLES() \ 3106 do { \ 3107 FREE_VAR (fail_stack.stack); \ 3108 FREE_VAR (regstart); \ 3109 FREE_VAR (regend); \ 3110 FREE_VAR (old_regstart); \ 3111 FREE_VAR (old_regend); \ 3112 FREE_VAR (best_regstart); \ 3113 FREE_VAR (best_regend); \ 3114 FREE_VAR (reg_info); \ 3115 FREE_VAR (reg_dummy); \ 3116 FREE_VAR (reg_info_dummy); \ 3117 } while (0) 3118 #else /* not REGEX_MALLOC */ 3119 /* Some MIPS systems (at least) want this to free alloca'd storage. */ 3120 #define FREE_VARIABLES() alloca (0) 3121 #endif /* not REGEX_MALLOC */ 3122 3123 3124 /* These values must meet several constraints. They must not be valid 3125 register values; since we have a limit of 255 registers (because 3126 we use only one byte in the pattern for the register number), we can 3127 use numbers larger than 255. They must differ by 1, because of 3128 NUM_FAILURE_ITEMS above. And the value for the lowest register must 3129 be larger than the value for the highest register, so we do not try 3130 to actually save any registers when none are active. */ 3131 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) 3132 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) 3133 3134 /* Matching routines. */ 3135 3136 #ifndef emacs /* Emacs never uses this. */ 3137 /* re_match is like re_match_2 except it takes only a single string. */ 3138 3139 int 3140 re_match (bufp, string, size, pos, regs) 3141 struct re_pattern_buffer *bufp; 3142 const char *string; 3143 int size, pos; 3144 struct re_registers *regs; 3145 { 3146 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size); 3147 } 3148 #endif /* not emacs */ 3149 3150 3151 /* re_match_2 matches the compiled pattern in BUFP against the 3152 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 3153 and SIZE2, respectively). We start matching at POS, and stop 3154 matching at STOP. 3155 3156 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we 3157 store offsets for the substring each group matched in REGS. See the 3158 documentation for exactly how many groups we fill. 3159 3160 We return -1 if no match, -2 if an internal error (such as the 3161 failure stack overflowing). Otherwise, we return the length of the 3162 matched substring. */ 3163 3164 int 3165 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 3166 struct re_pattern_buffer *bufp; 3167 const char *string1, *string2; 3168 int size1, size2; 3169 int pos; 3170 struct re_registers *regs; 3171 int stop; 3172 { 3173 /* General temporaries. */ 3174 int mcnt; 3175 unsigned char *p1; 3176 3177 /* Just past the end of the corresponding string. */ 3178 const char *end1, *end2; 3179 3180 /* Pointers into string1 and string2, just past the last characters in 3181 each to consider matching. */ 3182 const char *end_match_1, *end_match_2; 3183 3184 /* Where we are in the data, and the end of the current string. */ 3185 const char *d, *dend; 3186 3187 /* Where we are in the pattern, and the end of the pattern. */ 3188 unsigned char *p = bufp->buffer; 3189 register unsigned char *pend = p + bufp->used; 3190 3191 /* We use this to map every character in the string. */ 3192 char *translate = bufp->translate; 3193 3194 /* Failure point stack. Each place that can handle a failure further 3195 down the line pushes a failure point on this stack. It consists of 3196 restart, regend, and reg_info for all registers corresponding to 3197 the subexpressions we're currently inside, plus the number of such 3198 registers, and, finally, two char *'s. The first char * is where 3199 to resume scanning the pattern; the second one is where to resume 3200 scanning the strings. If the latter is zero, the failure point is 3201 a ``dummy''; if a failure happens and the failure point is a dummy, 3202 it gets discarded and the next next one is tried. */ 3203 fail_stack_type fail_stack; 3204 #ifdef DEBUG 3205 static unsigned failure_id = 0; 3206 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; 3207 #endif 3208 3209 /* We fill all the registers internally, independent of what we 3210 return, for use in backreferences. The number here includes 3211 an element for register zero. */ 3212 unsigned num_regs = bufp->re_nsub + 1; 3213 3214 /* The currently active registers. */ 3215 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3216 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3217 3218 /* Information on the contents of registers. These are pointers into 3219 the input strings; they record just what was matched (on this 3220 attempt) by a subexpression part of the pattern, that is, the 3221 regnum-th regstart pointer points to where in the pattern we began 3222 matching and the regnum-th regend points to right after where we 3223 stopped matching the regnum-th subexpression. (The zeroth register 3224 keeps track of what the whole pattern matches.) */ 3225 const char **regstart, **regend; 3226 3227 /* If a group that's operated upon by a repetition operator fails to 3228 match anything, then the register for its start will need to be 3229 restored because it will have been set to wherever in the string we 3230 are when we last see its open-group operator. Similarly for a 3231 register's end. */ 3232 const char **old_regstart, **old_regend; 3233 3234 /* The is_active field of reg_info helps us keep track of which (possibly 3235 nested) subexpressions we are currently in. The matched_something 3236 field of reg_info[reg_num] helps us tell whether or not we have 3237 matched any of the pattern so far this time through the reg_num-th 3238 subexpression. These two fields get reset each time through any 3239 loop their register is in. */ 3240 register_info_type *reg_info; 3241 3242 /* The following record the register info as found in the above 3243 variables when we find a match better than any we've seen before. 3244 This happens as we backtrack through the failure points, which in 3245 turn happens only if we have not yet matched the entire string. */ 3246 unsigned best_regs_set = false; 3247 const char **best_regstart, **best_regend; 3248 3249 /* Logically, this is `best_regend[0]'. But we don't want to have to 3250 allocate space for that if we're not allocating space for anything 3251 else (see below). Also, we never need info about register 0 for 3252 any of the other register vectors, and it seems rather a kludge to 3253 treat `best_regend' differently than the rest. So we keep track of 3254 the end of the best match so far in a separate variable. We 3255 initialize this to NULL so that when we backtrack the first time 3256 and need to test it, it's not garbage. */ 3257 const char *match_end = NULL; 3258 3259 /* Used when we pop values we don't care about. */ 3260 const char **reg_dummy; 3261 register_info_type *reg_info_dummy; 3262 3263 #ifdef DEBUG 3264 /* Counts the total number of registers pushed. */ 3265 unsigned num_regs_pushed = 0; 3266 #endif 3267 3268 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); 3269 3270 INIT_FAIL_STACK (); 3271 3272 /* Do not bother to initialize all the register variables if there are 3273 no groups in the pattern, as it takes a fair amount of time. If 3274 there are groups, we include space for register 0 (the whole 3275 pattern), even though we never use it, since it simplifies the 3276 array indexing. We should fix this. */ 3277 if (bufp->re_nsub) 3278 { 3279 regstart = REGEX_TALLOC (num_regs, const char *); 3280 regend = REGEX_TALLOC (num_regs, const char *); 3281 old_regstart = REGEX_TALLOC (num_regs, const char *); 3282 old_regend = REGEX_TALLOC (num_regs, const char *); 3283 best_regstart = REGEX_TALLOC (num_regs, const char *); 3284 best_regend = REGEX_TALLOC (num_regs, const char *); 3285 reg_info = REGEX_TALLOC (num_regs, register_info_type); 3286 reg_dummy = REGEX_TALLOC (num_regs, const char *); 3287 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); 3288 3289 if (!(regstart && regend && old_regstart && old_regend && reg_info 3290 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) 3291 { 3292 FREE_VARIABLES (); 3293 return -2; 3294 } 3295 } 3296 #ifdef REGEX_MALLOC 3297 else 3298 { 3299 /* We must initialize all our variables to NULL, so that 3300 `FREE_VARIABLES' doesn't try to free them. */ 3301 regstart = regend = old_regstart = old_regend = best_regstart 3302 = best_regend = reg_dummy = NULL; 3303 reg_info = reg_info_dummy = (register_info_type *) NULL; 3304 } 3305 #endif /* REGEX_MALLOC */ 3306 3307 /* The starting position is bogus. */ 3308 if (pos < 0 || pos > size1 + size2) 3309 { 3310 FREE_VARIABLES (); 3311 return -1; 3312 } 3313 3314 /* Initialize subexpression text positions to -1 to mark ones that no 3315 start_memory/stop_memory has been seen for. Also initialize the 3316 register information struct. */ 3317 for (mcnt = 1; mcnt < num_regs; mcnt++) 3318 { 3319 regstart[mcnt] = regend[mcnt] 3320 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; 3321 3322 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; 3323 IS_ACTIVE (reg_info[mcnt]) = 0; 3324 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 3325 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; 3326 } 3327 3328 /* We move `string1' into `string2' if the latter's empty -- but not if 3329 `string1' is null. */ 3330 if (size2 == 0 && string1 != NULL) 3331 { 3332 string2 = string1; 3333 size2 = size1; 3334 string1 = 0; 3335 size1 = 0; 3336 } 3337 end1 = string1 + size1; 3338 end2 = string2 + size2; 3339 3340 /* Compute where to stop matching, within the two strings. */ 3341 if (stop <= size1) 3342 { 3343 end_match_1 = string1 + stop; 3344 end_match_2 = string2; 3345 } 3346 else 3347 { 3348 end_match_1 = end1; 3349 end_match_2 = string2 + stop - size1; 3350 } 3351 3352 /* `p' scans through the pattern as `d' scans through the data. 3353 `dend' is the end of the input string that `d' points within. `d' 3354 is advanced into the following input string whenever necessary, but 3355 this happens before fetching; therefore, at the beginning of the 3356 loop, `d' can be pointing at the end of a string, but it cannot 3357 equal `string2'. */ 3358 if (size1 > 0 && pos <= size1) 3359 { 3360 d = string1 + pos; 3361 dend = end_match_1; 3362 } 3363 else 3364 { 3365 d = string2 + pos - size1; 3366 dend = end_match_2; 3367 } 3368 3369 DEBUG_PRINT1 ("The compiled pattern is: "); 3370 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); 3371 DEBUG_PRINT1 ("The string to match is: `"); 3372 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); 3373 DEBUG_PRINT1 ("'\n"); 3374 3375 /* This loops over pattern commands. It exits by returning from the 3376 function if the match is complete, or it drops through if the match 3377 fails at this starting point in the input data. */ 3378 for (;;) 3379 { 3380 DEBUG_PRINT2 ("\n0x%x: ", p); 3381 3382 if (p == pend) 3383 { /* End of pattern means we might have succeeded. */ 3384 DEBUG_PRINT1 ("end of pattern ... "); 3385 3386 /* If we haven't matched the entire string, and we want the 3387 longest match, try backtracking. */ 3388 if (d != end_match_2) 3389 { 3390 DEBUG_PRINT1 ("backtracking.\n"); 3391 3392 if (!FAIL_STACK_EMPTY ()) 3393 { /* More failure points to try. */ 3394 boolean same_str_p = (FIRST_STRING_P (match_end) 3395 == MATCHING_IN_FIRST_STRING); 3396 3397 /* If exceeds best match so far, save it. */ 3398 if (!best_regs_set 3399 || (same_str_p && d > match_end) 3400 || (!same_str_p && !MATCHING_IN_FIRST_STRING)) 3401 { 3402 best_regs_set = true; 3403 match_end = d; 3404 3405 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); 3406 3407 for (mcnt = 1; mcnt < num_regs; mcnt++) 3408 { 3409 best_regstart[mcnt] = regstart[mcnt]; 3410 best_regend[mcnt] = regend[mcnt]; 3411 } 3412 } 3413 goto fail; 3414 } 3415 3416 /* If no failure points, don't restore garbage. */ 3417 else if (best_regs_set) 3418 { 3419 restore_best_regs: 3420 /* Restore best match. It may happen that `dend == 3421 end_match_1' while the restored d is in string2. 3422 For example, the pattern `x.*y.*z' against the 3423 strings `x-' and `y-z-', if the two strings are 3424 not consecutive in memory. */ 3425 DEBUG_PRINT1 ("Restoring best registers.\n"); 3426 3427 d = match_end; 3428 dend = ((d >= string1 && d <= end1) 3429 ? end_match_1 : end_match_2); 3430 3431 for (mcnt = 1; mcnt < num_regs; mcnt++) 3432 { 3433 regstart[mcnt] = best_regstart[mcnt]; 3434 regend[mcnt] = best_regend[mcnt]; 3435 } 3436 } 3437 } /* d != end_match_2 */ 3438 3439 DEBUG_PRINT1 ("Accepting match.\n"); 3440 3441 /* If caller wants register contents data back, do it. */ 3442 if (regs && !bufp->no_sub) 3443 { 3444 /* Have the register data arrays been allocated? */ 3445 if (bufp->regs_allocated == REGS_UNALLOCATED) 3446 { /* No. So allocate them with malloc. We need one 3447 extra element beyond `num_regs' for the `-1' marker 3448 GNU code uses. */ 3449 regs->num_regs = MAX (RE_NREGS, num_regs + 1); 3450 regs->start = TALLOC (regs->num_regs, regoff_t); 3451 regs->end = TALLOC (regs->num_regs, regoff_t); 3452 if (regs->start == NULL || regs->end == NULL) 3453 return -2; 3454 bufp->regs_allocated = REGS_REALLOCATE; 3455 } 3456 else if (bufp->regs_allocated == REGS_REALLOCATE) 3457 { /* Yes. If we need more elements than were already 3458 allocated, reallocate them. If we need fewer, just 3459 leave it alone. */ 3460 if (regs->num_regs < num_regs + 1) 3461 { 3462 regs->num_regs = num_regs + 1; 3463 RETALLOC (regs->start, regs->num_regs, regoff_t); 3464 RETALLOC (regs->end, regs->num_regs, regoff_t); 3465 if (regs->start == NULL || regs->end == NULL) 3466 return -2; 3467 } 3468 } 3469 else 3470 assert (bufp->regs_allocated == REGS_FIXED); 3471 3472 /* Convert the pointer data in `regstart' and `regend' to 3473 indices. Register zero has to be set differently, 3474 since we haven't kept track of any info for it. */ 3475 if (regs->num_regs > 0) 3476 { 3477 regs->start[0] = pos; 3478 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1 3479 : d - string2 + size1); 3480 } 3481 3482 /* Go through the first `min (num_regs, regs->num_regs)' 3483 registers, since that is all we initialized. */ 3484 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) 3485 { 3486 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) 3487 regs->start[mcnt] = regs->end[mcnt] = -1; 3488 else 3489 { 3490 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]); 3491 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]); 3492 } 3493 } 3494 3495 /* If the regs structure we return has more elements than 3496 were in the pattern, set the extra elements to -1. If 3497 we (re)allocated the registers, this is the case, 3498 because we always allocate enough to have at least one 3499 -1 at the end. */ 3500 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) 3501 regs->start[mcnt] = regs->end[mcnt] = -1; 3502 } /* regs && !bufp->no_sub */ 3503 3504 FREE_VARIABLES (); 3505 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", 3506 nfailure_points_pushed, nfailure_points_popped, 3507 nfailure_points_pushed - nfailure_points_popped); 3508 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); 3509 3510 mcnt = d - pos - (MATCHING_IN_FIRST_STRING 3511 ? string1 3512 : string2 - size1); 3513 3514 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); 3515 3516 return mcnt; 3517 } 3518 3519 /* Otherwise match next pattern command. */ 3520 #ifdef SWITCH_ENUM_BUG 3521 switch ((int) ((re_opcode_t) *p++)) 3522 #else 3523 switch ((re_opcode_t) *p++) 3524 #endif 3525 { 3526 /* Ignore these. Used to ignore the n of succeed_n's which 3527 currently have n == 0. */ 3528 case no_op: 3529 DEBUG_PRINT1 ("EXECUTING no_op.\n"); 3530 break; 3531 3532 3533 /* Match the next n pattern characters exactly. The following 3534 byte in the pattern defines n, and the n bytes after that 3535 are the characters to match. */ 3536 case exactn: 3537 mcnt = *p++; 3538 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); 3539 3540 /* This is written out as an if-else so we don't waste time 3541 testing `translate' inside the loop. */ 3542 if (translate) 3543 { 3544 do 3545 { 3546 PREFETCH (); 3547 if (translate[(unsigned char) *d++] != (char) *p++) 3548 goto fail; 3549 } 3550 while (--mcnt); 3551 } 3552 else 3553 { 3554 do 3555 { 3556 PREFETCH (); 3557 if (*d++ != (char) *p++) goto fail; 3558 } 3559 while (--mcnt); 3560 } 3561 SET_REGS_MATCHED (); 3562 break; 3563 3564 3565 /* Match any character except possibly a newline or a null. */ 3566 case anychar: 3567 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 3568 3569 PREFETCH (); 3570 3571 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') 3572 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) 3573 goto fail; 3574 3575 SET_REGS_MATCHED (); 3576 DEBUG_PRINT2 (" Matched `%d'.\n", *d); 3577 d++; 3578 break; 3579 3580 3581 case charset: 3582 case charset_not: 3583 { 3584 register unsigned char c; 3585 boolean not = (re_opcode_t) *(p - 1) == charset_not; 3586 3587 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); 3588 3589 PREFETCH (); 3590 c = TRANSLATE (*d); /* The character to match. */ 3591 3592 /* Cast to `unsigned' instead of `unsigned char' in case the 3593 bit list is a full 32 bytes long. */ 3594 if (c < (unsigned) (*p * BYTEWIDTH) 3595 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 3596 not = !not; 3597 3598 p += 1 + *p; 3599 3600 if (!not) goto fail; 3601 3602 SET_REGS_MATCHED (); 3603 d++; 3604 break; 3605 } 3606 3607 3608 /* The beginning of a group is represented by start_memory. 3609 The arguments are the register number in the next byte, and the 3610 number of groups inner to this one in the next. The text 3611 matched within the group is recorded (in the internal 3612 registers data structure) under the register number. */ 3613 case start_memory: 3614 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); 3615 3616 /* Find out if this group can match the empty string. */ 3617 p1 = p; /* To send to group_match_null_string_p. */ 3618 3619 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) 3620 REG_MATCH_NULL_STRING_P (reg_info[*p]) 3621 = group_match_null_string_p (&p1, pend, reg_info); 3622 3623 /* Save the position in the string where we were the last time 3624 we were at this open-group operator in case the group is 3625 operated upon by a repetition operator, e.g., with `(a*)*b' 3626 against `ab'; then we want to ignore where we are now in 3627 the string in case this attempt to match fails. */ 3628 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 3629 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] 3630 : regstart[*p]; 3631 DEBUG_PRINT2 (" old_regstart: %d\n", 3632 POINTER_TO_OFFSET (old_regstart[*p])); 3633 3634 regstart[*p] = d; 3635 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); 3636 3637 IS_ACTIVE (reg_info[*p]) = 1; 3638 MATCHED_SOMETHING (reg_info[*p]) = 0; 3639 3640 /* This is the new highest active register. */ 3641 highest_active_reg = *p; 3642 3643 /* If nothing was active before, this is the new lowest active 3644 register. */ 3645 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 3646 lowest_active_reg = *p; 3647 3648 /* Move past the register number and inner group count. */ 3649 p += 2; 3650 break; 3651 3652 3653 /* The stop_memory opcode represents the end of a group. Its 3654 arguments are the same as start_memory's: the register 3655 number, and the number of inner groups. */ 3656 case stop_memory: 3657 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); 3658 3659 /* We need to save the string position the last time we were at 3660 this close-group operator in case the group is operated 3661 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' 3662 against `aba'; then we want to ignore where we are now in 3663 the string in case this attempt to match fails. */ 3664 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 3665 ? REG_UNSET (regend[*p]) ? d : regend[*p] 3666 : regend[*p]; 3667 DEBUG_PRINT2 (" old_regend: %d\n", 3668 POINTER_TO_OFFSET (old_regend[*p])); 3669 3670 regend[*p] = d; 3671 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); 3672 3673 /* This register isn't active anymore. */ 3674 IS_ACTIVE (reg_info[*p]) = 0; 3675 3676 /* If this was the only register active, nothing is active 3677 anymore. */ 3678 if (lowest_active_reg == highest_active_reg) 3679 { 3680 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3681 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3682 } 3683 else 3684 { /* We must scan for the new highest active register, since 3685 it isn't necessarily one less than now: consider 3686 (a(b)c(d(e)f)g). When group 3 ends, after the f), the 3687 new highest active register is 1. */ 3688 unsigned char r = *p - 1; 3689 while (r > 0 && !IS_ACTIVE (reg_info[r])) 3690 r--; 3691 3692 /* If we end up at register zero, that means that we saved 3693 the registers as the result of an `on_failure_jump', not 3694 a `start_memory', and we jumped to past the innermost 3695 `stop_memory'. For example, in ((.)*) we save 3696 registers 1 and 2 as a result of the *, but when we pop 3697 back to the second ), we are at the stop_memory 1. 3698 Thus, nothing is active. */ 3699 if (r == 0) 3700 { 3701 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3702 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3703 } 3704 else 3705 highest_active_reg = r; 3706 } 3707 3708 /* If just failed to match something this time around with a 3709 group that's operated on by a repetition operator, try to 3710 force exit from the ``loop'', and restore the register 3711 information for this group that we had before trying this 3712 last match. */ 3713 if ((!MATCHED_SOMETHING (reg_info[*p]) 3714 || (re_opcode_t) p[-3] == start_memory) 3715 && (p + 2) < pend) 3716 { 3717 boolean is_a_jump_n = false; 3718 3719 p1 = p + 2; 3720 mcnt = 0; 3721 switch ((re_opcode_t) *p1++) 3722 { 3723 case jump_n: 3724 is_a_jump_n = true; 3725 case pop_failure_jump: 3726 case maybe_pop_jump: 3727 case jump: 3728 case dummy_failure_jump: 3729 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 3730 if (is_a_jump_n) 3731 p1 += 2; 3732 break; 3733 3734 default: 3735 /* do nothing */ ; 3736 } 3737 p1 += mcnt; 3738 3739 /* If the next operation is a jump backwards in the pattern 3740 to an on_failure_jump right before the start_memory 3741 corresponding to this stop_memory, exit from the loop 3742 by forcing a failure after pushing on the stack the 3743 on_failure_jump's jump in the pattern, and d. */ 3744 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump 3745 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) 3746 { 3747 /* If this group ever matched anything, then restore 3748 what its registers were before trying this last 3749 failed match, e.g., with `(a*)*b' against `ab' for 3750 regstart[1], and, e.g., with `((a*)*(b*)*)*' 3751 against `aba' for regend[3]. 3752 3753 Also restore the registers for inner groups for, 3754 e.g., `((a*)(b*))*' against `aba' (register 3 would 3755 otherwise get trashed). */ 3756 3757 if (EVER_MATCHED_SOMETHING (reg_info[*p])) 3758 { 3759 unsigned r; 3760 3761 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; 3762 3763 /* Restore this and inner groups' (if any) registers. */ 3764 for (r = *p; r < *p + *(p + 1); r++) 3765 { 3766 regstart[r] = old_regstart[r]; 3767 3768 /* xx why this test? */ 3769 if ((int) old_regend[r] >= (int) regstart[r]) 3770 regend[r] = old_regend[r]; 3771 } 3772 } 3773 p1++; 3774 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 3775 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); 3776 3777 goto fail; 3778 } 3779 } 3780 3781 /* Move past the register number and the inner group count. */ 3782 p += 2; 3783 break; 3784 3785 3786 /* \<digit> has been turned into a `duplicate' command which is 3787 followed by the numeric value of <digit> as the register number. */ 3788 case duplicate: 3789 { 3790 register const char *d2, *dend2; 3791 int regno = *p++; /* Get which register to match against. */ 3792 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); 3793 3794 /* Can't back reference a group which we've never matched. */ 3795 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) 3796 goto fail; 3797 3798 /* Where in input to try to start matching. */ 3799 d2 = regstart[regno]; 3800 3801 /* Where to stop matching; if both the place to start and 3802 the place to stop matching are in the same string, then 3803 set to the place to stop, otherwise, for now have to use 3804 the end of the first string. */ 3805 3806 dend2 = ((FIRST_STRING_P (regstart[regno]) 3807 == FIRST_STRING_P (regend[regno])) 3808 ? regend[regno] : end_match_1); 3809 for (;;) 3810 { 3811 /* If necessary, advance to next segment in register 3812 contents. */ 3813 while (d2 == dend2) 3814 { 3815 if (dend2 == end_match_2) break; 3816 if (dend2 == regend[regno]) break; 3817 3818 /* End of string1 => advance to string2. */ 3819 d2 = string2; 3820 dend2 = regend[regno]; 3821 } 3822 /* At end of register contents => success */ 3823 if (d2 == dend2) break; 3824 3825 /* If necessary, advance to next segment in data. */ 3826 PREFETCH (); 3827 3828 /* How many characters left in this segment to match. */ 3829 mcnt = dend - d; 3830 3831 /* Want how many consecutive characters we can match in 3832 one shot, so, if necessary, adjust the count. */ 3833 if (mcnt > dend2 - d2) 3834 mcnt = dend2 - d2; 3835 3836 /* Compare that many; failure if mismatch, else move 3837 past them. */ 3838 if (translate 3839 ? bcmp_translate (d, d2, mcnt, translate) 3840 : bcmp (d, d2, mcnt)) 3841 goto fail; 3842 d += mcnt, d2 += mcnt; 3843 } 3844 } 3845 break; 3846 3847 3848 /* begline matches the empty string at the beginning of the string 3849 (unless `not_bol' is set in `bufp'), and, if 3850 `newline_anchor' is set, after newlines. */ 3851 case begline: 3852 DEBUG_PRINT1 ("EXECUTING begline.\n"); 3853 3854 if (AT_STRINGS_BEG (d)) 3855 { 3856 if (!bufp->not_bol) break; 3857 } 3858 else if (d[-1] == '\n' && bufp->newline_anchor) 3859 { 3860 break; 3861 } 3862 /* In all other cases, we fail. */ 3863 goto fail; 3864 3865 3866 /* endline is the dual of begline. */ 3867 case endline: 3868 DEBUG_PRINT1 ("EXECUTING endline.\n"); 3869 3870 if (AT_STRINGS_END (d)) 3871 { 3872 if (!bufp->not_eol) break; 3873 } 3874 3875 /* We have to ``prefetch'' the next character. */ 3876 else if ((d == end1 ? *string2 : *d) == '\n' 3877 && bufp->newline_anchor) 3878 { 3879 break; 3880 } 3881 goto fail; 3882 3883 3884 /* Match at the very beginning of the data. */ 3885 case begbuf: 3886 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); 3887 if (AT_STRINGS_BEG (d)) 3888 break; 3889 goto fail; 3890 3891 3892 /* Match at the very end of the data. */ 3893 case endbuf: 3894 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); 3895 if (AT_STRINGS_END (d)) 3896 break; 3897 goto fail; 3898 3899 3900 /* on_failure_keep_string_jump is used to optimize `.*\n'. It 3901 pushes NULL as the value for the string on the stack. Then 3902 `pop_failure_point' will keep the current value for the 3903 string, instead of restoring it. To see why, consider 3904 matching `foo\nbar' against `.*\n'. The .* matches the foo; 3905 then the . fails against the \n. But the next thing we want 3906 to do is match the \n against the \n; if we restored the 3907 string value, we would be back at the foo. 3908 3909 Because this is used only in specific cases, we don't need to 3910 check all the things that `on_failure_jump' does, to make 3911 sure the right things get saved on the stack. Hence we don't 3912 share its code. The only reason to push anything on the 3913 stack at all is that otherwise we would have to change 3914 `anychar's code to do something besides goto fail in this 3915 case; that seems worse than this. */ 3916 case on_failure_keep_string_jump: 3917 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); 3918 3919 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3920 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); 3921 3922 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); 3923 break; 3924 3925 3926 /* Uses of on_failure_jump: 3927 3928 Each alternative starts with an on_failure_jump that points 3929 to the beginning of the next alternative. Each alternative 3930 except the last ends with a jump that in effect jumps past 3931 the rest of the alternatives. (They really jump to the 3932 ending jump of the following alternative, because tensioning 3933 these jumps is a hassle.) 3934 3935 Repeats start with an on_failure_jump that points past both 3936 the repetition text and either the following jump or 3937 pop_failure_jump back to this on_failure_jump. */ 3938 case on_failure_jump: 3939 on_failure: 3940 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); 3941 3942 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3943 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); 3944 3945 /* If this on_failure_jump comes right before a group (i.e., 3946 the original * applied to a group), save the information 3947 for that group and all inner ones, so that if we fail back 3948 to this point, the group's information will be correct. 3949 For example, in \(a*\)*\1, we need the preceding group, 3950 and in \(\(a*\)b*\)\2, we need the inner group. */ 3951 3952 /* We can't use `p' to check ahead because we push 3953 a failure point to `p + mcnt' after we do this. */ 3954 p1 = p; 3955 3956 /* We need to skip no_op's before we look for the 3957 start_memory in case this on_failure_jump is happening as 3958 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 3959 against aba. */ 3960 while (p1 < pend && (re_opcode_t) *p1 == no_op) 3961 p1++; 3962 3963 if (p1 < pend && (re_opcode_t) *p1 == start_memory) 3964 { 3965 /* We have a new highest active register now. This will 3966 get reset at the start_memory we are about to get to, 3967 but we will have saved all the registers relevant to 3968 this repetition op, as described above. */ 3969 highest_active_reg = *(p1 + 1) + *(p1 + 2); 3970 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 3971 lowest_active_reg = *(p1 + 1); 3972 } 3973 3974 DEBUG_PRINT1 (":\n"); 3975 PUSH_FAILURE_POINT (p + mcnt, d, -2); 3976 break; 3977 3978 3979 /* A smart repeat ends with `maybe_pop_jump'. 3980 We change it to either `pop_failure_jump' or `jump'. */ 3981 case maybe_pop_jump: 3982 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3983 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); 3984 { 3985 register unsigned char *p2 = p; 3986 3987 /* Compare the beginning of the repeat with what in the 3988 pattern follows its end. If we can establish that there 3989 is nothing that they would both match, i.e., that we 3990 would have to backtrack because of (as in, e.g., `a*a') 3991 then we can change to pop_failure_jump, because we'll 3992 never have to backtrack. 3993 3994 This is not true in the case of alternatives: in 3995 `(a|ab)*' we do need to backtrack to the `ab' alternative 3996 (e.g., if the string was `ab'). But instead of trying to 3997 detect that here, the alternative has put on a dummy 3998 failure point which is what we will end up popping. */ 3999 4000 /* Skip over open/close-group commands. */ 4001 while (p2 + 2 < pend 4002 && ((re_opcode_t) *p2 == stop_memory 4003 || (re_opcode_t) *p2 == start_memory)) 4004 p2 += 3; /* Skip over args, too. */ 4005 4006 /* If we're at the end of the pattern, we can change. */ 4007 if (p2 == pend) 4008 { 4009 /* Consider what happens when matching ":\(.*\)" 4010 against ":/". I don't really understand this code 4011 yet. */ 4012 p[-3] = (unsigned char) pop_failure_jump; 4013 DEBUG_PRINT1 4014 (" End of pattern: change to `pop_failure_jump'.\n"); 4015 } 4016 4017 else if ((re_opcode_t) *p2 == exactn 4018 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) 4019 { 4020 register unsigned char c 4021 = *p2 == (unsigned char) endline ? '\n' : p2[2]; 4022 p1 = p + mcnt; 4023 4024 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding 4025 to the `maybe_finalize_jump' of this case. Examine what 4026 follows. */ 4027 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) 4028 { 4029 p[-3] = (unsigned char) pop_failure_jump; 4030 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 4031 c, p1[5]); 4032 } 4033 4034 else if ((re_opcode_t) p1[3] == charset 4035 || (re_opcode_t) p1[3] == charset_not) 4036 { 4037 int not = (re_opcode_t) p1[3] == charset_not; 4038 4039 if (c < (unsigned char) (p1[4] * BYTEWIDTH) 4040 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4041 not = !not; 4042 4043 /* `not' is equal to 1 if c would match, which means 4044 that we can't change to pop_failure_jump. */ 4045 if (!not) 4046 { 4047 p[-3] = (unsigned char) pop_failure_jump; 4048 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4049 } 4050 } 4051 } 4052 } 4053 p -= 2; /* Point at relative address again. */ 4054 if ((re_opcode_t) p[-1] != pop_failure_jump) 4055 { 4056 p[-1] = (unsigned char) jump; 4057 DEBUG_PRINT1 (" Match => jump.\n"); 4058 goto unconditional_jump; 4059 } 4060 /* Note fall through. */ 4061 4062 4063 /* The end of a simple repeat has a pop_failure_jump back to 4064 its matching on_failure_jump, where the latter will push a 4065 failure point. The pop_failure_jump takes off failure 4066 points put on by this pop_failure_jump's matching 4067 on_failure_jump; we got through the pattern to here from the 4068 matching on_failure_jump, so didn't fail. */ 4069 case pop_failure_jump: 4070 { 4071 /* We need to pass separate storage for the lowest and 4072 highest registers, even though we don't care about the 4073 actual values. Otherwise, we will restore only one 4074 register from the stack, since lowest will == highest in 4075 `pop_failure_point'. */ 4076 unsigned dummy_low_reg, dummy_high_reg; 4077 unsigned char *pdummy; 4078 const char *sdummy; 4079 4080 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); 4081 POP_FAILURE_POINT (sdummy, pdummy, 4082 dummy_low_reg, dummy_high_reg, 4083 reg_dummy, reg_dummy, reg_info_dummy); 4084 } 4085 /* Note fall through. */ 4086 4087 4088 /* Unconditionally jump (without popping any failure points). */ 4089 case jump: 4090 unconditional_jump: 4091 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ 4092 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); 4093 p += mcnt; /* Do the jump. */ 4094 DEBUG_PRINT2 ("(to 0x%x).\n", p); 4095 break; 4096 4097 4098 /* We need this opcode so we can detect where alternatives end 4099 in `group_match_null_string_p' et al. */ 4100 case jump_past_alt: 4101 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); 4102 goto unconditional_jump; 4103 4104 4105 /* Normally, the on_failure_jump pushes a failure point, which 4106 then gets popped at pop_failure_jump. We will end up at 4107 pop_failure_jump, also, and with a pattern of, say, `a+', we 4108 are skipping over the on_failure_jump, so we have to push 4109 something meaningless for pop_failure_jump to pop. */ 4110 case dummy_failure_jump: 4111 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); 4112 /* It doesn't matter what we push for the string here. What 4113 the code at `fail' tests is the value for the pattern. */ 4114 PUSH_FAILURE_POINT (0, 0, -2); 4115 goto unconditional_jump; 4116 4117 4118 /* At the end of an alternative, we need to push a dummy failure 4119 point in case we are followed by a `pop_failure_jump', because 4120 we don't want the failure point for the alternative to be 4121 popped. For example, matching `(a|ab)*' against `aab' 4122 requires that we match the `ab' alternative. */ 4123 case push_dummy_failure: 4124 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); 4125 /* See comments just above at `dummy_failure_jump' about the 4126 two zeroes. */ 4127 PUSH_FAILURE_POINT (0, 0, -2); 4128 break; 4129 4130 /* Have to succeed matching what follows at least n times. 4131 After that, handle like `on_failure_jump'. */ 4132 case succeed_n: 4133 EXTRACT_NUMBER (mcnt, p + 2); 4134 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); 4135 4136 assert (mcnt >= 0); 4137 /* Originally, this is how many times we HAVE to succeed. */ 4138 if (mcnt > 0) 4139 { 4140 mcnt--; 4141 p += 2; 4142 STORE_NUMBER_AND_INCR (p, mcnt); 4143 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); 4144 } 4145 else if (mcnt == 0) 4146 { 4147 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); 4148 p[2] = (unsigned char) no_op; 4149 p[3] = (unsigned char) no_op; 4150 goto on_failure; 4151 } 4152 break; 4153 4154 case jump_n: 4155 EXTRACT_NUMBER (mcnt, p + 2); 4156 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); 4157 4158 /* Originally, this is how many times we CAN jump. */ 4159 if (mcnt) 4160 { 4161 mcnt--; 4162 STORE_NUMBER (p + 2, mcnt); 4163 goto unconditional_jump; 4164 } 4165 /* If don't have to jump any more, skip over the rest of command. */ 4166 else 4167 p += 4; 4168 break; 4169 4170 case set_number_at: 4171 { 4172 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); 4173 4174 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4175 p1 = p + mcnt; 4176 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4177 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); 4178 STORE_NUMBER (p1, mcnt); 4179 break; 4180 } 4181 4182 case wordbound: 4183 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 4184 if (AT_WORD_BOUNDARY (d)) 4185 break; 4186 goto fail; 4187 4188 case notwordbound: 4189 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 4190 if (AT_WORD_BOUNDARY (d)) 4191 goto fail; 4192 break; 4193 4194 case wordbeg: 4195 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); 4196 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) 4197 break; 4198 goto fail; 4199 4200 case wordend: 4201 DEBUG_PRINT1 ("EXECUTING wordend.\n"); 4202 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) 4203 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) 4204 break; 4205 goto fail; 4206 4207 #ifdef emacs 4208 #ifdef emacs19 4209 case before_dot: 4210 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); 4211 if (PTR_CHAR_POS ((unsigned char *) d) >= point) 4212 goto fail; 4213 break; 4214 4215 case at_dot: 4216 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 4217 if (PTR_CHAR_POS ((unsigned char *) d) != point) 4218 goto fail; 4219 break; 4220 4221 case after_dot: 4222 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); 4223 if (PTR_CHAR_POS ((unsigned char *) d) <= point) 4224 goto fail; 4225 break; 4226 #else /* not emacs19 */ 4227 case at_dot: 4228 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 4229 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point) 4230 goto fail; 4231 break; 4232 #endif /* not emacs19 */ 4233 4234 case syntaxspec: 4235 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); 4236 mcnt = *p++; 4237 goto matchsyntax; 4238 4239 case wordchar: 4240 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); 4241 mcnt = (int) Sword; 4242 matchsyntax: 4243 PREFETCH (); 4244 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) 4245 goto fail; 4246 SET_REGS_MATCHED (); 4247 break; 4248 4249 case notsyntaxspec: 4250 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); 4251 mcnt = *p++; 4252 goto matchnotsyntax; 4253 4254 case notwordchar: 4255 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); 4256 mcnt = (int) Sword; 4257 matchnotsyntax: 4258 PREFETCH (); 4259 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) 4260 goto fail; 4261 SET_REGS_MATCHED (); 4262 break; 4263 4264 #else /* not emacs */ 4265 case wordchar: 4266 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); 4267 PREFETCH (); 4268 if (!WORDCHAR_P (d)) 4269 goto fail; 4270 SET_REGS_MATCHED (); 4271 d++; 4272 break; 4273 4274 case notwordchar: 4275 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); 4276 PREFETCH (); 4277 if (WORDCHAR_P (d)) 4278 goto fail; 4279 SET_REGS_MATCHED (); 4280 d++; 4281 break; 4282 #endif /* not emacs */ 4283 4284 default: 4285 abort (); 4286 } 4287 continue; /* Successfully executed one pattern command; keep going. */ 4288 4289 4290 /* We goto here if a matching operation fails. */ 4291 fail: 4292 if (!FAIL_STACK_EMPTY ()) 4293 { /* A restart point is known. Restore to that state. */ 4294 DEBUG_PRINT1 ("\nFAIL:\n"); 4295 POP_FAILURE_POINT (d, p, 4296 lowest_active_reg, highest_active_reg, 4297 regstart, regend, reg_info); 4298 4299 /* If this failure point is a dummy, try the next one. */ 4300 if (!p) 4301 goto fail; 4302 4303 /* If we failed to the end of the pattern, don't examine *p. */ 4304 assert (p <= pend); 4305 if (p < pend) 4306 { 4307 boolean is_a_jump_n = false; 4308 4309 /* If failed to a backwards jump that's part of a repetition 4310 loop, need to pop this failure point and use the next one. */ 4311 switch ((re_opcode_t) *p) 4312 { 4313 case jump_n: 4314 is_a_jump_n = true; 4315 case maybe_pop_jump: 4316 case pop_failure_jump: 4317 case jump: 4318 p1 = p + 1; 4319 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4320 p1 += mcnt; 4321 4322 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) 4323 || (!is_a_jump_n 4324 && (re_opcode_t) *p1 == on_failure_jump)) 4325 goto fail; 4326 break; 4327 default: 4328 /* do nothing */ ; 4329 } 4330 } 4331 4332 if (d >= string1 && d <= end1) 4333 dend = end_match_1; 4334 } 4335 else 4336 break; /* Matching at this starting point really fails. */ 4337 } /* for (;;) */ 4338 4339 if (best_regs_set) 4340 goto restore_best_regs; 4341 4342 FREE_VARIABLES (); 4343 4344 return -1; /* Failure to match. */ 4345 } /* re_match_2 */ 4346 4347 /* Subroutine definitions for re_match_2. */ 4348 4349 4350 /* We are passed P pointing to a register number after a start_memory. 4351 4352 Return true if the pattern up to the corresponding stop_memory can 4353 match the empty string, and false otherwise. 4354 4355 If we find the matching stop_memory, sets P to point to one past its number. 4356 Otherwise, sets P to an undefined byte less than or equal to END. 4357 4358 We don't handle duplicates properly (yet). */ 4359 4360 static boolean 4361 group_match_null_string_p (p, end, reg_info) 4362 unsigned char **p, *end; 4363 register_info_type *reg_info; 4364 { 4365 int mcnt; 4366 /* Point to after the args to the start_memory. */ 4367 unsigned char *p1 = *p + 2; 4368 4369 while (p1 < end) 4370 { 4371 /* Skip over opcodes that can match nothing, and return true or 4372 false, as appropriate, when we get to one that can't, or to the 4373 matching stop_memory. */ 4374 4375 switch ((re_opcode_t) *p1) 4376 { 4377 /* Could be either a loop or a series of alternatives. */ 4378 case on_failure_jump: 4379 p1++; 4380 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4381 4382 /* If the next operation is not a jump backwards in the 4383 pattern. */ 4384 4385 if (mcnt >= 0) 4386 { 4387 /* Go through the on_failure_jumps of the alternatives, 4388 seeing if any of the alternatives cannot match nothing. 4389 The last alternative starts with only a jump, 4390 whereas the rest start with on_failure_jump and end 4391 with a jump, e.g., here is the pattern for `a|b|c': 4392 4393 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 4394 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 4395 /exactn/1/c 4396 4397 So, we have to first go through the first (n-1) 4398 alternatives and then deal with the last one separately. */ 4399 4400 4401 /* Deal with the first (n-1) alternatives, which start 4402 with an on_failure_jump (see above) that jumps to right 4403 past a jump_past_alt. */ 4404 4405 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) 4406 { 4407 /* `mcnt' holds how many bytes long the alternative 4408 is, including the ending `jump_past_alt' and 4409 its number. */ 4410 4411 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, 4412 reg_info)) 4413 return false; 4414 4415 /* Move to right after this alternative, including the 4416 jump_past_alt. */ 4417 p1 += mcnt; 4418 4419 /* Break if it's the beginning of an n-th alternative 4420 that doesn't begin with an on_failure_jump. */ 4421 if ((re_opcode_t) *p1 != on_failure_jump) 4422 break; 4423 4424 /* Still have to check that it's not an n-th 4425 alternative that starts with an on_failure_jump. */ 4426 p1++; 4427 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4428 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) 4429 { 4430 /* Get to the beginning of the n-th alternative. */ 4431 p1 -= 3; 4432 break; 4433 } 4434 } 4435 4436 /* Deal with the last alternative: go back and get number 4437 of the `jump_past_alt' just before it. `mcnt' contains 4438 the length of the alternative. */ 4439 EXTRACT_NUMBER (mcnt, p1 - 2); 4440 4441 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) 4442 return false; 4443 4444 p1 += mcnt; /* Get past the n-th alternative. */ 4445 } /* if mcnt > 0 */ 4446 break; 4447 4448 4449 case stop_memory: 4450 assert (p1[1] == **p); 4451 *p = p1 + 2; 4452 return true; 4453 4454 4455 default: 4456 if (!common_op_match_null_string_p (&p1, end, reg_info)) 4457 return false; 4458 } 4459 } /* while p1 < end */ 4460 4461 return false; 4462 } /* group_match_null_string_p */ 4463 4464 4465 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: 4466 It expects P to be the first byte of a single alternative and END one 4467 byte past the last. The alternative can contain groups. */ 4468 4469 static boolean 4470 alt_match_null_string_p (p, end, reg_info) 4471 unsigned char *p, *end; 4472 register_info_type *reg_info; 4473 { 4474 int mcnt; 4475 unsigned char *p1 = p; 4476 4477 while (p1 < end) 4478 { 4479 /* Skip over opcodes that can match nothing, and break when we get 4480 to one that can't. */ 4481 4482 switch ((re_opcode_t) *p1) 4483 { 4484 /* It's a loop. */ 4485 case on_failure_jump: 4486 p1++; 4487 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4488 p1 += mcnt; 4489 break; 4490 4491 default: 4492 if (!common_op_match_null_string_p (&p1, end, reg_info)) 4493 return false; 4494 } 4495 } /* while p1 < end */ 4496 4497 return true; 4498 } /* alt_match_null_string_p */ 4499 4500 4501 /* Deals with the ops common to group_match_null_string_p and 4502 alt_match_null_string_p. 4503 4504 Sets P to one after the op and its arguments, if any. */ 4505 4506 static boolean 4507 common_op_match_null_string_p (p, end, reg_info) 4508 unsigned char **p, *end; 4509 register_info_type *reg_info; 4510 { 4511 int mcnt; 4512 boolean ret; 4513 int reg_no; 4514 unsigned char *p1 = *p; 4515 4516 switch ((re_opcode_t) *p1++) 4517 { 4518 case no_op: 4519 case begline: 4520 case endline: 4521 case begbuf: 4522 case endbuf: 4523 case wordbeg: 4524 case wordend: 4525 case wordbound: 4526 case notwordbound: 4527 #ifdef emacs 4528 case before_dot: 4529 case at_dot: 4530 case after_dot: 4531 #endif 4532 break; 4533 4534 case start_memory: 4535 reg_no = *p1; 4536 assert (reg_no > 0 && reg_no <= MAX_REGNUM); 4537 ret = group_match_null_string_p (&p1, end, reg_info); 4538 4539 /* Have to set this here in case we're checking a group which 4540 contains a group and a back reference to it. */ 4541 4542 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) 4543 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; 4544 4545 if (!ret) 4546 return false; 4547 break; 4548 4549 /* If this is an optimized succeed_n for zero times, make the jump. */ 4550 case jump: 4551 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4552 if (mcnt >= 0) 4553 p1 += mcnt; 4554 else 4555 return false; 4556 break; 4557 4558 case succeed_n: 4559 /* Get to the number of times to succeed. */ 4560 p1 += 2; 4561 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4562 4563 if (mcnt == 0) 4564 { 4565 p1 -= 4; 4566 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4567 p1 += mcnt; 4568 } 4569 else 4570 return false; 4571 break; 4572 4573 case duplicate: 4574 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) 4575 return false; 4576 break; 4577 4578 case set_number_at: 4579 p1 += 4; 4580 4581 default: 4582 /* All other opcodes mean we cannot match the empty string. */ 4583 return false; 4584 } 4585 4586 *p = p1; 4587 return true; 4588 } /* common_op_match_null_string_p */ 4589 4590 4591 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 4592 bytes; nonzero otherwise. */ 4593 4594 static int 4595 bcmp_translate (s1, s2, len, translate) 4596 unsigned char *s1, *s2; 4597 register int len; 4598 char *translate; 4599 { 4600 register unsigned char *p1 = s1, *p2 = s2; 4601 while (len) 4602 { 4603 if (translate[*p1++] != translate[*p2++]) return 1; 4604 len--; 4605 } 4606 return 0; 4607 } 4608 4609 /* Entry points for GNU code. */ 4610 4611 /* re_compile_pattern is the GNU regular expression compiler: it 4612 compiles PATTERN (of length SIZE) and puts the result in BUFP. 4613 Returns 0 if the pattern was valid, otherwise an error string. 4614 4615 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 4616 are set in BUFP on entry. 4617 4618 We call regex_compile to do the actual compilation. */ 4619 4620 const char * 4621 re_compile_pattern (pattern, length, bufp) 4622 const char *pattern; 4623 int length; 4624 struct re_pattern_buffer *bufp; 4625 { 4626 reg_errcode_t ret; 4627 4628 /* GNU code is written to assume at least RE_NREGS registers will be set 4629 (and at least one extra will be -1). */ 4630 bufp->regs_allocated = REGS_UNALLOCATED; 4631 4632 /* And GNU code determines whether or not to get register information 4633 by passing null for the REGS argument to re_match, etc., not by 4634 setting no_sub. */ 4635 bufp->no_sub = 0; 4636 4637 /* Match anchors at newline. */ 4638 bufp->newline_anchor = 1; 4639 4640 ret = regex_compile (pattern, length, re_syntax_options, bufp); 4641 4642 return re_error_msg[(int) ret]; 4643 } 4644 4645 /* Entry points compatible with 4.2 BSD regex library. We don't define 4646 them if this is an Emacs or POSIX compilation. */ 4647 4648 #if !defined (emacs) 4649 4650 /* BSD has one and only one pattern buffer. */ 4651 static struct re_pattern_buffer re_comp_buf; 4652 4653 char * 4654 re_comp (s) 4655 const char *s; 4656 { 4657 reg_errcode_t ret; 4658 4659 if (!s) 4660 { 4661 if (!re_comp_buf.buffer) 4662 return "No previous regular expression"; 4663 return 0; 4664 } 4665 4666 if (!re_comp_buf.buffer) 4667 { 4668 re_comp_buf.buffer = (unsigned char *) malloc (200); 4669 if (re_comp_buf.buffer == NULL) 4670 return "Memory exhausted"; 4671 re_comp_buf.allocated = 200; 4672 4673 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); 4674 if (re_comp_buf.fastmap == NULL) 4675 return "Memory exhausted"; 4676 } 4677 4678 /* Since `re_exec' always passes NULL for the `regs' argument, we 4679 don't need to initialize the pattern buffer fields which affect it. */ 4680 4681 /* Match anchors at newlines. */ 4682 re_comp_buf.newline_anchor = 1; 4683 4684 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 4685 4686 /* Yes, we're discarding `const' here. */ 4687 return (char *) re_error_msg[(int) ret]; 4688 } 4689 4690 4691 int 4692 re_exec (s) 4693 const char *s; 4694 { 4695 const int len = strlen (s); 4696 return 4697 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); 4698 } 4699 #endif /* not emacs and not _POSIX_SOURCE */ 4700 4701 /* POSIX.2 functions. Don't define these for Emacs. */ 4702 4703 #ifndef emacs 4704 4705 /* regcomp takes a regular expression as a string and compiles it. 4706 4707 PREG is a regex_t *. We do not expect any fields to be initialized, 4708 since POSIX says we shouldn't. Thus, we set 4709 4710 `buffer' to the compiled pattern; 4711 `used' to the length of the compiled pattern; 4712 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 4713 REG_EXTENDED bit in CFLAGS is set; otherwise, to 4714 RE_SYNTAX_POSIX_BASIC; 4715 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 4716 `fastmap' and `fastmap_accurate' to zero; 4717 `re_nsub' to the number of subexpressions in PATTERN. 4718 4719 PATTERN is the address of the pattern string. 4720 4721 CFLAGS is a series of bits which affect compilation. 4722 4723 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 4724 use POSIX basic syntax. 4725 4726 If REG_NEWLINE is set, then . and [^...] don't match newline. 4727 Also, regexec will try a match beginning after every newline. 4728 4729 If REG_ICASE is set, then we considers upper- and lowercase 4730 versions of letters to be equivalent when matching. 4731 4732 If REG_NOSUB is set, then when PREG is passed to regexec, that 4733 routine will report only success or failure, and nothing about the 4734 registers. 4735 4736 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 4737 the return codes and their meanings.) */ 4738 4739 int 4740 regcomp (preg, pattern, cflags) 4741 regex_t *preg; 4742 const char *pattern; 4743 int cflags; 4744 { 4745 reg_errcode_t ret; 4746 unsigned syntax 4747 = (cflags & REG_EXTENDED) ? 4748 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; 4749 4750 /* regex_compile will allocate the space for the compiled pattern. */ 4751 preg->buffer = 0; 4752 preg->allocated = 0; 4753 4754 /* Don't bother to use a fastmap when searching. This simplifies the 4755 REG_NEWLINE case: if we used a fastmap, we'd have to put all the 4756 characters after newlines into the fastmap. This way, we just try 4757 every character. */ 4758 preg->fastmap = 0; 4759 4760 if (cflags & REG_ICASE) 4761 { 4762 unsigned i; 4763 4764 preg->translate = (char *) malloc (CHAR_SET_SIZE); 4765 if (preg->translate == NULL) 4766 return (int) REG_ESPACE; 4767 4768 /* Map uppercase characters to corresponding lowercase ones. */ 4769 for (i = 0; i < CHAR_SET_SIZE; i++) 4770 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; 4771 } 4772 else 4773 preg->translate = NULL; 4774 4775 /* If REG_NEWLINE is set, newlines are treated differently. */ 4776 if (cflags & REG_NEWLINE) 4777 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 4778 syntax &= ~RE_DOT_NEWLINE; 4779 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 4780 /* It also changes the matching behavior. */ 4781 preg->newline_anchor = 1; 4782 } 4783 else 4784 preg->newline_anchor = 0; 4785 4786 preg->no_sub = !!(cflags & REG_NOSUB); 4787 4788 /* POSIX says a null character in the pattern terminates it, so we 4789 can use strlen here in compiling the pattern. */ 4790 ret = regex_compile (pattern, strlen (pattern), syntax, preg); 4791 4792 /* POSIX doesn't distinguish between an unmatched open-group and an 4793 unmatched close-group: both are REG_EPAREN. */ 4794 if (ret == REG_ERPAREN) ret = REG_EPAREN; 4795 4796 return (int) ret; 4797 } 4798 4799 4800 /* regexec searches for a given pattern, specified by PREG, in the 4801 string STRING. 4802 4803 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 4804 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 4805 least NMATCH elements, and we set them to the offsets of the 4806 corresponding matched substrings. 4807 4808 EFLAGS specifies `execution flags' which affect matching: if 4809 REG_NOTBOL is set, then ^ does not match at the beginning of the 4810 string; if REG_NOTEOL is set, then $ does not match at the end. 4811 4812 We return 0 if we find a match and REG_NOMATCH if not. */ 4813 4814 int 4815 regexec (preg, string, nmatch, pmatch, eflags) 4816 const regex_t *preg; 4817 const char *string; 4818 size_t nmatch; 4819 regmatch_t pmatch[]; 4820 int eflags; 4821 { 4822 int ret; 4823 struct re_registers regs; 4824 regex_t private_preg; 4825 int len = strlen (string); 4826 boolean want_reg_info = !preg->no_sub && nmatch > 0; 4827 4828 private_preg = *preg; 4829 4830 private_preg.not_bol = !!(eflags & REG_NOTBOL); 4831 private_preg.not_eol = !!(eflags & REG_NOTEOL); 4832 4833 /* The user has told us exactly how many registers to return 4834 information about, via `nmatch'. We have to pass that on to the 4835 matching routines. */ 4836 private_preg.regs_allocated = REGS_FIXED; 4837 4838 if (want_reg_info) 4839 { 4840 regs.num_regs = nmatch; 4841 regs.start = TALLOC (nmatch, regoff_t); 4842 regs.end = TALLOC (nmatch, regoff_t); 4843 if (regs.start == NULL || regs.end == NULL) 4844 return (int) REG_NOMATCH; 4845 } 4846 4847 /* Perform the searching operation. */ 4848 ret = re_search (&private_preg, string, len, 4849 /* start: */ 0, /* range: */ len, 4850 want_reg_info ? ®s : (struct re_registers *) 0); 4851 4852 /* Copy the register information to the POSIX structure. */ 4853 if (want_reg_info) 4854 { 4855 if (ret >= 0) 4856 { 4857 unsigned r; 4858 4859 for (r = 0; r < nmatch; r++) 4860 { 4861 pmatch[r].rm_so = regs.start[r]; 4862 pmatch[r].rm_eo = regs.end[r]; 4863 } 4864 } 4865 4866 /* If we needed the temporary register info, free the space now. */ 4867 free (regs.start); 4868 free (regs.end); 4869 } 4870 4871 /* We want zero return to mean success, unlike `re_search'. */ 4872 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; 4873 } 4874 4875 4876 /* Returns a message corresponding to an error code, ERRCODE, returned 4877 from either regcomp or regexec. We don't use PREG here. */ 4878 4879 size_t 4880 regerror (errcode, preg, errbuf, errbuf_size) 4881 int errcode; 4882 const regex_t *preg; 4883 char *errbuf; 4884 size_t errbuf_size; 4885 { 4886 const char *msg; 4887 size_t msg_size; 4888 4889 if (errcode < 0 4890 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0]))) 4891 /* Only error codes returned by the rest of the code should be passed 4892 to this routine. If we are given anything else, or if other regex 4893 code generates an invalid error code, then the program has a bug. 4894 Dump core so we can fix it. */ 4895 abort (); 4896 4897 msg = re_error_msg[errcode]; 4898 4899 /* POSIX doesn't require that we do anything in this case, but why 4900 not be nice. */ 4901 if (! msg) 4902 msg = "Success"; 4903 4904 msg_size = strlen (msg) + 1; /* Includes the null. */ 4905 4906 if (errbuf_size != 0) 4907 { 4908 if (msg_size > errbuf_size) 4909 { 4910 strncpy (errbuf, msg, errbuf_size - 1); 4911 errbuf[errbuf_size - 1] = 0; 4912 } 4913 else 4914 strcpy (errbuf, msg); 4915 } 4916 4917 return msg_size; 4918 } 4919 4920 4921 /* Free dynamically allocated space used by PREG. */ 4922 4923 void 4924 regfree (preg) 4925 regex_t *preg; 4926 { 4927 if (preg->buffer != NULL) 4928 free (preg->buffer); 4929 preg->buffer = NULL; 4930 4931 preg->allocated = 0; 4932 preg->used = 0; 4933 4934 if (preg->fastmap != NULL) 4935 free (preg->fastmap); 4936 preg->fastmap = NULL; 4937 preg->fastmap_accurate = 0; 4938 4939 if (preg->translate != NULL) 4940 free (preg->translate); 4941 preg->translate = NULL; 4942 } 4943 4944 #endif /* not emacs */ 4945 4946 /* 4947 Local variables: 4948 make-backup-files: t 4949 version-control: t 4950 trim-versions-without-asking: nil 4951 End: 4952 */ 4953