1 /* regcomp.c 2 */ 3 4 /* 5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee 6 */ 7 8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 9 * confused with the original package (see point 3 below). Thanks, Henry! 10 */ 11 12 /* Additional note: this code is very heavily munged from Henry's version 13 * in places. In some spots I've traded clarity for efficiency, so don't 14 * blame Henry for some of the lack of readability. 15 */ 16 17 /* The names of the functions have been changed from regcomp and 18 * regexec to pregcomp and pregexec in order to avoid conflicts 19 * with the POSIX routines of the same names. 20 */ 21 22 #ifdef PERL_EXT_RE_BUILD 23 /* need to replace pregcomp et al, so enable that */ 24 # ifndef PERL_IN_XSUB_RE 25 # define PERL_IN_XSUB_RE 26 # endif 27 /* need access to debugger hooks */ 28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING) 29 # define DEBUGGING 30 # endif 31 #endif 32 33 #ifdef PERL_IN_XSUB_RE 34 /* We *really* need to overwrite these symbols: */ 35 # define Perl_pregcomp my_regcomp 36 # define Perl_regdump my_regdump 37 # define Perl_regprop my_regprop 38 # define Perl_pregfree my_regfree 39 # define Perl_re_intuit_string my_re_intuit_string 40 /* *These* symbols are masked to allow static link. */ 41 # define Perl_regnext my_regnext 42 # define Perl_save_re_context my_save_re_context 43 # define Perl_reginitcolors my_reginitcolors 44 45 # define PERL_NO_GET_CONTEXT 46 #endif 47 48 /*SUPPRESS 112*/ 49 /* 50 * pregcomp and pregexec -- regsub and regerror are not used in perl 51 * 52 * Copyright (c) 1986 by University of Toronto. 53 * Written by Henry Spencer. Not derived from licensed software. 54 * 55 * Permission is granted to anyone to use this software for any 56 * purpose on any computer system, and to redistribute it freely, 57 * subject to the following restrictions: 58 * 59 * 1. The author is not responsible for the consequences of use of 60 * this software, no matter how awful, even if they arise 61 * from defects in it. 62 * 63 * 2. The origin of this software must not be misrepresented, either 64 * by explicit claim or by omission. 65 * 66 * 3. Altered versions must be plainly marked as such, and must not 67 * be misrepresented as being the original software. 68 * 69 * 70 **** Alterations to Henry's code are... 71 **** 72 **** Copyright (c) 1991-2001, Larry Wall 73 **** 74 **** You may distribute under the terms of either the GNU General Public 75 **** License or the Artistic License, as specified in the README file. 76 77 * 78 * Beware that some of this code is subtly aware of the way operator 79 * precedence is structured in regular expressions. Serious changes in 80 * regular-expression syntax might require a total rethink. 81 */ 82 #include "EXTERN.h" 83 #define PERL_IN_REGCOMP_C 84 #include "perl.h" 85 86 #ifdef PERL_IN_XSUB_RE 87 # if defined(PERL_CAPI) || defined(PERL_OBJECT) 88 # include "XSUB.h" 89 # endif 90 #else 91 # include "INTERN.h" 92 #endif 93 94 #define REG_COMP_C 95 #include "regcomp.h" 96 97 #ifdef op 98 #undef op 99 #endif /* op */ 100 101 #ifdef MSDOS 102 # if defined(BUGGY_MSC6) 103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ 104 # pragma optimize("a",off) 105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ 106 # pragma optimize("w",on ) 107 # endif /* BUGGY_MSC6 */ 108 #endif /* MSDOS */ 109 110 #ifndef STATIC 111 #define STATIC static 112 #endif 113 114 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') 115 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ 116 ((*s) == '{' && regcurly(s))) 117 118 #ifdef SPSTART 119 #undef SPSTART /* dratted cpp namespace... */ 120 #endif 121 /* 122 * Flags to be passed up and down. 123 */ 124 #define WORST 0 /* Worst case. */ 125 #define HASWIDTH 0x1 /* Known to match non-null strings. */ 126 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ 127 #define SPSTART 0x4 /* Starts with * or +. */ 128 #define TRYAGAIN 0x8 /* Weeded out a declaration. */ 129 130 /* Length of a variant. */ 131 132 typedef struct scan_data_t { 133 I32 len_min; 134 I32 len_delta; 135 I32 pos_min; 136 I32 pos_delta; 137 SV *last_found; 138 I32 last_end; /* min value, <0 unless valid. */ 139 I32 last_start_min; 140 I32 last_start_max; 141 SV **longest; /* Either &l_fixed, or &l_float. */ 142 SV *longest_fixed; 143 I32 offset_fixed; 144 SV *longest_float; 145 I32 offset_float_min; 146 I32 offset_float_max; 147 I32 flags; 148 I32 whilem_c; 149 I32 *last_closep; 150 struct regnode_charclass_class *start_class; 151 } scan_data_t; 152 153 /* 154 * Forward declarations for pregcomp()'s friends. 155 */ 156 157 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158 0, 0, 0, 0, 0, 0}; 159 160 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 161 #define SF_BEFORE_SEOL 0x1 162 #define SF_BEFORE_MEOL 0x2 163 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) 164 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) 165 166 #ifdef NO_UNARY_PLUS 167 # define SF_FIX_SHIFT_EOL (0+2) 168 # define SF_FL_SHIFT_EOL (0+4) 169 #else 170 # define SF_FIX_SHIFT_EOL (+2) 171 # define SF_FL_SHIFT_EOL (+4) 172 #endif 173 174 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) 175 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) 176 177 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) 178 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ 179 #define SF_IS_INF 0x40 180 #define SF_HAS_PAR 0x80 181 #define SF_IN_PAR 0x100 182 #define SF_HAS_EVAL 0x200 183 #define SCF_DO_SUBSTR 0x400 184 #define SCF_DO_STCLASS_AND 0x0800 185 #define SCF_DO_STCLASS_OR 0x1000 186 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 187 #define SCF_WHILEM_VISITED_POS 0x2000 188 189 #define RF_utf8 8 190 #define UTF (PL_reg_flags & RF_utf8) 191 #define LOC (PL_regflags & PMf_LOCALE) 192 #define FOLD (PL_regflags & PMf_FOLD) 193 194 #define OOB_CHAR8 1234 195 #define OOB_UTF8 123456 196 #define OOB_NAMEDCLASS -1 197 198 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 199 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) 200 201 202 /* length of regex to show in messages that don't mark a position within */ 203 #define RegexLengthToShowInErrorMessages 127 204 205 /* 206 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 207 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 208 * op/pragma/warn/regcomp. 209 */ 210 #define MARKER1 "HERE" /* marker as it appears in the description */ 211 #define MARKER2 " << HERE " /* marker as it appears within the regex */ 212 213 #define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/" 214 215 /* 216 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 217 * arg. Show regex, up to a maximum length. If it's too long, chop and add 218 * "...". 219 */ 220 #define FAIL(msg) \ 221 STMT_START { \ 222 char *ellipses = ""; \ 223 unsigned len = strlen(PL_regprecomp); \ 224 \ 225 if (!SIZE_ONLY) \ 226 SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ 227 \ 228 if (len > RegexLengthToShowInErrorMessages) { \ 229 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 230 len = RegexLengthToShowInErrorMessages - 10; \ 231 ellipses = "..."; \ 232 } \ 233 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ 234 msg, (int)len, PL_regprecomp, ellipses); \ 235 } STMT_END 236 237 /* 238 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 239 * args. Show regex, up to a maximum length. If it's too long, chop and add 240 * "...". 241 */ 242 #define FAIL2(pat,msg) \ 243 STMT_START { \ 244 char *ellipses = ""; \ 245 unsigned len = strlen(PL_regprecomp); \ 246 \ 247 if (!SIZE_ONLY) \ 248 SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ 249 \ 250 if (len > RegexLengthToShowInErrorMessages) { \ 251 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 252 len = RegexLengthToShowInErrorMessages - 10; \ 253 ellipses = "..."; \ 254 } \ 255 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \ 256 msg, (int)len, PL_regprecomp, ellipses); \ 257 } STMT_END 258 259 260 /* 261 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 262 */ 263 #define Simple_vFAIL(m) \ 264 STMT_START { \ 265 unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ 266 \ 267 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 268 m, (int)offset, PL_regprecomp, PL_regprecomp + offset); \ 269 } STMT_END 270 271 /* 272 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 273 */ 274 #define vFAIL(m) \ 275 STMT_START { \ 276 if (!SIZE_ONLY) \ 277 SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ 278 Simple_vFAIL(m); \ 279 } STMT_END 280 281 /* 282 * Like Simple_vFAIL(), but accepts two arguments. 283 */ 284 #define Simple_vFAIL2(m,a1) \ 285 STMT_START { \ 286 unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ 287 \ 288 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ 289 (int)offset, PL_regprecomp, PL_regprecomp + offset); \ 290 } STMT_END 291 292 /* 293 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 294 */ 295 #define vFAIL2(m,a1) \ 296 STMT_START { \ 297 if (!SIZE_ONLY) \ 298 SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ 299 Simple_vFAIL2(m, a1); \ 300 } STMT_END 301 302 303 /* 304 * Like Simple_vFAIL(), but accepts three arguments. 305 */ 306 #define Simple_vFAIL3(m, a1, a2) \ 307 STMT_START { \ 308 unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ 309 \ 310 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ 311 (int)offset, PL_regprecomp, PL_regprecomp + offset); \ 312 } STMT_END 313 314 /* 315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 316 */ 317 #define vFAIL3(m,a1,a2) \ 318 STMT_START { \ 319 if (!SIZE_ONLY) \ 320 SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ 321 Simple_vFAIL3(m, a1, a2); \ 322 } STMT_END 323 324 /* 325 * Like Simple_vFAIL(), but accepts four arguments. 326 */ 327 #define Simple_vFAIL4(m, a1, a2, a3) \ 328 STMT_START { \ 329 unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ 330 \ 331 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\ 332 (int)offset, PL_regprecomp, PL_regprecomp + offset); \ 333 } STMT_END 334 335 /* 336 * Like Simple_vFAIL(), but accepts five arguments. 337 */ 338 #define Simple_vFAIL5(m, a1, a2, a3, a4) \ 339 STMT_START { \ 340 unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ 341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\ 342 (int)offset, PL_regprecomp, PL_regprecomp + offset); \ 343 } STMT_END 344 345 346 #define vWARN(loc,m) \ 347 STMT_START { \ 348 unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \ 349 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\ 350 m, (int)offset, PL_regprecomp, PL_regprecomp + offset); \ 351 } STMT_END \ 352 353 354 #define vWARN2(loc, m, a1) \ 355 STMT_START { \ 356 unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \ 357 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ 358 a1, \ 359 (int)offset, PL_regprecomp, PL_regprecomp + offset); \ 360 } STMT_END 361 362 #define vWARN3(loc, m, a1, a2) \ 363 STMT_START { \ 364 unsigned offset = strlen(PL_regprecomp) - (PL_regxend - (loc)); \ 365 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ 366 a1, a2, \ 367 (int)offset, PL_regprecomp, PL_regprecomp + offset); \ 368 } STMT_END 369 370 #define vWARN4(loc, m, a1, a2, a3) \ 371 STMT_START { \ 372 unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \ 373 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ 374 a1, a2, a3, \ 375 (int)offset, PL_regprecomp, PL_regprecomp + offset); \ 376 } STMT_END 377 378 379 380 /* Allow for side effects in s */ 381 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END 382 383 static void clear_re(pTHXo_ void *r); 384 385 /* Mark that we cannot extend a found fixed substring at this point. 386 Updata the longest found anchored substring and the longest found 387 floating substrings if needed. */ 388 389 STATIC void 390 S_scan_commit(pTHX_ scan_data_t *data) 391 { 392 STRLEN l = CHR_SVLEN(data->last_found); 393 STRLEN old_l = CHR_SVLEN(*data->longest); 394 395 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { 396 sv_setsv(*data->longest, data->last_found); 397 if (*data->longest == data->longest_fixed) { 398 data->offset_fixed = l ? data->last_start_min : data->pos_min; 399 if (data->flags & SF_BEFORE_EOL) 400 data->flags 401 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); 402 else 403 data->flags &= ~SF_FIX_BEFORE_EOL; 404 } 405 else { 406 data->offset_float_min = l ? data->last_start_min : data->pos_min; 407 data->offset_float_max = (l 408 ? data->last_start_max 409 : data->pos_min + data->pos_delta); 410 if (data->flags & SF_BEFORE_EOL) 411 data->flags 412 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); 413 else 414 data->flags &= ~SF_FL_BEFORE_EOL; 415 } 416 } 417 SvCUR_set(data->last_found, 0); 418 data->last_end = -1; 419 data->flags &= ~SF_BEFORE_EOL; 420 } 421 422 /* Can match anything (initialization) */ 423 STATIC void 424 S_cl_anything(pTHX_ struct regnode_charclass_class *cl) 425 { 426 int value; 427 428 ANYOF_CLASS_ZERO(cl); 429 for (value = 0; value < 256; ++value) 430 ANYOF_BITMAP_SET(cl, value); 431 cl->flags = ANYOF_EOS; 432 if (LOC) 433 cl->flags |= ANYOF_LOCALE; 434 } 435 436 /* Can match anything (initialization) */ 437 STATIC int 438 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl) 439 { 440 int value; 441 442 for (value = 0; value <= ANYOF_MAX; value += 2) 443 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) 444 return 1; 445 for (value = 0; value < 256; ++value) 446 if (!ANYOF_BITMAP_TEST(cl, value)) 447 return 0; 448 return 1; 449 } 450 451 /* Can match anything (initialization) */ 452 STATIC void 453 S_cl_init(pTHX_ struct regnode_charclass_class *cl) 454 { 455 Zero(cl, 1, struct regnode_charclass_class); 456 cl->type = ANYOF; 457 cl_anything(cl); 458 } 459 460 STATIC void 461 S_cl_init_zero(pTHX_ struct regnode_charclass_class *cl) 462 { 463 Zero(cl, 1, struct regnode_charclass_class); 464 cl->type = ANYOF; 465 cl_anything(cl); 466 if (LOC) 467 cl->flags |= ANYOF_LOCALE; 468 } 469 470 /* 'And' a given class with another one. Can create false positives */ 471 /* We assume that cl is not inverted */ 472 STATIC void 473 S_cl_and(pTHX_ struct regnode_charclass_class *cl, 474 struct regnode_charclass_class *and_with) 475 { 476 if (!(and_with->flags & ANYOF_CLASS) 477 && !(cl->flags & ANYOF_CLASS) 478 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 479 && !(and_with->flags & ANYOF_FOLD) 480 && !(cl->flags & ANYOF_FOLD)) { 481 int i; 482 483 if (and_with->flags & ANYOF_INVERT) 484 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 485 cl->bitmap[i] &= ~and_with->bitmap[i]; 486 else 487 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 488 cl->bitmap[i] &= and_with->bitmap[i]; 489 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ 490 if (!(and_with->flags & ANYOF_EOS)) 491 cl->flags &= ~ANYOF_EOS; 492 } 493 494 /* 'OR' a given class with another one. Can create false positives */ 495 /* We assume that cl is not inverted */ 496 STATIC void 497 S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with) 498 { 499 if (or_with->flags & ANYOF_INVERT) { 500 /* We do not use 501 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) 502 * <= (B1 | !B2) | (CL1 | !CL2) 503 * which is wasteful if CL2 is small, but we ignore CL2: 504 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 505 * XXXX Can we handle case-fold? Unclear: 506 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = 507 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) 508 */ 509 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 510 && !(or_with->flags & ANYOF_FOLD) 511 && !(cl->flags & ANYOF_FOLD) ) { 512 int i; 513 514 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 515 cl->bitmap[i] |= ~or_with->bitmap[i]; 516 } /* XXXX: logic is complicated otherwise */ 517 else { 518 cl_anything(cl); 519 } 520 } else { 521 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ 522 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 523 && (!(or_with->flags & ANYOF_FOLD) 524 || (cl->flags & ANYOF_FOLD)) ) { 525 int i; 526 527 /* OR char bitmap and class bitmap separately */ 528 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 529 cl->bitmap[i] |= or_with->bitmap[i]; 530 if (or_with->flags & ANYOF_CLASS) { 531 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++) 532 cl->classflags[i] |= or_with->classflags[i]; 533 cl->flags |= ANYOF_CLASS; 534 } 535 } 536 else { /* XXXX: logic is complicated, leave it along for a moment. */ 537 cl_anything(cl); 538 } 539 } 540 if (or_with->flags & ANYOF_EOS) 541 cl->flags |= ANYOF_EOS; 542 } 543 544 /* REx optimizer. Converts nodes into quickier variants "in place". 545 Finds fixed substrings. */ 546 547 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set 548 to the position after last scanned or to NULL. */ 549 550 STATIC I32 551 S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags) 552 /* scanp: Start here (read-write). */ 553 /* deltap: Write maxlen-minlen here. */ 554 /* last: Stop before this one. */ 555 { 556 I32 min = 0, pars = 0, code; 557 regnode *scan = *scanp, *next; 558 I32 delta = 0; 559 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); 560 int is_inf_internal = 0; /* The studied chunk is infinite */ 561 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; 562 scan_data_t data_fake; 563 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */ 564 565 while (scan && OP(scan) != END && scan < last) { 566 /* Peephole optimizer: */ 567 568 if (PL_regkind[(U8)OP(scan)] == EXACT) { 569 /* Merge several consecutive EXACTish nodes into one. */ 570 regnode *n = regnext(scan); 571 U32 stringok = 1; 572 #ifdef DEBUGGING 573 regnode *stop = scan; 574 #endif 575 576 next = scan + NODE_SZ_STR(scan); 577 /* Skip NOTHING, merge EXACT*. */ 578 while (n && 579 ( PL_regkind[(U8)OP(n)] == NOTHING || 580 (stringok && (OP(n) == OP(scan)))) 581 && NEXT_OFF(n) 582 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { 583 if (OP(n) == TAIL || n > next) 584 stringok = 0; 585 if (PL_regkind[(U8)OP(n)] == NOTHING) { 586 NEXT_OFF(scan) += NEXT_OFF(n); 587 next = n + NODE_STEP_REGNODE; 588 #ifdef DEBUGGING 589 if (stringok) 590 stop = n; 591 #endif 592 n = regnext(n); 593 } 594 else if (stringok) { 595 int oldl = STR_LEN(scan); 596 regnode *nnext = regnext(n); 597 598 if (oldl + STR_LEN(n) > U8_MAX) 599 break; 600 NEXT_OFF(scan) += NEXT_OFF(n); 601 STR_LEN(scan) += STR_LEN(n); 602 next = n + NODE_SZ_STR(n); 603 /* Now we can overwrite *n : */ 604 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); 605 #ifdef DEBUGGING 606 stop = next - 1; 607 #endif 608 n = nnext; 609 } 610 } 611 #ifdef DEBUGGING 612 /* Allow dumping */ 613 n = scan + NODE_SZ_STR(scan); 614 while (n <= stop) { 615 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) { 616 OP(n) = OPTIMIZED; 617 NEXT_OFF(n) = 0; 618 } 619 n++; 620 } 621 #endif 622 } 623 /* Follow the next-chain of the current node and optimize 624 away all the NOTHINGs from it. */ 625 if (OP(scan) != CURLYX) { 626 int max = (reg_off_by_arg[OP(scan)] 627 ? I32_MAX 628 /* I32 may be smaller than U16 on CRAYs! */ 629 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); 630 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); 631 int noff; 632 regnode *n = scan; 633 634 /* Skip NOTHING and LONGJMP. */ 635 while ((n = regnext(n)) 636 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n))) 637 || ((OP(n) == LONGJMP) && (noff = ARG(n)))) 638 && off + noff < max) 639 off += noff; 640 if (reg_off_by_arg[OP(scan)]) 641 ARG(scan) = off; 642 else 643 NEXT_OFF(scan) = off; 644 } 645 /* The principal pseudo-switch. Cannot be a switch, since we 646 look into several different things. */ 647 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ 648 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) { 649 next = regnext(scan); 650 code = OP(scan); 651 652 if (OP(next) == code || code == IFTHEN || code == SUSPEND) { 653 I32 max1 = 0, min1 = I32_MAX, num = 0; 654 struct regnode_charclass_class accum; 655 656 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ 657 scan_commit(data); /* Cannot merge strings after this. */ 658 if (flags & SCF_DO_STCLASS) 659 cl_init_zero(&accum); 660 while (OP(scan) == code) { 661 I32 deltanext, minnext, f = 0, fake = 0; 662 struct regnode_charclass_class this_class; 663 664 num++; 665 data_fake.flags = 0; 666 if (data) { 667 data_fake.whilem_c = data->whilem_c; 668 data_fake.last_closep = data->last_closep; 669 } 670 else 671 data_fake.last_closep = &fake; 672 next = regnext(scan); 673 scan = NEXTOPER(scan); 674 if (code != BRANCH) 675 scan = NEXTOPER(scan); 676 if (flags & SCF_DO_STCLASS) { 677 cl_init(&this_class); 678 data_fake.start_class = &this_class; 679 f = SCF_DO_STCLASS_AND; 680 } 681 if (flags & SCF_WHILEM_VISITED_POS) 682 f |= SCF_WHILEM_VISITED_POS; 683 /* we suppose the run is continuous, last=next...*/ 684 minnext = study_chunk(&scan, &deltanext, next, 685 &data_fake, f); 686 if (min1 > minnext) 687 min1 = minnext; 688 if (max1 < minnext + deltanext) 689 max1 = minnext + deltanext; 690 if (deltanext == I32_MAX) 691 is_inf = is_inf_internal = 1; 692 scan = next; 693 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 694 pars++; 695 if (data && (data_fake.flags & SF_HAS_EVAL)) 696 data->flags |= SF_HAS_EVAL; 697 if (data) 698 data->whilem_c = data_fake.whilem_c; 699 if (flags & SCF_DO_STCLASS) 700 cl_or(&accum, &this_class); 701 if (code == SUSPEND) 702 break; 703 } 704 if (code == IFTHEN && num < 2) /* Empty ELSE branch */ 705 min1 = 0; 706 if (flags & SCF_DO_SUBSTR) { 707 data->pos_min += min1; 708 data->pos_delta += max1 - min1; 709 if (max1 != min1 || is_inf) 710 data->longest = &(data->longest_float); 711 } 712 min += min1; 713 delta += max1 - min1; 714 if (flags & SCF_DO_STCLASS_OR) { 715 cl_or(data->start_class, &accum); 716 if (min1) { 717 cl_and(data->start_class, &and_with); 718 flags &= ~SCF_DO_STCLASS; 719 } 720 } 721 else if (flags & SCF_DO_STCLASS_AND) { 722 if (min1) { 723 cl_and(data->start_class, &accum); 724 flags &= ~SCF_DO_STCLASS; 725 } 726 else { 727 /* Switch to OR mode: cache the old value of 728 * data->start_class */ 729 StructCopy(data->start_class, &and_with, 730 struct regnode_charclass_class); 731 flags &= ~SCF_DO_STCLASS_AND; 732 StructCopy(&accum, data->start_class, 733 struct regnode_charclass_class); 734 flags |= SCF_DO_STCLASS_OR; 735 data->start_class->flags |= ANYOF_EOS; 736 } 737 } 738 } 739 else if (code == BRANCHJ) /* single branch is optimized. */ 740 scan = NEXTOPER(NEXTOPER(scan)); 741 else /* single branch is optimized. */ 742 scan = NEXTOPER(scan); 743 continue; 744 } 745 else if (OP(scan) == EXACT) { 746 I32 l = STR_LEN(scan); 747 if (UTF) { 748 unsigned char *s = (unsigned char *)STRING(scan); 749 unsigned char *e = s + l; 750 I32 newl = 0; 751 while (s < e) { 752 newl++; 753 s += UTF8SKIP(s); 754 } 755 l = newl; 756 } 757 min += l; 758 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ 759 /* The code below prefers earlier match for fixed 760 offset, later match for variable offset. */ 761 if (data->last_end == -1) { /* Update the start info. */ 762 data->last_start_min = data->pos_min; 763 data->last_start_max = is_inf 764 ? I32_MAX : data->pos_min + data->pos_delta; 765 } 766 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); 767 data->last_end = data->pos_min + l; 768 data->pos_min += l; /* As in the first entry. */ 769 data->flags &= ~SF_BEFORE_EOL; 770 } 771 if (flags & SCF_DO_STCLASS_AND) { 772 /* Check whether it is compatible with what we know already! */ 773 int compat = 1; 774 775 if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 776 && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) 777 && (!(data->start_class->flags & ANYOF_FOLD) 778 || !ANYOF_BITMAP_TEST(data->start_class, 779 PL_fold[*(U8*)STRING(scan)]))) 780 compat = 0; 781 ANYOF_CLASS_ZERO(data->start_class); 782 ANYOF_BITMAP_ZERO(data->start_class); 783 if (compat) 784 ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); 785 data->start_class->flags &= ~ANYOF_EOS; 786 } 787 else if (flags & SCF_DO_STCLASS_OR) { 788 /* false positive possible if the class is case-folded */ 789 ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); 790 data->start_class->flags &= ~ANYOF_EOS; 791 cl_and(data->start_class, &and_with); 792 } 793 flags &= ~SCF_DO_STCLASS; 794 } 795 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */ 796 I32 l = STR_LEN(scan); 797 798 /* Search for fixed substrings supports EXACT only. */ 799 if (flags & SCF_DO_SUBSTR) 800 scan_commit(data); 801 if (UTF) { 802 unsigned char *s = (unsigned char *)STRING(scan); 803 unsigned char *e = s + l; 804 I32 newl = 0; 805 while (s < e) { 806 newl++; 807 s += UTF8SKIP(s); 808 } 809 l = newl; 810 } 811 min += l; 812 if (data && (flags & SCF_DO_SUBSTR)) 813 data->pos_min += l; 814 if (flags & SCF_DO_STCLASS_AND) { 815 /* Check whether it is compatible with what we know already! */ 816 int compat = 1; 817 818 if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 819 && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) 820 && !ANYOF_BITMAP_TEST(data->start_class, 821 PL_fold[*(U8*)STRING(scan)])) 822 compat = 0; 823 ANYOF_CLASS_ZERO(data->start_class); 824 ANYOF_BITMAP_ZERO(data->start_class); 825 if (compat) { 826 ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); 827 data->start_class->flags &= ~ANYOF_EOS; 828 data->start_class->flags |= ANYOF_FOLD; 829 if (OP(scan) == EXACTFL) 830 data->start_class->flags |= ANYOF_LOCALE; 831 } 832 } 833 else if (flags & SCF_DO_STCLASS_OR) { 834 if (data->start_class->flags & ANYOF_FOLD) { 835 /* false positive possible if the class is case-folded. 836 Assume that the locale settings are the same... */ 837 ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); 838 data->start_class->flags &= ~ANYOF_EOS; 839 } 840 cl_and(data->start_class, &and_with); 841 } 842 flags &= ~SCF_DO_STCLASS; 843 } 844 else if (strchr((char*)PL_varies,OP(scan))) { 845 I32 mincount, maxcount, minnext, deltanext, fl; 846 I32 f = flags, pos_before = 0; 847 regnode *oscan = scan; 848 struct regnode_charclass_class this_class; 849 struct regnode_charclass_class *oclass = NULL; 850 851 switch (PL_regkind[(U8)OP(scan)]) { 852 case WHILEM: /* End of (?:...)* . */ 853 scan = NEXTOPER(scan); 854 goto finish; 855 case PLUS: 856 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { 857 next = NEXTOPER(scan); 858 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { 859 mincount = 1; 860 maxcount = REG_INFTY; 861 next = regnext(scan); 862 scan = NEXTOPER(scan); 863 goto do_curly; 864 } 865 } 866 if (flags & SCF_DO_SUBSTR) 867 data->pos_min++; 868 min++; 869 /* Fall through. */ 870 case STAR: 871 if (flags & SCF_DO_STCLASS) { 872 mincount = 0; 873 maxcount = REG_INFTY; 874 next = regnext(scan); 875 scan = NEXTOPER(scan); 876 goto do_curly; 877 } 878 is_inf = is_inf_internal = 1; 879 scan = regnext(scan); 880 if (flags & SCF_DO_SUBSTR) { 881 scan_commit(data); /* Cannot extend fixed substrings */ 882 data->longest = &(data->longest_float); 883 } 884 goto optimize_curly_tail; 885 case CURLY: 886 mincount = ARG1(scan); 887 maxcount = ARG2(scan); 888 next = regnext(scan); 889 if (OP(scan) == CURLYX) { 890 I32 lp = (data ? *(data->last_closep) : 0); 891 892 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX); 893 } 894 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 895 do_curly: 896 if (flags & SCF_DO_SUBSTR) { 897 if (mincount == 0) scan_commit(data); /* Cannot extend fixed substrings */ 898 pos_before = data->pos_min; 899 } 900 if (data) { 901 fl = data->flags; 902 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); 903 if (is_inf) 904 data->flags |= SF_IS_INF; 905 } 906 if (flags & SCF_DO_STCLASS) { 907 cl_init(&this_class); 908 oclass = data->start_class; 909 data->start_class = &this_class; 910 f |= SCF_DO_STCLASS_AND; 911 f &= ~SCF_DO_STCLASS_OR; 912 } 913 /* These are the cases when once a subexpression 914 fails at a particular position, it cannot succeed 915 even after backtracking at the enclosing scope. 916 917 XXXX what if minimal match and we are at the 918 initial run of {n,m}? */ 919 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) 920 f &= ~SCF_WHILEM_VISITED_POS; 921 922 /* This will finish on WHILEM, setting scan, or on NULL: */ 923 minnext = study_chunk(&scan, &deltanext, last, data, 924 mincount == 0 925 ? (f & ~SCF_DO_SUBSTR) : f); 926 927 if (flags & SCF_DO_STCLASS) 928 data->start_class = oclass; 929 if (mincount == 0 || minnext == 0) { 930 if (flags & SCF_DO_STCLASS_OR) { 931 cl_or(data->start_class, &this_class); 932 } 933 else if (flags & SCF_DO_STCLASS_AND) { 934 /* Switch to OR mode: cache the old value of 935 * data->start_class */ 936 StructCopy(data->start_class, &and_with, 937 struct regnode_charclass_class); 938 flags &= ~SCF_DO_STCLASS_AND; 939 StructCopy(&this_class, data->start_class, 940 struct regnode_charclass_class); 941 flags |= SCF_DO_STCLASS_OR; 942 data->start_class->flags |= ANYOF_EOS; 943 } 944 } else { /* Non-zero len */ 945 if (flags & SCF_DO_STCLASS_OR) { 946 cl_or(data->start_class, &this_class); 947 cl_and(data->start_class, &and_with); 948 } 949 else if (flags & SCF_DO_STCLASS_AND) 950 cl_and(data->start_class, &this_class); 951 flags &= ~SCF_DO_STCLASS; 952 } 953 if (!scan) /* It was not CURLYX, but CURLY. */ 954 scan = next; 955 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0) 956 && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) 957 && maxcount <= REG_INFTY/3) /* Complement check for big count */ 958 { 959 vWARN(PL_regcomp_parse, 960 "Quantifier unexpected on zero-length expression"); 961 } 962 963 min += minnext * mincount; 964 is_inf_internal |= ((maxcount == REG_INFTY 965 && (minnext + deltanext) > 0) 966 || deltanext == I32_MAX); 967 is_inf |= is_inf_internal; 968 delta += (minnext + deltanext) * maxcount - minnext * mincount; 969 970 /* Try powerful optimization CURLYX => CURLYN. */ 971 if ( OP(oscan) == CURLYX && data 972 && data->flags & SF_IN_PAR 973 && !(data->flags & SF_HAS_EVAL) 974 && !deltanext && minnext == 1 ) { 975 /* Try to optimize to CURLYN. */ 976 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; 977 regnode *nxt1 = nxt, *nxt2; 978 979 /* Skip open. */ 980 nxt = regnext(nxt); 981 if (!strchr((char*)PL_simple,OP(nxt)) 982 && !(PL_regkind[(U8)OP(nxt)] == EXACT 983 && STR_LEN(nxt) == 1)) 984 goto nogo; 985 nxt2 = nxt; 986 nxt = regnext(nxt); 987 if (OP(nxt) != CLOSE) 988 goto nogo; 989 /* Now we know that nxt2 is the only contents: */ 990 oscan->flags = ARG(nxt); 991 OP(oscan) = CURLYN; 992 OP(nxt1) = NOTHING; /* was OPEN. */ 993 #ifdef DEBUGGING 994 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 995 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */ 996 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */ 997 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 998 OP(nxt + 1) = OPTIMIZED; /* was count. */ 999 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */ 1000 #endif 1001 } 1002 nogo: 1003 1004 /* Try optimization CURLYX => CURLYM. */ 1005 if ( OP(oscan) == CURLYX && data 1006 && !(data->flags & SF_HAS_PAR) 1007 && !(data->flags & SF_HAS_EVAL) 1008 && !deltanext ) { 1009 /* XXXX How to optimize if data == 0? */ 1010 /* Optimize to a simpler form. */ 1011 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ 1012 regnode *nxt2; 1013 1014 OP(oscan) = CURLYM; 1015 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ 1016 && (OP(nxt2) != WHILEM)) 1017 nxt = nxt2; 1018 OP(nxt2) = SUCCEED; /* Whas WHILEM */ 1019 /* Need to optimize away parenths. */ 1020 if (data->flags & SF_IN_PAR) { 1021 /* Set the parenth number. */ 1022 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ 1023 1024 if (OP(nxt) != CLOSE) 1025 FAIL("Panic opt close"); 1026 oscan->flags = ARG(nxt); 1027 OP(nxt1) = OPTIMIZED; /* was OPEN. */ 1028 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 1029 #ifdef DEBUGGING 1030 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 1031 OP(nxt + 1) = OPTIMIZED; /* was count. */ 1032 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */ 1033 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */ 1034 #endif 1035 #if 0 1036 while ( nxt1 && (OP(nxt1) != WHILEM)) { 1037 regnode *nnxt = regnext(nxt1); 1038 1039 if (nnxt == nxt) { 1040 if (reg_off_by_arg[OP(nxt1)]) 1041 ARG_SET(nxt1, nxt2 - nxt1); 1042 else if (nxt2 - nxt1 < U16_MAX) 1043 NEXT_OFF(nxt1) = nxt2 - nxt1; 1044 else 1045 OP(nxt) = NOTHING; /* Cannot beautify */ 1046 } 1047 nxt1 = nnxt; 1048 } 1049 #endif 1050 /* Optimize again: */ 1051 study_chunk(&nxt1, &deltanext, nxt, NULL, 0); 1052 } 1053 else 1054 oscan->flags = 0; 1055 } 1056 else if ((OP(oscan) == CURLYX) 1057 && (flags & SCF_WHILEM_VISITED_POS) 1058 /* See the comment on a similar expression above. 1059 However, this time it not a subexpression 1060 we care about, but the expression itself. */ 1061 && (maxcount == REG_INFTY) 1062 && data && ++data->whilem_c < 16) { 1063 /* This stays as CURLYX, we can put the count/of pair. */ 1064 /* Find WHILEM (as in regexec.c) */ 1065 regnode *nxt = oscan + NEXT_OFF(oscan); 1066 1067 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ 1068 nxt += ARG(nxt); 1069 PREVOPER(nxt)->flags = data->whilem_c 1070 | (PL_reg_whilem_seen << 4); /* On WHILEM */ 1071 } 1072 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 1073 pars++; 1074 if (flags & SCF_DO_SUBSTR) { 1075 SV *last_str = Nullsv; 1076 int counted = mincount != 0; 1077 1078 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ 1079 I32 b = pos_before >= data->last_start_min 1080 ? pos_before : data->last_start_min; 1081 STRLEN l; 1082 char *s = SvPV(data->last_found, l); 1083 I32 old = b - data->last_start_min; 1084 1085 if (UTF) 1086 old = utf8_hop((U8*)s, old) - (U8*)s; 1087 1088 l -= old; 1089 /* Get the added string: */ 1090 last_str = newSVpvn(s + old, l); 1091 if (deltanext == 0 && pos_before == b) { 1092 /* What was added is a constant string */ 1093 if (mincount > 1) { 1094 SvGROW(last_str, (mincount * l) + 1); 1095 repeatcpy(SvPVX(last_str) + l, 1096 SvPVX(last_str), l, mincount - 1); 1097 SvCUR(last_str) *= mincount; 1098 /* Add additional parts. */ 1099 SvCUR_set(data->last_found, 1100 SvCUR(data->last_found) - l); 1101 sv_catsv(data->last_found, last_str); 1102 data->last_end += l * (mincount - 1); 1103 } 1104 } else { 1105 /* start offset must point into the last copy */ 1106 data->last_start_min += minnext * (mincount - 1); 1107 data->last_start_max += is_inf ? 0 : (maxcount - 1) 1108 * (minnext + data->pos_delta); 1109 } 1110 } 1111 /* It is counted once already... */ 1112 data->pos_min += minnext * (mincount - counted); 1113 data->pos_delta += - counted * deltanext + 1114 (minnext + deltanext) * maxcount - minnext * mincount; 1115 if (mincount != maxcount) { 1116 /* Cannot extend fixed substrings found inside 1117 the group. */ 1118 scan_commit(data); 1119 if (mincount && last_str) { 1120 sv_setsv(data->last_found, last_str); 1121 data->last_end = data->pos_min; 1122 data->last_start_min = 1123 data->pos_min - CHR_SVLEN(last_str); 1124 data->last_start_max = is_inf 1125 ? I32_MAX 1126 : data->pos_min + data->pos_delta 1127 - CHR_SVLEN(last_str); 1128 } 1129 data->longest = &(data->longest_float); 1130 } 1131 SvREFCNT_dec(last_str); 1132 } 1133 if (data && (fl & SF_HAS_EVAL)) 1134 data->flags |= SF_HAS_EVAL; 1135 optimize_curly_tail: 1136 if (OP(oscan) != CURLYX) { 1137 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING 1138 && NEXT_OFF(next)) 1139 NEXT_OFF(oscan) += NEXT_OFF(next); 1140 } 1141 continue; 1142 default: /* REF and CLUMP only? */ 1143 if (flags & SCF_DO_SUBSTR) { 1144 scan_commit(data); /* Cannot expect anything... */ 1145 data->longest = &(data->longest_float); 1146 } 1147 is_inf = is_inf_internal = 1; 1148 if (flags & SCF_DO_STCLASS_OR) 1149 cl_anything(data->start_class); 1150 flags &= ~SCF_DO_STCLASS; 1151 break; 1152 } 1153 } 1154 else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) { 1155 int value; 1156 1157 if (flags & SCF_DO_SUBSTR) { 1158 scan_commit(data); 1159 data->pos_min++; 1160 } 1161 min++; 1162 if (flags & SCF_DO_STCLASS) { 1163 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ 1164 1165 /* Some of the logic below assumes that switching 1166 locale on will only add false positives. */ 1167 switch (PL_regkind[(U8)OP(scan)]) { 1168 case ANYUTF8: 1169 case SANY: 1170 case SANYUTF8: 1171 case ALNUMUTF8: 1172 case ANYOFUTF8: 1173 case ALNUMLUTF8: 1174 case NALNUMUTF8: 1175 case NALNUMLUTF8: 1176 case SPACEUTF8: 1177 case NSPACEUTF8: 1178 case SPACELUTF8: 1179 case NSPACELUTF8: 1180 case DIGITUTF8: 1181 case NDIGITUTF8: 1182 default: 1183 do_default: 1184 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ 1185 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 1186 cl_anything(data->start_class); 1187 break; 1188 case REG_ANY: 1189 if (OP(scan) == SANY) 1190 goto do_default; 1191 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ 1192 value = (ANYOF_BITMAP_TEST(data->start_class,'\n') 1193 || (data->start_class->flags & ANYOF_CLASS)); 1194 cl_anything(data->start_class); 1195 } 1196 if (flags & SCF_DO_STCLASS_AND || !value) 1197 ANYOF_BITMAP_CLEAR(data->start_class,'\n'); 1198 break; 1199 case ANYOF: 1200 if (flags & SCF_DO_STCLASS_AND) 1201 cl_and(data->start_class, 1202 (struct regnode_charclass_class*)scan); 1203 else 1204 cl_or(data->start_class, 1205 (struct regnode_charclass_class*)scan); 1206 break; 1207 case ALNUM: 1208 if (flags & SCF_DO_STCLASS_AND) { 1209 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1210 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 1211 for (value = 0; value < 256; value++) 1212 if (!isALNUM(value)) 1213 ANYOF_BITMAP_CLEAR(data->start_class, value); 1214 } 1215 } 1216 else { 1217 if (data->start_class->flags & ANYOF_LOCALE) 1218 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 1219 else { 1220 for (value = 0; value < 256; value++) 1221 if (isALNUM(value)) 1222 ANYOF_BITMAP_SET(data->start_class, value); 1223 } 1224 } 1225 break; 1226 case ALNUML: 1227 if (flags & SCF_DO_STCLASS_AND) { 1228 if (data->start_class->flags & ANYOF_LOCALE) 1229 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 1230 } 1231 else { 1232 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 1233 data->start_class->flags |= ANYOF_LOCALE; 1234 } 1235 break; 1236 case NALNUM: 1237 if (flags & SCF_DO_STCLASS_AND) { 1238 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1239 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 1240 for (value = 0; value < 256; value++) 1241 if (isALNUM(value)) 1242 ANYOF_BITMAP_CLEAR(data->start_class, value); 1243 } 1244 } 1245 else { 1246 if (data->start_class->flags & ANYOF_LOCALE) 1247 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 1248 else { 1249 for (value = 0; value < 256; value++) 1250 if (!isALNUM(value)) 1251 ANYOF_BITMAP_SET(data->start_class, value); 1252 } 1253 } 1254 break; 1255 case NALNUML: 1256 if (flags & SCF_DO_STCLASS_AND) { 1257 if (data->start_class->flags & ANYOF_LOCALE) 1258 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 1259 } 1260 else { 1261 data->start_class->flags |= ANYOF_LOCALE; 1262 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 1263 } 1264 break; 1265 case SPACE: 1266 if (flags & SCF_DO_STCLASS_AND) { 1267 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1268 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 1269 for (value = 0; value < 256; value++) 1270 if (!isSPACE(value)) 1271 ANYOF_BITMAP_CLEAR(data->start_class, value); 1272 } 1273 } 1274 else { 1275 if (data->start_class->flags & ANYOF_LOCALE) 1276 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 1277 else { 1278 for (value = 0; value < 256; value++) 1279 if (isSPACE(value)) 1280 ANYOF_BITMAP_SET(data->start_class, value); 1281 } 1282 } 1283 break; 1284 case SPACEL: 1285 if (flags & SCF_DO_STCLASS_AND) { 1286 if (data->start_class->flags & ANYOF_LOCALE) 1287 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 1288 } 1289 else { 1290 data->start_class->flags |= ANYOF_LOCALE; 1291 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 1292 } 1293 break; 1294 case NSPACE: 1295 if (flags & SCF_DO_STCLASS_AND) { 1296 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1297 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 1298 for (value = 0; value < 256; value++) 1299 if (isSPACE(value)) 1300 ANYOF_BITMAP_CLEAR(data->start_class, value); 1301 } 1302 } 1303 else { 1304 if (data->start_class->flags & ANYOF_LOCALE) 1305 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 1306 else { 1307 for (value = 0; value < 256; value++) 1308 if (!isSPACE(value)) 1309 ANYOF_BITMAP_SET(data->start_class, value); 1310 } 1311 } 1312 break; 1313 case NSPACEL: 1314 if (flags & SCF_DO_STCLASS_AND) { 1315 if (data->start_class->flags & ANYOF_LOCALE) { 1316 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 1317 for (value = 0; value < 256; value++) 1318 if (!isSPACE(value)) 1319 ANYOF_BITMAP_CLEAR(data->start_class, value); 1320 } 1321 } 1322 else { 1323 data->start_class->flags |= ANYOF_LOCALE; 1324 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 1325 } 1326 break; 1327 case DIGIT: 1328 if (flags & SCF_DO_STCLASS_AND) { 1329 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); 1330 for (value = 0; value < 256; value++) 1331 if (!isDIGIT(value)) 1332 ANYOF_BITMAP_CLEAR(data->start_class, value); 1333 } 1334 else { 1335 if (data->start_class->flags & ANYOF_LOCALE) 1336 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); 1337 else { 1338 for (value = 0; value < 256; value++) 1339 if (isDIGIT(value)) 1340 ANYOF_BITMAP_SET(data->start_class, value); 1341 } 1342 } 1343 break; 1344 case NDIGIT: 1345 if (flags & SCF_DO_STCLASS_AND) { 1346 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); 1347 for (value = 0; value < 256; value++) 1348 if (isDIGIT(value)) 1349 ANYOF_BITMAP_CLEAR(data->start_class, value); 1350 } 1351 else { 1352 if (data->start_class->flags & ANYOF_LOCALE) 1353 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); 1354 else { 1355 for (value = 0; value < 256; value++) 1356 if (!isDIGIT(value)) 1357 ANYOF_BITMAP_SET(data->start_class, value); 1358 } 1359 } 1360 break; 1361 } 1362 if (flags & SCF_DO_STCLASS_OR) 1363 cl_and(data->start_class, &and_with); 1364 flags &= ~SCF_DO_STCLASS; 1365 } 1366 } 1367 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { 1368 data->flags |= (OP(scan) == MEOL 1369 ? SF_BEFORE_MEOL 1370 : SF_BEFORE_SEOL); 1371 } 1372 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ 1373 /* Lookbehind, or need to calculate parens/evals/stclass: */ 1374 && (scan->flags || data || (flags & SCF_DO_STCLASS)) 1375 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { 1376 /* Lookahead/lookbehind */ 1377 I32 deltanext, minnext, fake = 0; 1378 regnode *nscan; 1379 struct regnode_charclass_class intrnl; 1380 int f = 0; 1381 1382 data_fake.flags = 0; 1383 if (data) { 1384 data_fake.whilem_c = data->whilem_c; 1385 data_fake.last_closep = data->last_closep; 1386 } 1387 else 1388 data_fake.last_closep = &fake; 1389 if ( flags & SCF_DO_STCLASS && !scan->flags 1390 && OP(scan) == IFMATCH ) { /* Lookahead */ 1391 cl_init(&intrnl); 1392 data_fake.start_class = &intrnl; 1393 f |= SCF_DO_STCLASS_AND; 1394 } 1395 if (flags & SCF_WHILEM_VISITED_POS) 1396 f |= SCF_WHILEM_VISITED_POS; 1397 next = regnext(scan); 1398 nscan = NEXTOPER(NEXTOPER(scan)); 1399 minnext = study_chunk(&nscan, &deltanext, last, &data_fake, f); 1400 if (scan->flags) { 1401 if (deltanext) { 1402 vFAIL("Variable length lookbehind not implemented"); 1403 } 1404 else if (minnext > U8_MAX) { 1405 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); 1406 } 1407 scan->flags = minnext; 1408 } 1409 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 1410 pars++; 1411 if (data && (data_fake.flags & SF_HAS_EVAL)) 1412 data->flags |= SF_HAS_EVAL; 1413 if (data) 1414 data->whilem_c = data_fake.whilem_c; 1415 if (f & SCF_DO_STCLASS_AND) { 1416 int was = (data->start_class->flags & ANYOF_EOS); 1417 1418 cl_and(data->start_class, &intrnl); 1419 if (was) 1420 data->start_class->flags |= ANYOF_EOS; 1421 } 1422 } 1423 else if (OP(scan) == OPEN) { 1424 pars++; 1425 } 1426 else if (OP(scan) == CLOSE) { 1427 if (ARG(scan) == is_par) { 1428 next = regnext(scan); 1429 1430 if ( next && (OP(next) != WHILEM) && next < last) 1431 is_par = 0; /* Disable optimization */ 1432 } 1433 if (data) 1434 *(data->last_closep) = ARG(scan); 1435 } 1436 else if (OP(scan) == EVAL) { 1437 if (data) 1438 data->flags |= SF_HAS_EVAL; 1439 } 1440 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */ 1441 if (flags & SCF_DO_SUBSTR) { 1442 scan_commit(data); 1443 data->longest = &(data->longest_float); 1444 } 1445 is_inf = is_inf_internal = 1; 1446 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 1447 cl_anything(data->start_class); 1448 flags &= ~SCF_DO_STCLASS; 1449 } 1450 /* Else: zero-length, ignore. */ 1451 scan = regnext(scan); 1452 } 1453 1454 finish: 1455 *scanp = scan; 1456 *deltap = is_inf_internal ? I32_MAX : delta; 1457 if (flags & SCF_DO_SUBSTR && is_inf) 1458 data->pos_delta = I32_MAX - data->pos_min; 1459 if (is_par > U8_MAX) 1460 is_par = 0; 1461 if (is_par && pars==1 && data) { 1462 data->flags |= SF_IN_PAR; 1463 data->flags &= ~SF_HAS_PAR; 1464 } 1465 else if (pars && data) { 1466 data->flags |= SF_HAS_PAR; 1467 data->flags &= ~SF_IN_PAR; 1468 } 1469 if (flags & SCF_DO_STCLASS_OR) 1470 cl_and(data->start_class, &and_with); 1471 return min; 1472 } 1473 1474 STATIC I32 1475 S_add_data(pTHX_ I32 n, char *s) 1476 { 1477 if (PL_regcomp_rx->data) { 1478 Renewc(PL_regcomp_rx->data, 1479 sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (PL_regcomp_rx->data->count + n - 1), 1480 char, struct reg_data); 1481 Renew(PL_regcomp_rx->data->what, PL_regcomp_rx->data->count + n, U8); 1482 PL_regcomp_rx->data->count += n; 1483 } 1484 else { 1485 Newc(1207, PL_regcomp_rx->data, sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (n - 1), 1486 char, struct reg_data); 1487 New(1208, PL_regcomp_rx->data->what, n, U8); 1488 PL_regcomp_rx->data->count = n; 1489 } 1490 Copy(s, PL_regcomp_rx->data->what + PL_regcomp_rx->data->count - n, n, U8); 1491 return PL_regcomp_rx->data->count - n; 1492 } 1493 1494 void 1495 Perl_reginitcolors(pTHX) 1496 { 1497 int i = 0; 1498 char *s = PerlEnv_getenv("PERL_RE_COLORS"); 1499 1500 if (s) { 1501 PL_colors[0] = s = savepv(s); 1502 while (++i < 6) { 1503 s = strchr(s, '\t'); 1504 if (s) { 1505 *s = '\0'; 1506 PL_colors[i] = ++s; 1507 } 1508 else 1509 PL_colors[i] = s = ""; 1510 } 1511 } else { 1512 while (i < 6) 1513 PL_colors[i++] = ""; 1514 } 1515 PL_colorset = 1; 1516 } 1517 1518 1519 /* 1520 - pregcomp - compile a regular expression into internal code 1521 * 1522 * We can't allocate space until we know how big the compiled form will be, 1523 * but we can't compile it (and thus know how big it is) until we've got a 1524 * place to put the code. So we cheat: we compile it twice, once with code 1525 * generation turned off and size counting turned on, and once "for real". 1526 * This also means that we don't allocate space until we are sure that the 1527 * thing really will compile successfully, and we never have to move the 1528 * code and thus invalidate pointers into it. (Note that it has to be in 1529 * one piece because free() must be able to free it all.) [NB: not true in perl] 1530 * 1531 * Beware that the optimization-preparation code in here knows about some 1532 * of the structure of the compiled regexp. [I'll say.] 1533 */ 1534 regexp * 1535 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) 1536 { 1537 register regexp *r; 1538 regnode *scan; 1539 regnode *first; 1540 I32 flags; 1541 I32 minlen = 0; 1542 I32 sawplus = 0; 1543 I32 sawopen = 0; 1544 scan_data_t data; 1545 1546 if (exp == NULL) 1547 FAIL("NULL regexp argument"); 1548 1549 if (pm->op_pmdynflags & PMdf_UTF8) { 1550 PL_reg_flags |= RF_utf8; 1551 } 1552 else 1553 PL_reg_flags = 0; 1554 1555 PL_regprecomp = exp; 1556 DEBUG_r(if (!PL_colorset) reginitcolors()); 1557 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", 1558 PL_colors[4],PL_colors[5],PL_colors[0], 1559 (int)(xend - exp), PL_regprecomp, PL_colors[1])); 1560 PL_regflags = pm->op_pmflags; 1561 PL_regsawback = 0; 1562 1563 PL_regseen = 0; 1564 PL_seen_zerolen = *exp == '^' ? -1 : 0; 1565 PL_seen_evals = 0; 1566 PL_extralen = 0; 1567 1568 /* First pass: determine size, legality. */ 1569 PL_regcomp_parse = exp; 1570 PL_regxend = xend; 1571 PL_regnaughty = 0; 1572 PL_regnpar = 1; 1573 PL_regsize = 0L; 1574 PL_regcode = &PL_regdummy; 1575 PL_reg_whilem_seen = 0; 1576 #if 0 /* REGC() is (currently) a NOP at the first pass. 1577 * Clever compilers notice this and complain. --jhi */ 1578 REGC((U8)REG_MAGIC, (char*)PL_regcode); 1579 #endif 1580 if (reg(0, &flags) == NULL) { 1581 PL_regprecomp = Nullch; 1582 return(NULL); 1583 } 1584 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)PL_regsize)); 1585 1586 /* Small enough for pointer-storage convention? 1587 If extralen==0, this means that we will not need long jumps. */ 1588 if (PL_regsize >= 0x10000L && PL_extralen) 1589 PL_regsize += PL_extralen; 1590 else 1591 PL_extralen = 0; 1592 if (PL_reg_whilem_seen > 15) 1593 PL_reg_whilem_seen = 15; 1594 1595 /* Allocate space and initialize. */ 1596 Newc(1001, r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), 1597 char, regexp); 1598 if (r == NULL) 1599 FAIL("Regexp out of space"); 1600 1601 #ifdef DEBUGGING 1602 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ 1603 Zero(r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), char); 1604 #endif 1605 r->refcnt = 1; 1606 r->prelen = xend - exp; 1607 r->precomp = savepvn(PL_regprecomp, r->prelen); 1608 r->subbeg = NULL; 1609 r->reganch = pm->op_pmflags & PMf_COMPILETIME; 1610 r->nparens = PL_regnpar - 1; /* set early to validate backrefs */ 1611 1612 r->substrs = 0; /* Useful during FAIL. */ 1613 r->startp = 0; /* Useful during FAIL. */ 1614 r->endp = 0; /* Useful during FAIL. */ 1615 1616 PL_regcomp_rx = r; 1617 1618 /* Second pass: emit code. */ 1619 PL_regcomp_parse = exp; 1620 PL_regxend = xend; 1621 PL_regnaughty = 0; 1622 PL_regnpar = 1; 1623 PL_regcode = r->program; 1624 /* Store the count of eval-groups for security checks: */ 1625 PL_regcode->next_off = ((PL_seen_evals > U16_MAX) ? U16_MAX : PL_seen_evals); 1626 REGC((U8)REG_MAGIC, (char*) PL_regcode++); 1627 r->data = 0; 1628 if (reg(0, &flags) == NULL) 1629 return(NULL); 1630 1631 /* Dig out information for optimizations. */ 1632 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */ 1633 pm->op_pmflags = PL_regflags; 1634 if (UTF) 1635 r->reganch |= ROPT_UTF8; 1636 r->regstclass = NULL; 1637 if (PL_regnaughty >= 10) /* Probably an expensive pattern. */ 1638 r->reganch |= ROPT_NAUGHTY; 1639 scan = r->program + 1; /* First BRANCH. */ 1640 1641 /* XXXX To minimize changes to RE engine we always allocate 1642 3-units-long substrs field. */ 1643 Newz(1004, r->substrs, 1, struct reg_substr_data); 1644 1645 StructCopy(&zero_scan_data, &data, scan_data_t); 1646 /* XXXX Should not we check for something else? Usually it is OPEN1... */ 1647 if (OP(scan) != BRANCH) { /* Only one top-level choice. */ 1648 I32 fake; 1649 STRLEN longest_float_length, longest_fixed_length; 1650 struct regnode_charclass_class ch_class; 1651 int stclass_flag; 1652 I32 last_close = 0; 1653 1654 first = scan; 1655 /* Skip introductions and multiplicators >= 1. */ 1656 while ((OP(first) == OPEN && (sawopen = 1)) || 1657 /* An OR of *one* alternative - should not happen now. */ 1658 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || 1659 (OP(first) == PLUS) || 1660 (OP(first) == MINMOD) || 1661 /* An {n,m} with n>0 */ 1662 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) { 1663 if (OP(first) == PLUS) 1664 sawplus = 1; 1665 else 1666 first += regarglen[(U8)OP(first)]; 1667 first = NEXTOPER(first); 1668 } 1669 1670 /* Starting-point info. */ 1671 again: 1672 if (PL_regkind[(U8)OP(first)] == EXACT) { 1673 if (OP(first) == EXACT); /* Empty, get anchored substr later. */ 1674 else if ((OP(first) == EXACTF || OP(first) == EXACTFL) 1675 && !UTF) 1676 r->regstclass = first; 1677 } 1678 else if (strchr((char*)PL_simple,OP(first))) 1679 r->regstclass = first; 1680 else if (PL_regkind[(U8)OP(first)] == BOUND || 1681 PL_regkind[(U8)OP(first)] == NBOUND) 1682 r->regstclass = first; 1683 else if (PL_regkind[(U8)OP(first)] == BOL) { 1684 r->reganch |= (OP(first) == MBOL 1685 ? ROPT_ANCH_MBOL 1686 : (OP(first) == SBOL 1687 ? ROPT_ANCH_SBOL 1688 : ROPT_ANCH_BOL)); 1689 first = NEXTOPER(first); 1690 goto again; 1691 } 1692 else if (OP(first) == GPOS) { 1693 r->reganch |= ROPT_ANCH_GPOS; 1694 first = NEXTOPER(first); 1695 goto again; 1696 } 1697 else if ((OP(first) == STAR && 1698 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) && 1699 !(r->reganch & ROPT_ANCH) ) 1700 { 1701 /* turn .* into ^.* with an implied $*=1 */ 1702 int type = OP(NEXTOPER(first)); 1703 1704 if (type == REG_ANY || type == ANYUTF8) 1705 type = ROPT_ANCH_MBOL; 1706 else 1707 type = ROPT_ANCH_SBOL; 1708 1709 r->reganch |= type | ROPT_IMPLICIT; 1710 first = NEXTOPER(first); 1711 goto again; 1712 } 1713 if (sawplus && (!sawopen || !PL_regsawback) 1714 && !(PL_regseen & REG_SEEN_EVAL)) /* May examine pos and $& */ 1715 /* x+ must match at the 1st pos of run of x's */ 1716 r->reganch |= ROPT_SKIP; 1717 1718 /* Scan is after the zeroth branch, first is atomic matcher. */ 1719 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 1720 (IV)(first - scan + 1))); 1721 /* 1722 * If there's something expensive in the r.e., find the 1723 * longest literal string that must appear and make it the 1724 * regmust. Resolve ties in favor of later strings, since 1725 * the regstart check works with the beginning of the r.e. 1726 * and avoiding duplication strengthens checking. Not a 1727 * strong reason, but sufficient in the absence of others. 1728 * [Now we resolve ties in favor of the earlier string if 1729 * it happens that c_offset_min has been invalidated, since the 1730 * earlier string may buy us something the later one won't.] 1731 */ 1732 minlen = 0; 1733 1734 data.longest_fixed = newSVpvn("",0); 1735 data.longest_float = newSVpvn("",0); 1736 data.last_found = newSVpvn("",0); 1737 data.longest = &(data.longest_fixed); 1738 first = scan; 1739 if (!r->regstclass) { 1740 cl_init(&ch_class); 1741 data.start_class = &ch_class; 1742 stclass_flag = SCF_DO_STCLASS_AND; 1743 } else /* XXXX Check for BOUND? */ 1744 stclass_flag = 0; 1745 data.last_closep = &last_close; 1746 1747 minlen = study_chunk(&first, &fake, scan + PL_regsize, /* Up to end */ 1748 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag); 1749 if ( PL_regnpar == 1 && data.longest == &(data.longest_fixed) 1750 && data.last_start_min == 0 && data.last_end > 0 1751 && !PL_seen_zerolen 1752 && (!(PL_regseen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS))) 1753 r->reganch |= ROPT_CHECK_ALL; 1754 scan_commit(&data); 1755 SvREFCNT_dec(data.last_found); 1756 1757 longest_float_length = CHR_SVLEN(data.longest_float); 1758 if (longest_float_length 1759 || (data.flags & SF_FL_BEFORE_EOL 1760 && (!(data.flags & SF_FL_BEFORE_MEOL) 1761 || (PL_regflags & PMf_MULTILINE)))) { 1762 int t; 1763 1764 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ 1765 && data.offset_fixed == data.offset_float_min 1766 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) 1767 goto remove_float; /* As in (a)+. */ 1768 1769 r->float_substr = data.longest_float; 1770 r->float_min_offset = data.offset_float_min; 1771 r->float_max_offset = data.offset_float_max; 1772 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ 1773 && (!(data.flags & SF_FL_BEFORE_MEOL) 1774 || (PL_regflags & PMf_MULTILINE))); 1775 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0); 1776 } 1777 else { 1778 remove_float: 1779 r->float_substr = Nullsv; 1780 SvREFCNT_dec(data.longest_float); 1781 longest_float_length = 0; 1782 } 1783 1784 longest_fixed_length = CHR_SVLEN(data.longest_fixed); 1785 if (longest_fixed_length 1786 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ 1787 && (!(data.flags & SF_FIX_BEFORE_MEOL) 1788 || (PL_regflags & PMf_MULTILINE)))) { 1789 int t; 1790 1791 r->anchored_substr = data.longest_fixed; 1792 r->anchored_offset = data.offset_fixed; 1793 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ 1794 && (!(data.flags & SF_FIX_BEFORE_MEOL) 1795 || (PL_regflags & PMf_MULTILINE))); 1796 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0); 1797 } 1798 else { 1799 r->anchored_substr = Nullsv; 1800 SvREFCNT_dec(data.longest_fixed); 1801 longest_fixed_length = 0; 1802 } 1803 if (r->regstclass 1804 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8 1805 || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY)) 1806 r->regstclass = NULL; 1807 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag 1808 && !(data.start_class->flags & ANYOF_EOS) 1809 && !cl_is_anything(data.start_class)) { 1810 SV *sv; 1811 I32 n = add_data(1, "f"); 1812 1813 New(1006, PL_regcomp_rx->data->data[n], 1, 1814 struct regnode_charclass_class); 1815 StructCopy(data.start_class, 1816 (struct regnode_charclass_class*)PL_regcomp_rx->data->data[n], 1817 struct regnode_charclass_class); 1818 r->regstclass = (regnode*)PL_regcomp_rx->data->data[n]; 1819 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ 1820 DEBUG_r((sv = sv_newmortal(), 1821 regprop(sv, (regnode*)data.start_class), 1822 PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", 1823 SvPVX(sv)))); 1824 } 1825 1826 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ 1827 if (longest_fixed_length > longest_float_length) { 1828 r->check_substr = r->anchored_substr; 1829 r->check_offset_min = r->check_offset_max = r->anchored_offset; 1830 if (r->reganch & ROPT_ANCH_SINGLE) 1831 r->reganch |= ROPT_NOSCAN; 1832 } 1833 else { 1834 r->check_substr = r->float_substr; 1835 r->check_offset_min = data.offset_float_min; 1836 r->check_offset_max = data.offset_float_max; 1837 } 1838 /* XXXX Currently intuiting is not compatible with ANCH_GPOS. 1839 This should be changed ASAP! */ 1840 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) { 1841 r->reganch |= RE_USE_INTUIT; 1842 if (SvTAIL(r->check_substr)) 1843 r->reganch |= RE_INTUIT_TAIL; 1844 } 1845 } 1846 else { 1847 /* Several toplevels. Best we can is to set minlen. */ 1848 I32 fake; 1849 struct regnode_charclass_class ch_class; 1850 I32 last_close = 0; 1851 1852 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n")); 1853 scan = r->program + 1; 1854 cl_init(&ch_class); 1855 data.start_class = &ch_class; 1856 data.last_closep = &last_close; 1857 minlen = study_chunk(&scan, &fake, scan + PL_regsize, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS); 1858 r->check_substr = r->anchored_substr = r->float_substr = Nullsv; 1859 if (!(data.start_class->flags & ANYOF_EOS) 1860 && !cl_is_anything(data.start_class)) { 1861 SV *sv; 1862 I32 n = add_data(1, "f"); 1863 1864 New(1006, PL_regcomp_rx->data->data[n], 1, 1865 struct regnode_charclass_class); 1866 StructCopy(data.start_class, 1867 (struct regnode_charclass_class*)PL_regcomp_rx->data->data[n], 1868 struct regnode_charclass_class); 1869 r->regstclass = (regnode*)PL_regcomp_rx->data->data[n]; 1870 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ 1871 DEBUG_r((sv = sv_newmortal(), 1872 regprop(sv, (regnode*)data.start_class), 1873 PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", 1874 SvPVX(sv)))); 1875 } 1876 } 1877 1878 r->minlen = minlen; 1879 if (PL_regseen & REG_SEEN_GPOS) 1880 r->reganch |= ROPT_GPOS_SEEN; 1881 if (PL_regseen & REG_SEEN_LOOKBEHIND) 1882 r->reganch |= ROPT_LOOKBEHIND_SEEN; 1883 if (PL_regseen & REG_SEEN_EVAL) 1884 r->reganch |= ROPT_EVAL_SEEN; 1885 Newz(1002, r->startp, PL_regnpar, I32); 1886 Newz(1002, r->endp, PL_regnpar, I32); 1887 PL_regdata = r->data; /* for regprop() ANYOFUTF8 */ 1888 DEBUG_r(regdump(r)); 1889 return(r); 1890 } 1891 1892 /* 1893 - reg - regular expression, i.e. main body or parenthesized thing 1894 * 1895 * Caller must absorb opening parenthesis. 1896 * 1897 * Combining parenthesis handling with the base level of regular expression 1898 * is a trifle forced, but the need to tie the tails of the branches to what 1899 * follows makes it hard to avoid. 1900 */ 1901 STATIC regnode * 1902 S_reg(pTHX_ I32 paren, I32 *flagp) 1903 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ 1904 { 1905 register regnode *ret; /* Will be the head of the group. */ 1906 register regnode *br; 1907 register regnode *lastbr; 1908 register regnode *ender = 0; 1909 register I32 parno = 0; 1910 I32 flags, oregflags = PL_regflags, have_branch = 0, open = 0; 1911 char *oregcomp_parse = PL_regcomp_parse; 1912 char c; 1913 1914 *flagp = 0; /* Tentatively. */ 1915 1916 /* Make an OPEN node, if parenthesized. */ 1917 if (paren) { 1918 if (*PL_regcomp_parse == '?') { 1919 U16 posflags = 0, negflags = 0; 1920 U16 *flagsp = &posflags; 1921 int logical = 0; 1922 char *seqstart = PL_regcomp_parse; 1923 1924 PL_regcomp_parse++; 1925 paren = *PL_regcomp_parse++; 1926 ret = NULL; /* For look-ahead/behind. */ 1927 switch (paren) { 1928 case '<': 1929 PL_regseen |= REG_SEEN_LOOKBEHIND; 1930 if (*PL_regcomp_parse == '!') 1931 paren = ','; 1932 if (*PL_regcomp_parse != '=' && *PL_regcomp_parse != '!') 1933 goto unknown; 1934 PL_regcomp_parse++; 1935 case '=': 1936 case '!': 1937 PL_seen_zerolen++; 1938 case ':': 1939 case '>': 1940 break; 1941 case '$': 1942 case '@': 1943 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 1944 break; 1945 case '#': 1946 while (*PL_regcomp_parse && *PL_regcomp_parse != ')') 1947 PL_regcomp_parse++; 1948 if (*PL_regcomp_parse != ')') 1949 FAIL("Sequence (?#... not terminated"); 1950 nextchar(); 1951 *flagp = TRYAGAIN; 1952 return NULL; 1953 case 'p': 1954 if (SIZE_ONLY) 1955 vWARN(PL_regcomp_parse, "(?p{}) is deprecated - use (??{})"); 1956 /* FALL THROUGH*/ 1957 case '?': 1958 logical = 1; 1959 paren = *PL_regcomp_parse++; 1960 /* FALL THROUGH */ 1961 case '{': 1962 { 1963 I32 count = 1, n = 0; 1964 char c; 1965 char *s = PL_regcomp_parse; 1966 SV *sv; 1967 OP_4tree *sop, *rop; 1968 1969 PL_seen_zerolen++; 1970 PL_regseen |= REG_SEEN_EVAL; 1971 while (count && (c = *PL_regcomp_parse)) { 1972 if (c == '\\' && PL_regcomp_parse[1]) 1973 PL_regcomp_parse++; 1974 else if (c == '{') 1975 count++; 1976 else if (c == '}') 1977 count--; 1978 PL_regcomp_parse++; 1979 } 1980 if (*PL_regcomp_parse != ')') 1981 { 1982 PL_regcomp_parse = s; 1983 vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); 1984 } 1985 if (!SIZE_ONLY) { 1986 AV *av; 1987 1988 if (PL_regcomp_parse - 1 - s) 1989 sv = newSVpvn(s, PL_regcomp_parse - 1 - s); 1990 else 1991 sv = newSVpvn("", 0); 1992 1993 ENTER; 1994 Perl_save_re_context(aTHX); 1995 rop = sv_compile_2op(sv, &sop, "re", &av); 1996 LEAVE; 1997 1998 n = add_data(3, "nop"); 1999 PL_regcomp_rx->data->data[n] = (void*)rop; 2000 PL_regcomp_rx->data->data[n+1] = (void*)sop; 2001 PL_regcomp_rx->data->data[n+2] = (void*)av; 2002 SvREFCNT_dec(sv); 2003 } 2004 else { /* First pass */ 2005 if (PL_reginterp_cnt < ++PL_seen_evals 2006 && PL_curcop != &PL_compiling) 2007 /* No compiled RE interpolated, has runtime 2008 components ===> unsafe. */ 2009 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 2010 if (PL_tainted) 2011 FAIL("Eval-group in insecure regular expression"); 2012 } 2013 2014 nextchar(); 2015 if (logical) { 2016 ret = reg_node(LOGICAL); 2017 if (!SIZE_ONLY) 2018 ret->flags = 2; 2019 regtail(ret, reganode(EVAL, n)); 2020 return ret; 2021 } 2022 return reganode(EVAL, n); 2023 } 2024 case '(': 2025 { 2026 if (PL_regcomp_parse[0] == '?') { 2027 if (PL_regcomp_parse[1] == '=' || PL_regcomp_parse[1] == '!' 2028 || PL_regcomp_parse[1] == '<' 2029 || PL_regcomp_parse[1] == '{') { /* Lookahead or eval. */ 2030 I32 flag; 2031 2032 ret = reg_node(LOGICAL); 2033 if (!SIZE_ONLY) 2034 ret->flags = 1; 2035 regtail(ret, reg(1, &flag)); 2036 goto insert_if; 2037 } 2038 } 2039 else if (PL_regcomp_parse[0] >= '1' && PL_regcomp_parse[0] <= '9' ) { 2040 parno = atoi(PL_regcomp_parse++); 2041 2042 while (isDIGIT(*PL_regcomp_parse)) 2043 PL_regcomp_parse++; 2044 ret = reganode(GROUPP, parno); 2045 if ((c = *nextchar()) != ')') 2046 vFAIL("Switch condition not recognized"); 2047 insert_if: 2048 regtail(ret, reganode(IFTHEN, 0)); 2049 br = regbranch(&flags, 1); 2050 if (br == NULL) 2051 br = reganode(LONGJMP, 0); 2052 else 2053 regtail(br, reganode(LONGJMP, 0)); 2054 c = *nextchar(); 2055 if (flags&HASWIDTH) 2056 *flagp |= HASWIDTH; 2057 if (c == '|') { 2058 lastbr = reganode(IFTHEN, 0); /* Fake one for optimizer. */ 2059 regbranch(&flags, 1); 2060 regtail(ret, lastbr); 2061 if (flags&HASWIDTH) 2062 *flagp |= HASWIDTH; 2063 c = *nextchar(); 2064 } 2065 else 2066 lastbr = NULL; 2067 if (c != ')') 2068 vFAIL("Switch (?(condition)... contains too many branches"); 2069 ender = reg_node(TAIL); 2070 regtail(br, ender); 2071 if (lastbr) { 2072 regtail(lastbr, ender); 2073 regtail(NEXTOPER(NEXTOPER(lastbr)), ender); 2074 } 2075 else 2076 regtail(ret, ender); 2077 return ret; 2078 } 2079 else { 2080 vFAIL2("Unknown switch condition (?(%.2s", PL_regcomp_parse); 2081 } 2082 } 2083 case 0: 2084 PL_regcomp_parse--; /* for vFAIL to print correctly */ 2085 vFAIL("Sequence (? incomplete"); 2086 break; 2087 default: 2088 --PL_regcomp_parse; 2089 parse_flags: 2090 while (*PL_regcomp_parse && strchr("iogcmsx", *PL_regcomp_parse)) { 2091 if (*PL_regcomp_parse != 'o') 2092 pmflag(flagsp, *PL_regcomp_parse); 2093 ++PL_regcomp_parse; 2094 } 2095 if (*PL_regcomp_parse == '-') { 2096 flagsp = &negflags; 2097 ++PL_regcomp_parse; 2098 goto parse_flags; 2099 } 2100 PL_regflags |= posflags; 2101 PL_regflags &= ~negflags; 2102 if (*PL_regcomp_parse == ':') { 2103 PL_regcomp_parse++; 2104 paren = ':'; 2105 break; 2106 } 2107 unknown: 2108 if (*PL_regcomp_parse != ')') { 2109 PL_regcomp_parse++; 2110 vFAIL3("Sequence (%.*s...) not recognized", PL_regcomp_parse-seqstart, seqstart); 2111 } 2112 nextchar(); 2113 *flagp = TRYAGAIN; 2114 return NULL; 2115 } 2116 } 2117 else { 2118 parno = PL_regnpar; 2119 PL_regnpar++; 2120 ret = reganode(OPEN, parno); 2121 open = 1; 2122 } 2123 } 2124 else 2125 ret = NULL; 2126 2127 /* Pick up the branches, linking them together. */ 2128 br = regbranch(&flags, 1); 2129 if (br == NULL) 2130 return(NULL); 2131 if (*PL_regcomp_parse == '|') { 2132 if (!SIZE_ONLY && PL_extralen) { 2133 reginsert(BRANCHJ, br); 2134 } 2135 else 2136 reginsert(BRANCH, br); 2137 have_branch = 1; 2138 if (SIZE_ONLY) 2139 PL_extralen += 1; /* For BRANCHJ-BRANCH. */ 2140 } 2141 else if (paren == ':') { 2142 *flagp |= flags&SIMPLE; 2143 } 2144 if (open) { /* Starts with OPEN. */ 2145 regtail(ret, br); /* OPEN -> first. */ 2146 } 2147 else if (paren != '?') /* Not Conditional */ 2148 ret = br; 2149 if (flags&HASWIDTH) 2150 *flagp |= HASWIDTH; 2151 *flagp |= flags&SPSTART; 2152 lastbr = br; 2153 while (*PL_regcomp_parse == '|') { 2154 if (!SIZE_ONLY && PL_extralen) { 2155 ender = reganode(LONGJMP,0); 2156 regtail(NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ 2157 } 2158 if (SIZE_ONLY) 2159 PL_extralen += 2; /* Account for LONGJMP. */ 2160 nextchar(); 2161 br = regbranch(&flags, 0); 2162 if (br == NULL) 2163 return(NULL); 2164 regtail(lastbr, br); /* BRANCH -> BRANCH. */ 2165 lastbr = br; 2166 if (flags&HASWIDTH) 2167 *flagp |= HASWIDTH; 2168 *flagp |= flags&SPSTART; 2169 } 2170 2171 if (have_branch || paren != ':') { 2172 /* Make a closing node, and hook it on the end. */ 2173 switch (paren) { 2174 case ':': 2175 ender = reg_node(TAIL); 2176 break; 2177 case 1: 2178 ender = reganode(CLOSE, parno); 2179 break; 2180 case '<': 2181 case ',': 2182 case '=': 2183 case '!': 2184 *flagp &= ~HASWIDTH; 2185 /* FALL THROUGH */ 2186 case '>': 2187 ender = reg_node(SUCCEED); 2188 break; 2189 case 0: 2190 ender = reg_node(END); 2191 break; 2192 } 2193 regtail(lastbr, ender); 2194 2195 if (have_branch) { 2196 /* Hook the tails of the branches to the closing node. */ 2197 for (br = ret; br != NULL; br = regnext(br)) { 2198 regoptail(br, ender); 2199 } 2200 } 2201 } 2202 2203 { 2204 char *p; 2205 static char parens[] = "=!<,>"; 2206 2207 if (paren && (p = strchr(parens, paren))) { 2208 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 2209 int flag = (p - parens) > 1; 2210 2211 if (paren == '>') 2212 node = SUSPEND, flag = 0; 2213 reginsert(node,ret); 2214 ret->flags = flag; 2215 regtail(ret, reg_node(TAIL)); 2216 } 2217 } 2218 2219 /* Check for proper termination. */ 2220 if (paren) { 2221 PL_regflags = oregflags; 2222 if (PL_regcomp_parse >= PL_regxend || *nextchar() != ')') { 2223 PL_regcomp_parse = oregcomp_parse; 2224 vFAIL("Unmatched ("); 2225 } 2226 } 2227 else if (!paren && PL_regcomp_parse < PL_regxend) { 2228 if (*PL_regcomp_parse == ')') { 2229 PL_regcomp_parse++; 2230 vFAIL("Unmatched )"); 2231 } 2232 else 2233 FAIL("Junk on end of regexp"); /* "Can't happen". */ 2234 /* NOTREACHED */ 2235 } 2236 2237 return(ret); 2238 } 2239 2240 /* 2241 - regbranch - one alternative of an | operator 2242 * 2243 * Implements the concatenation operator. 2244 */ 2245 STATIC regnode * 2246 S_regbranch(pTHX_ I32 *flagp, I32 first) 2247 { 2248 register regnode *ret; 2249 register regnode *chain = NULL; 2250 register regnode *latest; 2251 I32 flags = 0, c = 0; 2252 2253 if (first) 2254 ret = NULL; 2255 else { 2256 if (!SIZE_ONLY && PL_extralen) 2257 ret = reganode(BRANCHJ,0); 2258 else 2259 ret = reg_node(BRANCH); 2260 } 2261 2262 if (!first && SIZE_ONLY) 2263 PL_extralen += 1; /* BRANCHJ */ 2264 2265 *flagp = WORST; /* Tentatively. */ 2266 2267 PL_regcomp_parse--; 2268 nextchar(); 2269 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '|' && *PL_regcomp_parse != ')') { 2270 flags &= ~TRYAGAIN; 2271 latest = regpiece(&flags); 2272 if (latest == NULL) { 2273 if (flags & TRYAGAIN) 2274 continue; 2275 return(NULL); 2276 } 2277 else if (ret == NULL) 2278 ret = latest; 2279 *flagp |= flags&HASWIDTH; 2280 if (chain == NULL) /* First piece. */ 2281 *flagp |= flags&SPSTART; 2282 else { 2283 PL_regnaughty++; 2284 regtail(chain, latest); 2285 } 2286 chain = latest; 2287 c++; 2288 } 2289 if (chain == NULL) { /* Loop ran zero times. */ 2290 chain = reg_node(NOTHING); 2291 if (ret == NULL) 2292 ret = chain; 2293 } 2294 if (c == 1) { 2295 *flagp |= flags&SIMPLE; 2296 } 2297 2298 return(ret); 2299 } 2300 2301 /* 2302 - regpiece - something followed by possible [*+?] 2303 * 2304 * Note that the branching code sequences used for ? and the general cases 2305 * of * and + are somewhat optimized: they use the same NOTHING node as 2306 * both the endmarker for their branch list and the body of the last branch. 2307 * It might seem that this node could be dispensed with entirely, but the 2308 * endmarker role is not redundant. 2309 */ 2310 STATIC regnode * 2311 S_regpiece(pTHX_ I32 *flagp) 2312 { 2313 register regnode *ret; 2314 register char op; 2315 register char *next; 2316 I32 flags; 2317 char *origparse = PL_regcomp_parse; 2318 char *maxpos; 2319 I32 min; 2320 I32 max = REG_INFTY; 2321 2322 ret = regatom(&flags); 2323 if (ret == NULL) { 2324 if (flags & TRYAGAIN) 2325 *flagp |= TRYAGAIN; 2326 return(NULL); 2327 } 2328 2329 op = *PL_regcomp_parse; 2330 2331 if (op == '{' && regcurly(PL_regcomp_parse)) { 2332 next = PL_regcomp_parse + 1; 2333 maxpos = Nullch; 2334 while (isDIGIT(*next) || *next == ',') { 2335 if (*next == ',') { 2336 if (maxpos) 2337 break; 2338 else 2339 maxpos = next; 2340 } 2341 next++; 2342 } 2343 if (*next == '}') { /* got one */ 2344 if (!maxpos) 2345 maxpos = next; 2346 PL_regcomp_parse++; 2347 min = atoi(PL_regcomp_parse); 2348 if (*maxpos == ',') 2349 maxpos++; 2350 else 2351 maxpos = PL_regcomp_parse; 2352 max = atoi(maxpos); 2353 if (!max && *maxpos != '0') 2354 max = REG_INFTY; /* meaning "infinity" */ 2355 else if (max >= REG_INFTY) 2356 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 2357 PL_regcomp_parse = next; 2358 nextchar(); 2359 2360 do_curly: 2361 if ((flags&SIMPLE)) { 2362 PL_regnaughty += 2 + PL_regnaughty / 2; 2363 reginsert(CURLY, ret); 2364 } 2365 else { 2366 regnode *w = reg_node(WHILEM); 2367 2368 w->flags = 0; 2369 regtail(ret, w); 2370 if (!SIZE_ONLY && PL_extralen) { 2371 reginsert(LONGJMP,ret); 2372 reginsert(NOTHING,ret); 2373 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ 2374 } 2375 reginsert(CURLYX,ret); 2376 if (!SIZE_ONLY && PL_extralen) 2377 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ 2378 regtail(ret, reg_node(NOTHING)); 2379 if (SIZE_ONLY) 2380 PL_reg_whilem_seen++, PL_extralen += 3; 2381 PL_regnaughty += 4 + PL_regnaughty; /* compound interest */ 2382 } 2383 ret->flags = 0; 2384 2385 if (min > 0) 2386 *flagp = WORST; 2387 if (max > 0) 2388 *flagp |= HASWIDTH; 2389 if (max && max < min) 2390 vFAIL("Can't do {n,m} with n > m"); 2391 if (!SIZE_ONLY) { 2392 ARG1_SET(ret, min); 2393 ARG2_SET(ret, max); 2394 } 2395 2396 goto nest_check; 2397 } 2398 } 2399 2400 if (!ISMULT1(op)) { 2401 *flagp = flags; 2402 return(ret); 2403 } 2404 2405 #if 0 /* Now runtime fix should be reliable. */ 2406 2407 /* if this is reinstated, don't forget to put this back into perldiag: 2408 2409 =item Regexp *+ operand could be empty at {#} in regex m/%s/ 2410 2411 (F) The part of the regexp subject to either the * or + quantifier 2412 could match an empty string. The {#} shows in the regular 2413 expression about where the problem was discovered. 2414 2415 */ 2416 2417 if (!(flags&HASWIDTH) && op != '?') 2418 vFAIL("Regexp *+ operand could be empty"); 2419 #endif 2420 2421 nextchar(); 2422 2423 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); 2424 2425 if (op == '*' && (flags&SIMPLE)) { 2426 reginsert(STAR, ret); 2427 ret->flags = 0; 2428 PL_regnaughty += 4; 2429 } 2430 else if (op == '*') { 2431 min = 0; 2432 goto do_curly; 2433 } 2434 else if (op == '+' && (flags&SIMPLE)) { 2435 reginsert(PLUS, ret); 2436 ret->flags = 0; 2437 PL_regnaughty += 3; 2438 } 2439 else if (op == '+') { 2440 min = 1; 2441 goto do_curly; 2442 } 2443 else if (op == '?') { 2444 min = 0; max = 1; 2445 goto do_curly; 2446 } 2447 nest_check: 2448 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) { 2449 vWARN3(PL_regcomp_parse, 2450 "%.*s matches null string many times", 2451 PL_regcomp_parse - origparse, 2452 origparse); 2453 } 2454 2455 if (*PL_regcomp_parse == '?') { 2456 nextchar(); 2457 reginsert(MINMOD, ret); 2458 regtail(ret, ret + NODE_STEP_REGNODE); 2459 } 2460 if (ISMULT2(PL_regcomp_parse)) { 2461 PL_regcomp_parse++; 2462 vFAIL("Nested quantifiers"); 2463 } 2464 2465 return(ret); 2466 } 2467 2468 /* 2469 - regatom - the lowest level 2470 * 2471 * Optimization: gobbles an entire sequence of ordinary characters so that 2472 * it can turn them into a single node, which is smaller to store and 2473 * faster to run. Backslashed characters are exceptions, each becoming a 2474 * separate node; the code is simpler that way and it's not worth fixing. 2475 * 2476 * [Yes, it is worth fixing, some scripts can run twice the speed.] */ 2477 STATIC regnode * 2478 S_regatom(pTHX_ I32 *flagp) 2479 { 2480 register regnode *ret = 0; 2481 I32 flags; 2482 2483 *flagp = WORST; /* Tentatively. */ 2484 2485 tryagain: 2486 switch (*PL_regcomp_parse) { 2487 case '^': 2488 PL_seen_zerolen++; 2489 nextchar(); 2490 if (PL_regflags & PMf_MULTILINE) 2491 ret = reg_node(MBOL); 2492 else if (PL_regflags & PMf_SINGLELINE) 2493 ret = reg_node(SBOL); 2494 else 2495 ret = reg_node(BOL); 2496 break; 2497 case '$': 2498 nextchar(); 2499 if (*PL_regcomp_parse) 2500 PL_seen_zerolen++; 2501 if (PL_regflags & PMf_MULTILINE) 2502 ret = reg_node(MEOL); 2503 else if (PL_regflags & PMf_SINGLELINE) 2504 ret = reg_node(SEOL); 2505 else 2506 ret = reg_node(EOL); 2507 break; 2508 case '.': 2509 nextchar(); 2510 if (UTF) { 2511 if (PL_regflags & PMf_SINGLELINE) 2512 ret = reg_node(SANYUTF8); 2513 else 2514 ret = reg_node(ANYUTF8); 2515 *flagp |= HASWIDTH; 2516 } 2517 else { 2518 if (PL_regflags & PMf_SINGLELINE) 2519 ret = reg_node(SANY); 2520 else 2521 ret = reg_node(REG_ANY); 2522 *flagp |= HASWIDTH|SIMPLE; 2523 } 2524 PL_regnaughty++; 2525 break; 2526 case '[': 2527 { 2528 char *oregcomp_parse = ++PL_regcomp_parse; 2529 ret = (UTF ? regclassutf8() : regclass()); 2530 if (*PL_regcomp_parse != ']') { 2531 PL_regcomp_parse = oregcomp_parse; 2532 vFAIL("Unmatched ["); 2533 } 2534 nextchar(); 2535 *flagp |= HASWIDTH|SIMPLE; 2536 break; 2537 } 2538 case '(': 2539 nextchar(); 2540 ret = reg(1, &flags); 2541 if (ret == NULL) { 2542 if (flags & TRYAGAIN) { 2543 if (PL_regcomp_parse == PL_regxend) { 2544 /* Make parent create an empty node if needed. */ 2545 *flagp |= TRYAGAIN; 2546 return(NULL); 2547 } 2548 goto tryagain; 2549 } 2550 return(NULL); 2551 } 2552 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); 2553 break; 2554 case '|': 2555 case ')': 2556 if (flags & TRYAGAIN) { 2557 *flagp |= TRYAGAIN; 2558 return NULL; 2559 } 2560 vFAIL("Internal urp"); 2561 /* Supposed to be caught earlier. */ 2562 break; 2563 case '{': 2564 if (!regcurly(PL_regcomp_parse)) { 2565 PL_regcomp_parse++; 2566 goto defchar; 2567 } 2568 /* FALL THROUGH */ 2569 case '?': 2570 case '+': 2571 case '*': 2572 PL_regcomp_parse++; 2573 vFAIL("Quantifier follows nothing"); 2574 break; 2575 case '\\': 2576 switch (*++PL_regcomp_parse) { 2577 case 'A': 2578 PL_seen_zerolen++; 2579 ret = reg_node(SBOL); 2580 *flagp |= SIMPLE; 2581 nextchar(); 2582 break; 2583 case 'G': 2584 ret = reg_node(GPOS); 2585 PL_regseen |= REG_SEEN_GPOS; 2586 *flagp |= SIMPLE; 2587 nextchar(); 2588 break; 2589 case 'Z': 2590 ret = reg_node(SEOL); 2591 *flagp |= SIMPLE; 2592 nextchar(); 2593 break; 2594 case 'z': 2595 ret = reg_node(EOS); 2596 *flagp |= SIMPLE; 2597 PL_seen_zerolen++; /* Do not optimize RE away */ 2598 nextchar(); 2599 break; 2600 case 'C': 2601 ret = reg_node(SANY); 2602 *flagp |= HASWIDTH|SIMPLE; 2603 nextchar(); 2604 break; 2605 case 'X': 2606 ret = reg_node(CLUMP); 2607 *flagp |= HASWIDTH; 2608 nextchar(); 2609 if (UTF && !PL_utf8_mark) 2610 is_utf8_mark((U8*)"~"); /* preload table */ 2611 break; 2612 case 'w': 2613 ret = reg_node( 2614 UTF 2615 ? (LOC ? ALNUMLUTF8 : ALNUMUTF8) 2616 : (LOC ? ALNUML : ALNUM)); 2617 *flagp |= HASWIDTH|SIMPLE; 2618 nextchar(); 2619 if (UTF && !PL_utf8_alnum) 2620 is_utf8_alnum((U8*)"a"); /* preload table */ 2621 break; 2622 case 'W': 2623 ret = reg_node( 2624 UTF 2625 ? (LOC ? NALNUMLUTF8 : NALNUMUTF8) 2626 : (LOC ? NALNUML : NALNUM)); 2627 *flagp |= HASWIDTH|SIMPLE; 2628 nextchar(); 2629 if (UTF && !PL_utf8_alnum) 2630 is_utf8_alnum((U8*)"a"); /* preload table */ 2631 break; 2632 case 'b': 2633 PL_seen_zerolen++; 2634 PL_regseen |= REG_SEEN_LOOKBEHIND; 2635 ret = reg_node( 2636 UTF 2637 ? (LOC ? BOUNDLUTF8 : BOUNDUTF8) 2638 : (LOC ? BOUNDL : BOUND)); 2639 *flagp |= SIMPLE; 2640 nextchar(); 2641 if (UTF && !PL_utf8_alnum) 2642 is_utf8_alnum((U8*)"a"); /* preload table */ 2643 break; 2644 case 'B': 2645 PL_seen_zerolen++; 2646 PL_regseen |= REG_SEEN_LOOKBEHIND; 2647 ret = reg_node( 2648 UTF 2649 ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8) 2650 : (LOC ? NBOUNDL : NBOUND)); 2651 *flagp |= SIMPLE; 2652 nextchar(); 2653 if (UTF && !PL_utf8_alnum) 2654 is_utf8_alnum((U8*)"a"); /* preload table */ 2655 break; 2656 case 's': 2657 ret = reg_node( 2658 UTF 2659 ? (LOC ? SPACELUTF8 : SPACEUTF8) 2660 : (LOC ? SPACEL : SPACE)); 2661 *flagp |= HASWIDTH|SIMPLE; 2662 nextchar(); 2663 if (UTF && !PL_utf8_space) 2664 is_utf8_space((U8*)" "); /* preload table */ 2665 break; 2666 case 'S': 2667 ret = reg_node( 2668 UTF 2669 ? (LOC ? NSPACELUTF8 : NSPACEUTF8) 2670 : (LOC ? NSPACEL : NSPACE)); 2671 *flagp |= HASWIDTH|SIMPLE; 2672 nextchar(); 2673 if (UTF && !PL_utf8_space) 2674 is_utf8_space((U8*)" "); /* preload table */ 2675 break; 2676 case 'd': 2677 ret = reg_node(UTF ? DIGITUTF8 : DIGIT); 2678 *flagp |= HASWIDTH|SIMPLE; 2679 nextchar(); 2680 if (UTF && !PL_utf8_digit) 2681 is_utf8_digit((U8*)"1"); /* preload table */ 2682 break; 2683 case 'D': 2684 ret = reg_node(UTF ? NDIGITUTF8 : NDIGIT); 2685 *flagp |= HASWIDTH|SIMPLE; 2686 nextchar(); 2687 if (UTF && !PL_utf8_digit) 2688 is_utf8_digit((U8*)"1"); /* preload table */ 2689 break; 2690 case 'p': 2691 case 'P': 2692 { /* a lovely hack--pretend we saw [\pX] instead */ 2693 char* oldregxend = PL_regxend; 2694 2695 if (PL_regcomp_parse[1] == '{') { 2696 PL_regxend = strchr(PL_regcomp_parse, '}'); 2697 if (!PL_regxend) { 2698 PL_regcomp_parse += 2; 2699 PL_regxend = oldregxend; 2700 vFAIL("Missing right brace on \\p{}"); 2701 } 2702 PL_regxend++; 2703 } 2704 else 2705 PL_regxend = PL_regcomp_parse + 2; 2706 PL_regcomp_parse--; 2707 2708 ret = regclassutf8(); 2709 2710 PL_regxend = oldregxend; 2711 PL_regcomp_parse--; 2712 nextchar(); 2713 *flagp |= HASWIDTH|SIMPLE; 2714 } 2715 break; 2716 case 'n': 2717 case 'r': 2718 case 't': 2719 case 'f': 2720 case 'e': 2721 case 'a': 2722 case 'x': 2723 case 'c': 2724 case '0': 2725 goto defchar; 2726 case '1': case '2': case '3': case '4': 2727 case '5': case '6': case '7': case '8': case '9': 2728 { 2729 I32 num = atoi(PL_regcomp_parse); 2730 2731 if (num > 9 && num >= PL_regnpar) 2732 goto defchar; 2733 else { 2734 while (isDIGIT(*PL_regcomp_parse)) 2735 PL_regcomp_parse++; 2736 2737 if (!SIZE_ONLY && num > PL_regcomp_rx->nparens) 2738 vFAIL("Reference to nonexistent group"); 2739 PL_regsawback = 1; 2740 ret = reganode(FOLD 2741 ? (LOC ? REFFL : REFF) 2742 : REF, num); 2743 *flagp |= HASWIDTH; 2744 PL_regcomp_parse--; 2745 nextchar(); 2746 } 2747 } 2748 break; 2749 case '\0': 2750 if (PL_regcomp_parse >= PL_regxend) 2751 FAIL("Trailing \\"); 2752 /* FALL THROUGH */ 2753 default: 2754 /* Do not generate `unrecognized' warnings here, we fall 2755 back into the quick-grab loop below */ 2756 goto defchar; 2757 } 2758 break; 2759 2760 case '#': 2761 if (PL_regflags & PMf_EXTENDED) { 2762 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '\n') PL_regcomp_parse++; 2763 if (PL_regcomp_parse < PL_regxend) 2764 goto tryagain; 2765 } 2766 /* FALL THROUGH */ 2767 2768 default: { 2769 register STRLEN len; 2770 register UV ender; 2771 register char *p; 2772 char *oldp, *s; 2773 STRLEN numlen; 2774 2775 PL_regcomp_parse++; 2776 2777 defchar: 2778 ret = reg_node(FOLD 2779 ? (LOC ? EXACTFL : EXACTF) 2780 : EXACT); 2781 s = STRING(ret); 2782 for (len = 0, p = PL_regcomp_parse - 1; 2783 len < 127 && p < PL_regxend; 2784 len++) 2785 { 2786 oldp = p; 2787 2788 if (PL_regflags & PMf_EXTENDED) 2789 p = regwhite(p, PL_regxend); 2790 switch (*p) { 2791 case '^': 2792 case '$': 2793 case '.': 2794 case '[': 2795 case '(': 2796 case ')': 2797 case '|': 2798 goto loopdone; 2799 case '\\': 2800 switch (*++p) { 2801 case 'A': 2802 case 'G': 2803 case 'Z': 2804 case 'z': 2805 case 'w': 2806 case 'W': 2807 case 'b': 2808 case 'B': 2809 case 's': 2810 case 'S': 2811 case 'd': 2812 case 'D': 2813 case 'p': 2814 case 'P': 2815 --p; 2816 goto loopdone; 2817 case 'n': 2818 ender = '\n'; 2819 p++; 2820 break; 2821 case 'r': 2822 ender = '\r'; 2823 p++; 2824 break; 2825 case 't': 2826 ender = '\t'; 2827 p++; 2828 break; 2829 case 'f': 2830 ender = '\f'; 2831 p++; 2832 break; 2833 case 'e': 2834 #ifdef ASCIIish 2835 ender = '\033'; 2836 #else 2837 ender = '\047'; 2838 #endif 2839 p++; 2840 break; 2841 case 'a': 2842 #ifdef ASCIIish 2843 ender = '\007'; 2844 #else 2845 ender = '\057'; 2846 #endif 2847 p++; 2848 break; 2849 case 'x': 2850 if (*++p == '{') { 2851 char* e = strchr(p, '}'); 2852 2853 if (!e) { 2854 PL_regcomp_parse = p + 1; 2855 vFAIL("Missing right brace on \\x{}"); 2856 } 2857 else { 2858 numlen = 1; /* allow underscores */ 2859 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen); 2860 /* numlen is generous */ 2861 if (numlen + len >= 127) { 2862 p--; 2863 goto loopdone; 2864 } 2865 p = e + 1; 2866 } 2867 } 2868 else { 2869 numlen = 0; /* disallow underscores */ 2870 ender = (UV)scan_hex(p, 2, &numlen); 2871 p += numlen; 2872 } 2873 break; 2874 case 'c': 2875 p++; 2876 ender = UCHARAT(p++); 2877 ender = toCTRL(ender); 2878 break; 2879 case '0': case '1': case '2': case '3':case '4': 2880 case '5': case '6': case '7': case '8':case '9': 2881 if (*p == '0' || 2882 (isDIGIT(p[1]) && atoi(p) >= PL_regnpar) ) { 2883 numlen = 0; /* disallow underscores */ 2884 ender = (UV)scan_oct(p, 3, &numlen); 2885 p += numlen; 2886 } 2887 else { 2888 --p; 2889 goto loopdone; 2890 } 2891 break; 2892 case '\0': 2893 if (p >= PL_regxend) 2894 FAIL("Trailing \\"); 2895 /* FALL THROUGH */ 2896 default: 2897 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p)) 2898 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p); 2899 goto normal_default; 2900 } 2901 break; 2902 default: 2903 normal_default: 2904 if (UTF8_IS_START(*p) && UTF) { 2905 ender = utf8_to_uv((U8*)p, PL_regxend - p, 2906 &numlen, 0); 2907 p += numlen; 2908 } 2909 else 2910 ender = *p++; 2911 break; 2912 } 2913 if (PL_regflags & PMf_EXTENDED) 2914 p = regwhite(p, PL_regxend); 2915 if (UTF && FOLD) { 2916 if (LOC) 2917 ender = toLOWER_LC_uni(ender); 2918 else 2919 ender = toLOWER_uni(ender); 2920 } 2921 if (ISMULT2(p)) { /* Back off on ?+*. */ 2922 if (len) 2923 p = oldp; 2924 /* ender is a Unicode value so it can be > 0xff -- 2925 * in other words, do not use UTF8_IS_CONTINUED(). */ 2926 else if (ender >= 0x80 && UTF) { 2927 reguni(ender, s, &numlen); 2928 s += numlen; 2929 len += numlen; 2930 } 2931 else { 2932 len++; 2933 REGC(ender, s++); 2934 } 2935 break; 2936 } 2937 /* ender is a Unicode value so it can be > 0xff -- 2938 * in other words, do not use UTF8_IS_CONTINUED(). */ 2939 if (ender >= 0x80 && UTF) { 2940 reguni(ender, s, &numlen); 2941 s += numlen; 2942 len += numlen - 1; 2943 } 2944 else 2945 REGC(ender, s++); 2946 } 2947 loopdone: 2948 PL_regcomp_parse = p - 1; 2949 nextchar(); 2950 { 2951 /* len is STRLEN which is unsigned, need to copy to signed */ 2952 IV iv = len; 2953 if (iv < 0) 2954 vFAIL("Internal disaster"); 2955 } 2956 if (len > 0) 2957 *flagp |= HASWIDTH; 2958 if (len == 1) 2959 *flagp |= SIMPLE; 2960 if (!SIZE_ONLY) 2961 STR_LEN(ret) = len; 2962 if (SIZE_ONLY) 2963 PL_regsize += STR_SZ(len); 2964 else 2965 PL_regcode += STR_SZ(len); 2966 } 2967 break; 2968 } 2969 2970 return(ret); 2971 } 2972 2973 STATIC char * 2974 S_regwhite(pTHX_ char *p, char *e) 2975 { 2976 while (p < e) { 2977 if (isSPACE(*p)) 2978 ++p; 2979 else if (*p == '#') { 2980 do { 2981 p++; 2982 } while (p < e && *p != '\n'); 2983 } 2984 else 2985 break; 2986 } 2987 return p; 2988 } 2989 2990 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 2991 Character classes ([:foo:]) can also be negated ([:^foo:]). 2992 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 2993 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 2994 but trigger warnings because they are currently unimplemented. */ 2995 STATIC I32 2996 S_regpposixcc(pTHX_ I32 value) 2997 { 2998 char *posixcc = 0; 2999 I32 namedclass = OOB_NAMEDCLASS; 3000 3001 if (value == '[' && PL_regcomp_parse + 1 < PL_regxend && 3002 /* I smell either [: or [= or [. -- POSIX has been here, right? */ 3003 (*PL_regcomp_parse == ':' || 3004 *PL_regcomp_parse == '=' || 3005 *PL_regcomp_parse == '.')) { 3006 char c = *PL_regcomp_parse; 3007 char* s = PL_regcomp_parse++; 3008 3009 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != c) 3010 PL_regcomp_parse++; 3011 if (PL_regcomp_parse == PL_regxend) 3012 /* Grandfather lone [:, [=, [. */ 3013 PL_regcomp_parse = s; 3014 else { 3015 char* t = PL_regcomp_parse++; /* skip over the c */ 3016 3017 if (*PL_regcomp_parse == ']') { 3018 PL_regcomp_parse++; /* skip over the ending ] */ 3019 posixcc = s + 1; 3020 if (*s == ':') { 3021 I32 complement = *posixcc == '^' ? *posixcc++ : 0; 3022 I32 skip = 5; /* the most common skip */ 3023 3024 switch (*posixcc) { 3025 case 'a': 3026 if (strnEQ(posixcc, "alnum", 5)) 3027 namedclass = 3028 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; 3029 else if (strnEQ(posixcc, "alpha", 5)) 3030 namedclass = 3031 complement ? ANYOF_NALPHA : ANYOF_ALPHA; 3032 else if (strnEQ(posixcc, "ascii", 5)) 3033 namedclass = 3034 complement ? ANYOF_NASCII : ANYOF_ASCII; 3035 break; 3036 case 'b': 3037 if (strnEQ(posixcc, "blank", 5)) 3038 namedclass = 3039 complement ? ANYOF_NBLANK : ANYOF_BLANK; 3040 break; 3041 case 'c': 3042 if (strnEQ(posixcc, "cntrl", 5)) 3043 namedclass = 3044 complement ? ANYOF_NCNTRL : ANYOF_CNTRL; 3045 break; 3046 case 'd': 3047 if (strnEQ(posixcc, "digit", 5)) 3048 namedclass = 3049 complement ? ANYOF_NDIGIT : ANYOF_DIGIT; 3050 break; 3051 case 'g': 3052 if (strnEQ(posixcc, "graph", 5)) 3053 namedclass = 3054 complement ? ANYOF_NGRAPH : ANYOF_GRAPH; 3055 break; 3056 case 'l': 3057 if (strnEQ(posixcc, "lower", 5)) 3058 namedclass = 3059 complement ? ANYOF_NLOWER : ANYOF_LOWER; 3060 break; 3061 case 'p': 3062 if (strnEQ(posixcc, "print", 5)) 3063 namedclass = 3064 complement ? ANYOF_NPRINT : ANYOF_PRINT; 3065 else if (strnEQ(posixcc, "punct", 5)) 3066 namedclass = 3067 complement ? ANYOF_NPUNCT : ANYOF_PUNCT; 3068 break; 3069 case 's': 3070 if (strnEQ(posixcc, "space", 5)) 3071 namedclass = 3072 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; 3073 break; 3074 case 'u': 3075 if (strnEQ(posixcc, "upper", 5)) 3076 namedclass = 3077 complement ? ANYOF_NUPPER : ANYOF_UPPER; 3078 break; 3079 case 'w': /* this is not POSIX, this is the Perl \w */ 3080 if (strnEQ(posixcc, "word", 4)) { 3081 namedclass = 3082 complement ? ANYOF_NALNUM : ANYOF_ALNUM; 3083 skip = 4; 3084 } 3085 break; 3086 case 'x': 3087 if (strnEQ(posixcc, "xdigit", 6)) { 3088 namedclass = 3089 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; 3090 skip = 6; 3091 } 3092 break; 3093 } 3094 if (namedclass == OOB_NAMEDCLASS || 3095 posixcc[skip] != ':' || 3096 posixcc[skip+1] != ']') 3097 { 3098 Simple_vFAIL3("POSIX class [:%.*s:] unknown", 3099 t - s - 1, s + 1); 3100 } 3101 } else if (!SIZE_ONLY) { 3102 /* [[=foo=]] and [[.foo.]] are still future. */ 3103 3104 /* adjust PL_regcomp_parse so the warning shows after 3105 the class closes */ 3106 while (*PL_regcomp_parse && *PL_regcomp_parse != ']') 3107 PL_regcomp_parse++; 3108 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 3109 } 3110 } else { 3111 /* Maternal grandfather: 3112 * "[:" ending in ":" but not in ":]" */ 3113 PL_regcomp_parse = s; 3114 } 3115 } 3116 } 3117 3118 return namedclass; 3119 } 3120 3121 STATIC void 3122 S_checkposixcc(pTHX) 3123 { 3124 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && 3125 (*PL_regcomp_parse == ':' || 3126 *PL_regcomp_parse == '=' || 3127 *PL_regcomp_parse == '.')) { 3128 char *s = PL_regcomp_parse; 3129 char c = *s++; 3130 3131 while(*s && isALNUM(*s)) 3132 s++; 3133 if (*s && c == *s && s[1] == ']') { 3134 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c); 3135 3136 /* [[=foo=]] and [[.foo.]] are still future. */ 3137 if (c == '=' || c == '.') 3138 { 3139 /* adjust PL_regcomp_parse so the error shows after 3140 the class closes */ 3141 while (*PL_regcomp_parse && *PL_regcomp_parse++ != ']') 3142 ; 3143 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 3144 } 3145 } 3146 } 3147 } 3148 3149 STATIC regnode * 3150 S_regclass(pTHX) 3151 { 3152 register U32 value; 3153 register I32 lastvalue = OOB_CHAR8; 3154 register I32 range = 0; 3155 register regnode *ret; 3156 STRLEN numlen; 3157 I32 namedclass; 3158 char *rangebegin; 3159 bool need_class = 0; 3160 3161 ret = reg_node(ANYOF); 3162 if (SIZE_ONLY) 3163 PL_regsize += ANYOF_SKIP; 3164 else { 3165 ret->flags = 0; 3166 ANYOF_BITMAP_ZERO(ret); 3167 PL_regcode += ANYOF_SKIP; 3168 if (FOLD) 3169 ANYOF_FLAGS(ret) |= ANYOF_FOLD; 3170 if (LOC) 3171 ANYOF_FLAGS(ret) |= ANYOF_LOCALE; 3172 } 3173 if (*PL_regcomp_parse == '^') { /* Complement of range. */ 3174 PL_regnaughty++; 3175 PL_regcomp_parse++; 3176 if (!SIZE_ONLY) 3177 ANYOF_FLAGS(ret) |= ANYOF_INVERT; 3178 } 3179 3180 if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) 3181 checkposixcc(); 3182 3183 if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') 3184 goto skipcond; /* allow 1st char to be ] or - */ 3185 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') { 3186 skipcond: 3187 namedclass = OOB_NAMEDCLASS; 3188 if (!range) 3189 rangebegin = PL_regcomp_parse; 3190 value = UCHARAT(PL_regcomp_parse++); 3191 if (value == '[') 3192 namedclass = regpposixcc(value); 3193 else if (value == '\\') { 3194 value = UCHARAT(PL_regcomp_parse++); 3195 /* Some compilers cannot handle switching on 64-bit integer 3196 * values, therefore the 'value' cannot be an UV. --jhi */ 3197 switch (value) { 3198 case 'w': namedclass = ANYOF_ALNUM; break; 3199 case 'W': namedclass = ANYOF_NALNUM; break; 3200 case 's': namedclass = ANYOF_SPACE; break; 3201 case 'S': namedclass = ANYOF_NSPACE; break; 3202 case 'd': namedclass = ANYOF_DIGIT; break; 3203 case 'D': namedclass = ANYOF_NDIGIT; break; 3204 case 'n': value = '\n'; break; 3205 case 'r': value = '\r'; break; 3206 case 't': value = '\t'; break; 3207 case 'f': value = '\f'; break; 3208 case 'b': value = '\b'; break; 3209 #ifdef ASCIIish 3210 case 'e': value = '\033'; break; 3211 case 'a': value = '\007'; break; 3212 #else 3213 case 'e': value = '\047'; break; 3214 case 'a': value = '\057'; break; 3215 #endif 3216 case 'x': 3217 numlen = 0; /* disallow underscores */ 3218 value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen); 3219 PL_regcomp_parse += numlen; 3220 break; 3221 case 'c': 3222 value = UCHARAT(PL_regcomp_parse++); 3223 value = toCTRL(value); 3224 break; 3225 case '0': case '1': case '2': case '3': case '4': 3226 case '5': case '6': case '7': case '8': case '9': 3227 numlen = 0; /* disallow underscores */ 3228 value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen); 3229 PL_regcomp_parse += numlen; 3230 break; 3231 default: 3232 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) 3233 3234 vWARN2(PL_regcomp_parse, "Unrecognized escape \\%c in character class passed through", (int)value); 3235 break; 3236 } 3237 } 3238 if (namedclass > OOB_NAMEDCLASS) { 3239 if (!need_class && !SIZE_ONLY) 3240 ANYOF_CLASS_ZERO(ret); 3241 need_class = 1; 3242 if (range) { /* a-\d, a-[:digit:] */ 3243 if (!SIZE_ONLY) { 3244 if (ckWARN(WARN_REGEXP)) 3245 vWARN4(PL_regcomp_parse, 3246 "False [] range \"%*.*s\"", 3247 PL_regcomp_parse - rangebegin, 3248 PL_regcomp_parse - rangebegin, 3249 rangebegin); 3250 ANYOF_BITMAP_SET(ret, lastvalue); 3251 ANYOF_BITMAP_SET(ret, '-'); 3252 } 3253 range = 0; /* this is not a true range */ 3254 } 3255 if (!SIZE_ONLY) { 3256 switch (namedclass) { 3257 case ANYOF_ALNUM: 3258 if (LOC) 3259 ANYOF_CLASS_SET(ret, ANYOF_ALNUM); 3260 else { 3261 for (value = 0; value < 256; value++) 3262 if (isALNUM(value)) 3263 ANYOF_BITMAP_SET(ret, value); 3264 } 3265 break; 3266 case ANYOF_NALNUM: 3267 if (LOC) 3268 ANYOF_CLASS_SET(ret, ANYOF_NALNUM); 3269 else { 3270 for (value = 0; value < 256; value++) 3271 if (!isALNUM(value)) 3272 ANYOF_BITMAP_SET(ret, value); 3273 } 3274 break; 3275 case ANYOF_SPACE: 3276 if (LOC) 3277 ANYOF_CLASS_SET(ret, ANYOF_SPACE); 3278 else { 3279 for (value = 0; value < 256; value++) 3280 if (isSPACE(value)) 3281 ANYOF_BITMAP_SET(ret, value); 3282 } 3283 break; 3284 case ANYOF_NSPACE: 3285 if (LOC) 3286 ANYOF_CLASS_SET(ret, ANYOF_NSPACE); 3287 else { 3288 for (value = 0; value < 256; value++) 3289 if (!isSPACE(value)) 3290 ANYOF_BITMAP_SET(ret, value); 3291 } 3292 break; 3293 case ANYOF_DIGIT: 3294 if (LOC) 3295 ANYOF_CLASS_SET(ret, ANYOF_DIGIT); 3296 else { 3297 for (value = '0'; value <= '9'; value++) 3298 ANYOF_BITMAP_SET(ret, value); 3299 } 3300 break; 3301 case ANYOF_NDIGIT: 3302 if (LOC) 3303 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); 3304 else { 3305 for (value = 0; value < '0'; value++) 3306 ANYOF_BITMAP_SET(ret, value); 3307 for (value = '9' + 1; value < 256; value++) 3308 ANYOF_BITMAP_SET(ret, value); 3309 } 3310 break; 3311 case ANYOF_NALNUMC: 3312 if (LOC) 3313 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC); 3314 else { 3315 for (value = 0; value < 256; value++) 3316 if (!isALNUMC(value)) 3317 ANYOF_BITMAP_SET(ret, value); 3318 } 3319 break; 3320 case ANYOF_ALNUMC: 3321 if (LOC) 3322 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); 3323 else { 3324 for (value = 0; value < 256; value++) 3325 if (isALNUMC(value)) 3326 ANYOF_BITMAP_SET(ret, value); 3327 } 3328 break; 3329 case ANYOF_ALPHA: 3330 if (LOC) 3331 ANYOF_CLASS_SET(ret, ANYOF_ALPHA); 3332 else { 3333 for (value = 0; value < 256; value++) 3334 if (isALPHA(value)) 3335 ANYOF_BITMAP_SET(ret, value); 3336 } 3337 break; 3338 case ANYOF_NALPHA: 3339 if (LOC) 3340 ANYOF_CLASS_SET(ret, ANYOF_NALPHA); 3341 else { 3342 for (value = 0; value < 256; value++) 3343 if (!isALPHA(value)) 3344 ANYOF_BITMAP_SET(ret, value); 3345 } 3346 break; 3347 case ANYOF_ASCII: 3348 if (LOC) 3349 ANYOF_CLASS_SET(ret, ANYOF_ASCII); 3350 else { 3351 #ifdef ASCIIish 3352 for (value = 0; value < 128; value++) 3353 ANYOF_BITMAP_SET(ret, value); 3354 #else /* EBCDIC */ 3355 for (value = 0; value < 256; value++) 3356 if (isASCII(value)) 3357 ANYOF_BITMAP_SET(ret, value); 3358 #endif /* EBCDIC */ 3359 } 3360 break; 3361 case ANYOF_NASCII: 3362 if (LOC) 3363 ANYOF_CLASS_SET(ret, ANYOF_NASCII); 3364 else { 3365 #ifdef ASCIIish 3366 for (value = 128; value < 256; value++) 3367 ANYOF_BITMAP_SET(ret, value); 3368 #else /* EBCDIC */ 3369 for (value = 0; value < 256; value++) 3370 if (!isASCII(value)) 3371 ANYOF_BITMAP_SET(ret, value); 3372 #endif /* EBCDIC */ 3373 } 3374 break; 3375 case ANYOF_BLANK: 3376 if (LOC) 3377 ANYOF_CLASS_SET(ret, ANYOF_BLANK); 3378 else { 3379 for (value = 0; value < 256; value++) 3380 if (isBLANK(value)) 3381 ANYOF_BITMAP_SET(ret, value); 3382 } 3383 break; 3384 case ANYOF_NBLANK: 3385 if (LOC) 3386 ANYOF_CLASS_SET(ret, ANYOF_NBLANK); 3387 else { 3388 for (value = 0; value < 256; value++) 3389 if (!isBLANK(value)) 3390 ANYOF_BITMAP_SET(ret, value); 3391 } 3392 break; 3393 case ANYOF_CNTRL: 3394 if (LOC) 3395 ANYOF_CLASS_SET(ret, ANYOF_CNTRL); 3396 else { 3397 for (value = 0; value < 256; value++) 3398 if (isCNTRL(value)) 3399 ANYOF_BITMAP_SET(ret, value); 3400 } 3401 lastvalue = OOB_CHAR8; 3402 break; 3403 case ANYOF_NCNTRL: 3404 if (LOC) 3405 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL); 3406 else { 3407 for (value = 0; value < 256; value++) 3408 if (!isCNTRL(value)) 3409 ANYOF_BITMAP_SET(ret, value); 3410 } 3411 break; 3412 case ANYOF_GRAPH: 3413 if (LOC) 3414 ANYOF_CLASS_SET(ret, ANYOF_GRAPH); 3415 else { 3416 for (value = 0; value < 256; value++) 3417 if (isGRAPH(value)) 3418 ANYOF_BITMAP_SET(ret, value); 3419 } 3420 break; 3421 case ANYOF_NGRAPH: 3422 if (LOC) 3423 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH); 3424 else { 3425 for (value = 0; value < 256; value++) 3426 if (!isGRAPH(value)) 3427 ANYOF_BITMAP_SET(ret, value); 3428 } 3429 break; 3430 case ANYOF_LOWER: 3431 if (LOC) 3432 ANYOF_CLASS_SET(ret, ANYOF_LOWER); 3433 else { 3434 for (value = 0; value < 256; value++) 3435 if (isLOWER(value)) 3436 ANYOF_BITMAP_SET(ret, value); 3437 } 3438 break; 3439 case ANYOF_NLOWER: 3440 if (LOC) 3441 ANYOF_CLASS_SET(ret, ANYOF_NLOWER); 3442 else { 3443 for (value = 0; value < 256; value++) 3444 if (!isLOWER(value)) 3445 ANYOF_BITMAP_SET(ret, value); 3446 } 3447 break; 3448 case ANYOF_PRINT: 3449 if (LOC) 3450 ANYOF_CLASS_SET(ret, ANYOF_PRINT); 3451 else { 3452 for (value = 0; value < 256; value++) 3453 if (isPRINT(value)) 3454 ANYOF_BITMAP_SET(ret, value); 3455 } 3456 break; 3457 case ANYOF_NPRINT: 3458 if (LOC) 3459 ANYOF_CLASS_SET(ret, ANYOF_NPRINT); 3460 else { 3461 for (value = 0; value < 256; value++) 3462 if (!isPRINT(value)) 3463 ANYOF_BITMAP_SET(ret, value); 3464 } 3465 break; 3466 case ANYOF_PSXSPC: 3467 if (LOC) 3468 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC); 3469 else { 3470 for (value = 0; value < 256; value++) 3471 if (isPSXSPC(value)) 3472 ANYOF_BITMAP_SET(ret, value); 3473 } 3474 break; 3475 case ANYOF_NPSXSPC: 3476 if (LOC) 3477 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC); 3478 else { 3479 for (value = 0; value < 256; value++) 3480 if (!isPSXSPC(value)) 3481 ANYOF_BITMAP_SET(ret, value); 3482 } 3483 break; 3484 case ANYOF_PUNCT: 3485 if (LOC) 3486 ANYOF_CLASS_SET(ret, ANYOF_PUNCT); 3487 else { 3488 for (value = 0; value < 256; value++) 3489 if (isPUNCT(value)) 3490 ANYOF_BITMAP_SET(ret, value); 3491 } 3492 break; 3493 case ANYOF_NPUNCT: 3494 if (LOC) 3495 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT); 3496 else { 3497 for (value = 0; value < 256; value++) 3498 if (!isPUNCT(value)) 3499 ANYOF_BITMAP_SET(ret, value); 3500 } 3501 break; 3502 case ANYOF_UPPER: 3503 if (LOC) 3504 ANYOF_CLASS_SET(ret, ANYOF_UPPER); 3505 else { 3506 for (value = 0; value < 256; value++) 3507 if (isUPPER(value)) 3508 ANYOF_BITMAP_SET(ret, value); 3509 } 3510 break; 3511 case ANYOF_NUPPER: 3512 if (LOC) 3513 ANYOF_CLASS_SET(ret, ANYOF_NUPPER); 3514 else { 3515 for (value = 0; value < 256; value++) 3516 if (!isUPPER(value)) 3517 ANYOF_BITMAP_SET(ret, value); 3518 } 3519 break; 3520 case ANYOF_XDIGIT: 3521 if (LOC) 3522 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT); 3523 else { 3524 for (value = 0; value < 256; value++) 3525 if (isXDIGIT(value)) 3526 ANYOF_BITMAP_SET(ret, value); 3527 } 3528 break; 3529 case ANYOF_NXDIGIT: 3530 if (LOC) 3531 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT); 3532 else { 3533 for (value = 0; value < 256; value++) 3534 if (!isXDIGIT(value)) 3535 ANYOF_BITMAP_SET(ret, value); 3536 } 3537 break; 3538 default: 3539 vFAIL("Invalid [::] class"); 3540 break; 3541 } 3542 if (LOC) 3543 ANYOF_FLAGS(ret) |= ANYOF_CLASS; 3544 continue; 3545 } 3546 } 3547 if (range) { 3548 if (lastvalue > value) /* b-a */ { 3549 Simple_vFAIL4("Invalid [] range \"%*.*s\"", 3550 PL_regcomp_parse - rangebegin, 3551 PL_regcomp_parse - rangebegin, 3552 rangebegin); 3553 } 3554 range = 0; 3555 } 3556 else { 3557 lastvalue = value; 3558 if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend && 3559 PL_regcomp_parse[1] != ']') { 3560 PL_regcomp_parse++; 3561 if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ 3562 if (ckWARN(WARN_REGEXP)) 3563 vWARN4(PL_regcomp_parse, 3564 "False [] range \"%*.*s\"", 3565 PL_regcomp_parse - rangebegin, 3566 PL_regcomp_parse - rangebegin, 3567 rangebegin); 3568 if (!SIZE_ONLY) 3569 ANYOF_BITMAP_SET(ret, '-'); 3570 } else 3571 range = 1; 3572 continue; /* do it next time */ 3573 } 3574 } 3575 /* now is the next time */ 3576 if (!SIZE_ONLY) { 3577 #ifndef ASCIIish /* EBCDIC, for example. */ 3578 if ((isLOWER(lastvalue) && isLOWER(value)) || 3579 (isUPPER(lastvalue) && isUPPER(value))) 3580 { 3581 I32 i; 3582 if (isLOWER(lastvalue)) { 3583 for (i = lastvalue; i <= value; i++) 3584 if (isLOWER(i)) 3585 ANYOF_BITMAP_SET(ret, i); 3586 } else { 3587 for (i = lastvalue; i <= value; i++) 3588 if (isUPPER(i)) 3589 ANYOF_BITMAP_SET(ret, i); 3590 } 3591 } 3592 else 3593 #endif 3594 for ( ; lastvalue <= value; lastvalue++) 3595 ANYOF_BITMAP_SET(ret, lastvalue); 3596 } 3597 range = 0; 3598 } 3599 if (need_class) { 3600 if (SIZE_ONLY) 3601 PL_regsize += ANYOF_CLASS_ADD_SKIP; 3602 else 3603 PL_regcode += ANYOF_CLASS_ADD_SKIP; 3604 } 3605 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ 3606 if (!SIZE_ONLY && 3607 (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) { 3608 for (value = 0; value < 256; ++value) { 3609 if (ANYOF_BITMAP_TEST(ret, value)) { 3610 I32 cf = PL_fold[value]; 3611 ANYOF_BITMAP_SET(ret, cf); 3612 } 3613 } 3614 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; 3615 } 3616 /* optimize inverted simple patterns (e.g. [^a-z]) */ 3617 if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { 3618 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) 3619 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; 3620 ANYOF_FLAGS(ret) = 0; 3621 } 3622 return ret; 3623 } 3624 3625 STATIC regnode * 3626 S_regclassutf8(pTHX) 3627 { 3628 register char *e; 3629 register U32 value; 3630 register U32 lastvalue = OOB_UTF8; 3631 register I32 range = 0; 3632 register regnode *ret; 3633 STRLEN numlen; 3634 I32 n; 3635 SV *listsv; 3636 U8 flags = 0; 3637 I32 namedclass; 3638 char *rangebegin; 3639 3640 if (*PL_regcomp_parse == '^') { /* Complement of range. */ 3641 PL_regnaughty++; 3642 PL_regcomp_parse++; 3643 if (!SIZE_ONLY) 3644 flags |= ANYOF_INVERT; 3645 } 3646 if (!SIZE_ONLY) { 3647 if (FOLD) 3648 flags |= ANYOF_FOLD; 3649 if (LOC) 3650 flags |= ANYOF_LOCALE; 3651 listsv = newSVpvn("# comment\n", 10); 3652 } 3653 3654 if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) 3655 checkposixcc(); 3656 3657 if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') 3658 goto skipcond; /* allow 1st char to be ] or - */ 3659 3660 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') { 3661 skipcond: 3662 namedclass = OOB_NAMEDCLASS; 3663 if (!range) 3664 rangebegin = PL_regcomp_parse; 3665 value = utf8_to_uv((U8*)PL_regcomp_parse, 3666 PL_regxend - PL_regcomp_parse, 3667 &numlen, 0); 3668 PL_regcomp_parse += numlen; 3669 if (value == '[') 3670 namedclass = regpposixcc(value); 3671 else if (value == '\\') { 3672 value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, 3673 PL_regxend - PL_regcomp_parse, 3674 &numlen, 0); 3675 PL_regcomp_parse += numlen; 3676 /* Some compilers cannot handle switching on 64-bit integer 3677 * values, therefore value cannot be an UV. Yes, this will 3678 * be a problem later if we want switch on Unicode. --jhi */ 3679 switch (value) { 3680 case 'w': namedclass = ANYOF_ALNUM; break; 3681 case 'W': namedclass = ANYOF_NALNUM; break; 3682 case 's': namedclass = ANYOF_SPACE; break; 3683 case 'S': namedclass = ANYOF_NSPACE; break; 3684 case 'd': namedclass = ANYOF_DIGIT; break; 3685 case 'D': namedclass = ANYOF_NDIGIT; break; 3686 case 'p': 3687 case 'P': 3688 if (*PL_regcomp_parse == '{') { 3689 e = strchr(PL_regcomp_parse++, '}'); 3690 if (!e) 3691 vFAIL("Missing right brace on \\p{}"); 3692 n = e - PL_regcomp_parse; 3693 } 3694 else { 3695 e = PL_regcomp_parse; 3696 n = 1; 3697 } 3698 if (!SIZE_ONLY) { 3699 if (value == 'p') 3700 Perl_sv_catpvf(aTHX_ listsv, 3701 "+utf8::%.*s\n", (int)n, PL_regcomp_parse); 3702 else 3703 Perl_sv_catpvf(aTHX_ listsv, 3704 "!utf8::%.*s\n", (int)n, PL_regcomp_parse); 3705 } 3706 PL_regcomp_parse = e + 1; 3707 lastvalue = OOB_UTF8; 3708 continue; 3709 case 'n': value = '\n'; break; 3710 case 'r': value = '\r'; break; 3711 case 't': value = '\t'; break; 3712 case 'f': value = '\f'; break; 3713 case 'b': value = '\b'; break; 3714 #ifdef ASCIIish 3715 case 'e': value = '\033'; break; 3716 case 'a': value = '\007'; break; 3717 #else 3718 case 'e': value = '\047'; break; 3719 case 'a': value = '\057'; break; 3720 #endif 3721 case 'x': 3722 if (*PL_regcomp_parse == '{') { 3723 e = strchr(PL_regcomp_parse++, '}'); 3724 if (!e) 3725 vFAIL("Missing right brace on \\x{}"); 3726 numlen = 1; /* allow underscores */ 3727 value = (UV)scan_hex(PL_regcomp_parse, 3728 e - PL_regcomp_parse, 3729 &numlen); 3730 PL_regcomp_parse = e + 1; 3731 } 3732 else { 3733 numlen = 0; /* disallow underscores */ 3734 value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen); 3735 PL_regcomp_parse += numlen; 3736 } 3737 break; 3738 case 'c': 3739 value = UCHARAT(PL_regcomp_parse++); 3740 value = toCTRL(value); 3741 break; 3742 case '0': case '1': case '2': case '3': case '4': 3743 case '5': case '6': case '7': case '8': case '9': 3744 numlen = 0; /* disallow underscores */ 3745 value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen); 3746 PL_regcomp_parse += numlen; 3747 break; 3748 default: 3749 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) 3750 vWARN2(PL_regcomp_parse, 3751 "Unrecognized escape \\%c in character class passed through", 3752 (int)value); 3753 break; 3754 } 3755 } 3756 if (namedclass > OOB_NAMEDCLASS) { 3757 if (range) { /* a-\d, a-[:digit:] */ 3758 if (!SIZE_ONLY) { 3759 if (ckWARN(WARN_REGEXP)) 3760 vWARN4(PL_regcomp_parse, 3761 "False [] range \"%*.*s\"", 3762 PL_regcomp_parse - rangebegin, 3763 PL_regcomp_parse - rangebegin, 3764 rangebegin); 3765 Perl_sv_catpvf(aTHX_ listsv, 3766 /* 0x002D is Unicode for '-' */ 3767 "%04"UVxf"\n002D\n", (UV)lastvalue); 3768 } 3769 range = 0; 3770 } 3771 if (!SIZE_ONLY) { 3772 switch (namedclass) { 3773 case ANYOF_ALNUM: 3774 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; 3775 case ANYOF_NALNUM: 3776 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; 3777 case ANYOF_ALNUMC: 3778 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; 3779 case ANYOF_NALNUMC: 3780 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; 3781 case ANYOF_ALPHA: 3782 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; 3783 case ANYOF_NALPHA: 3784 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; 3785 case ANYOF_ASCII: 3786 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; 3787 case ANYOF_NASCII: 3788 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; 3789 case ANYOF_CNTRL: 3790 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; 3791 case ANYOF_NCNTRL: 3792 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break; 3793 case ANYOF_GRAPH: 3794 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; 3795 case ANYOF_NGRAPH: 3796 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; 3797 case ANYOF_DIGIT: 3798 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break; 3799 case ANYOF_NDIGIT: 3800 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; 3801 case ANYOF_LOWER: 3802 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; 3803 case ANYOF_NLOWER: 3804 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; 3805 case ANYOF_PRINT: 3806 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; 3807 case ANYOF_NPRINT: 3808 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; 3809 case ANYOF_PUNCT: 3810 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; 3811 case ANYOF_NPUNCT: 3812 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; 3813 case ANYOF_SPACE: 3814 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break; 3815 case ANYOF_NSPACE: 3816 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break; 3817 case ANYOF_BLANK: 3818 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; 3819 case ANYOF_NBLANK: 3820 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; 3821 case ANYOF_PSXSPC: 3822 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; 3823 case ANYOF_NPSXSPC: 3824 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; 3825 case ANYOF_UPPER: 3826 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; 3827 case ANYOF_NUPPER: 3828 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; 3829 case ANYOF_XDIGIT: 3830 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; 3831 case ANYOF_NXDIGIT: 3832 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; 3833 } 3834 continue; 3835 } 3836 } 3837 if (range) { 3838 if (lastvalue > value) { /* b-a */ 3839 Simple_vFAIL4("Invalid [] range \"%*.*s\"", 3840 PL_regcomp_parse - rangebegin, 3841 PL_regcomp_parse - rangebegin, 3842 rangebegin); 3843 } 3844 range = 0; 3845 } 3846 else { 3847 lastvalue = value; 3848 if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend && 3849 PL_regcomp_parse[1] != ']') { 3850 PL_regcomp_parse++; 3851 if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ 3852 if (ckWARN(WARN_REGEXP)) 3853 vWARN4(PL_regcomp_parse, 3854 "False [] range \"%*.*s\"", 3855 PL_regcomp_parse - rangebegin, 3856 PL_regcomp_parse - rangebegin, 3857 rangebegin); 3858 if (!SIZE_ONLY) 3859 Perl_sv_catpvf(aTHX_ listsv, 3860 /* 0x002D is Unicode for '-' */ 3861 "002D\n"); 3862 } else 3863 range = 1; 3864 continue; /* do it next time */ 3865 } 3866 } 3867 /* now is the next time */ 3868 if (!SIZE_ONLY) 3869 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", 3870 (UV)lastvalue, (UV)value); 3871 range = 0; 3872 } 3873 3874 ret = reganode(ANYOFUTF8, 0); 3875 3876 if (!SIZE_ONLY) { 3877 SV *rv = swash_init("utf8", "", listsv, 1, 0); 3878 #ifdef DEBUGGING 3879 AV *av = newAV(); 3880 av_push(av, rv); 3881 av_push(av, listsv); 3882 rv = newRV_noinc((SV*)av); 3883 #else 3884 SvREFCNT_dec(listsv); 3885 #endif 3886 n = add_data(1,"s"); 3887 PL_regcomp_rx->data->data[n] = (void*)rv; 3888 ARG1_SET(ret, flags); 3889 ARG2_SET(ret, n); 3890 } 3891 3892 return ret; 3893 } 3894 3895 STATIC char* 3896 S_nextchar(pTHX) 3897 { 3898 char* retval = PL_regcomp_parse++; 3899 3900 for (;;) { 3901 if (*PL_regcomp_parse == '(' && PL_regcomp_parse[1] == '?' && 3902 PL_regcomp_parse[2] == '#') { 3903 while (*PL_regcomp_parse && *PL_regcomp_parse != ')') 3904 PL_regcomp_parse++; 3905 PL_regcomp_parse++; 3906 continue; 3907 } 3908 if (PL_regflags & PMf_EXTENDED) { 3909 if (isSPACE(*PL_regcomp_parse)) { 3910 PL_regcomp_parse++; 3911 continue; 3912 } 3913 else if (*PL_regcomp_parse == '#') { 3914 while (*PL_regcomp_parse && *PL_regcomp_parse != '\n') 3915 PL_regcomp_parse++; 3916 PL_regcomp_parse++; 3917 continue; 3918 } 3919 } 3920 return retval; 3921 } 3922 } 3923 3924 /* 3925 - reg_node - emit a node 3926 */ 3927 STATIC regnode * /* Location. */ 3928 S_reg_node(pTHX_ U8 op) 3929 { 3930 register regnode *ret; 3931 register regnode *ptr; 3932 3933 ret = PL_regcode; 3934 if (SIZE_ONLY) { 3935 SIZE_ALIGN(PL_regsize); 3936 PL_regsize += 1; 3937 return(ret); 3938 } 3939 3940 NODE_ALIGN_FILL(ret); 3941 ptr = ret; 3942 FILL_ADVANCE_NODE(ptr, op); 3943 PL_regcode = ptr; 3944 3945 return(ret); 3946 } 3947 3948 /* 3949 - reganode - emit a node with an argument 3950 */ 3951 STATIC regnode * /* Location. */ 3952 S_reganode(pTHX_ U8 op, U32 arg) 3953 { 3954 register regnode *ret; 3955 register regnode *ptr; 3956 3957 ret = PL_regcode; 3958 if (SIZE_ONLY) { 3959 SIZE_ALIGN(PL_regsize); 3960 PL_regsize += 2; 3961 return(ret); 3962 } 3963 3964 NODE_ALIGN_FILL(ret); 3965 ptr = ret; 3966 FILL_ADVANCE_NODE_ARG(ptr, op, arg); 3967 PL_regcode = ptr; 3968 3969 return(ret); 3970 } 3971 3972 /* 3973 - reguni - emit (if appropriate) a Unicode character 3974 */ 3975 STATIC void 3976 S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp) 3977 { 3978 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s); 3979 } 3980 3981 /* 3982 - reginsert - insert an operator in front of already-emitted operand 3983 * 3984 * Means relocating the operand. 3985 */ 3986 STATIC void 3987 S_reginsert(pTHX_ U8 op, regnode *opnd) 3988 { 3989 register regnode *src; 3990 register regnode *dst; 3991 register regnode *place; 3992 register int offset = regarglen[(U8)op]; 3993 3994 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ 3995 3996 if (SIZE_ONLY) { 3997 PL_regsize += NODE_STEP_REGNODE + offset; 3998 return; 3999 } 4000 4001 src = PL_regcode; 4002 PL_regcode += NODE_STEP_REGNODE + offset; 4003 dst = PL_regcode; 4004 while (src > opnd) 4005 StructCopy(--src, --dst, regnode); 4006 4007 place = opnd; /* Op node, where operand used to be. */ 4008 src = NEXTOPER(place); 4009 FILL_ADVANCE_NODE(place, op); 4010 Zero(src, offset, regnode); 4011 } 4012 4013 /* 4014 - regtail - set the next-pointer at the end of a node chain of p to val. 4015 */ 4016 STATIC void 4017 S_regtail(pTHX_ regnode *p, regnode *val) 4018 { 4019 register regnode *scan; 4020 register regnode *temp; 4021 4022 if (SIZE_ONLY) 4023 return; 4024 4025 /* Find last node. */ 4026 scan = p; 4027 for (;;) { 4028 temp = regnext(scan); 4029 if (temp == NULL) 4030 break; 4031 scan = temp; 4032 } 4033 4034 if (reg_off_by_arg[OP(scan)]) { 4035 ARG_SET(scan, val - scan); 4036 } 4037 else { 4038 NEXT_OFF(scan) = val - scan; 4039 } 4040 } 4041 4042 /* 4043 - regoptail - regtail on operand of first argument; nop if operandless 4044 */ 4045 STATIC void 4046 S_regoptail(pTHX_ regnode *p, regnode *val) 4047 { 4048 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 4049 if (p == NULL || SIZE_ONLY) 4050 return; 4051 if (PL_regkind[(U8)OP(p)] == BRANCH) { 4052 regtail(NEXTOPER(p), val); 4053 } 4054 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) { 4055 regtail(NEXTOPER(NEXTOPER(p)), val); 4056 } 4057 else 4058 return; 4059 } 4060 4061 /* 4062 - regcurly - a little FSA that accepts {\d+,?\d*} 4063 */ 4064 STATIC I32 4065 S_regcurly(pTHX_ register char *s) 4066 { 4067 if (*s++ != '{') 4068 return FALSE; 4069 if (!isDIGIT(*s)) 4070 return FALSE; 4071 while (isDIGIT(*s)) 4072 s++; 4073 if (*s == ',') 4074 s++; 4075 while (isDIGIT(*s)) 4076 s++; 4077 if (*s != '}') 4078 return FALSE; 4079 return TRUE; 4080 } 4081 4082 4083 STATIC regnode * 4084 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) 4085 { 4086 #ifdef DEBUGGING 4087 register U8 op = EXACT; /* Arbitrary non-END op. */ 4088 register regnode *next; 4089 4090 while (op != END && (!last || node < last)) { 4091 /* While that wasn't END last time... */ 4092 4093 NODE_ALIGN(node); 4094 op = OP(node); 4095 if (op == CLOSE) 4096 l--; 4097 next = regnext(node); 4098 /* Where, what. */ 4099 if (OP(node) == OPTIMIZED) 4100 goto after_print; 4101 regprop(sv, node); 4102 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), 4103 (int)(2*l + 1), "", SvPVX(sv)); 4104 if (next == NULL) /* Next ptr. */ 4105 PerlIO_printf(Perl_debug_log, "(0)"); 4106 else 4107 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start)); 4108 (void)PerlIO_putc(Perl_debug_log, '\n'); 4109 after_print: 4110 if (PL_regkind[(U8)op] == BRANCHJ) { 4111 register regnode *nnode = (OP(next) == LONGJMP 4112 ? regnext(next) 4113 : next); 4114 if (last && nnode > last) 4115 nnode = last; 4116 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1); 4117 } 4118 else if (PL_regkind[(U8)op] == BRANCH) { 4119 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1); 4120 } 4121 else if ( op == CURLY) { /* `next' might be very big: optimizer */ 4122 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS, 4123 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1); 4124 } 4125 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { 4126 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS, 4127 next, sv, l + 1); 4128 } 4129 else if ( op == PLUS || op == STAR) { 4130 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1); 4131 } 4132 else if (op == ANYOF) { 4133 node = NEXTOPER(node); 4134 node += ANYOF_SKIP; 4135 } 4136 else if (PL_regkind[(U8)op] == EXACT) { 4137 /* Literal string, where present. */ 4138 node += NODE_SZ_STR(node) - 1; 4139 node = NEXTOPER(node); 4140 } 4141 else { 4142 node = NEXTOPER(node); 4143 node += regarglen[(U8)op]; 4144 } 4145 if (op == CURLYX || op == OPEN) 4146 l++; 4147 else if (op == WHILEM) 4148 l--; 4149 } 4150 #endif /* DEBUGGING */ 4151 return node; 4152 } 4153 4154 /* 4155 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 4156 */ 4157 void 4158 Perl_regdump(pTHX_ regexp *r) 4159 { 4160 #ifdef DEBUGGING 4161 SV *sv = sv_newmortal(); 4162 4163 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0); 4164 4165 /* Header fields of interest. */ 4166 if (r->anchored_substr) 4167 PerlIO_printf(Perl_debug_log, 4168 "anchored `%s%.*s%s'%s at %"IVdf" ", 4169 PL_colors[0], 4170 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)), 4171 SvPVX(r->anchored_substr), 4172 PL_colors[1], 4173 SvTAIL(r->anchored_substr) ? "$" : "", 4174 (IV)r->anchored_offset); 4175 if (r->float_substr) 4176 PerlIO_printf(Perl_debug_log, 4177 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", 4178 PL_colors[0], 4179 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)), 4180 SvPVX(r->float_substr), 4181 PL_colors[1], 4182 SvTAIL(r->float_substr) ? "$" : "", 4183 (IV)r->float_min_offset, (UV)r->float_max_offset); 4184 if (r->check_substr) 4185 PerlIO_printf(Perl_debug_log, 4186 r->check_substr == r->float_substr 4187 ? "(checking floating" : "(checking anchored"); 4188 if (r->reganch & ROPT_NOSCAN) 4189 PerlIO_printf(Perl_debug_log, " noscan"); 4190 if (r->reganch & ROPT_CHECK_ALL) 4191 PerlIO_printf(Perl_debug_log, " isall"); 4192 if (r->check_substr) 4193 PerlIO_printf(Perl_debug_log, ") "); 4194 4195 if (r->regstclass) { 4196 regprop(sv, r->regstclass); 4197 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv)); 4198 } 4199 if (r->reganch & ROPT_ANCH) { 4200 PerlIO_printf(Perl_debug_log, "anchored"); 4201 if (r->reganch & ROPT_ANCH_BOL) 4202 PerlIO_printf(Perl_debug_log, "(BOL)"); 4203 if (r->reganch & ROPT_ANCH_MBOL) 4204 PerlIO_printf(Perl_debug_log, "(MBOL)"); 4205 if (r->reganch & ROPT_ANCH_SBOL) 4206 PerlIO_printf(Perl_debug_log, "(SBOL)"); 4207 if (r->reganch & ROPT_ANCH_GPOS) 4208 PerlIO_printf(Perl_debug_log, "(GPOS)"); 4209 PerlIO_putc(Perl_debug_log, ' '); 4210 } 4211 if (r->reganch & ROPT_GPOS_SEEN) 4212 PerlIO_printf(Perl_debug_log, "GPOS "); 4213 if (r->reganch & ROPT_SKIP) 4214 PerlIO_printf(Perl_debug_log, "plus "); 4215 if (r->reganch & ROPT_IMPLICIT) 4216 PerlIO_printf(Perl_debug_log, "implicit "); 4217 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen); 4218 if (r->reganch & ROPT_EVAL_SEEN) 4219 PerlIO_printf(Perl_debug_log, "with eval "); 4220 PerlIO_printf(Perl_debug_log, "\n"); 4221 #endif /* DEBUGGING */ 4222 } 4223 4224 STATIC void 4225 S_put_byte(pTHX_ SV *sv, int c) 4226 { 4227 if (isCNTRL(c) || c == 127 || c == 255) 4228 Perl_sv_catpvf(aTHX_ sv, "\\%o", c); 4229 else if (c == '-' || c == ']' || c == '\\' || c == '^') 4230 Perl_sv_catpvf(aTHX_ sv, "\\%c", c); 4231 else 4232 Perl_sv_catpvf(aTHX_ sv, "%c", c); 4233 } 4234 4235 /* 4236 - regprop - printable representation of opcode 4237 */ 4238 void 4239 Perl_regprop(pTHX_ SV *sv, regnode *o) 4240 { 4241 #ifdef DEBUGGING 4242 register int k; 4243 4244 sv_setpvn(sv, "", 0); 4245 if (OP(o) >= reg_num) /* regnode.type is unsigned */ 4246 FAIL("Corrupted regexp opcode"); 4247 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */ 4248 4249 k = PL_regkind[(U8)OP(o)]; 4250 4251 if (k == EXACT) 4252 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0], 4253 STR_LEN(o), STRING(o), PL_colors[1]); 4254 else if (k == CURLY) { 4255 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) 4256 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ 4257 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); 4258 } 4259 else if (k == WHILEM && o->flags) /* Ordinal/of */ 4260 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); 4261 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP ) 4262 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ 4263 else if (k == LOGICAL) 4264 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ 4265 else if (k == ANYOF) { 4266 int i, rangestart = -1; 4267 bool anyofutf8 = OP(o) == ANYOFUTF8; 4268 U8 flags = anyofutf8 ? ARG1(o) : o->flags; 4269 const char * const anyofs[] = { /* Should be syncronized with 4270 * ANYOF_ #xdefines in regcomp.h */ 4271 "\\w", 4272 "\\W", 4273 "\\s", 4274 "\\S", 4275 "\\d", 4276 "\\D", 4277 "[:alnum:]", 4278 "[:^alnum:]", 4279 "[:alpha:]", 4280 "[:^alpha:]", 4281 "[:ascii:]", 4282 "[:^ascii:]", 4283 "[:ctrl:]", 4284 "[:^ctrl:]", 4285 "[:graph:]", 4286 "[:^graph:]", 4287 "[:lower:]", 4288 "[:^lower:]", 4289 "[:print:]", 4290 "[:^print:]", 4291 "[:punct:]", 4292 "[:^punct:]", 4293 "[:upper:]", 4294 "[:^upper:]", 4295 "[:xdigit:]", 4296 "[:^xdigit:]", 4297 "[:space:]", 4298 "[:^space:]", 4299 "[:blank:]", 4300 "[:^blank:]" 4301 }; 4302 4303 if (flags & ANYOF_LOCALE) 4304 sv_catpv(sv, "{loc}"); 4305 if (flags & ANYOF_FOLD) 4306 sv_catpv(sv, "{i}"); 4307 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 4308 if (flags & ANYOF_INVERT) 4309 sv_catpv(sv, "^"); 4310 if (OP(o) == ANYOF) { 4311 for (i = 0; i <= 256; i++) { 4312 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { 4313 if (rangestart == -1) 4314 rangestart = i; 4315 } else if (rangestart != -1) { 4316 if (i <= rangestart + 3) 4317 for (; rangestart < i; rangestart++) 4318 put_byte(sv, rangestart); 4319 else { 4320 put_byte(sv, rangestart); 4321 sv_catpv(sv, "-"); 4322 put_byte(sv, i - 1); 4323 } 4324 rangestart = -1; 4325 } 4326 } 4327 if (o->flags & ANYOF_CLASS) 4328 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++) 4329 if (ANYOF_CLASS_TEST(o,i)) 4330 sv_catpv(sv, anyofs[i]); 4331 } 4332 else { 4333 SV *rv = (SV*)PL_regdata->data[ARG2(o)]; 4334 AV *av = (AV*)SvRV((SV*)rv); 4335 SV *sw = *av_fetch(av, 0, FALSE); 4336 SV *lv = *av_fetch(av, 1, FALSE); 4337 UV i; 4338 U8 s[UTF8_MAXLEN+1]; 4339 for (i = 0; i <= 256; i++) { /* just the first 256 */ 4340 U8 *e = uv_to_utf8(s, i); 4341 if (i < 256 && swash_fetch(sw, s)) { 4342 if (rangestart == -1) 4343 rangestart = i; 4344 } else if (rangestart != -1) { 4345 U8 *p; 4346 4347 if (i <= rangestart + 3) 4348 for (; rangestart < i; rangestart++) { 4349 for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++) 4350 put_byte(sv, *p); 4351 } 4352 else { 4353 for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++) 4354 put_byte(sv, *p); 4355 sv_catpv(sv, "-"); 4356 for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++) 4357 put_byte(sv, *p); 4358 } 4359 rangestart = -1; 4360 } 4361 } 4362 sv_catpv(sv, "..."); 4363 { 4364 char *s = savepv(SvPVX(lv)); 4365 4366 while(*s && *s != '\n') s++; 4367 if (*s == '\n') { 4368 char *t = ++s; 4369 4370 while (*s) { 4371 if (*s == '\n') 4372 *s = ' '; 4373 s++; 4374 } 4375 if (s[-1] == ' ') 4376 s[-1] = 0; 4377 4378 sv_catpv(sv, t); 4379 } 4380 } 4381 } 4382 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 4383 } 4384 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) 4385 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags); 4386 #endif /* DEBUGGING */ 4387 } 4388 4389 SV * 4390 Perl_re_intuit_string(pTHX_ regexp *prog) 4391 { /* Assume that RE_INTUIT is set */ 4392 DEBUG_r( 4393 { STRLEN n_a; 4394 char *s = SvPV(prog->check_substr,n_a); 4395 4396 if (!PL_colorset) reginitcolors(); 4397 PerlIO_printf(Perl_debug_log, 4398 "%sUsing REx substr:%s `%s%.60s%s%s'\n", 4399 PL_colors[4],PL_colors[5],PL_colors[0], 4400 s, 4401 PL_colors[1], 4402 (strlen(s) > 60 ? "..." : "")); 4403 } ); 4404 4405 return prog->check_substr; 4406 } 4407 4408 void 4409 Perl_pregfree(pTHX_ struct regexp *r) 4410 { 4411 DEBUG_r(if (!PL_colorset) reginitcolors()); 4412 4413 if (!r || (--r->refcnt > 0)) 4414 return; 4415 DEBUG_r(PerlIO_printf(Perl_debug_log, 4416 "%sFreeing REx:%s `%s%.60s%s%s'\n", 4417 PL_colors[4],PL_colors[5],PL_colors[0], 4418 r->precomp, 4419 PL_colors[1], 4420 (strlen(r->precomp) > 60 ? "..." : ""))); 4421 4422 if (r->precomp) 4423 Safefree(r->precomp); 4424 if (RX_MATCH_COPIED(r)) 4425 Safefree(r->subbeg); 4426 if (r->substrs) { 4427 if (r->anchored_substr) 4428 SvREFCNT_dec(r->anchored_substr); 4429 if (r->float_substr) 4430 SvREFCNT_dec(r->float_substr); 4431 Safefree(r->substrs); 4432 } 4433 if (r->data) { 4434 int n = r->data->count; 4435 AV* new_comppad = NULL; 4436 AV* old_comppad; 4437 SV** old_curpad; 4438 4439 while (--n >= 0) { 4440 switch (r->data->what[n]) { 4441 case 's': 4442 SvREFCNT_dec((SV*)r->data->data[n]); 4443 break; 4444 case 'f': 4445 Safefree(r->data->data[n]); 4446 break; 4447 case 'p': 4448 new_comppad = (AV*)r->data->data[n]; 4449 break; 4450 case 'o': 4451 if (new_comppad == NULL) 4452 Perl_croak(aTHX_ "panic: pregfree comppad"); 4453 old_comppad = PL_comppad; 4454 old_curpad = PL_curpad; 4455 /* Watch out for global destruction's random ordering. */ 4456 if (SvTYPE(new_comppad) == SVt_PVAV) { 4457 PL_comppad = new_comppad; 4458 PL_curpad = AvARRAY(new_comppad); 4459 } 4460 else 4461 PL_curpad = NULL; 4462 op_free((OP_4tree*)r->data->data[n]); 4463 PL_comppad = old_comppad; 4464 PL_curpad = old_curpad; 4465 SvREFCNT_dec((SV*)new_comppad); 4466 new_comppad = NULL; 4467 break; 4468 case 'n': 4469 break; 4470 default: 4471 FAIL2("panic: regfree data code '%c'", r->data->what[n]); 4472 } 4473 } 4474 Safefree(r->data->what); 4475 Safefree(r->data); 4476 } 4477 Safefree(r->startp); 4478 Safefree(r->endp); 4479 Safefree(r); 4480 } 4481 4482 /* 4483 - regnext - dig the "next" pointer out of a node 4484 * 4485 * [Note, when REGALIGN is defined there are two places in regmatch() 4486 * that bypass this code for speed.] 4487 */ 4488 regnode * 4489 Perl_regnext(pTHX_ register regnode *p) 4490 { 4491 register I32 offset; 4492 4493 if (p == &PL_regdummy) 4494 return(NULL); 4495 4496 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); 4497 if (offset == 0) 4498 return(NULL); 4499 4500 return(p+offset); 4501 } 4502 4503 STATIC void 4504 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) 4505 { 4506 va_list args; 4507 STRLEN l1 = strlen(pat1); 4508 STRLEN l2 = strlen(pat2); 4509 char buf[512]; 4510 SV *msv; 4511 char *message; 4512 4513 if (l1 > 510) 4514 l1 = 510; 4515 if (l1 + l2 > 510) 4516 l2 = 510 - l1; 4517 Copy(pat1, buf, l1 , char); 4518 Copy(pat2, buf + l1, l2 , char); 4519 buf[l1 + l2] = '\n'; 4520 buf[l1 + l2 + 1] = '\0'; 4521 #ifdef I_STDARG 4522 /* ANSI variant takes additional second argument */ 4523 va_start(args, pat2); 4524 #else 4525 va_start(args); 4526 #endif 4527 msv = vmess(buf, &args); 4528 va_end(args); 4529 message = SvPV(msv,l1); 4530 if (l1 > 512) 4531 l1 = 512; 4532 Copy(message, buf, l1 , char); 4533 buf[l1] = '\0'; /* Overwrite \n */ 4534 Perl_croak(aTHX_ "%s", buf); 4535 } 4536 4537 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 4538 4539 void 4540 Perl_save_re_context(pTHX) 4541 { 4542 SAVEPPTR(PL_bostr); 4543 SAVEPPTR(PL_regprecomp); /* uncompiled string. */ 4544 SAVEI32(PL_regnpar); /* () count. */ 4545 SAVEI32(PL_regsize); /* Code size. */ 4546 SAVEI16(PL_regflags); /* are we folding, multilining? */ 4547 SAVEPPTR(PL_reginput); /* String-input pointer. */ 4548 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */ 4549 SAVEPPTR(PL_regeol); /* End of input, for $ check. */ 4550 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */ 4551 SAVEVPTR(PL_regendp); /* Ditto for endp. */ 4552 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */ 4553 SAVEPPTR(PL_regtill); /* How far we are required to go. */ 4554 SAVEI8(PL_regprev); /* char before regbol, \n if none */ 4555 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */ 4556 PL_reg_start_tmp = 0; 4557 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */ 4558 PL_reg_start_tmpl = 0; 4559 SAVEVPTR(PL_regdata); 4560 SAVEI32(PL_reg_flags); /* from regexec.c */ 4561 SAVEI32(PL_reg_eval_set); /* from regexec.c */ 4562 SAVEI32(PL_regnarrate); /* from regexec.c */ 4563 SAVEVPTR(PL_regprogram); /* from regexec.c */ 4564 SAVEINT(PL_regindent); /* from regexec.c */ 4565 SAVEVPTR(PL_regcc); /* from regexec.c */ 4566 SAVEVPTR(PL_curcop); 4567 SAVEVPTR(PL_regcomp_rx); /* from regcomp.c */ 4568 SAVEI32(PL_regseen); /* from regcomp.c */ 4569 SAVEI32(PL_regsawback); /* Did we see \1, ...? */ 4570 SAVEI32(PL_regnaughty); /* How bad is this pattern? */ 4571 SAVEVPTR(PL_regcode); /* Code-emit pointer; ®dummy = don't */ 4572 SAVEPPTR(PL_regxend); /* End of input for compile */ 4573 SAVEPPTR(PL_regcomp_parse); /* Input-scan pointer. */ 4574 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */ 4575 SAVEVPTR(PL_reg_re); /* from regexec.c */ 4576 SAVEPPTR(PL_reg_ganch); /* from regexec.c */ 4577 SAVESPTR(PL_reg_sv); /* from regexec.c */ 4578 SAVEVPTR(PL_reg_magic); /* from regexec.c */ 4579 SAVEI32(PL_reg_oldpos); /* from regexec.c */ 4580 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */ 4581 SAVEVPTR(PL_reg_curpm); /* from regexec.c */ 4582 SAVEI32(PL_regnpar); /* () count. */ 4583 #ifdef DEBUGGING 4584 SAVEPPTR(PL_reg_starttry); /* from regexec.c */ 4585 #endif 4586 } 4587 4588 #ifdef PERL_OBJECT 4589 #include "XSUB.h" 4590 #undef this 4591 #define this pPerl 4592 #endif 4593 4594 static void 4595 clear_re(pTHXo_ void *r) 4596 { 4597 ReREFCNT_dec((regexp *)r); 4598 } 4599 4600