1 #ifdef PERL_EXT_RE_BUILD 2 #include "re_top.h" 3 #endif 4 5 #include "EXTERN.h" 6 #define PERL_IN_REGEX_ENGINE 7 #define PERL_IN_REGCOMP_ANY 8 #define PERL_IN_REGCOMP_STUDY_C 9 #include "perl.h" 10 11 #ifdef PERL_IN_XSUB_RE 12 # include "re_comp.h" 13 #else 14 # include "regcomp.h" 15 #endif 16 17 #include "invlist_inline.h" 18 #include "unicode_constants.h" 19 #include "regcomp_internal.h" 20 21 #define INIT_AND_WITHP \ 22 assert(!and_withp); \ 23 Newx(and_withp, 1, regnode_ssc); \ 24 SAVEFREEPV(and_withp) 25 26 27 STATIC void 28 S_unwind_scan_frames(pTHX_ const void *p) 29 { 30 PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES; 31 scan_frame *f= (scan_frame *)p; 32 do { 33 scan_frame *n= f->next_frame; 34 Safefree(f); 35 f= n; 36 } while (f); 37 } 38 39 /* Follow the next-chain of the current node and optimize away 40 all the NOTHINGs from it. 41 */ 42 STATIC void 43 S_rck_elide_nothing(pTHX_ regnode *node) 44 { 45 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING; 46 47 if (OP(node) != CURLYX) { 48 const int max = (REGNODE_OFF_BY_ARG(OP(node)) 49 ? I32_MAX 50 /* I32 may be smaller than U16 on CRAYs! */ 51 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); 52 int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG1u(node) : NEXT_OFF(node)); 53 int noff; 54 regnode *n = node; 55 56 /* Skip NOTHING and LONGJMP. */ 57 while ( 58 (n = regnext(n)) 59 && ( 60 (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n))) 61 || ((OP(n) == LONGJMP) && (noff = ARG1u(n))) 62 ) 63 && off + noff < max 64 ) { 65 off += noff; 66 } 67 if (REGNODE_OFF_BY_ARG(OP(node))) 68 ARG1u(node) = off; 69 else 70 NEXT_OFF(node) = off; 71 } 72 return; 73 } 74 75 76 /* 77 * As best we can, determine the characters that can match the start of 78 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there 79 * can be false positive matches 80 * 81 * Returns the invlist as a new SV*; it is the caller's responsibility to 82 * call SvREFCNT_dec() when done with it. 83 */ 84 STATIC SV* 85 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) 86 { 87 const U8 * s = (U8*)STRING(node); 88 SSize_t bytelen = STR_LEN(node); 89 UV uc; 90 /* Start out big enough for 2 separate code points */ 91 SV* invlist = _new_invlist(4); 92 93 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST; 94 95 if (! UTF) { 96 uc = *s; 97 98 /* We punt and assume can match anything if the node begins 99 * with a multi-character fold. Things are complicated. For 100 * example, /ffi/i could match any of: 101 * "\N{LATIN SMALL LIGATURE FFI}" 102 * "\N{LATIN SMALL LIGATURE FF}I" 103 * "F\N{LATIN SMALL LIGATURE FI}" 104 * plus several other things; and making sure we have all the 105 * possibilities is hard. */ 106 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) { 107 invlist = _add_range_to_invlist(invlist, 0, UV_MAX); 108 } 109 else { 110 /* Any Latin1 range character can potentially match any 111 * other depending on the locale, and in Turkic locales, 'I' and 112 * 'i' can match U+130 and U+131 */ 113 if (OP(node) == EXACTFL) { 114 _invlist_union(invlist, PL_Latin1, &invlist); 115 if (isALPHA_FOLD_EQ(uc, 'I')) { 116 invlist = add_cp_to_invlist(invlist, 117 LATIN_SMALL_LETTER_DOTLESS_I); 118 invlist = add_cp_to_invlist(invlist, 119 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 120 } 121 } 122 else { 123 /* But otherwise, it matches at least itself. We can 124 * quickly tell if it has a distinct fold, and if so, 125 * it matches that as well */ 126 invlist = add_cp_to_invlist(invlist, uc); 127 if (IS_IN_SOME_FOLD_L1(uc)) 128 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]); 129 } 130 131 /* Some characters match above-Latin1 ones under /i. This 132 * is true of EXACTFL ones when the locale is UTF-8 */ 133 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) 134 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA, 135 EXACTFAA_NO_TRIE))) 136 { 137 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist); 138 } 139 } 140 } 141 else { /* Pattern is UTF-8 */ 142 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; 143 const U8* e = s + bytelen; 144 IV fc; 145 146 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); 147 148 /* The only code points that aren't folded in a UTF EXACTFish 149 * node are the problematic ones in EXACTFL nodes */ 150 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) { 151 /* We need to check for the possibility that this EXACTFL 152 * node begins with a multi-char fold. Therefore we fold 153 * the first few characters of it so that we can make that 154 * check */ 155 U8 *d = folded; 156 int i; 157 158 fc = -1; 159 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { 160 if (isASCII(*s)) { 161 *(d++) = (U8) toFOLD(*s); 162 if (fc < 0) { /* Save the first fold */ 163 fc = *(d-1); 164 } 165 s++; 166 } 167 else { 168 STRLEN len; 169 UV fold = toFOLD_utf8_safe(s, e, d, &len); 170 if (fc < 0) { /* Save the first fold */ 171 fc = fold; 172 } 173 d += len; 174 s += UTF8SKIP(s); 175 } 176 } 177 178 /* And set up so the code below that looks in this folded 179 * buffer instead of the node's string */ 180 e = d; 181 s = folded; 182 } 183 184 /* When we reach here 's' points to the fold of the first 185 * character(s) of the node; and 'e' points to far enough along 186 * the folded string to be just past any possible multi-char 187 * fold. 188 * 189 * Like the non-UTF case above, we punt if the node begins with a 190 * multi-char fold */ 191 192 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { 193 invlist = _add_range_to_invlist(invlist, 0, UV_MAX); 194 } 195 else { /* Single char fold */ 196 unsigned int k; 197 U32 first_fold; 198 const U32 * remaining_folds; 199 Size_t folds_count; 200 201 /* It matches itself */ 202 invlist = add_cp_to_invlist(invlist, fc); 203 204 /* ... plus all the things that fold to it, which are found in 205 * PL_utf8_foldclosures */ 206 folds_count = _inverse_folds(fc, &first_fold, 207 &remaining_folds); 208 for (k = 0; k < folds_count; k++) { 209 UV c = (k == 0) ? first_fold : remaining_folds[k-1]; 210 211 /* /aa doesn't allow folds between ASCII and non- */ 212 if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE) 213 && isASCII(c) != isASCII(fc)) 214 { 215 continue; 216 } 217 218 invlist = add_cp_to_invlist(invlist, c); 219 } 220 221 if (OP(node) == EXACTFL) { 222 223 /* If either [iI] are present in an EXACTFL node the above code 224 * should have added its normal case pair, but under a Turkish 225 * locale they could match instead the case pairs from it. Add 226 * those as potential matches as well */ 227 if (isALPHA_FOLD_EQ(fc, 'I')) { 228 invlist = add_cp_to_invlist(invlist, 229 LATIN_SMALL_LETTER_DOTLESS_I); 230 invlist = add_cp_to_invlist(invlist, 231 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 232 } 233 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) { 234 invlist = add_cp_to_invlist(invlist, 'I'); 235 } 236 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { 237 invlist = add_cp_to_invlist(invlist, 'i'); 238 } 239 } 240 } 241 } 242 243 return invlist; 244 } 245 246 247 /* Mark that we cannot extend a found fixed substring at this point. 248 Update the longest found anchored substring or the longest found 249 floating substrings if needed. */ 250 251 void 252 Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, 253 SSize_t *minlenp, int is_inf) 254 { 255 const STRLEN l = CHR_SVLEN(data->last_found); 256 SV * const longest_sv = data->substrs[data->cur_is_floating].str; 257 const STRLEN old_l = CHR_SVLEN(longest_sv); 258 DECLARE_AND_GET_RE_DEBUG_FLAGS; 259 260 PERL_ARGS_ASSERT_SCAN_COMMIT; 261 262 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { 263 const U8 i = data->cur_is_floating; 264 SvSetMagicSV(longest_sv, data->last_found); 265 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min; 266 267 if (!i) /* fixed */ 268 data->substrs[0].max_offset = data->substrs[0].min_offset; 269 else { /* float */ 270 data->substrs[1].max_offset = 271 (is_inf) 272 ? OPTIMIZE_INFTY 273 : (l 274 ? data->last_start_max 275 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min 276 ? OPTIMIZE_INFTY 277 : data->pos_min + data->pos_delta)); 278 } 279 280 data->substrs[i].flags &= ~SF_BEFORE_EOL; 281 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL; 282 data->substrs[i].minlenp = minlenp; 283 data->substrs[i].lookbehind = 0; 284 } 285 286 SvCUR_set(data->last_found, 0); 287 { 288 SV * const sv = data->last_found; 289 if (SvUTF8(sv) && SvMAGICAL(sv)) { 290 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); 291 if (mg) 292 mg->mg_len = 0; 293 } 294 } 295 data->last_end = -1; 296 data->flags &= ~SF_BEFORE_EOL; 297 DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1); 298 } 299 300 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion 301 * list that describes which code points it matches */ 302 303 STATIC void 304 S_ssc_anything(pTHX_ regnode_ssc *ssc) 305 { 306 /* Set the SSC 'ssc' to match an empty string or any code point */ 307 308 PERL_ARGS_ASSERT_SSC_ANYTHING; 309 310 assert(is_ANYOF_SYNTHETIC(ssc)); 311 312 /* mortalize so won't leak */ 313 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX)); 314 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ 315 } 316 317 STATIC int 318 S_ssc_is_anything(const regnode_ssc *ssc) 319 { 320 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code 321 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys 322 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted 323 * in any way, so there's no point in using it */ 324 325 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */ 326 bool ret; 327 328 PERL_ARGS_ASSERT_SSC_IS_ANYTHING; 329 330 assert(is_ANYOF_SYNTHETIC(ssc)); 331 332 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) { 333 return FALSE; 334 } 335 336 /* See if the list consists solely of the range 0 - Infinity */ 337 invlist_iterinit(ssc->invlist); 338 ret = invlist_iternext(ssc->invlist, &start, &end) 339 && start == 0 340 && end == UV_MAX; 341 342 invlist_iterfinish(ssc->invlist); 343 344 if (ret) { 345 return TRUE; 346 } 347 348 /* If e.g., both \w and \W are set, matches everything */ 349 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 350 int i; 351 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { 352 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { 353 return TRUE; 354 } 355 } 356 } 357 358 return FALSE; 359 } 360 361 void 362 Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) 363 { 364 /* Initializes the SSC 'ssc'. This includes setting it to match an empty 365 * string, any code point, or any posix class under locale */ 366 367 PERL_ARGS_ASSERT_SSC_INIT; 368 369 Zero(ssc, 1, regnode_ssc); 370 set_ANYOF_SYNTHETIC(ssc); 371 ARG1u_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE); 372 ssc_anything(ssc); 373 374 /* If any portion of the regex is to operate under locale rules that aren't 375 * fully known at compile time, initialization includes it. The reason 376 * this isn't done for all regexes is that the optimizer was written under 377 * the assumption that locale was all-or-nothing. Given the complexity and 378 * lack of documentation in the optimizer, and that there are inadequate 379 * test cases for locale, many parts of it may not work properly, it is 380 * safest to avoid locale unless necessary. */ 381 if (RExC_contains_locale) { 382 ANYOF_POSIXL_SETALL(ssc); 383 } 384 else { 385 ANYOF_POSIXL_ZERO(ssc); 386 } 387 } 388 389 STATIC int 390 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, 391 const regnode_ssc *ssc) 392 { 393 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only 394 * to the list of code points matched, and locale posix classes; hence does 395 * not check its flags) */ 396 397 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */ 398 bool ret; 399 400 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; 401 402 assert(is_ANYOF_SYNTHETIC(ssc)); 403 404 invlist_iterinit(ssc->invlist); 405 ret = invlist_iternext(ssc->invlist, &start, &end) 406 && start == 0 407 && end == UV_MAX; 408 409 invlist_iterfinish(ssc->invlist); 410 411 if (! ret) { 412 return FALSE; 413 } 414 415 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { 416 return FALSE; 417 } 418 419 return TRUE; 420 } 421 422 423 STATIC SV* 424 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, 425 const regnode_charclass* const node) 426 { 427 /* Returns a mortal inversion list defining which code points are matched 428 * by 'node', which is of ANYOF-ish type . Handles complementing the 429 * result if appropriate. If some code points aren't knowable at this 430 * time, the returned list must, and will, contain every code point that is 431 * a possibility. */ 432 433 SV* invlist = NULL; 434 SV* only_utf8_locale_invlist = NULL; 435 bool new_node_has_latin1 = FALSE; 436 const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF) 437 ? ANYOF_FLAGS(node) 438 : 0; 439 440 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; 441 442 /* Look at the data structure created by S_set_ANYOF_arg() */ 443 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { 444 invlist = sv_2mortal(_new_invlist(1)); 445 invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX); 446 } 447 else if (ANYOF_HAS_AUX(node)) { 448 const U32 n = ARG1u(node); 449 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); 450 AV * const av = MUTABLE_AV(SvRV(rv)); 451 SV **const ary = AvARRAY(av); 452 453 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { 454 455 /* Here there are things that won't be known until runtime -- we 456 * have to assume it could be anything */ 457 invlist = sv_2mortal(_new_invlist(1)); 458 return _add_range_to_invlist(invlist, 0, UV_MAX); 459 } 460 else if (ary[INVLIST_INDEX]) { 461 462 /* Use the node's inversion list */ 463 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL)); 464 } 465 466 /* Get the code points valid only under UTF-8 locales */ 467 if ( (flags & ANYOFL_FOLD) 468 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) 469 { 470 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX]; 471 } 472 } 473 474 if (! invlist) { 475 invlist = sv_2mortal(_new_invlist(0)); 476 } 477 478 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS 479 * code points, and an inversion list for the others, but if there are code 480 * points that should match only conditionally on the target string being 481 * UTF-8, those are placed in the inversion list, and not the bitmap. 482 * Since there are circumstances under which they could match, they are 483 * included in the SSC. But if the ANYOF node is to be inverted, we have 484 * to exclude them here, so that when we invert below, the end result 485 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We 486 * have to do this here before we add the unconditionally matched code 487 * points */ 488 if (flags & ANYOF_INVERT) { 489 _invlist_intersection_complement_2nd(invlist, 490 PL_UpperLatin1, 491 &invlist); 492 } 493 494 /* Add in the points from the bit map */ 495 if (REGNODE_TYPE(OP(node)) == ANYOF){ 496 for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { 497 if (ANYOF_BITMAP_TEST(node, i)) { 498 unsigned int start = i++; 499 500 for (; i < NUM_ANYOF_CODE_POINTS 501 && ANYOF_BITMAP_TEST(node, i); ++i) 502 { 503 /* empty */ 504 } 505 invlist = _add_range_to_invlist(invlist, start, i-1); 506 new_node_has_latin1 = TRUE; 507 } 508 } 509 } 510 511 /* If this can match all upper Latin1 code points, have to add them 512 * as well. But don't add them if inverting, as when that gets done below, 513 * it would exclude all these characters, including the ones it shouldn't 514 * that were added just above */ 515 if ( ! (flags & ANYOF_INVERT) 516 && OP(node) == ANYOFD 517 && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)) 518 { 519 _invlist_union(invlist, PL_UpperLatin1, &invlist); 520 } 521 522 /* Similarly for these */ 523 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { 524 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); 525 } 526 527 if (flags & ANYOF_INVERT) { 528 _invlist_invert(invlist); 529 } 530 else if (flags & ANYOFL_FOLD) { 531 if (new_node_has_latin1) { 532 533 /* These folds are potential in Turkic locales */ 534 if (_invlist_contains_cp(invlist, 'i')) { 535 invlist = add_cp_to_invlist(invlist, 536 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 537 } 538 if (_invlist_contains_cp(invlist, 'I')) { 539 invlist = add_cp_to_invlist(invlist, 540 LATIN_SMALL_LETTER_DOTLESS_I); 541 } 542 543 /* Under /li, any 0-255 could fold to any other 0-255, depending on 544 * the locale. We can skip this if there are no 0-255 at all. */ 545 _invlist_union(invlist, PL_Latin1, &invlist); 546 } 547 else { 548 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) { 549 invlist = add_cp_to_invlist(invlist, 'I'); 550 } 551 if (_invlist_contains_cp(invlist, 552 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) 553 { 554 invlist = add_cp_to_invlist(invlist, 'i'); 555 } 556 } 557 } 558 559 /* Similarly add the UTF-8 locale possible matches. These have to be 560 * deferred until after the non-UTF-8 locale ones are taken care of just 561 * above, or it leads to wrong results under ANYOF_INVERT */ 562 if (only_utf8_locale_invlist) { 563 _invlist_union_maybe_complement_2nd(invlist, 564 only_utf8_locale_invlist, 565 flags & ANYOF_INVERT, 566 &invlist); 567 } 568 569 return invlist; 570 } 571 572 /* 'AND' a given class with another one. Can create false positives. 'ssc' 573 * should not be inverted. */ 574 575 STATIC void 576 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, 577 const regnode_charclass *and_with) 578 { 579 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either 580 * another SSC or a regular ANYOF class. Can create false positives. */ 581 582 SV* anded_cp_list; 583 U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF) 584 ? ANYOF_FLAGS(and_with) 585 : 0; 586 U8 anded_flags; 587 588 PERL_ARGS_ASSERT_SSC_AND; 589 590 assert(is_ANYOF_SYNTHETIC(ssc)); 591 592 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract 593 * the code point inversion list and just the relevant flags */ 594 if (is_ANYOF_SYNTHETIC(and_with)) { 595 anded_cp_list = ((regnode_ssc *)and_with)->invlist; 596 anded_flags = and_with_flags; 597 598 /* XXX This is a kludge around what appears to be deficiencies in the 599 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, 600 * there are paths through the optimizer where it doesn't get weeded 601 * out when it should. And if we don't make some extra provision for 602 * it like the code just below, it doesn't get added when it should. 603 * This solution is to add it only when AND'ing, which is here, and 604 * only when what is being AND'ed is the pristine, original node 605 * matching anything. Thus it is like adding it to ssc_anything() but 606 * only when the result is to be AND'ed. Probably the same solution 607 * could be adopted for the same problem we have with /l matching, 608 * which is solved differently in S_ssc_init(), and that would lead to 609 * fewer false positives than that solution has. But if this solution 610 * creates bugs, the consequences are only that a warning isn't raised 611 * that should be; while the consequences for having /l bugs is 612 * incorrect matches */ 613 if (ssc_is_anything((regnode_ssc *)and_with)) { 614 anded_flags |= ANYOF_WARN_SUPER__shared; 615 } 616 } 617 else { 618 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); 619 if (OP(and_with) == ANYOFD) { 620 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS; 621 } 622 else { 623 anded_flags = and_with_flags 624 & ( ANYOF_COMMON_FLAGS 625 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared 626 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES); 627 if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) { 628 anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; 629 } 630 } 631 } 632 633 ANYOF_FLAGS(ssc) &= anded_flags; 634 635 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. 636 * C2 is the list of code points in 'and-with'; P2, its posix classes. 637 * 'and_with' may be inverted. When not inverted, we have the situation of 638 * computing: 639 * (C1 | P1) & (C2 | P2) 640 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) 641 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) 642 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) 643 * <= ((C1 & C2) | P1 | P2) 644 * Alternatively, the last few steps could be: 645 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) 646 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) 647 * <= (C1 | C2 | (P1 & P2)) 648 * We favor the second approach if either P1 or P2 is non-empty. This is 649 * because these components are a barrier to doing optimizations, as what 650 * they match cannot be known until the moment of matching as they are 651 * dependent on the current locale, 'AND"ing them likely will reduce or 652 * eliminate them. 653 * But we can do better if we know that C1,P1 are in their initial state (a 654 * frequent occurrence), each matching everything: 655 * (<everything>) & (C2 | P2) = C2 | P2 656 * Similarly, if C2,P2 are in their initial state (again a frequent 657 * occurrence), the result is a no-op 658 * (C1 | P1) & (<everything>) = C1 | P1 659 * 660 * Inverted, we have 661 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) 662 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) 663 * <= (C1 & ~C2) | (P1 & ~P2) 664 * */ 665 666 if ((and_with_flags & ANYOF_INVERT) 667 && ! is_ANYOF_SYNTHETIC(and_with)) 668 { 669 unsigned int i; 670 671 ssc_intersection(ssc, 672 anded_cp_list, 673 FALSE /* Has already been inverted */ 674 ); 675 676 /* If either P1 or P2 is empty, the intersection will be also; can skip 677 * the loop */ 678 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) { 679 ANYOF_POSIXL_ZERO(ssc); 680 } 681 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 682 683 /* Note that the Posix class component P from 'and_with' actually 684 * looks like: 685 * P = Pa | Pb | ... | Pn 686 * where each component is one posix class, such as in [\w\s]. 687 * Thus 688 * ~P = ~(Pa | Pb | ... | Pn) 689 * = ~Pa & ~Pb & ... & ~Pn 690 * <= ~Pa | ~Pb | ... | ~Pn 691 * The last is something we can easily calculate, but unfortunately 692 * is likely to have many false positives. We could do better 693 * in some (but certainly not all) instances if two classes in 694 * P have known relationships. For example 695 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: 696 * So 697 * :lower: & :print: = :lower: 698 * And similarly for classes that must be disjoint. For example, 699 * since \s and \w can have no elements in common based on rules in 700 * the POSIX standard, 701 * \w & ^\S = nothing 702 * Unfortunately, some vendor locales do not meet the Posix 703 * standard, in particular almost everything by Microsoft. 704 * The loop below just changes e.g., \w into \W and vice versa */ 705 706 regnode_charclass_posixl temp; 707 int add = 1; /* To calculate the index of the complement */ 708 709 Zero(&temp, 1, regnode_charclass_posixl); 710 ANYOF_POSIXL_ZERO(&temp); 711 for (i = 0; i < ANYOF_MAX; i++) { 712 assert(i % 2 != 0 713 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) 714 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); 715 716 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { 717 ANYOF_POSIXL_SET(&temp, i + add); 718 } 719 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ 720 } 721 ANYOF_POSIXL_AND(&temp, ssc); 722 723 } /* else ssc already has no posixes */ 724 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC 725 in its initial state */ 726 else if (! is_ANYOF_SYNTHETIC(and_with) 727 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) 728 { 729 /* But if 'ssc' is in its initial state, the result is just 'and_with'; 730 * copy it over 'ssc' */ 731 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { 732 if (is_ANYOF_SYNTHETIC(and_with)) { 733 StructCopy(and_with, ssc, regnode_ssc); 734 } 735 else { 736 ssc->invlist = anded_cp_list; 737 ANYOF_POSIXL_ZERO(ssc); 738 if (and_with_flags & ANYOF_MATCHES_POSIXL) { 739 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); 740 } 741 } 742 } 743 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) 744 || (and_with_flags & ANYOF_MATCHES_POSIXL)) 745 { 746 /* One or the other of P1, P2 is non-empty. */ 747 if (and_with_flags & ANYOF_MATCHES_POSIXL) { 748 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); 749 } 750 ssc_union(ssc, anded_cp_list, FALSE); 751 } 752 else { /* P1 = P2 = empty */ 753 ssc_intersection(ssc, anded_cp_list, FALSE); 754 } 755 } 756 } 757 758 STATIC void 759 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, 760 const regnode_charclass *or_with) 761 { 762 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either 763 * another SSC or a regular ANYOF class. Can create false positives if 764 * 'or_with' is to be inverted. */ 765 766 SV* ored_cp_list; 767 U8 ored_flags; 768 U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF) 769 ? ANYOF_FLAGS(or_with) 770 : 0; 771 772 PERL_ARGS_ASSERT_SSC_OR; 773 774 assert(is_ANYOF_SYNTHETIC(ssc)); 775 776 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract 777 * the code point inversion list and just the relevant flags */ 778 if (is_ANYOF_SYNTHETIC(or_with)) { 779 ored_cp_list = ((regnode_ssc*) or_with)->invlist; 780 ored_flags = or_with_flags; 781 } 782 else { 783 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); 784 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS; 785 if (OP(or_with) != ANYOFD) { 786 ored_flags |= 787 or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared 788 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES); 789 if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) { 790 ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; 791 } 792 } 793 } 794 795 ANYOF_FLAGS(ssc) |= ored_flags; 796 797 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. 798 * C2 is the list of code points in 'or-with'; P2, its posix classes. 799 * 'or_with' may be inverted. When not inverted, we have the simple 800 * situation of computing: 801 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) 802 * If P1|P2 yields a situation with both a class and its complement are 803 * set, like having both \w and \W, this matches all code points, and we 804 * can delete these from the P component of the ssc going forward. XXX We 805 * might be able to delete all the P components, but I (khw) am not certain 806 * about this, and it is better to be safe. 807 * 808 * Inverted, we have 809 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) 810 * <= (C1 | P1) | ~C2 811 * <= (C1 | ~C2) | P1 812 * (which results in actually simpler code than the non-inverted case) 813 * */ 814 815 if ((or_with_flags & ANYOF_INVERT) 816 && ! is_ANYOF_SYNTHETIC(or_with)) 817 { 818 /* We ignore P2, leaving P1 going forward */ 819 } /* else Not inverted */ 820 else if (or_with_flags & ANYOF_MATCHES_POSIXL) { 821 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); 822 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 823 unsigned int i; 824 for (i = 0; i < ANYOF_MAX; i += 2) { 825 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) 826 { 827 ssc_match_all_cp(ssc); 828 ANYOF_POSIXL_CLEAR(ssc, i); 829 ANYOF_POSIXL_CLEAR(ssc, i+1); 830 } 831 } 832 } 833 } 834 835 ssc_union(ssc, 836 ored_cp_list, 837 FALSE /* Already has been inverted */ 838 ); 839 } 840 841 STATIC void 842 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) 843 { 844 PERL_ARGS_ASSERT_SSC_UNION; 845 846 assert(is_ANYOF_SYNTHETIC(ssc)); 847 848 _invlist_union_maybe_complement_2nd(ssc->invlist, 849 invlist, 850 invert2nd, 851 &ssc->invlist); 852 } 853 854 STATIC void 855 S_ssc_intersection(pTHX_ regnode_ssc *ssc, 856 SV* const invlist, 857 const bool invert2nd) 858 { 859 PERL_ARGS_ASSERT_SSC_INTERSECTION; 860 861 assert(is_ANYOF_SYNTHETIC(ssc)); 862 863 _invlist_intersection_maybe_complement_2nd(ssc->invlist, 864 invlist, 865 invert2nd, 866 &ssc->invlist); 867 } 868 869 STATIC void 870 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) 871 { 872 PERL_ARGS_ASSERT_SSC_ADD_RANGE; 873 874 assert(is_ANYOF_SYNTHETIC(ssc)); 875 876 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); 877 } 878 879 STATIC void 880 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) 881 { 882 /* AND just the single code point 'cp' into the SSC 'ssc' */ 883 884 SV* cp_list = _new_invlist(2); 885 886 PERL_ARGS_ASSERT_SSC_CP_AND; 887 888 assert(is_ANYOF_SYNTHETIC(ssc)); 889 890 cp_list = add_cp_to_invlist(cp_list, cp); 891 ssc_intersection(ssc, cp_list, 892 FALSE /* Not inverted */ 893 ); 894 SvREFCNT_dec_NN(cp_list); 895 } 896 897 STATIC void 898 S_ssc_clear_locale(regnode_ssc *ssc) 899 { 900 /* Set the SSC 'ssc' to not match any locale things */ 901 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; 902 903 assert(is_ANYOF_SYNTHETIC(ssc)); 904 905 ANYOF_POSIXL_ZERO(ssc); 906 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; 907 } 908 909 910 911 /* The below joins as many adjacent EXACTish nodes as possible into a single 912 * one. The regop may be changed if the node(s) contain certain sequences that 913 * require special handling. The joining is only done if: 914 * 1) there is room in the current conglomerated node to entirely contain the 915 * next one. 916 * 2) they are compatible node types 917 * 918 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and 919 * these get optimized out 920 * 921 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full 922 * as possible, even if that means splitting an existing node so that its first 923 * part is moved to the preceding node. This would maximise the efficiency of 924 * memEQ during matching. 925 * 926 * If a node is to match under /i (folded), the number of characters it matches 927 * can be different than its character length if it contains a multi-character 928 * fold. *min_subtract is set to the total delta number of characters of the 929 * input nodes. 930 * 931 * And *unfolded_multi_char is set to indicate whether or not the node contains 932 * an unfolded multi-char fold. This happens when it won't be known until 933 * runtime whether the fold is valid or not; namely 934 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the 935 * target string being matched against turns out to be UTF-8 is that fold 936 * valid; or 937 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at 938 * runtime. 939 * (Multi-char folds whose components are all above the Latin1 range are not 940 * run-time locale dependent, and have already been folded by the time this 941 * function is called.) 942 * 943 * This is as good a place as any to discuss the design of handling these 944 * multi-character fold sequences. It's been wrong in Perl for a very long 945 * time. There are three code points in Unicode whose multi-character folds 946 * were long ago discovered to mess things up. The previous designs for 947 * dealing with these involved assigning a special node for them. This 948 * approach doesn't always work, as evidenced by this example: 949 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches 950 * Both sides fold to "sss", but if the pattern is parsed to create a node that 951 * would match just the \xDF, it won't be able to handle the case where a 952 * successful match would have to cross the node's boundary. The new approach 953 * that hopefully generally solves the problem generates an EXACTFUP node 954 * that is "sss" in this case. 955 * 956 * It turns out that there are problems with all multi-character folds, and not 957 * just these three. Now the code is general, for all such cases. The 958 * approach taken is: 959 * 1) This routine examines each EXACTFish node that could contain multi- 960 * character folded sequences. Since a single character can fold into 961 * such a sequence, the minimum match length for this node is less than 962 * the number of characters in the node. This routine returns in 963 * *min_subtract how many characters to subtract from the actual 964 * length of the string to get a real minimum match length; it is 0 if 965 * there are no multi-char foldeds. This delta is used by the caller to 966 * adjust the min length of the match, and the delta between min and max, 967 * so that the optimizer doesn't reject these possibilities based on size 968 * constraints. 969 * 970 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF) 971 * under /u, we fold it to 'ss' in regatom(), and in this routine, after 972 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8 973 * EXACTFU nodes. The node type of such nodes is then changed to 974 * EXACTFUP, indicating it is problematic, and needs careful handling. 975 * (The procedures in step 1) above are sufficient to handle this case in 976 * UTF-8 encoded nodes.) The reason this is problematic is that this is 977 * the only case where there is a possible fold length change in non-UTF-8 978 * patterns. By reserving a special node type for problematic cases, the 979 * far more common regular EXACTFU nodes can be processed faster. 980 * regexec.c takes advantage of this. 981 * 982 * EXACTFUP has been created as a grab-bag for (hopefully uncommon) 983 * problematic cases. These all only occur when the pattern is not 984 * UTF-8. In addition to the 'ss' sequence where there is a possible fold 985 * length change, it handles the situation where the string cannot be 986 * entirely folded. The strings in an EXACTFish node are folded as much 987 * as possible during compilation in regcomp.c. This saves effort in 988 * regex matching. By using an EXACTFUP node when it is not possible to 989 * fully fold at compile time, regexec.c can know that everything in an 990 * EXACTFU node is folded, so folding can be skipped at runtime. The only 991 * case where folding in EXACTFU nodes can't be done at compile time is 992 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This 993 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes 994 * handle two very different cases. Alternatively, there could have been 995 * a node type where there are length changes, one for unfolded, and one 996 * for both. If yet another special case needed to be created, the number 997 * of required node types would have to go to 7. khw figures that even 998 * though there are plenty of node types to spare, that the maintenance 999 * cost wasn't worth the small speedup of doing it that way, especially 1000 * since he thinks the MICRO SIGN is rarely encountered in practice. 1001 * 1002 * There are other cases where folding isn't done at compile time, but 1003 * none of them are under /u, and hence not for EXACTFU nodes. The folds 1004 * in EXACTFL nodes aren't known until runtime, and vary as the locale 1005 * changes. Some folds in EXACTF depend on if the runtime target string 1006 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di 1007 * when no fold in it depends on the UTF-8ness of the target string.) 1008 * 1009 * 3) A problem remains for unfolded multi-char folds. (These occur when the 1010 * validity of the fold won't be known until runtime, and so must remain 1011 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA 1012 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot 1013 * be an EXACTF node with a UTF-8 pattern.) They also occur for various 1014 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) 1015 * The reason this is a problem is that the optimizer part of regexec.c 1016 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption 1017 * that a character in the pattern corresponds to at most a single 1018 * character in the target string. (And I do mean character, and not byte 1019 * here, unlike other parts of the documentation that have never been 1020 * updated to account for multibyte Unicode.) Sharp s in EXACTF and 1021 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA 1022 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in 1023 * EXACTFL nodes, violate the assumption, and they are the only instances 1024 * where it is violated. I'm reluctant to try to change the assumption, 1025 * as the code involved is impenetrable to me (khw), so instead the code 1026 * here punts. This routine examines EXACTFL nodes, and (when the pattern 1027 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a 1028 * boolean indicating whether or not the node contains such a fold. When 1029 * it is true, the caller sets a flag that later causes the optimizer in 1030 * this file to not set values for the floating and fixed string lengths, 1031 * and thus avoids the optimizer code in regexec.c that makes the invalid 1032 * assumption. Thus, there is no optimization based on string lengths for 1033 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern 1034 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the 1035 * assumption is wrong only in these cases is that all other non-UTF-8 1036 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to 1037 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in 1038 * EXACTF nodes because we don't know at compile time if it actually 1039 * matches 'ss' or not. For EXACTF nodes it will match iff the target 1040 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it 1041 * always matches; and EXACTFAA where it never does. In an EXACTFAA node 1042 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the 1043 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 1044 * string would require the pattern to be forced into UTF-8, the overhead 1045 * of which we want to avoid. Similarly the unfolded multi-char folds in 1046 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 1047 * locale.) 1048 * 1049 * Similarly, the code that generates tries doesn't currently handle 1050 * not-already-folded multi-char folds, and it looks like a pain to change 1051 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s 1052 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode, 1053 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people 1054 * using /iaa matching will be doing so almost entirely with ASCII 1055 * strings, so this should rarely be encountered in practice */ 1056 1057 U32 1058 Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, 1059 UV *min_subtract, bool *unfolded_multi_char, 1060 U32 flags, regnode *val, U32 depth) 1061 { 1062 /* Merge several consecutive EXACTish nodes into one. */ 1063 1064 regnode *n = regnext(scan); 1065 U32 stringok = 1; 1066 regnode *next = REGNODE_AFTER_varies(scan); 1067 U32 stopnow = 0; 1068 #ifdef DEBUGGING 1069 U32 merged = 0; 1070 regnode *stop = scan; 1071 DECLARE_AND_GET_RE_DEBUG_FLAGS; 1072 #else 1073 PERL_UNUSED_ARG(depth); 1074 #endif 1075 1076 PERL_ARGS_ASSERT_JOIN_EXACT; 1077 #ifndef EXPERIMENTAL_INPLACESCAN 1078 PERL_UNUSED_ARG(flags); 1079 PERL_UNUSED_ARG(val); 1080 #endif 1081 DEBUG_PEEP("join", scan, depth, 0); 1082 1083 assert(REGNODE_TYPE(OP(scan)) == EXACT); 1084 1085 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge 1086 * EXACT ones that are mergeable to the current one. */ 1087 while ( n 1088 && ( REGNODE_TYPE(OP(n)) == NOTHING 1089 || (stringok && REGNODE_TYPE(OP(n)) == EXACT)) 1090 && NEXT_OFF(n) 1091 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) 1092 { 1093 1094 if (OP(n) == TAIL || n > next) 1095 stringok = 0; 1096 if (REGNODE_TYPE(OP(n)) == NOTHING) { 1097 DEBUG_PEEP("skip:", n, depth, 0); 1098 NEXT_OFF(scan) += NEXT_OFF(n); 1099 next = n + NODE_STEP_REGNODE; 1100 #ifdef DEBUGGING 1101 if (stringok) 1102 stop = n; 1103 #endif 1104 n = regnext(n); 1105 } 1106 else if (stringok) { 1107 const unsigned int oldl = STR_LEN(scan); 1108 regnode * const nnext = regnext(n); 1109 1110 /* XXX I (khw) kind of doubt that this works on platforms (should 1111 * Perl ever run on one) where U8_MAX is above 255 because of lots 1112 * of other assumptions */ 1113 /* Don't join if the sum can't fit into a single node */ 1114 if (oldl + STR_LEN(n) > U8_MAX) 1115 break; 1116 1117 /* Joining something that requires UTF-8 with something that 1118 * doesn't, means the result requires UTF-8. */ 1119 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) { 1120 OP(scan) = EXACT_REQ8; 1121 } 1122 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) { 1123 ; /* join is compatible, no need to change OP */ 1124 } 1125 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) { 1126 OP(scan) = EXACTFU_REQ8; 1127 } 1128 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) { 1129 ; /* join is compatible, no need to change OP */ 1130 } 1131 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) { 1132 ; /* join is compatible, no need to change OP */ 1133 } 1134 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) { 1135 1136 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated, 1137 * which can join with EXACTFU ones. We check for this case 1138 * here. These need to be resolved to either EXACTFU or 1139 * EXACTF at joining time. They have nothing in them that 1140 * would forbid them from being the more desirable EXACTFU 1141 * nodes except that they begin and/or end with a single [Ss]. 1142 * The reason this is problematic is because they could be 1143 * joined in this loop with an adjacent node that ends and/or 1144 * begins with [Ss] which would then form the sequence 'ss', 1145 * which matches differently under /di than /ui, in which case 1146 * EXACTFU can't be used. If the 'ss' sequence doesn't get 1147 * formed, the nodes get absorbed into any adjacent EXACTFU 1148 * node. And if the only adjacent node is EXACTF, they get 1149 * absorbed into that, under the theory that a longer node is 1150 * better than two shorter ones, even if one is EXACTFU. Note 1151 * that EXACTFU_REQ8 is generated only for UTF-8 patterns, 1152 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */ 1153 1154 if (STRING(n)[STR_LEN(n)-1] == 's') { 1155 1156 /* Here the joined node would end with 's'. If the node 1157 * following the combination is an EXACTF one, it's better to 1158 * join this trailing edge 's' node with that one, leaving the 1159 * current one in 'scan' be the more desirable EXACTFU */ 1160 if (OP(nnext) == EXACTF) { 1161 break; 1162 } 1163 1164 OP(scan) = EXACTFU_S_EDGE; 1165 1166 } /* Otherwise, the beginning 's' of the 2nd node just 1167 becomes an interior 's' in 'scan' */ 1168 } 1169 else if (OP(scan) == EXACTF && OP(n) == EXACTF) { 1170 ; /* join is compatible, no need to change OP */ 1171 } 1172 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) { 1173 1174 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE 1175 * nodes. But the latter nodes can be also joined with EXACTFU 1176 * ones, and that is a better outcome, so if the node following 1177 * 'n' is EXACTFU, quit now so that those two can be joined 1178 * later */ 1179 if (OP(nnext) == EXACTFU) { 1180 break; 1181 } 1182 1183 /* The join is compatible, and the combined node will be 1184 * EXACTF. (These don't care if they begin or end with 's' */ 1185 } 1186 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) { 1187 if ( STRING(scan)[STR_LEN(scan)-1] == 's' 1188 && STRING(n)[0] == 's') 1189 { 1190 /* When combined, we have the sequence 'ss', which means we 1191 * have to remain /di */ 1192 OP(scan) = EXACTF; 1193 } 1194 } 1195 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) { 1196 if (STRING(n)[0] == 's') { 1197 ; /* Here the join is compatible and the combined node 1198 starts with 's', no need to change OP */ 1199 } 1200 else { /* Now the trailing 's' is in the interior */ 1201 OP(scan) = EXACTFU; 1202 } 1203 } 1204 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) { 1205 1206 /* The join is compatible, and the combined node will be 1207 * EXACTF. (These don't care if they begin or end with 's' */ 1208 OP(scan) = EXACTF; 1209 } 1210 else if (OP(scan) != OP(n)) { 1211 1212 /* The only other compatible joinings are the same node type */ 1213 break; 1214 } 1215 1216 DEBUG_PEEP("merg", n, depth, 0); 1217 #ifdef DEBUGGING 1218 merged++; 1219 #endif 1220 1221 next = REGNODE_AFTER_varies(n); 1222 NEXT_OFF(scan) += NEXT_OFF(n); 1223 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 ); 1224 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n))); 1225 /* Now we can overwrite *n : */ 1226 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); 1227 #ifdef DEBUGGING 1228 stop = next - 1; 1229 #endif 1230 n = nnext; 1231 if (stopnow) break; 1232 } 1233 1234 #ifdef EXPERIMENTAL_INPLACESCAN 1235 if (flags && !NEXT_OFF(n)) { 1236 DEBUG_PEEP("atch", val, depth, 0); 1237 if (REGNODE_OFF_BY_ARG(OP(n))) { 1238 ARG1u_SET(n, val - n); 1239 } 1240 else { 1241 NEXT_OFF(n) = val - n; 1242 } 1243 stopnow = 1; 1244 } 1245 #endif 1246 } 1247 1248 /* This temporary node can now be turned into EXACTFU, and must, as 1249 * regexec.c doesn't handle it */ 1250 if (OP(scan) == EXACTFU_S_EDGE) { 1251 OP(scan) = EXACTFU; 1252 } 1253 1254 *min_subtract = 0; 1255 *unfolded_multi_char = FALSE; 1256 1257 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We 1258 * can now analyze for sequences of problematic code points. (Prior to 1259 * this final joining, sequences could have been split over boundaries, and 1260 * hence missed). The sequences only happen in folding, hence for any 1261 * non-EXACT EXACTish node */ 1262 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) { 1263 U8* s0 = (U8*) STRING(scan); 1264 U8* s = s0; 1265 U8* s_end = s0 + STR_LEN(scan); 1266 1267 int total_count_delta = 0; /* Total delta number of characters that 1268 multi-char folds expand to */ 1269 1270 /* One pass is made over the node's string looking for all the 1271 * possibilities. To avoid some tests in the loop, there are two main 1272 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and 1273 * non-UTF-8 */ 1274 if (UTF) { 1275 U8* folded = NULL; 1276 1277 if (OP(scan) == EXACTFL) { 1278 U8 *d; 1279 1280 /* An EXACTFL node would already have been changed to another 1281 * node type unless there is at least one character in it that 1282 * is problematic; likely a character whose fold definition 1283 * won't be known until runtime, and so has yet to be folded. 1284 * For all but the UTF-8 locale, folds are 1-1 in length, but 1285 * to handle the UTF-8 case, we need to create a temporary 1286 * folded copy using UTF-8 locale rules in order to analyze it. 1287 * This is because our macros that look to see if a sequence is 1288 * a multi-char fold assume everything is folded (otherwise the 1289 * tests in those macros would be too complicated and slow). 1290 * Note that here, the non-problematic folds will have already 1291 * been done, so we can just copy such characters. We actually 1292 * don't completely fold the EXACTFL string. We skip the 1293 * unfolded multi-char folds, as that would just create work 1294 * below to figure out the size they already are */ 1295 1296 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); 1297 d = folded; 1298 while (s < s_end) { 1299 STRLEN s_len = UTF8SKIP(s); 1300 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { 1301 Copy(s, d, s_len, U8); 1302 d += s_len; 1303 } 1304 else if (is_FOLDS_TO_MULTI_utf8(s)) { 1305 *unfolded_multi_char = TRUE; 1306 Copy(s, d, s_len, U8); 1307 d += s_len; 1308 } 1309 else if (isASCII(*s)) { 1310 *(d++) = toFOLD(*s); 1311 } 1312 else { 1313 STRLEN len; 1314 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL); 1315 d += len; 1316 } 1317 s += s_len; 1318 } 1319 1320 /* Point the remainder of the routine to look at our temporary 1321 * folded copy */ 1322 s = folded; 1323 s_end = d; 1324 } /* End of creating folded copy of EXACTFL string */ 1325 1326 /* Examine the string for a multi-character fold sequence. UTF-8 1327 * patterns have all characters pre-folded by the time this code is 1328 * executed */ 1329 while (s < s_end - 1) /* Can stop 1 before the end, as minimum 1330 length sequence we are looking for is 2 */ 1331 { 1332 int count = 0; /* How many characters in a multi-char fold */ 1333 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); 1334 if (! len) { /* Not a multi-char fold: get next char */ 1335 s += UTF8SKIP(s); 1336 continue; 1337 } 1338 1339 { /* Here is a generic multi-char fold. */ 1340 U8* multi_end = s + len; 1341 1342 /* Count how many characters are in it. In the case of 1343 * /aa, no folds which contain ASCII code points are 1344 * allowed, so check for those, and skip if found. */ 1345 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) { 1346 count = utf8_length(s, multi_end); 1347 s = multi_end; 1348 } 1349 else { 1350 while (s < multi_end) { 1351 if (isASCII(*s)) { 1352 s++; 1353 goto next_iteration; 1354 } 1355 else { 1356 s += UTF8SKIP(s); 1357 } 1358 count++; 1359 } 1360 } 1361 } 1362 1363 /* The delta is how long the sequence is minus 1 (1 is how long 1364 * the character that folds to the sequence is) */ 1365 total_count_delta += count - 1; 1366 next_iteration: ; 1367 } 1368 1369 /* We created a temporary folded copy of the string in EXACTFL 1370 * nodes. Therefore we need to be sure it doesn't go below zero, 1371 * as the real string could be shorter */ 1372 if (OP(scan) == EXACTFL) { 1373 int total_chars = utf8_length((U8*) STRING(scan), 1374 (U8*) STRING(scan) + STR_LEN(scan)); 1375 if (total_count_delta > total_chars) { 1376 total_count_delta = total_chars; 1377 } 1378 } 1379 1380 *min_subtract += total_count_delta; 1381 Safefree(folded); 1382 } 1383 else if (OP(scan) == EXACTFAA) { 1384 1385 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char 1386 * fold to the ASCII range (and there are no existing ones in the 1387 * upper latin1 range). But, as outlined in the comments preceding 1388 * this function, we need to flag any occurrences of the sharp s. 1389 * This character forbids trie formation (because of added 1390 * complexity) */ 1391 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 1392 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 1393 || UNICODE_DOT_DOT_VERSION > 0) 1394 while (s < s_end) { 1395 if (*s == LATIN_SMALL_LETTER_SHARP_S) { 1396 OP(scan) = EXACTFAA_NO_TRIE; 1397 *unfolded_multi_char = TRUE; 1398 break; 1399 } 1400 s++; 1401 } 1402 } 1403 else if (OP(scan) != EXACTFAA_NO_TRIE) { 1404 1405 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char 1406 * folds that are all Latin1. As explained in the comments 1407 * preceding this function, we look also for the sharp s in EXACTF 1408 * and EXACTFL nodes; it can be in the final position. Otherwise 1409 * we can stop looking 1 byte earlier because have to find at least 1410 * two characters for a multi-fold */ 1411 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) 1412 ? s_end 1413 : s_end -1; 1414 1415 while (s < upper) { 1416 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); 1417 if (! len) { /* Not a multi-char fold. */ 1418 if (*s == LATIN_SMALL_LETTER_SHARP_S 1419 && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) 1420 { 1421 *unfolded_multi_char = TRUE; 1422 } 1423 s++; 1424 continue; 1425 } 1426 1427 if (len == 2 1428 && isALPHA_FOLD_EQ(*s, 's') 1429 && isALPHA_FOLD_EQ(*(s+1), 's')) 1430 { 1431 1432 /* EXACTF nodes need to know that the minimum length 1433 * changed so that a sharp s in the string can match this 1434 * ss in the pattern, but they remain EXACTF nodes, as they 1435 * won't match this unless the target string is in UTF-8, 1436 * which we don't know until runtime. EXACTFL nodes can't 1437 * transform into EXACTFU nodes */ 1438 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { 1439 OP(scan) = EXACTFUP; 1440 } 1441 } 1442 1443 *min_subtract += len - 1; 1444 s += len; 1445 } 1446 #endif 1447 } 1448 } 1449 1450 #ifdef DEBUGGING 1451 /* Allow dumping but overwriting the collection of skipped 1452 * ops and/or strings with fake optimized ops */ 1453 n = REGNODE_AFTER_varies(scan); 1454 while (n <= stop) { 1455 OP(n) = OPTIMIZED; 1456 FLAGS(n) = 0; 1457 NEXT_OFF(n) = 0; 1458 n++; 1459 } 1460 #endif 1461 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);}); 1462 return stopnow; 1463 } 1464 1465 /* REx optimizer. Converts nodes into quicker variants "in place". 1466 Finds fixed substrings. */ 1467 1468 1469 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set 1470 to the position after last scanned or to NULL. */ 1471 1472 /* the return from this sub is the minimum length that could possibly match */ 1473 SSize_t 1474 Perl_study_chunk(pTHX_ 1475 RExC_state_t *pRExC_state, 1476 regnode **scanp, /* Start here (read-write). */ 1477 SSize_t *minlenp, /* used for the minlen of substrings? */ 1478 SSize_t *deltap, /* Write maxlen-minlen here. */ 1479 regnode *last, /* Stop before this one. */ 1480 scan_data_t *data, /* string data about the pattern */ 1481 I32 stopparen, /* treat CLOSE-N as END, see GOSUB */ 1482 U32 recursed_depth, /* how deep have we recursed via GOSUB */ 1483 regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */ 1484 U32 flags, /* flags controlling this call, see SCF_ flags */ 1485 U32 depth, /* how deep have we recursed period */ 1486 bool was_mutate_ok /* TRUE if in-place optimizations are allowed. 1487 FALSE only if the caller (recursively) was 1488 prohibited from modifying the regops, because 1489 a higher caller is holding a ptr to them. */ 1490 ) 1491 { 1492 /* vars about the regnodes we are working with */ 1493 regnode *scan = *scanp; /* the current opcode we are inspecting */ 1494 regnode *next = NULL; /* the next opcode beyond scan, tmp var */ 1495 regnode *first_non_open = scan; /* FIXME: should this init to NULL? 1496 the first non open regop, if the init 1497 val IS an OPEN then we will skip past 1498 it just after the var decls section */ 1499 I32 code = 0; /* temp var used to hold the optype of a regop */ 1500 1501 /* vars about the min and max length of the pattern */ 1502 SSize_t min = 0; /* min length of this part of the pattern */ 1503 SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT 1504 this is adjusted down if we find 1505 an ACCEPT */ 1506 SSize_t delta = 0; /* difference between min and max length 1507 (not accounting for stopmin) */ 1508 1509 /* vars about capture buffers in the pattern */ 1510 I32 pars = 0; /* count of OPEN opcodes */ 1511 I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */ 1512 1513 /* vars about whether this pattern contains something that can match 1514 * infinitely long strings, eg, X* or X+ */ 1515 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); 1516 int is_inf_internal = 0; /* The studied chunk is infinite */ 1517 1518 /* scan_data_t (struct) is used to hold information about the substrings 1519 * and start class we have extracted from the string */ 1520 scan_data_t data_fake; /* temp var used for recursing in some cases */ 1521 1522 SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do 1523 trie optimizations */ 1524 1525 scan_frame *frame = NULL; /* used as part of fake recursion */ 1526 1527 DECLARE_AND_GET_RE_DEBUG_FLAGS; 1528 1529 PERL_ARGS_ASSERT_STUDY_CHUNK; 1530 RExC_study_started= 1; 1531 1532 Zero(&data_fake, 1, scan_data_t); 1533 1534 if ( depth == 0 ) { 1535 while (first_non_open && OP(first_non_open) == OPEN) 1536 first_non_open=regnext(first_non_open); 1537 } 1538 1539 fake_study_recurse: 1540 DEBUG_r( 1541 RExC_study_chunk_recursed_count++; 1542 ); 1543 DEBUG_OPTIMISE_MORE_r( 1544 { 1545 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p", 1546 depth, (long)stopparen, 1547 (unsigned long)RExC_study_chunk_recursed_count, 1548 (unsigned long)depth, (unsigned long)recursed_depth, 1549 scan, 1550 last); 1551 if (recursed_depth) { 1552 U32 i; 1553 U32 j; 1554 for ( j = 0 ; j < recursed_depth ; j++ ) { 1555 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) { 1556 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) { 1557 Perl_re_printf( aTHX_ " %d",(int)i); 1558 break; 1559 } 1560 } 1561 if ( j + 1 < recursed_depth ) { 1562 Perl_re_printf( aTHX_ ","); 1563 } 1564 } 1565 } 1566 Perl_re_printf( aTHX_ "\n"); 1567 } 1568 ); 1569 while ( scan && OP(scan) != END && scan < last ){ 1570 UV min_subtract = 0; /* How mmany chars to subtract from the minimum 1571 node length to get a real minimum (because 1572 the folded version may be shorter) */ 1573 bool unfolded_multi_char = FALSE; 1574 /* avoid mutating ops if we are anywhere within the recursed or 1575 * enframed handling for a GOSUB: the outermost level will handle it. 1576 */ 1577 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub); 1578 /* Peephole optimizer: */ 1579 DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta); 1580 DEBUG_PEEP("Peep", scan, depth, flags); 1581 1582 1583 /* The reason we do this here is that we need to deal with things like 1584 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT 1585 * parsing code, as each (?:..) is handled by a different invocation of 1586 * reg() -- Yves 1587 */ 1588 if (REGNODE_TYPE(OP(scan)) == EXACT 1589 && OP(scan) != LEXACT 1590 && OP(scan) != LEXACT_REQ8 1591 && mutate_ok 1592 ) { 1593 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char, 1594 0, NULL, depth + 1); 1595 } 1596 1597 /* Follow the next-chain of the current node and optimize 1598 away all the NOTHINGs from it. 1599 */ 1600 rck_elide_nothing(scan); 1601 1602 /* The principal pseudo-switch. Cannot be a switch, since we look into 1603 * several different things. */ 1604 if ( OP(scan) == DEFINEP ) { 1605 SSize_t minlen = 0; 1606 SSize_t deltanext = 0; 1607 SSize_t fake_last_close = 0; 1608 regnode *fake_last_close_op = NULL; 1609 U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY); 1610 1611 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 1612 scan = regnext(scan); 1613 assert( OP(scan) == IFTHEN ); 1614 DEBUG_PEEP("expect IFTHEN", scan, depth, flags); 1615 1616 data_fake.last_closep= &fake_last_close; 1617 data_fake.last_close_opp= &fake_last_close_op; 1618 minlen = *minlenp; 1619 next = regnext(scan); 1620 scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN); 1621 DEBUG_PEEP("scan", scan, depth, flags); 1622 DEBUG_PEEP("next", next, depth, flags); 1623 1624 /* we suppose the run is continuous, last=next... 1625 * NOTE we dont use the return here! */ 1626 /* DEFINEP study_chunk() recursion */ 1627 (void)study_chunk(pRExC_state, &scan, &minlen, 1628 &deltanext, next, &data_fake, stopparen, 1629 recursed_depth, NULL, f, depth+1, mutate_ok); 1630 1631 scan = next; 1632 } else 1633 if ( 1634 OP(scan) == BRANCH || 1635 OP(scan) == BRANCHJ || 1636 OP(scan) == IFTHEN 1637 ) { 1638 next = regnext(scan); 1639 code = OP(scan); 1640 1641 /* The op(next)==code check below is to see if we 1642 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN" 1643 * IFTHEN is special as it might not appear in pairs. 1644 * Not sure whether BRANCH-BRANCHJ is possible, regardless 1645 * we dont handle it cleanly. */ 1646 if (OP(next) == code || code == IFTHEN) { 1647 /* NOTE - There is similar code to this block below for 1648 * handling TRIE nodes on a re-study. If you change stuff here 1649 * check there too. */ 1650 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0; 1651 regnode_ssc accum; 1652 regnode * const startbranch=scan; 1653 1654 if (flags & SCF_DO_SUBSTR) { 1655 /* Cannot merge strings after this. */ 1656 scan_commit(pRExC_state, data, minlenp, is_inf); 1657 } 1658 1659 if (flags & SCF_DO_STCLASS) 1660 ssc_init_zero(pRExC_state, &accum); 1661 1662 while (OP(scan) == code) { 1663 SSize_t deltanext, minnext, fake_last_close = 0; 1664 regnode *fake_last_close_op = NULL; 1665 U32 f = (flags & SCF_TRIE_DOING_RESTUDY); 1666 regnode_ssc this_class; 1667 1668 DEBUG_PEEP("Branch", scan, depth, flags); 1669 1670 num++; 1671 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 1672 if (data) { 1673 data_fake.whilem_c = data->whilem_c; 1674 data_fake.last_closep = data->last_closep; 1675 data_fake.last_close_opp = data->last_close_opp; 1676 } 1677 else { 1678 data_fake.last_closep = &fake_last_close; 1679 data_fake.last_close_opp = &fake_last_close_op; 1680 } 1681 1682 data_fake.pos_delta = delta; 1683 next = regnext(scan); 1684 1685 scan = REGNODE_AFTER_opcode(scan, code); 1686 1687 if (flags & SCF_DO_STCLASS) { 1688 ssc_init(pRExC_state, &this_class); 1689 data_fake.start_class = &this_class; 1690 f |= SCF_DO_STCLASS_AND; 1691 } 1692 if (flags & SCF_WHILEM_VISITED_POS) 1693 f |= SCF_WHILEM_VISITED_POS; 1694 1695 /* we suppose the run is continuous, last=next...*/ 1696 /* recurse study_chunk() for each BRANCH in an alternation */ 1697 minnext = study_chunk(pRExC_state, &scan, minlenp, 1698 &deltanext, next, &data_fake, stopparen, 1699 recursed_depth, NULL, f, depth+1, 1700 mutate_ok); 1701 1702 if (min1 > minnext) 1703 min1 = minnext; 1704 if (deltanext == OPTIMIZE_INFTY) { 1705 is_inf = is_inf_internal = 1; 1706 max1 = OPTIMIZE_INFTY; 1707 } else if (max1 < minnext + deltanext) 1708 max1 = minnext + deltanext; 1709 scan = next; 1710 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 1711 pars++; 1712 if (data_fake.flags & SCF_SEEN_ACCEPT) { 1713 if ( stopmin > minnext) 1714 stopmin = min + min1; 1715 flags &= ~SCF_DO_SUBSTR; 1716 if (data) 1717 data->flags |= SCF_SEEN_ACCEPT; 1718 } 1719 if (data) { 1720 if (data_fake.flags & SF_HAS_EVAL) 1721 data->flags |= SF_HAS_EVAL; 1722 data->whilem_c = data_fake.whilem_c; 1723 } 1724 if (flags & SCF_DO_STCLASS) 1725 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); 1726 DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta); 1727 } 1728 if (code == IFTHEN && num < 2) /* Empty ELSE branch */ 1729 min1 = 0; 1730 if (flags & SCF_DO_SUBSTR) { 1731 data->pos_min += min1; 1732 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1)) 1733 data->pos_delta = OPTIMIZE_INFTY; 1734 else 1735 data->pos_delta += max1 - min1; 1736 if (max1 != min1 || is_inf) 1737 data->cur_is_floating = 1; 1738 } 1739 min += min1; 1740 if (delta == OPTIMIZE_INFTY 1741 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0) 1742 delta = OPTIMIZE_INFTY; 1743 else 1744 delta += max1 - min1; 1745 if (flags & SCF_DO_STCLASS_OR) { 1746 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); 1747 if (min1) { 1748 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 1749 flags &= ~SCF_DO_STCLASS; 1750 } 1751 } 1752 else if (flags & SCF_DO_STCLASS_AND) { 1753 if (min1) { 1754 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); 1755 flags &= ~SCF_DO_STCLASS; 1756 } 1757 else { 1758 /* Switch to OR mode: cache the old value of 1759 * data->start_class */ 1760 INIT_AND_WITHP; 1761 StructCopy(data->start_class, and_withp, regnode_ssc); 1762 flags &= ~SCF_DO_STCLASS_AND; 1763 StructCopy(&accum, data->start_class, regnode_ssc); 1764 flags |= SCF_DO_STCLASS_OR; 1765 } 1766 } 1767 DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta); 1768 1769 if (PERL_ENABLE_TRIE_OPTIMISATION 1770 && OP(startbranch) == BRANCH 1771 && mutate_ok 1772 ) { 1773 /* demq. 1774 1775 Assuming this was/is a branch we are dealing with: 'scan' 1776 now points at the item that follows the branch sequence, 1777 whatever it is. We now start at the beginning of the 1778 sequence and look for subsequences of 1779 1780 BRANCH->EXACT=>x1 1781 BRANCH->EXACT=>x2 1782 tail 1783 1784 which would be constructed from a pattern like 1785 /A|LIST|OF|WORDS/ 1786 1787 If we can find such a subsequence we need to turn the first 1788 element into a trie and then add the subsequent branch exact 1789 strings to the trie. 1790 1791 We have two cases 1792 1793 1. patterns where the whole set of branches can be 1794 converted. 1795 1796 2. patterns where only a subset can be converted. 1797 1798 In case 1 we can replace the whole set with a single regop 1799 for the trie. In case 2 we need to keep the start and end 1800 branches so 1801 1802 'BRANCH EXACT; BRANCH EXACT; BRANCH X' 1803 becomes BRANCH TRIE; BRANCH X; 1804 1805 There is an additional case, that being where there is a 1806 common prefix, which gets split out into an EXACT like node 1807 preceding the TRIE node. 1808 1809 If X(1..n)==tail then we can do a simple trie, if not we make 1810 a "jump" trie, such that when we match the appropriate word 1811 we "jump" to the appropriate tail node. Essentially we turn 1812 a nested if into a case structure of sorts. 1813 1814 */ 1815 1816 int made=0; 1817 if (!re_trie_maxbuff) { 1818 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 1819 if (!SvIOK(re_trie_maxbuff)) 1820 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 1821 } 1822 if ( SvIV(re_trie_maxbuff)>=0 ) { 1823 regnode *cur; 1824 regnode *first = (regnode *)NULL; 1825 regnode *prev = (regnode *)NULL; 1826 regnode *tail = scan; 1827 U8 trietype = 0; 1828 U32 count=0; 1829 1830 /* var tail is used because there may be a TAIL 1831 regop in the way. Ie, the exacts will point to the 1832 thing following the TAIL, but the last branch will 1833 point at the TAIL. So we advance tail. If we 1834 have nested (?:) we may have to move through several 1835 tails. 1836 */ 1837 1838 while ( OP( tail ) == TAIL ) { 1839 /* this is the TAIL generated by (?:) */ 1840 tail = regnext( tail ); 1841 } 1842 1843 1844 DEBUG_TRIE_COMPILE_r({ 1845 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state); 1846 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n", 1847 depth+1, 1848 "Looking for TRIE'able sequences. Tail node is ", 1849 (UV) REGNODE_OFFSET(tail), 1850 SvPV_nolen_const( RExC_mysv ) 1851 ); 1852 }); 1853 1854 /* 1855 1856 Step through the branches 1857 cur represents each branch, 1858 noper is the first thing to be matched as part 1859 of that branch 1860 noper_next is the regnext() of that node. 1861 1862 We normally handle a case like this 1863 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also 1864 support building with NOJUMPTRIE, which restricts 1865 the trie logic to structures like /FOO|BAR/. 1866 1867 If noper is a trieable nodetype then the branch is 1868 a possible optimization target. If we are building 1869 under NOJUMPTRIE then we require that noper_next is 1870 the same as scan (our current position in the regex 1871 program). 1872 1873 Once we have two or more consecutive such branches 1874 we can create a trie of the EXACT's contents and 1875 stitch it in place into the program. 1876 1877 If the sequence represents all of the branches in 1878 the alternation we replace the entire thing with a 1879 single TRIE node. 1880 1881 Otherwise when it is a subsequence we need to 1882 stitch it in place and replace only the relevant 1883 branches. This means the first branch has to remain 1884 as it is used by the alternation logic, and its 1885 next pointer, and needs to be repointed at the item 1886 on the branch chain following the last branch we 1887 have optimized away. 1888 1889 This could be either a BRANCH, in which case the 1890 subsequence is internal, or it could be the item 1891 following the branch sequence in which case the 1892 subsequence is at the end (which does not 1893 necessarily mean the first node is the start of the 1894 alternation). 1895 1896 TRIE_TYPE(X) is a define which maps the optype to a 1897 trietype. 1898 1899 optype | trietype 1900 ----------------+----------- 1901 NOTHING | NOTHING 1902 EXACT | EXACT 1903 EXACT_REQ8 | EXACT 1904 EXACTFU | EXACTFU 1905 EXACTFU_REQ8 | EXACTFU 1906 EXACTFUP | EXACTFU 1907 EXACTFAA | EXACTFAA 1908 EXACTL | EXACTL 1909 EXACTFLU8 | EXACTFLU8 1910 1911 1912 */ 1913 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ 1914 ? NOTHING \ 1915 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \ 1916 ? EXACT \ 1917 : ( EXACTFU == (X) \ 1918 || EXACTFU_REQ8 == (X) \ 1919 || EXACTFUP == (X) ) \ 1920 ? EXACTFU \ 1921 : ( EXACTFAA == (X) ) \ 1922 ? EXACTFAA \ 1923 : ( EXACTL == (X) ) \ 1924 ? EXACTL \ 1925 : ( EXACTFLU8 == (X) ) \ 1926 ? EXACTFLU8 \ 1927 : 0 ) 1928 1929 /* dont use tail as the end marker for this traverse */ 1930 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { 1931 regnode * const noper = REGNODE_AFTER( cur ); 1932 U8 noper_type = OP( noper ); 1933 U8 noper_trietype = TRIE_TYPE( noper_type ); 1934 #if defined(DEBUGGING) || defined(NOJUMPTRIE) 1935 regnode * const noper_next = regnext( noper ); 1936 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; 1937 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0; 1938 #endif 1939 1940 DEBUG_TRIE_COMPILE_r({ 1941 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); 1942 Perl_re_indentf( aTHX_ "- %d:%s (%d)", 1943 depth+1, 1944 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) ); 1945 1946 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state); 1947 Perl_re_printf( aTHX_ " -> %d:%s", 1948 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv)); 1949 1950 if ( noper_next ) { 1951 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state); 1952 Perl_re_printf( aTHX_ "\t=> %d:%s\t", 1953 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv)); 1954 } 1955 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n", 1956 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), 1957 REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype) 1958 ); 1959 }); 1960 1961 /* Is noper a trieable nodetype that can be merged 1962 * with the current trie (if there is one)? */ 1963 if ( noper_trietype 1964 && 1965 ( 1966 ( noper_trietype == NOTHING ) 1967 || ( trietype == NOTHING ) 1968 || ( trietype == noper_trietype ) 1969 ) 1970 #ifdef NOJUMPTRIE 1971 && noper_next >= tail 1972 #endif 1973 && count < U16_MAX) 1974 { 1975 /* Handle mergable triable node Either we are 1976 * the first node in a new trieable sequence, 1977 * in which case we do some bookkeeping, 1978 * otherwise we update the end pointer. */ 1979 if ( !first ) { 1980 first = cur; 1981 if ( noper_trietype == NOTHING ) { 1982 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE) 1983 regnode * const noper_next = regnext( noper ); 1984 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; 1985 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; 1986 #endif 1987 1988 if ( noper_next_trietype ) { 1989 trietype = noper_next_trietype; 1990 } else if (noper_next_type) { 1991 /* a NOTHING regop is 1 regop wide. 1992 * We need at least two for a trie 1993 * so we can't merge this in */ 1994 first = NULL; 1995 } 1996 } else { 1997 trietype = noper_trietype; 1998 } 1999 } else { 2000 if ( trietype == NOTHING ) 2001 trietype = noper_trietype; 2002 prev = cur; 2003 } 2004 if (first) 2005 count++; 2006 } /* end handle mergable triable node */ 2007 else { 2008 /* handle unmergable node - 2009 * noper may either be a triable node which can 2010 * not be tried together with the current trie, 2011 * or a non triable node */ 2012 if ( prev ) { 2013 /* If last is set and trietype is not 2014 * NOTHING then we have found at least two 2015 * triable branch sequences in a row of a 2016 * similar trietype so we can turn them 2017 * into a trie. If/when we allow NOTHING to 2018 * start a trie sequence this condition 2019 * will be required, and it isn't expensive 2020 * so we leave it in for now. */ 2021 if ( trietype && trietype != NOTHING ) 2022 make_trie( pRExC_state, 2023 startbranch, first, cur, tail, 2024 count, trietype, depth+1 ); 2025 prev = NULL; /* note: we clear/update 2026 first, trietype etc below, 2027 so we dont do it here */ 2028 } 2029 if ( noper_trietype 2030 #ifdef NOJUMPTRIE 2031 && noper_next >= tail 2032 #endif 2033 ){ 2034 /* noper is triable, so we can start a new 2035 * trie sequence */ 2036 count = 1; 2037 first = cur; 2038 trietype = noper_trietype; 2039 } else if (first) { 2040 /* if we already saw a first but the 2041 * current node is not triable then we have 2042 * to reset the first information. */ 2043 count = 0; 2044 first = NULL; 2045 trietype = 0; 2046 } 2047 } /* end handle unmergable node */ 2048 } /* loop over branches */ 2049 DEBUG_TRIE_COMPILE_r({ 2050 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); 2051 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ", 2052 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); 2053 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n", 2054 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), 2055 REGNODE_NAME(trietype) 2056 ); 2057 2058 }); 2059 if ( prev && trietype ) { 2060 if ( trietype != NOTHING ) { 2061 /* the last branch of the sequence was part of 2062 * a trie, so we have to construct it here 2063 * outside of the loop */ 2064 made= make_trie( pRExC_state, startbranch, 2065 first, scan, tail, count, 2066 trietype, depth+1 ); 2067 #ifdef TRIE_STUDY_OPT 2068 if ( ((made == MADE_EXACT_TRIE && 2069 startbranch == first) 2070 || ( first_non_open == first )) && 2071 depth==0 ) { 2072 flags |= SCF_TRIE_RESTUDY; 2073 if ( startbranch == first 2074 && scan >= tail ) 2075 { 2076 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; 2077 } 2078 } 2079 #endif 2080 } else { 2081 /* at this point we know whatever we have is a 2082 * NOTHING sequence/branch AND if 'startbranch' 2083 * is 'first' then we can turn the whole thing 2084 * into a NOTHING 2085 */ 2086 if ( startbranch == first ) { 2087 regnode *opt; 2088 /* the entire thing is a NOTHING sequence, 2089 * something like this: (?:|) So we can 2090 * turn it into a plain NOTHING op. */ 2091 DEBUG_TRIE_COMPILE_r({ 2092 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); 2093 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n", 2094 depth+1, 2095 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); 2096 2097 }); 2098 OP(startbranch)= NOTHING; 2099 NEXT_OFF(startbranch)= tail - startbranch; 2100 for ( opt= startbranch + 1; opt < tail ; opt++ ) 2101 OP(opt)= OPTIMIZED; 2102 } 2103 } 2104 } /* end if ( prev) */ 2105 } /* TRIE_MAXBUF is non zero */ 2106 } /* do trie */ 2107 DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta); 2108 } 2109 else 2110 scan = REGNODE_AFTER_opcode(scan,code); 2111 continue; 2112 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) { 2113 I32 paren = 0; 2114 regnode *start = NULL; 2115 regnode *end = NULL; 2116 U32 my_recursed_depth= recursed_depth; 2117 2118 if (OP(scan) != SUSPEND) { /* GOSUB */ 2119 /* Do setup, note this code has side effects beyond 2120 * the rest of this block. Specifically setting 2121 * RExC_recurse[] must happen at least once during 2122 * study_chunk(). */ 2123 paren = ARG1u(scan); 2124 RExC_recurse[ARG2i(scan)] = scan; 2125 start = REGNODE_p(RExC_open_parens[paren]); 2126 end = REGNODE_p(RExC_close_parens[paren]); 2127 2128 /* NOTE we MUST always execute the above code, even 2129 * if we do nothing with a GOSUB */ 2130 if ( 2131 ( flags & SCF_IN_DEFINE ) 2132 || 2133 ( 2134 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF)) 2135 && 2136 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 ) 2137 ) 2138 ) { 2139 /* no need to do anything here if we are in a define. */ 2140 /* or we are after some kind of infinite construct 2141 * so we can skip recursing into this item. 2142 * Since it is infinite we will not change the maxlen 2143 * or delta, and if we miss something that might raise 2144 * the minlen it will merely pessimise a little. 2145 * 2146 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/ 2147 * might result in a minlen of 1 and not of 4, 2148 * but this doesn't make us mismatch, just try a bit 2149 * harder than we should. 2150 * 2151 * However we must assume this GOSUB is infinite, to 2152 * avoid wrongly applying other optimizations in the 2153 * enclosing scope - see GH 18096, for example. 2154 */ 2155 is_inf = is_inf_internal = 1; 2156 scan= regnext(scan); 2157 continue; 2158 } 2159 2160 if ( 2161 !recursed_depth 2162 || !PAREN_TEST(recursed_depth - 1, paren) 2163 ) { 2164 /* it is quite possible that there are more efficient ways 2165 * to do this. We maintain a bitmap per level of recursion 2166 * of which patterns we have entered so we can detect if a 2167 * pattern creates a possible infinite loop. When we 2168 * recurse down a level we copy the previous levels bitmap 2169 * down. When we are at recursion level 0 we zero the top 2170 * level bitmap. It would be nice to implement a different 2171 * more efficient way of doing this. In particular the top 2172 * level bitmap may be unnecessary. 2173 */ 2174 if (!recursed_depth) { 2175 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); 2176 } else { 2177 Copy(PAREN_OFFSET(recursed_depth - 1), 2178 PAREN_OFFSET(recursed_depth), 2179 RExC_study_chunk_recursed_bytes, U8); 2180 } 2181 /* we havent recursed into this paren yet, so recurse into it */ 2182 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta); 2183 PAREN_SET(recursed_depth, paren); 2184 my_recursed_depth= recursed_depth + 1; 2185 } else { 2186 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta); 2187 /* some form of infinite recursion, assume infinite length 2188 * */ 2189 if (flags & SCF_DO_SUBSTR) { 2190 scan_commit(pRExC_state, data, minlenp, is_inf); 2191 data->cur_is_floating = 1; 2192 } 2193 is_inf = is_inf_internal = 1; 2194 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 2195 ssc_anything(data->start_class); 2196 flags &= ~SCF_DO_STCLASS; 2197 2198 start= NULL; /* reset start so we dont recurse later on. */ 2199 } 2200 } else { 2201 paren = stopparen; 2202 start = scan + 2; 2203 end = regnext(scan); 2204 } 2205 if (start) { 2206 scan_frame *newframe; 2207 assert(end); 2208 if (!RExC_frame_last) { 2209 Newxz(newframe, 1, scan_frame); 2210 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe); 2211 RExC_frame_head= newframe; 2212 RExC_frame_count++; 2213 } else if (!RExC_frame_last->next_frame) { 2214 Newxz(newframe, 1, scan_frame); 2215 RExC_frame_last->next_frame= newframe; 2216 newframe->prev_frame= RExC_frame_last; 2217 RExC_frame_count++; 2218 } else { 2219 newframe= RExC_frame_last->next_frame; 2220 } 2221 RExC_frame_last= newframe; 2222 2223 newframe->next_regnode = regnext(scan); 2224 newframe->last_regnode = last; 2225 newframe->stopparen = stopparen; 2226 newframe->prev_recursed_depth = recursed_depth; 2227 newframe->this_prev_frame= frame; 2228 newframe->in_gosub = ( 2229 (frame && frame->in_gosub) || OP(scan) == GOSUB 2230 ); 2231 2232 DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta); 2233 DEBUG_PEEP("fnew", scan, depth, flags); 2234 2235 frame = newframe; 2236 scan = start; 2237 stopparen = paren; 2238 last = end; 2239 depth = depth + 1; 2240 recursed_depth= my_recursed_depth; 2241 2242 continue; 2243 } 2244 } 2245 else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) { 2246 SSize_t bytelen = STR_LEN(scan), charlen; 2247 UV uc; 2248 assert(bytelen); 2249 if (UTF) { 2250 const U8 * const s = (U8*)STRING(scan); 2251 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); 2252 charlen = utf8_length(s, s + bytelen); 2253 } else { 2254 uc = *((U8*)STRING(scan)); 2255 charlen = bytelen; 2256 } 2257 min += charlen; 2258 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ 2259 /* The code below prefers earlier match for fixed 2260 offset, later match for variable offset. */ 2261 if (data->last_end == -1) { /* Update the start info. */ 2262 data->last_start_min = data->pos_min; 2263 data->last_start_max = 2264 is_inf ? OPTIMIZE_INFTY 2265 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min) 2266 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta; 2267 } 2268 sv_catpvn(data->last_found, STRING(scan), bytelen); 2269 if (UTF) 2270 SvUTF8_on(data->last_found); 2271 { 2272 SV * const sv = data->last_found; 2273 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 2274 mg_find(sv, PERL_MAGIC_utf8) : NULL; 2275 if (mg && mg->mg_len >= 0) 2276 mg->mg_len += charlen; 2277 } 2278 data->last_end = data->pos_min + charlen; 2279 data->pos_min += charlen; /* As in the first entry. */ 2280 data->flags &= ~SF_BEFORE_EOL; 2281 } 2282 2283 /* ANDing the code point leaves at most it, and not in locale, and 2284 * can't match null string */ 2285 if (flags & SCF_DO_STCLASS_AND) { 2286 ssc_cp_and(data->start_class, uc); 2287 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 2288 ssc_clear_locale(data->start_class); 2289 } 2290 else if (flags & SCF_DO_STCLASS_OR) { 2291 ssc_add_cp(data->start_class, uc); 2292 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 2293 2294 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 2295 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 2296 } 2297 flags &= ~SCF_DO_STCLASS; 2298 DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta); 2299 } 2300 else if (REGNODE_TYPE(OP(scan)) == EXACT) { 2301 /* But OP != EXACT!, so is EXACTFish */ 2302 SSize_t bytelen = STR_LEN(scan), charlen; 2303 const U8 * s = (U8*)STRING(scan); 2304 2305 /* Replace a length 1 ASCII fold pair node with an ANYOFM node, 2306 * with the mask set to the complement of the bit that differs 2307 * between upper and lower case, and the lowest code point of the 2308 * pair (which the '&' forces) */ 2309 if ( bytelen == 1 2310 && isALPHA_A(*s) 2311 && ( OP(scan) == EXACTFAA 2312 || ( OP(scan) == EXACTFU 2313 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))) 2314 && mutate_ok 2315 ) { 2316 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ 2317 2318 OP(scan) = ANYOFM; 2319 ARG1u_SET(scan, *s & mask); 2320 FLAGS(scan) = mask; 2321 /* We're not EXACTFish any more, so restudy. 2322 * Search for "restudy" in this file to find 2323 * a comment with details. */ 2324 continue; 2325 } 2326 2327 /* Search for fixed substrings supports EXACT only. */ 2328 if (flags & SCF_DO_SUBSTR) { 2329 assert(data); 2330 scan_commit(pRExC_state, data, minlenp, is_inf); 2331 } 2332 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen; 2333 if (unfolded_multi_char) { 2334 RExC_seen |= REG_UNFOLDED_MULTI_SEEN; 2335 } 2336 min += charlen - min_subtract; 2337 assert (min >= 0); 2338 if ((SSize_t)min_subtract < OPTIMIZE_INFTY 2339 && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract 2340 ) { 2341 delta += min_subtract; 2342 } else { 2343 delta = OPTIMIZE_INFTY; 2344 } 2345 if (flags & SCF_DO_SUBSTR) { 2346 data->pos_min += charlen - min_subtract; 2347 if (data->pos_min < 0) { 2348 data->pos_min = 0; 2349 } 2350 if ((SSize_t)min_subtract < OPTIMIZE_INFTY 2351 && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract 2352 ) { 2353 data->pos_delta += min_subtract; 2354 } else { 2355 data->pos_delta = OPTIMIZE_INFTY; 2356 } 2357 if (min_subtract) { 2358 data->cur_is_floating = 1; /* float */ 2359 } 2360 } 2361 2362 if (flags & SCF_DO_STCLASS) { 2363 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan); 2364 2365 assert(EXACTF_invlist); 2366 if (flags & SCF_DO_STCLASS_AND) { 2367 if (OP(scan) != EXACTFL) 2368 ssc_clear_locale(data->start_class); 2369 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 2370 ANYOF_POSIXL_ZERO(data->start_class); 2371 ssc_intersection(data->start_class, EXACTF_invlist, FALSE); 2372 } 2373 else { /* SCF_DO_STCLASS_OR */ 2374 ssc_union(data->start_class, EXACTF_invlist, FALSE); 2375 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 2376 2377 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 2378 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 2379 } 2380 flags &= ~SCF_DO_STCLASS; 2381 SvREFCNT_dec(EXACTF_invlist); 2382 } 2383 DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta); 2384 } 2385 else if (REGNODE_VARIES(OP(scan))) { 2386 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; 2387 I32 fl = 0; 2388 U32 f = flags; 2389 regnode * const oscan = scan; 2390 regnode_ssc this_class; 2391 regnode_ssc *oclass = NULL; 2392 I32 next_is_eval = 0; 2393 2394 switch (REGNODE_TYPE(OP(scan))) { 2395 case WHILEM: /* End of (?:...)* . */ 2396 scan = REGNODE_AFTER(scan); 2397 goto finish; 2398 case PLUS: 2399 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { 2400 next = REGNODE_AFTER(scan); 2401 if ( ( REGNODE_TYPE(OP(next)) == EXACT 2402 && ! isEXACTFish(OP(next))) 2403 || (flags & SCF_DO_STCLASS)) 2404 { 2405 mincount = 1; 2406 maxcount = REG_INFTY; 2407 next = regnext(scan); 2408 scan = REGNODE_AFTER(scan); 2409 goto do_curly; 2410 } 2411 } 2412 if (flags & SCF_DO_SUBSTR) 2413 data->pos_min++; 2414 /* This will bypass the formal 'min += minnext * mincount' 2415 * calculation in the do_curly path, so assumes min width 2416 * of the PLUS payload is exactly one. */ 2417 min++; 2418 /* FALLTHROUGH */ 2419 case STAR: 2420 next = REGNODE_AFTER(scan); 2421 2422 /* This temporary node can now be turned into EXACTFU, and 2423 * must, as regexec.c doesn't handle it */ 2424 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) { 2425 OP(next) = EXACTFU; 2426 } 2427 2428 if ( STR_LEN(next) == 1 2429 && isALPHA_A(* STRING(next)) 2430 && ( OP(next) == EXACTFAA 2431 || ( OP(next) == EXACTFU 2432 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))) 2433 && mutate_ok 2434 ) { 2435 /* These differ in just one bit */ 2436 U8 mask = ~ ('A' ^ 'a'); 2437 2438 assert(isALPHA_A(* STRING(next))); 2439 2440 /* Then replace it by an ANYOFM node, with 2441 * the mask set to the complement of the 2442 * bit that differs between upper and lower 2443 * case, and the lowest code point of the 2444 * pair (which the '&' forces) */ 2445 OP(next) = ANYOFM; 2446 ARG1u_SET(next, *STRING(next) & mask); 2447 FLAGS(next) = mask; 2448 } 2449 2450 if (flags & SCF_DO_STCLASS) { 2451 mincount = 0; 2452 maxcount = REG_INFTY; 2453 next = regnext(scan); 2454 scan = REGNODE_AFTER(scan); 2455 goto do_curly; 2456 } 2457 if (flags & SCF_DO_SUBSTR) { 2458 scan_commit(pRExC_state, data, minlenp, is_inf); 2459 /* Cannot extend fixed substrings */ 2460 data->cur_is_floating = 1; /* float */ 2461 } 2462 is_inf = is_inf_internal = 1; 2463 scan = regnext(scan); 2464 goto optimize_curly_tail; 2465 case CURLY: 2466 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) 2467 && (FLAGS(scan) == stopparen)) 2468 { 2469 mincount = 1; 2470 maxcount = 1; 2471 } else { 2472 mincount = ARG1i(scan); 2473 maxcount = ARG2i(scan); 2474 } 2475 next = regnext(scan); 2476 if (OP(scan) == CURLYX) { 2477 I32 lp = (data ? *(data->last_closep) : 0); 2478 FLAGS(scan) = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); 2479 } 2480 scan = REGNODE_AFTER(scan); 2481 next_is_eval = (OP(scan) == EVAL); 2482 do_curly: 2483 if (flags & SCF_DO_SUBSTR) { 2484 if (mincount == 0) 2485 scan_commit(pRExC_state, data, minlenp, is_inf); 2486 /* Cannot extend fixed substrings */ 2487 pos_before = data->pos_min; 2488 } 2489 if (data) { 2490 fl = data->flags; 2491 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); 2492 if (is_inf) 2493 data->flags |= SF_IS_INF; 2494 } 2495 if (flags & SCF_DO_STCLASS) { 2496 ssc_init(pRExC_state, &this_class); 2497 oclass = data->start_class; 2498 data->start_class = &this_class; 2499 f |= SCF_DO_STCLASS_AND; 2500 f &= ~SCF_DO_STCLASS_OR; 2501 } 2502 /* Exclude from super-linear cache processing any {n,m} 2503 regops for which the combination of input pos and regex 2504 pos is not enough information to determine if a match 2505 will be possible. 2506 2507 For example, in the regex /foo(bar\s*){4,8}baz/ with the 2508 regex pos at the \s*, the prospects for a match depend not 2509 only on the input position but also on how many (bar\s*) 2510 repeats into the {4,8} we are. */ 2511 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) 2512 f &= ~SCF_WHILEM_VISITED_POS; 2513 2514 /* This will finish on WHILEM, setting scan, or on NULL: */ 2515 /* recurse study_chunk() on loop bodies */ 2516 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 2517 last, data, stopparen, recursed_depth, NULL, 2518 (mincount == 0 2519 ? (f & ~SCF_DO_SUBSTR) 2520 : f) 2521 , depth+1, mutate_ok); 2522 2523 if (data && data->flags & SCF_SEEN_ACCEPT) { 2524 if (mincount > 1) 2525 mincount = 1; 2526 } 2527 2528 if (flags & SCF_DO_STCLASS) 2529 data->start_class = oclass; 2530 if (mincount == 0 || minnext == 0) { 2531 if (flags & SCF_DO_STCLASS_OR) { 2532 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 2533 } 2534 else if (flags & SCF_DO_STCLASS_AND) { 2535 /* Switch to OR mode: cache the old value of 2536 * data->start_class */ 2537 INIT_AND_WITHP; 2538 StructCopy(data->start_class, and_withp, regnode_ssc); 2539 flags &= ~SCF_DO_STCLASS_AND; 2540 StructCopy(&this_class, data->start_class, regnode_ssc); 2541 flags |= SCF_DO_STCLASS_OR; 2542 ANYOF_FLAGS(data->start_class) 2543 |= SSC_MATCHES_EMPTY_STRING; 2544 } 2545 } else { /* Non-zero len */ 2546 if (flags & SCF_DO_STCLASS_OR) { 2547 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 2548 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 2549 } 2550 else if (flags & SCF_DO_STCLASS_AND) 2551 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 2552 flags &= ~SCF_DO_STCLASS; 2553 } 2554 if (!scan) /* It was not CURLYX, but CURLY. */ 2555 scan = next; 2556 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR) 2557 /* ? quantifier ok, except for (?{ ... }) */ 2558 && (next_is_eval || !(mincount == 0 && maxcount == 1)) 2559 && (minnext == 0) && (deltanext == 0) 2560 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) 2561 && maxcount <= REG_INFTY/3) /* Complement check for big 2562 count */ 2563 { 2564 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP), 2565 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), 2566 "Quantifier unexpected on zero-length expression " 2567 "in regex m/%" UTF8f "/", 2568 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp, 2569 RExC_precomp))); 2570 } 2571 2572 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext ) 2573 || min >= SSize_t_MAX - minnext * mincount ) 2574 { 2575 FAIL("Regexp out of space"); 2576 } 2577 2578 min += minnext * mincount; 2579 is_inf_internal |= deltanext == OPTIMIZE_INFTY 2580 || (maxcount == REG_INFTY && minnext + deltanext > 0); 2581 is_inf |= is_inf_internal; 2582 if (is_inf) { 2583 delta = OPTIMIZE_INFTY; 2584 } else { 2585 delta += (minnext + deltanext) * maxcount 2586 - minnext * mincount; 2587 } 2588 2589 if (data && data->flags & SCF_SEEN_ACCEPT) { 2590 if (flags & SCF_DO_SUBSTR) { 2591 scan_commit(pRExC_state, data, minlenp, is_inf); 2592 flags &= ~SCF_DO_SUBSTR; 2593 } 2594 if (stopmin > min) 2595 stopmin = min; 2596 DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta); 2597 } 2598 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta); 2599 /* Try powerful optimization CURLYX => CURLYN. */ 2600 if ( RE_OPTIMIZE_CURLYX_TO_CURLYN 2601 && OP(oscan) == CURLYX 2602 && data 2603 && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a 2604 non optimistic eval is seen 2605 anywhere.*/ 2606 && ( data->flags & SF_IN_PAR ) /* has parens */ 2607 && !deltanext 2608 && minnext == 1 2609 && mutate_ok 2610 ) { 2611 DEBUG_STUDYDATA("CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta); 2612 /* Try to optimize to CURLYN. */ 2613 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); 2614 regnode * const nxt1 = nxt; 2615 #ifdef DEBUGGING 2616 regnode *nxt2; 2617 #endif 2618 /* Skip open. */ 2619 nxt = regnext(nxt); 2620 if (!REGNODE_SIMPLE(OP(nxt)) 2621 && !(REGNODE_TYPE(OP(nxt)) == EXACT 2622 && STR_LEN(nxt) == 1)) 2623 goto nogo; 2624 #ifdef DEBUGGING 2625 nxt2 = nxt; 2626 #endif 2627 nxt = regnext(nxt); 2628 if (OP(nxt) != CLOSE) 2629 goto nogo; 2630 if (RExC_open_parens) { 2631 2632 /*open->CURLYM*/ 2633 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan); 2634 2635 /*close->while*/ 2636 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2; 2637 } 2638 /* Now we know that nxt2 is the only contents: */ 2639 FLAGS(oscan) = (U8)PARNO(nxt); 2640 OP(oscan) = CURLYN; 2641 OP(nxt1) = NOTHING; /* was OPEN. */ 2642 2643 #ifdef DEBUGGING 2644 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 2645 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ 2646 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ 2647 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 2648 OP(nxt + 1) = OPTIMIZED; /* was count. */ 2649 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ 2650 #endif 2651 } 2652 nogo: 2653 2654 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta); 2655 2656 /* Try optimization CURLYX => CURLYM. */ 2657 if ( RE_OPTIMIZE_CURLYX_TO_CURLYM 2658 && OP(oscan) == CURLYX 2659 && data 2660 && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a 2661 non optimistic eval is seen 2662 anywhere.*/ 2663 && !(data->flags & SF_HAS_PAR) /* no parens! */ 2664 && !deltanext /* atom is fixed width */ 2665 && minnext != 0 /* CURLYM can't handle zero width */ 2666 /* Nor characters whose fold at run-time may be 2667 * multi-character */ 2668 && !(RExC_seen & REG_UNFOLDED_MULTI_SEEN) 2669 && mutate_ok 2670 ) { 2671 DEBUG_STUDYDATA("CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta); 2672 /* XXXX How to optimize if data == 0? */ 2673 /* Optimize to a simpler form. */ 2674 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */ 2675 regnode *nxt2; 2676 2677 OP(oscan) = CURLYM; 2678 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ 2679 && (OP(nxt2) != WHILEM)) 2680 nxt = nxt2; 2681 OP(nxt2) = SUCCEED; /* Whas WHILEM */ 2682 /* Need to optimize away parenths. */ 2683 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { 2684 /* Set the parenth number. */ 2685 /* note that we have changed the type of oscan to CURLYM here */ 2686 regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/ 2687 2688 FLAGS(oscan) = (U8)PARNO(nxt); 2689 if (RExC_open_parens) { 2690 /*open->CURLYM*/ 2691 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan); 2692 2693 /*close->NOTHING*/ 2694 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2) 2695 + 1; 2696 } 2697 OP(nxt1) = OPTIMIZED; /* was OPEN. */ 2698 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 2699 2700 #ifdef DEBUGGING 2701 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 2702 OP(nxt + 1) = OPTIMIZED; /* was count. */ 2703 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ 2704 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ 2705 #endif 2706 #if 0 2707 while ( nxt1 && (OP(nxt1) != WHILEM)) { 2708 regnode *nnxt = regnext(nxt1); 2709 if (nnxt == nxt) { 2710 if (REGNODE_OFF_BY_ARG(OP(nxt1))) 2711 ARG1u_SET(nxt1, nxt2 - nxt1); 2712 else if (nxt2 - nxt1 < U16_MAX) 2713 NEXT_OFF(nxt1) = nxt2 - nxt1; 2714 else 2715 OP(nxt) = NOTHING; /* Cannot beautify */ 2716 } 2717 nxt1 = nnxt; 2718 } 2719 #endif 2720 /* Optimize again: */ 2721 /* recurse study_chunk() on optimised CURLYX => CURLYM */ 2722 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, 2723 NULL, stopparen, recursed_depth, NULL, 0, 2724 depth+1, mutate_ok); 2725 } 2726 else 2727 FLAGS(oscan) = 0; 2728 } 2729 else if ((OP(oscan) == CURLYX) 2730 && (flags & SCF_WHILEM_VISITED_POS) 2731 /* See the comment on a similar expression above. 2732 However, this time it's not a subexpression 2733 we care about, but the expression itself. */ 2734 && (maxcount == REG_INFTY) 2735 && data) { 2736 /* This stays as CURLYX, we can put the count/of pair. */ 2737 /* Find WHILEM (as in regexec.c) */ 2738 regnode *nxt = oscan + NEXT_OFF(oscan); 2739 2740 if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */ 2741 nxt += ARG1u(nxt); 2742 nxt = REGNODE_BEFORE(nxt); 2743 if (FLAGS(nxt) & 0xf) { 2744 /* we've already set whilem count on this node */ 2745 } else if (++data->whilem_c < 16) { 2746 assert(data->whilem_c <= RExC_whilem_seen); 2747 FLAGS(nxt) = (U8)(data->whilem_c 2748 | (RExC_whilem_seen << 4)); /* On WHILEM */ 2749 } 2750 } 2751 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 2752 pars++; 2753 if (flags & SCF_DO_SUBSTR) { 2754 SV *last_str = NULL; 2755 STRLEN last_chrs = 0; 2756 int counted = mincount != 0; 2757 2758 if (data->last_end > 0 && mincount != 0) { /* Ends with a 2759 string. */ 2760 SSize_t b = pos_before >= data->last_start_min 2761 ? pos_before : data->last_start_min; 2762 STRLEN l; 2763 const char * const s = SvPV_const(data->last_found, l); 2764 SSize_t old = b - data->last_start_min; 2765 assert(old >= 0); 2766 2767 if (UTF) 2768 old = utf8_hop_forward((U8*)s, old, 2769 (U8 *) SvEND(data->last_found)) 2770 - (U8*)s; 2771 l -= old; 2772 /* Get the added string: */ 2773 last_str = newSVpvn_utf8(s + old, l, UTF); 2774 last_chrs = UTF ? utf8_length((U8*)(s + old), 2775 (U8*)(s + old + l)) : l; 2776 if (deltanext == 0 && pos_before == b) { 2777 /* What was added is a constant string */ 2778 if (mincount > 1) { 2779 2780 SvGROW(last_str, (mincount * l) + 1); 2781 repeatcpy(SvPVX(last_str) + l, 2782 SvPVX_const(last_str), l, 2783 mincount - 1); 2784 SvCUR_set(last_str, SvCUR(last_str) * mincount); 2785 /* Add additional parts. */ 2786 SvCUR_set(data->last_found, 2787 SvCUR(data->last_found) - l); 2788 sv_catsv(data->last_found, last_str); 2789 { 2790 SV * sv = data->last_found; 2791 MAGIC *mg = 2792 SvUTF8(sv) && SvMAGICAL(sv) ? 2793 mg_find(sv, PERL_MAGIC_utf8) : NULL; 2794 if (mg && mg->mg_len >= 0) 2795 mg->mg_len += last_chrs * (mincount-1); 2796 } 2797 last_chrs *= mincount; 2798 data->last_end += l * (mincount - 1); 2799 } 2800 } else { 2801 /* start offset must point into the last copy */ 2802 data->last_start_min += minnext * (mincount - 1); 2803 data->last_start_max = 2804 is_inf 2805 ? OPTIMIZE_INFTY 2806 : data->last_start_max + 2807 (maxcount - 1) * (minnext + data->pos_delta); 2808 } 2809 } 2810 /* It is counted once already... */ 2811 data->pos_min += minnext * (mincount - counted); 2812 #if 0 2813 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf 2814 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf 2815 " maxcount=%" UVuf " mincount=%" UVuf 2816 " data->pos_delta=%" UVuf "\n", 2817 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, 2818 (UV)maxcount, (UV)mincount, (UV)data->pos_delta); 2819 if (deltanext != OPTIMIZE_INFTY) 2820 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", 2821 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount 2822 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta)); 2823 #endif 2824 if (deltanext == OPTIMIZE_INFTY 2825 || data->pos_delta == OPTIMIZE_INFTY 2826 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta) 2827 data->pos_delta = OPTIMIZE_INFTY; 2828 else 2829 data->pos_delta += - counted * deltanext + 2830 (minnext + deltanext) * maxcount - minnext * mincount; 2831 if (mincount != maxcount) { 2832 /* Cannot extend fixed substrings found inside 2833 the group. */ 2834 scan_commit(pRExC_state, data, minlenp, is_inf); 2835 if (mincount && last_str) { 2836 SV * const sv = data->last_found; 2837 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 2838 mg_find(sv, PERL_MAGIC_utf8) : NULL; 2839 2840 if (mg) 2841 mg->mg_len = -1; 2842 sv_setsv(sv, last_str); 2843 data->last_end = data->pos_min; 2844 data->last_start_min = data->pos_min - last_chrs; 2845 data->last_start_max = is_inf 2846 ? OPTIMIZE_INFTY 2847 : data->pos_min + data->pos_delta - last_chrs; 2848 } 2849 data->cur_is_floating = 1; /* float */ 2850 } 2851 SvREFCNT_dec(last_str); 2852 } 2853 if (data && (fl & SF_HAS_EVAL)) 2854 data->flags |= SF_HAS_EVAL; 2855 optimize_curly_tail: 2856 rck_elide_nothing(oscan); 2857 continue; 2858 2859 default: 2860 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", 2861 OP(scan)); 2862 case REF: 2863 case CLUMP: 2864 if (flags & SCF_DO_SUBSTR) { 2865 /* Cannot expect anything... */ 2866 scan_commit(pRExC_state, data, minlenp, is_inf); 2867 data->cur_is_floating = 1; /* float */ 2868 } 2869 is_inf = is_inf_internal = 1; 2870 if (flags & SCF_DO_STCLASS_OR) { 2871 if (OP(scan) == CLUMP) { 2872 /* Actually is any start char, but very few code points 2873 * aren't start characters */ 2874 ssc_match_all_cp(data->start_class); 2875 } 2876 else { 2877 ssc_anything(data->start_class); 2878 } 2879 } 2880 flags &= ~SCF_DO_STCLASS; 2881 break; 2882 } 2883 } 2884 else if (OP(scan) == LNBREAK) { 2885 if (flags & SCF_DO_STCLASS) { 2886 if (flags & SCF_DO_STCLASS_AND) { 2887 ssc_intersection(data->start_class, 2888 PL_XPosix_ptrs[CC_VERTSPACE_], FALSE); 2889 ssc_clear_locale(data->start_class); 2890 ANYOF_FLAGS(data->start_class) 2891 &= ~SSC_MATCHES_EMPTY_STRING; 2892 } 2893 else if (flags & SCF_DO_STCLASS_OR) { 2894 ssc_union(data->start_class, 2895 PL_XPosix_ptrs[CC_VERTSPACE_], 2896 FALSE); 2897 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 2898 2899 /* See commit msg for 2900 * 749e076fceedeb708a624933726e7989f2302f6a */ 2901 ANYOF_FLAGS(data->start_class) 2902 &= ~SSC_MATCHES_EMPTY_STRING; 2903 } 2904 flags &= ~SCF_DO_STCLASS; 2905 } 2906 min++; 2907 if (delta != OPTIMIZE_INFTY) 2908 delta++; /* Because of the 2 char string cr-lf */ 2909 if (flags & SCF_DO_SUBSTR) { 2910 /* Cannot expect anything... */ 2911 scan_commit(pRExC_state, data, minlenp, is_inf); 2912 data->pos_min += 1; 2913 if (data->pos_delta != OPTIMIZE_INFTY) { 2914 data->pos_delta += 1; 2915 } 2916 data->cur_is_floating = 1; /* float */ 2917 } 2918 } 2919 else if (REGNODE_SIMPLE(OP(scan))) { 2920 2921 if (flags & SCF_DO_SUBSTR) { 2922 scan_commit(pRExC_state, data, minlenp, is_inf); 2923 data->pos_min++; 2924 } 2925 min++; 2926 if (flags & SCF_DO_STCLASS) { 2927 bool invert = 0; 2928 SV* my_invlist = NULL; 2929 U8 namedclass; 2930 2931 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 2932 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 2933 2934 /* Some of the logic below assumes that switching 2935 locale on will only add false positives. */ 2936 switch (OP(scan)) { 2937 2938 default: 2939 #ifdef DEBUGGING 2940 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", 2941 OP(scan)); 2942 #endif 2943 case SANY: 2944 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 2945 ssc_match_all_cp(data->start_class); 2946 break; 2947 2948 case REG_ANY: 2949 { 2950 SV* REG_ANY_invlist = _new_invlist(2); 2951 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, 2952 '\n'); 2953 if (flags & SCF_DO_STCLASS_OR) { 2954 ssc_union(data->start_class, 2955 REG_ANY_invlist, 2956 TRUE /* TRUE => invert, hence all but \n 2957 */ 2958 ); 2959 } 2960 else if (flags & SCF_DO_STCLASS_AND) { 2961 ssc_intersection(data->start_class, 2962 REG_ANY_invlist, 2963 TRUE /* TRUE => invert */ 2964 ); 2965 ssc_clear_locale(data->start_class); 2966 } 2967 SvREFCNT_dec_NN(REG_ANY_invlist); 2968 } 2969 break; 2970 2971 case ANYOFD: 2972 case ANYOFL: 2973 case ANYOFPOSIXL: 2974 case ANYOFH: 2975 case ANYOFHb: 2976 case ANYOFHr: 2977 case ANYOFHs: 2978 case ANYOF: 2979 if (flags & SCF_DO_STCLASS_AND) 2980 ssc_and(pRExC_state, data->start_class, 2981 (regnode_charclass *) scan); 2982 else 2983 ssc_or(pRExC_state, data->start_class, 2984 (regnode_charclass *) scan); 2985 break; 2986 2987 case ANYOFHbbm: 2988 { 2989 SV* cp_list = get_ANYOFHbbm_contents(scan); 2990 2991 if (flags & SCF_DO_STCLASS_OR) { 2992 ssc_union(data->start_class, cp_list, invert); 2993 } 2994 else if (flags & SCF_DO_STCLASS_AND) { 2995 ssc_intersection(data->start_class, cp_list, invert); 2996 } 2997 2998 SvREFCNT_dec_NN(cp_list); 2999 break; 3000 } 3001 3002 case NANYOFM: /* NANYOFM already contains the inversion of the 3003 input ANYOF data, so, unlike things like 3004 NPOSIXA, don't change 'invert' to TRUE */ 3005 /* FALLTHROUGH */ 3006 case ANYOFM: 3007 { 3008 SV* cp_list = get_ANYOFM_contents(scan); 3009 3010 if (flags & SCF_DO_STCLASS_OR) { 3011 ssc_union(data->start_class, cp_list, invert); 3012 } 3013 else if (flags & SCF_DO_STCLASS_AND) { 3014 ssc_intersection(data->start_class, cp_list, invert); 3015 } 3016 3017 SvREFCNT_dec_NN(cp_list); 3018 break; 3019 } 3020 3021 case ANYOFR: 3022 case ANYOFRb: 3023 { 3024 SV* cp_list = NULL; 3025 3026 cp_list = _add_range_to_invlist(cp_list, 3027 ANYOFRbase(scan), 3028 ANYOFRbase(scan) + ANYOFRdelta(scan)); 3029 3030 if (flags & SCF_DO_STCLASS_OR) { 3031 ssc_union(data->start_class, cp_list, invert); 3032 } 3033 else if (flags & SCF_DO_STCLASS_AND) { 3034 ssc_intersection(data->start_class, cp_list, invert); 3035 } 3036 3037 SvREFCNT_dec_NN(cp_list); 3038 break; 3039 } 3040 3041 case NPOSIXL: 3042 invert = 1; 3043 /* FALLTHROUGH */ 3044 3045 case POSIXL: 3046 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; 3047 if (flags & SCF_DO_STCLASS_AND) { 3048 bool was_there = cBOOL( 3049 ANYOF_POSIXL_TEST(data->start_class, 3050 namedclass)); 3051 ANYOF_POSIXL_ZERO(data->start_class); 3052 if (was_there) { /* Do an AND */ 3053 ANYOF_POSIXL_SET(data->start_class, namedclass); 3054 } 3055 /* No individual code points can now match */ 3056 data->start_class->invlist 3057 = sv_2mortal(_new_invlist(0)); 3058 } 3059 else { 3060 int complement = namedclass + ((invert) ? -1 : 1); 3061 3062 assert(flags & SCF_DO_STCLASS_OR); 3063 3064 /* If the complement of this class was already there, 3065 * the result is that they match all code points, 3066 * (\d + \D == everything). Remove the classes from 3067 * future consideration. Locale is not relevant in 3068 * this case */ 3069 if (ANYOF_POSIXL_TEST(data->start_class, complement)) { 3070 ssc_match_all_cp(data->start_class); 3071 ANYOF_POSIXL_CLEAR(data->start_class, namedclass); 3072 ANYOF_POSIXL_CLEAR(data->start_class, complement); 3073 } 3074 else { /* The usual case; just add this class to the 3075 existing set */ 3076 ANYOF_POSIXL_SET(data->start_class, namedclass); 3077 } 3078 } 3079 break; 3080 3081 case NPOSIXA: /* For these, we always know the exact set of 3082 what's matched */ 3083 invert = 1; 3084 /* FALLTHROUGH */ 3085 case POSIXA: 3086 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL); 3087 goto join_posix_and_ascii; 3088 3089 case NPOSIXD: 3090 case NPOSIXU: 3091 invert = 1; 3092 /* FALLTHROUGH */ 3093 case POSIXD: 3094 case POSIXU: 3095 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL); 3096 3097 /* NPOSIXD matches all upper Latin1 code points unless the 3098 * target string being matched is UTF-8, which is 3099 * unknowable until match time. Since we are going to 3100 * invert, we want to get rid of all of them so that the 3101 * inversion will match all */ 3102 if (OP(scan) == NPOSIXD) { 3103 _invlist_subtract(my_invlist, PL_UpperLatin1, 3104 &my_invlist); 3105 } 3106 3107 join_posix_and_ascii: 3108 3109 if (flags & SCF_DO_STCLASS_AND) { 3110 ssc_intersection(data->start_class, my_invlist, invert); 3111 ssc_clear_locale(data->start_class); 3112 } 3113 else { 3114 assert(flags & SCF_DO_STCLASS_OR); 3115 ssc_union(data->start_class, my_invlist, invert); 3116 } 3117 SvREFCNT_dec(my_invlist); 3118 } 3119 if (flags & SCF_DO_STCLASS_OR) 3120 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 3121 flags &= ~SCF_DO_STCLASS; 3122 } 3123 } 3124 else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) { 3125 data->flags |= (OP(scan) == MEOL 3126 ? SF_BEFORE_MEOL 3127 : SF_BEFORE_SEOL); 3128 scan_commit(pRExC_state, data, minlenp, is_inf); 3129 3130 } 3131 else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ 3132 /* Lookbehind, or need to calculate parens/evals/stclass: */ 3133 && (FLAGS(scan) || data || (flags & SCF_DO_STCLASS)) 3134 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) 3135 { 3136 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY 3137 || OP(scan) == UNLESSM ) 3138 { 3139 /* Negative Lookahead/lookbehind 3140 In this case we can't do fixed string optimisation. 3141 */ 3142 3143 bool is_positive = OP(scan) == IFMATCH ? 1 : 0; 3144 SSize_t deltanext, minnext; 3145 SSize_t fake_last_close = 0; 3146 regnode *fake_last_close_op = NULL; 3147 regnode *cur_last_close_op; 3148 regnode *nscan; 3149 regnode_ssc intrnl; 3150 U32 f = (flags & SCF_TRIE_DOING_RESTUDY); 3151 3152 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 3153 if (data) { 3154 data_fake.whilem_c = data->whilem_c; 3155 data_fake.last_closep = data->last_closep; 3156 data_fake.last_close_opp = data->last_close_opp; 3157 } 3158 else { 3159 data_fake.last_closep = &fake_last_close; 3160 data_fake.last_close_opp = &fake_last_close_op; 3161 } 3162 3163 /* remember the last_close_op we saw so we can see if 3164 * we are dealing with variable length lookbehind that 3165 * contains capturing buffers, which are considered 3166 * experimental */ 3167 cur_last_close_op= *(data_fake.last_close_opp); 3168 3169 data_fake.pos_delta = delta; 3170 if ( flags & SCF_DO_STCLASS && !FLAGS(scan) 3171 && OP(scan) == IFMATCH ) { /* Lookahead */ 3172 ssc_init(pRExC_state, &intrnl); 3173 data_fake.start_class = &intrnl; 3174 f |= SCF_DO_STCLASS_AND; 3175 } 3176 if (flags & SCF_WHILEM_VISITED_POS) 3177 f |= SCF_WHILEM_VISITED_POS; 3178 next = regnext(scan); 3179 nscan = REGNODE_AFTER(scan); 3180 3181 /* recurse study_chunk() for lookahead body */ 3182 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, 3183 last, &data_fake, stopparen, 3184 recursed_depth, NULL, f, depth+1, 3185 mutate_ok); 3186 3187 if (FLAGS(scan)) { 3188 if ( deltanext < 0 3189 || deltanext > (I32) U8_MAX 3190 || minnext > (I32)U8_MAX 3191 || minnext + deltanext > (I32)U8_MAX) 3192 { 3193 FAIL2("Lookbehind longer than %" UVuf " not implemented", 3194 (UV)U8_MAX); 3195 } 3196 3197 /* The 'next_off' field has been repurposed to count the 3198 * additional starting positions to try beyond the initial 3199 * one. (This leaves it at 0 for non-variable length 3200 * matches to avoid breakage for those not using this 3201 * extension) */ 3202 if (deltanext) { 3203 NEXT_OFF(scan) = deltanext; 3204 if ( 3205 /* See a CLOSE op inside this lookbehind? */ 3206 cur_last_close_op != *(data_fake.last_close_opp) 3207 /* and not doing restudy. see: restudied */ 3208 && !(flags & SCF_TRIE_DOING_RESTUDY) 3209 ) { 3210 /* this is positive variable length lookbehind with 3211 * capture buffers inside of it */ 3212 ckWARNexperimental_with_arg(RExC_parse, 3213 WARN_EXPERIMENTAL__VLB, 3214 "Variable length %s lookbehind with capturing is experimental", 3215 is_positive ? "positive" : "negative"); 3216 } 3217 } 3218 FLAGS(scan) = (U8)minnext + deltanext; 3219 } 3220 if (data) { 3221 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3222 pars++; 3223 if (data_fake.flags & SF_HAS_EVAL) 3224 data->flags |= SF_HAS_EVAL; 3225 data->whilem_c = data_fake.whilem_c; 3226 } 3227 if (f & SCF_DO_STCLASS_AND) { 3228 if (flags & SCF_DO_STCLASS_OR) { 3229 /* OR before, AND after: ideally we would recurse with 3230 * data_fake to get the AND applied by study of the 3231 * remainder of the pattern, and then derecurse; 3232 * *** HACK *** for now just treat as "no information". 3233 * See [perl #56690]. 3234 */ 3235 ssc_init(pRExC_state, data->start_class); 3236 } else { 3237 /* AND before and after: combine and continue. These 3238 * assertions are zero-length, so can match an EMPTY 3239 * string */ 3240 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); 3241 ANYOF_FLAGS(data->start_class) 3242 |= SSC_MATCHES_EMPTY_STRING; 3243 } 3244 } 3245 DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta); 3246 } 3247 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY 3248 else { 3249 /* Positive Lookahead/lookbehind 3250 In this case we can do fixed string optimisation, 3251 but we must be careful about it. Note in the case of 3252 lookbehind the positions will be offset by the minimum 3253 length of the pattern, something we won't know about 3254 until after the recurse. 3255 */ 3256 SSize_t deltanext, fake_last_close = 0; 3257 regnode *last_close_op = NULL; 3258 regnode *nscan; 3259 regnode_ssc intrnl; 3260 U32 f = (flags & SCF_TRIE_DOING_RESTUDY); 3261 /* We use SAVEFREEPV so that when the full compile 3262 is finished perl will clean up the allocated 3263 minlens when it's all done. This way we don't 3264 have to worry about freeing them when we know 3265 they wont be used, which would be a pain. 3266 */ 3267 SSize_t *minnextp; 3268 Newx( minnextp, 1, SSize_t ); 3269 SAVEFREEPV(minnextp); 3270 3271 if (data) { 3272 StructCopy(data, &data_fake, scan_data_t); 3273 if ((flags & SCF_DO_SUBSTR) && data->last_found) { 3274 f |= SCF_DO_SUBSTR; 3275 if (FLAGS(scan)) 3276 scan_commit(pRExC_state, &data_fake, minlenp, is_inf); 3277 data_fake.last_found=newSVsv(data->last_found); 3278 } 3279 } 3280 else { 3281 data_fake.last_closep = &fake_last_close; 3282 data_fake.last_close_opp = &fake_last_close_opp; 3283 } 3284 data_fake.flags = 0; 3285 data_fake.substrs[0].flags = 0; 3286 data_fake.substrs[1].flags = 0; 3287 data_fake.pos_delta = delta; 3288 if (is_inf) 3289 data_fake.flags |= SF_IS_INF; 3290 if ( flags & SCF_DO_STCLASS && !FLAGS(scan) 3291 && OP(scan) == IFMATCH ) { /* Lookahead */ 3292 ssc_init(pRExC_state, &intrnl); 3293 data_fake.start_class = &intrnl; 3294 f |= SCF_DO_STCLASS_AND; 3295 } 3296 if (flags & SCF_WHILEM_VISITED_POS) 3297 f |= SCF_WHILEM_VISITED_POS; 3298 next = regnext(scan); 3299 nscan = REGNODE_AFTER(scan); 3300 3301 /* positive lookahead study_chunk() recursion */ 3302 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, 3303 &deltanext, last, &data_fake, 3304 stopparen, recursed_depth, NULL, 3305 f, depth+1, mutate_ok); 3306 if (FLAGS(scan)) { 3307 assert(0); /* This code has never been tested since this 3308 is normally not compiled */ 3309 if ( deltanext < 0 3310 || deltanext > (I32) U8_MAX 3311 || *minnextp > (I32)U8_MAX 3312 || *minnextp + deltanext > (I32)U8_MAX) 3313 { 3314 FAIL2("Lookbehind longer than %" UVuf " not implemented", 3315 (UV)U8_MAX); 3316 } 3317 3318 if (deltanext) { 3319 NEXT_OFF(scan) = deltanext; 3320 } 3321 FLAGS(scan) = (U8)*minnextp + deltanext; 3322 } 3323 3324 *minnextp += min; 3325 3326 if (f & SCF_DO_STCLASS_AND) { 3327 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); 3328 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING; 3329 } 3330 if (data) { 3331 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3332 pars++; 3333 if (data_fake.flags & SF_HAS_EVAL) 3334 data->flags |= SF_HAS_EVAL; 3335 data->whilem_c = data_fake.whilem_c; 3336 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { 3337 int i; 3338 if (RExC_rx->minlen < *minnextp) 3339 RExC_rx->minlen = *minnextp; 3340 scan_commit(pRExC_state, &data_fake, minnextp, is_inf); 3341 SvREFCNT_dec_NN(data_fake.last_found); 3342 3343 for (i = 0; i < 2; i++) { 3344 if (data_fake.substrs[i].minlenp != minlenp) { 3345 data->substrs[i].min_offset = 3346 data_fake.substrs[i].min_offset; 3347 data->substrs[i].max_offset = 3348 data_fake.substrs[i].max_offset; 3349 data->substrs[i].minlenp = 3350 data_fake.substrs[i].minlenp; 3351 data->substrs[i].lookbehind += FLAGS(scan); 3352 } 3353 } 3354 } 3355 } 3356 } 3357 #endif 3358 } 3359 else if (OP(scan) == OPEN) { 3360 if (stopparen != (I32)PARNO(scan)) 3361 pars++; 3362 } 3363 else if (OP(scan) == CLOSE) { 3364 if (stopparen == (I32)PARNO(scan)) { 3365 break; 3366 } 3367 if ((I32)PARNO(scan) == is_par) { 3368 next = regnext(scan); 3369 3370 if ( next && (OP(next) != WHILEM) && next < last) 3371 is_par = 0; /* Disable optimization */ 3372 } 3373 if (data) { 3374 *(data->last_closep) = PARNO(scan); 3375 *(data->last_close_opp) = scan; 3376 } 3377 } 3378 else if (OP(scan) == EVAL) { 3379 if (data && !(FLAGS(scan) & EVAL_OPTIMISTIC_FLAG) ) 3380 data->flags |= SF_HAS_EVAL; 3381 } 3382 else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) { 3383 if (flags & SCF_DO_SUBSTR) { 3384 scan_commit(pRExC_state, data, minlenp, is_inf); 3385 flags &= ~SCF_DO_SUBSTR; 3386 } 3387 if (OP(scan)==ACCEPT) { 3388 /* m{(*ACCEPT)x} does not have to start with 'x' */ 3389 flags &= ~SCF_DO_STCLASS; 3390 if (data) 3391 data->flags |= SCF_SEEN_ACCEPT; 3392 if (stopmin > min) 3393 stopmin = min; 3394 } 3395 } 3396 else if (OP(scan) == COMMIT) { 3397 /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we 3398 * must not end up with "abcxyz" as a fixed substring else we'll 3399 * skip straight to attempting to match at offset 4. 3400 */ 3401 if (flags & SCF_DO_SUBSTR) { 3402 scan_commit(pRExC_state, data, minlenp, is_inf); 3403 flags &= ~SCF_DO_SUBSTR; 3404 } 3405 } 3406 else if (OP(scan) == LOGICAL && FLAGS(scan) == 2) /* Embedded follows */ 3407 { 3408 if (flags & SCF_DO_SUBSTR) { 3409 scan_commit(pRExC_state, data, minlenp, is_inf); 3410 data->cur_is_floating = 1; /* float */ 3411 } 3412 is_inf = is_inf_internal = 1; 3413 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 3414 ssc_anything(data->start_class); 3415 flags &= ~SCF_DO_STCLASS; 3416 } 3417 else if (OP(scan) == GPOS) { 3418 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && 3419 !(delta || is_inf || (data && data->pos_delta))) 3420 { 3421 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) 3422 RExC_rx->intflags |= PREGf_ANCH_GPOS; 3423 if (RExC_rx->gofs < (STRLEN)min) 3424 RExC_rx->gofs = min; 3425 } else { 3426 RExC_rx->intflags |= PREGf_GPOS_FLOAT; 3427 RExC_rx->gofs = 0; 3428 } 3429 } 3430 #ifdef TRIE_STUDY_OPT 3431 #ifdef FULL_TRIE_STUDY 3432 else if (REGNODE_TYPE(OP(scan)) == TRIE) { 3433 /* NOTE - There is similar code to this block above for handling 3434 BRANCH nodes on the initial study. If you change stuff here 3435 check there too. */ 3436 regnode *trie_node= scan; 3437 regnode *tail= regnext(scan); 3438 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ]; 3439 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY; 3440 regnode_ssc accum; 3441 3442 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ 3443 /* Cannot merge strings after this. */ 3444 scan_commit(pRExC_state, data, minlenp, is_inf); 3445 } 3446 if (flags & SCF_DO_STCLASS) 3447 ssc_init_zero(pRExC_state, &accum); 3448 3449 if (!trie->jump) { 3450 min1= trie->minlen; 3451 max1= trie->maxlen; 3452 } else { 3453 const regnode *nextbranch= NULL; 3454 U32 word; 3455 3456 for ( word=1 ; word <= trie->wordcount ; word++) 3457 { 3458 SSize_t deltanext = 0, minnext = 0; 3459 U32 f = (flags & SCF_TRIE_DOING_RESTUDY); 3460 SSize_t fake_last_close = 0; 3461 regnode *fake_last_close_op = NULL; 3462 regnode_ssc this_class; 3463 3464 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 3465 if (data) { 3466 data_fake.whilem_c = data->whilem_c; 3467 data_fake.last_closep = data->last_closep; 3468 data_fake.last_close_opp = data->last_close_opp; 3469 } 3470 else { 3471 data_fake.last_closep = &fake_last_close; 3472 data_fake.last_close_opp = &fake_last_close_op; 3473 } 3474 data_fake.pos_delta = delta; 3475 if (flags & SCF_DO_STCLASS) { 3476 ssc_init(pRExC_state, &this_class); 3477 data_fake.start_class = &this_class; 3478 f |= SCF_DO_STCLASS_AND; 3479 } 3480 if (flags & SCF_WHILEM_VISITED_POS) 3481 f |= SCF_WHILEM_VISITED_POS; 3482 3483 if (trie->jump[word]) { 3484 if (!nextbranch) 3485 nextbranch = trie_node + trie->jump[0]; 3486 scan= trie_node + trie->jump[word]; 3487 /* We go from the jump point to the branch that follows 3488 it. Note this means we need the vestigal unused 3489 branches even though they arent otherwise used. */ 3490 /* optimise study_chunk() for TRIE */ 3491 minnext = study_chunk(pRExC_state, &scan, minlenp, 3492 &deltanext, (regnode *)nextbranch, &data_fake, 3493 stopparen, recursed_depth, NULL, f, depth+1, 3494 mutate_ok); 3495 } 3496 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) 3497 nextbranch= regnext((regnode*)nextbranch); 3498 3499 if (min1 > (SSize_t)(minnext + trie->minlen)) 3500 min1 = minnext + trie->minlen; 3501 if (deltanext == OPTIMIZE_INFTY) { 3502 is_inf = is_inf_internal = 1; 3503 max1 = OPTIMIZE_INFTY; 3504 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) 3505 max1 = minnext + deltanext + trie->maxlen; 3506 3507 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3508 pars++; 3509 if (data_fake.flags & SCF_SEEN_ACCEPT) { 3510 if ( stopmin > min + min1) 3511 stopmin = min + min1; 3512 flags &= ~SCF_DO_SUBSTR; 3513 if (data) 3514 data->flags |= SCF_SEEN_ACCEPT; 3515 } 3516 if (data) { 3517 if (data_fake.flags & SF_HAS_EVAL) 3518 data->flags |= SF_HAS_EVAL; 3519 data->whilem_c = data_fake.whilem_c; 3520 } 3521 if (flags & SCF_DO_STCLASS) 3522 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); 3523 } 3524 DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta); 3525 } 3526 if (flags & SCF_DO_SUBSTR) { 3527 data->pos_min += min1; 3528 data->pos_delta += max1 - min1; 3529 if (max1 != min1 || is_inf) 3530 data->cur_is_floating = 1; /* float */ 3531 } 3532 min += min1; 3533 if (delta != OPTIMIZE_INFTY) { 3534 if (OPTIMIZE_INFTY - (max1 - min1) >= delta) 3535 delta += max1 - min1; 3536 else 3537 delta = OPTIMIZE_INFTY; 3538 } 3539 if (flags & SCF_DO_STCLASS_OR) { 3540 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); 3541 if (min1) { 3542 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 3543 flags &= ~SCF_DO_STCLASS; 3544 } 3545 } 3546 else if (flags & SCF_DO_STCLASS_AND) { 3547 if (min1) { 3548 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); 3549 flags &= ~SCF_DO_STCLASS; 3550 } 3551 else { 3552 /* Switch to OR mode: cache the old value of 3553 * data->start_class */ 3554 INIT_AND_WITHP; 3555 StructCopy(data->start_class, and_withp, regnode_ssc); 3556 flags &= ~SCF_DO_STCLASS_AND; 3557 StructCopy(&accum, data->start_class, regnode_ssc); 3558 flags |= SCF_DO_STCLASS_OR; 3559 } 3560 } 3561 scan= tail; 3562 DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta); 3563 continue; 3564 } 3565 #else 3566 else if (REGNODE_TYPE(OP(scan)) == TRIE) { 3567 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ]; 3568 U8*bang=NULL; 3569 3570 min += trie->minlen; 3571 delta += (trie->maxlen - trie->minlen); 3572 flags &= ~SCF_DO_STCLASS; /* xxx */ 3573 if (flags & SCF_DO_SUBSTR) { 3574 /* Cannot expect anything... */ 3575 scan_commit(pRExC_state, data, minlenp, is_inf); 3576 data->pos_min += trie->minlen; 3577 data->pos_delta += (trie->maxlen - trie->minlen); 3578 if (trie->maxlen != trie->minlen) 3579 data->cur_is_floating = 1; /* float */ 3580 } 3581 if (trie->jump) /* no more substrings -- for now /grr*/ 3582 flags &= ~SCF_DO_SUBSTR; 3583 } 3584 3585 #endif /* old or new */ 3586 #endif /* TRIE_STUDY_OPT */ 3587 3588 else if (OP(scan) == REGEX_SET) { 3589 Perl_croak(aTHX_ "panic: %s regnode should be resolved" 3590 " before optimization", REGNODE_NAME(REGEX_SET)); 3591 } 3592 3593 /* Else: zero-length, ignore. */ 3594 scan = regnext(scan); 3595 } 3596 3597 finish: 3598 if (frame) { 3599 /* we need to unwind recursion. */ 3600 depth = depth - 1; 3601 3602 DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta); 3603 DEBUG_PEEP("fend", scan, depth, flags); 3604 3605 /* restore previous context */ 3606 last = frame->last_regnode; 3607 scan = frame->next_regnode; 3608 stopparen = frame->stopparen; 3609 recursed_depth = frame->prev_recursed_depth; 3610 3611 RExC_frame_last = frame->prev_frame; 3612 frame = frame->this_prev_frame; 3613 goto fake_study_recurse; 3614 } 3615 3616 assert(!frame); 3617 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta); 3618 3619 /* is this pattern infinite? Eg, consider /(a|b+)/ */ 3620 if (is_inf_internal) 3621 delta = OPTIMIZE_INFTY; 3622 3623 /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */ 3624 if (min > stopmin) { 3625 /* 3626 At this point 'min' represents the minimum length string we can 3627 match while *ignoring* the implication of ACCEPT, and 'delta' 3628 represents the difference between the minimum length and maximum 3629 length, and if the pattern matches an infinitely long string 3630 (consider the + and * quantifiers) then we use the special delta 3631 value of OPTIMIZE_INFTY to represent it. 'stopmin' is the 3632 minimum length that can be matched *and* accepted. 3633 3634 A pattern is accepted when matching was successful *and* 3635 complete, and thus there is no further matching needing to be 3636 done, no backtracking to occur, etc. Prior to the introduction 3637 of ACCEPT the only opcode that signaled acceptance was the END 3638 opcode, which is always the very last opcode in a regex program. 3639 ACCEPT is thus conceptually an early successful return out of 3640 the matching process. stopmin starts out as OPTIMIZE_INFTY to 3641 represent "the entire pattern", and is ratched down to the 3642 "current min" if necessary when an ACCEPT opcode is encountered. 3643 3644 Thus stopmin might be smaller than min if we saw an (*ACCEPT), 3645 and we now need to account for it in both min and delta. 3646 Consider that in a pattern /AB/ normally the min length it can 3647 match can be computed as min(A)+min(B). But (*ACCEPT) means 3648 that it might be something else, not even neccesarily min(A) at 3649 all. Consider 3650 3651 A = /(foo(*ACCEPT)|x+)/ 3652 B = /whop/ 3653 AB = /(foo(*ACCEPT)|x+)whop/ 3654 3655 The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY 3656 for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for 3657 "whop", and the delta of 0 as the pattern is of fixed length, the 3658 stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT. 3659 When handling AB we expect to see a min of 5 for "xwhop", and a 3660 delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3 3661 for "foo". This should result in a final min of 3 for "foo", and 3662 a final delta of OPTIMIZE_INFTY for "xxxxx...whop". 3663 3664 In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a 3665 min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the 3666 stop min would be 4 for "dude". This should result in a final 3667 min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx". 3668 3669 When min is smaller than stopmin then we can ignore it. In the 3670 fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2, 3671 and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously 3672 the ACCEPT doesn't reduce the minimum length of the string that 3673 might be matched, nor affect the maximum length. 3674 3675 In something like /foo(*ACCEPT)ba?r/ we would have a min of 5 3676 for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for 3677 "foo". We currently turn this into a min of 3 for "foo" and a 3678 delta of 3 for "foobar" even though technically "foobar" isn't 3679 possible. ACCEPT affects some aspects of the optimizer, like 3680 length computations and mandatory substring optimizations, but 3681 there are other optimzations this routine perfoms that are not 3682 affected and this compromise simplifies implementation. 3683 3684 It might be helpful to consider that this C function is called 3685 recursively on the pattern in a bottom up fashion, and that the 3686 min returned by a nested call may be marked as coming from an 3687 ACCEPT, causing its callers to treat the returned min as a 3688 stopmin as the recursion unwinds. Thus a single ACCEPT can affect 3689 multiple calls into this function in different ways. 3690 */ 3691 3692 if (OPTIMIZE_INFTY - delta >= min - stopmin) 3693 delta += min - stopmin; 3694 else 3695 delta = OPTIMIZE_INFTY; 3696 min = stopmin; 3697 } 3698 3699 *scanp = scan; 3700 *deltap = delta; 3701 3702 if (flags & SCF_DO_SUBSTR && is_inf) 3703 data->pos_delta = OPTIMIZE_INFTY - data->pos_min; 3704 if (is_par > (I32)U8_MAX) 3705 is_par = 0; 3706 if (is_par && pars==1 && data) { 3707 data->flags |= SF_IN_PAR; 3708 data->flags &= ~SF_HAS_PAR; 3709 } 3710 else if (pars && data) { 3711 data->flags |= SF_HAS_PAR; 3712 data->flags &= ~SF_IN_PAR; 3713 } 3714 if (flags & SCF_DO_STCLASS_OR) 3715 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 3716 if (flags & SCF_TRIE_RESTUDY) 3717 data->flags |= SCF_TRIE_RESTUDY; 3718 3719 3720 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { 3721 if (min > OPTIMIZE_INFTY - delta) 3722 RExC_maxlen = OPTIMIZE_INFTY; 3723 else if (RExC_maxlen < min + delta) 3724 RExC_maxlen = min + delta; 3725 } 3726 DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta); 3727 return min; 3728 } 3729