1 /* regexec.c 2 */ 3 4 /* 5 * One Ring to rule them all, One Ring to find them 6 & 7 * [p.v of _The Lord of the Rings_, opening poem] 8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] 9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] 10 */ 11 12 /* This file contains functions for executing a regular expression. See 13 * also regcomp.c which funnily enough, contains functions for compiling 14 * a regular expression. 15 * 16 * This file is also copied at build time to ext/re/re_exec.c, where 17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 18 * This causes the main functions to be compiled under new names and with 19 * debugging support added, which makes "use re 'debug'" work. 20 */ 21 22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 23 * confused with the original package (see point 3 below). Thanks, Henry! 24 */ 25 26 /* Additional note: this code is very heavily munged from Henry's version 27 * in places. In some spots I've traded clarity for efficiency, so don't 28 * blame Henry for some of the lack of readability. 29 */ 30 31 /* The names of the functions have been changed from regcomp and 32 * regexec to pregcomp and pregexec in order to avoid conflicts 33 * with the POSIX routines of the same names. 34 */ 35 36 #ifdef PERL_EXT_RE_BUILD 37 #include "re_top.h" 38 #endif 39 40 /* 41 * pregcomp and pregexec -- regsub and regerror are not used in perl 42 * 43 * Copyright (c) 1986 by University of Toronto. 44 * Written by Henry Spencer. Not derived from licensed software. 45 * 46 * Permission is granted to anyone to use this software for any 47 * purpose on any computer system, and to redistribute it freely, 48 * subject to the following restrictions: 49 * 50 * 1. The author is not responsible for the consequences of use of 51 * this software, no matter how awful, even if they arise 52 * from defects in it. 53 * 54 * 2. The origin of this software must not be misrepresented, either 55 * by explicit claim or by omission. 56 * 57 * 3. Altered versions must be plainly marked as such, and must not 58 * be misrepresented as being the original software. 59 * 60 **** Alterations to Henry's code are... 61 **** 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 64 **** by Larry Wall and others 65 **** 66 **** You may distribute under the terms of either the GNU General Public 67 **** License or the Artistic License, as specified in the README file. 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73 #include "EXTERN.h" 74 #define PERL_IN_REGEXEC_C 75 #include "perl.h" 76 77 #ifdef PERL_IN_XSUB_RE 78 # include "re_comp.h" 79 #else 80 # include "regcomp.h" 81 #endif 82 83 #include "inline_invlist.c" 84 #include "unicode_constants.h" 85 86 #ifdef DEBUGGING 87 /* At least one required character in the target string is expressible only in 88 * UTF-8. */ 89 static const char* const non_utf8_target_but_utf8_required 90 = "Can't match, because target string needs to be in UTF-8\n"; 91 #endif 92 93 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ 94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ 95 goto target; \ 96 } STMT_END 97 98 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) 99 100 #ifndef STATIC 101 #define STATIC static 102 #endif 103 104 /* Valid only for non-utf8 strings: avoids the reginclass 105 * call if there are no complications: i.e., if everything matchable is 106 * straight forward in the bitmap */ 107 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ 108 : ANYOF_BITMAP_TEST(p,*(c))) 109 110 /* 111 * Forwards. 112 */ 113 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) 115 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b) 116 117 #define HOPc(pos,off) \ 118 (char *)(reginfo->is_utf8_target \ 119 ? reghop3((U8*)pos, off, \ 120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ 121 : (U8*)(pos + off)) 122 123 #define HOPBACKc(pos, off) \ 124 (char*)(reginfo->is_utf8_target \ 125 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ 126 : (pos - off >= reginfo->strbeg) \ 127 ? (U8*)pos - off \ 128 : NULL) 129 130 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) 131 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) 132 133 /* lim must be +ve. Returns NULL on overshoot */ 134 #define HOPMAYBE3(pos,off,lim) \ 135 (reginfo->is_utf8_target \ 136 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ 137 : ((U8*)pos + off <= lim) \ 138 ? (U8*)pos + off \ 139 : NULL) 140 141 /* like HOP3, but limits the result to <= lim even for the non-utf8 case. 142 * off must be >=0; args should be vars rather than expressions */ 143 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ 144 ? reghop3((U8*)(pos), off, (U8*)(lim)) \ 145 : (U8*)((pos + off) > lim ? lim : (pos + off))) 146 147 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ 148 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ 149 : (U8*)(pos + off)) 150 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) 151 152 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ 153 #define NEXTCHR_IS_EOS (nextchr < 0) 154 155 #define SET_nextchr \ 156 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) 157 158 #define SET_locinput(p) \ 159 locinput = (p); \ 160 SET_nextchr 161 162 163 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ 164 if (!swash_ptr) { \ 165 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ 166 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ 167 1, 0, invlist, &flags); \ 168 assert(swash_ptr); \ 169 } \ 170 } STMT_END 171 172 /* If in debug mode, we test that a known character properly matches */ 173 #ifdef DEBUGGING 174 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ 175 property_name, \ 176 invlist, \ 177 utf8_char_in_property) \ 178 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ 179 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); 180 #else 181 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ 182 property_name, \ 183 invlist, \ 184 utf8_char_in_property) \ 185 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) 186 #endif 187 188 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ 189 PL_utf8_swash_ptrs[_CC_WORDCHAR], \ 190 "", \ 191 PL_XPosix_ptrs[_CC_WORDCHAR], \ 192 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); 193 194 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ 195 STMT_START { \ 196 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ 197 "_X_regular_begin", \ 198 NULL, \ 199 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ 200 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ 201 "_X_extend", \ 202 NULL, \ 203 COMBINING_GRAVE_ACCENT_UTF8); \ 204 } STMT_END 205 206 #define PLACEHOLDER /* Something for the preprocessor to grab onto */ 207 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ 208 209 /* for use after a quantifier and before an EXACT-like node -- japhy */ 210 /* it would be nice to rework regcomp.sym to generate this stuff. sigh 211 * 212 * NOTE that *nothing* that affects backtracking should be in here, specifically 213 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a 214 * node that is in between two EXACT like nodes when ascertaining what the required 215 * "follow" character is. This should probably be moved to regex compile time 216 * although it may be done at run time beause of the REF possibility - more 217 * investigation required. -- demerphq 218 */ 219 #define JUMPABLE(rn) ( \ 220 OP(rn) == OPEN || \ 221 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ 222 OP(rn) == EVAL || \ 223 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ 224 OP(rn) == PLUS || OP(rn) == MINMOD || \ 225 OP(rn) == KEEPS || \ 226 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ 227 ) 228 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) 229 230 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF ) 231 232 #if 0 233 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so 234 we don't need this definition. */ 235 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) 236 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) 237 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) 238 239 #else 240 /* ... so we use this as its faster. */ 241 #define IS_TEXT(rn) ( OP(rn)==EXACT ) 242 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) 243 #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) 244 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) 245 246 #endif 247 248 /* 249 Search for mandatory following text node; for lookahead, the text must 250 follow but for lookbehind (rn->flags != 0) we skip to the next step. 251 */ 252 #define FIND_NEXT_IMPT(rn) STMT_START { \ 253 while (JUMPABLE(rn)) { \ 254 const OPCODE type = OP(rn); \ 255 if (type == SUSPEND || PL_regkind[type] == CURLY) \ 256 rn = NEXTOPER(NEXTOPER(rn)); \ 257 else if (type == PLUS) \ 258 rn = NEXTOPER(rn); \ 259 else if (type == IFMATCH) \ 260 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ 261 else rn += NEXT_OFF(rn); \ 262 } \ 263 } STMT_END 264 265 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode. 266 * These are for the pre-composed Hangul syllables, which are all in a 267 * contiguous block and arranged there in such a way so as to facilitate 268 * alorithmic determination of their characteristics. As such, they don't need 269 * a swash, but can be determined by simple arithmetic. Almost all are 270 * GCB=LVT, but every 28th one is a GCB=LV */ 271 #define SBASE 0xAC00 /* Start of block */ 272 #define SCount 11172 /* Length of block */ 273 #define TCount 28 274 275 #define SLAB_FIRST(s) (&(s)->states[0]) 276 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) 277 278 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); 279 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); 280 static regmatch_state * S_push_slab(pTHX); 281 282 #define REGCP_PAREN_ELEMS 3 283 #define REGCP_OTHER_ELEMS 3 284 #define REGCP_FRAME_ELEMS 1 285 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and 286 * are needed for the regexp context stack bookkeeping. */ 287 288 STATIC CHECKPOINT 289 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) 290 { 291 dVAR; 292 const int retval = PL_savestack_ix; 293 const int paren_elems_to_push = 294 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS; 295 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; 296 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; 297 I32 p; 298 GET_RE_DEBUG_FLAGS_DECL; 299 300 PERL_ARGS_ASSERT_REGCPPUSH; 301 302 if (paren_elems_to_push < 0) 303 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i", 304 paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS); 305 306 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) 307 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf 308 " out of range (%lu-%ld)", 309 total_elems, 310 (unsigned long)maxopenparen, 311 (long)parenfloor); 312 313 SSGROW(total_elems + REGCP_FRAME_ELEMS); 314 315 DEBUG_BUFFERS_r( 316 if ((int)maxopenparen > (int)parenfloor) 317 PerlIO_printf(Perl_debug_log, 318 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n", 319 PTR2UV(rex), 320 PTR2UV(rex->offs) 321 ); 322 ); 323 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { 324 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ 325 SSPUSHIV(rex->offs[p].end); 326 SSPUSHIV(rex->offs[p].start); 327 SSPUSHINT(rex->offs[p].start_tmp); 328 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, 329 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", 330 (UV)p, 331 (IV)rex->offs[p].start, 332 (IV)rex->offs[p].start_tmp, 333 (IV)rex->offs[p].end 334 )); 335 } 336 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ 337 SSPUSHINT(maxopenparen); 338 SSPUSHINT(rex->lastparen); 339 SSPUSHINT(rex->lastcloseparen); 340 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ 341 342 return retval; 343 } 344 345 /* These are needed since we do not localize EVAL nodes: */ 346 #define REGCP_SET(cp) \ 347 DEBUG_STATE_r( \ 348 PerlIO_printf(Perl_debug_log, \ 349 " Setting an EVAL scope, savestack=%"IVdf"\n", \ 350 (IV)PL_savestack_ix)); \ 351 cp = PL_savestack_ix 352 353 #define REGCP_UNWIND(cp) \ 354 DEBUG_STATE_r( \ 355 if (cp != PL_savestack_ix) \ 356 PerlIO_printf(Perl_debug_log, \ 357 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \ 358 (IV)(cp), (IV)PL_savestack_ix)); \ 359 regcpblow(cp) 360 361 #define UNWIND_PAREN(lp, lcp) \ 362 for (n = rex->lastparen; n > lp; n--) \ 363 rex->offs[n].end = -1; \ 364 rex->lastparen = n; \ 365 rex->lastcloseparen = lcp; 366 367 368 STATIC void 369 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) 370 { 371 dVAR; 372 UV i; 373 U32 paren; 374 GET_RE_DEBUG_FLAGS_DECL; 375 376 PERL_ARGS_ASSERT_REGCPPOP; 377 378 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ 379 i = SSPOPUV; 380 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ 381 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ 382 rex->lastcloseparen = SSPOPINT; 383 rex->lastparen = SSPOPINT; 384 *maxopenparen_p = SSPOPINT; 385 386 i -= REGCP_OTHER_ELEMS; 387 /* Now restore the parentheses context. */ 388 DEBUG_BUFFERS_r( 389 if (i || rex->lastparen + 1 <= rex->nparens) 390 PerlIO_printf(Perl_debug_log, 391 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n", 392 PTR2UV(rex), 393 PTR2UV(rex->offs) 394 ); 395 ); 396 paren = *maxopenparen_p; 397 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { 398 SSize_t tmps; 399 rex->offs[paren].start_tmp = SSPOPINT; 400 rex->offs[paren].start = SSPOPIV; 401 tmps = SSPOPIV; 402 if (paren <= rex->lastparen) 403 rex->offs[paren].end = tmps; 404 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, 405 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n", 406 (UV)paren, 407 (IV)rex->offs[paren].start, 408 (IV)rex->offs[paren].start_tmp, 409 (IV)rex->offs[paren].end, 410 (paren > rex->lastparen ? "(skipped)" : "")); 411 ); 412 paren--; 413 } 414 #if 1 415 /* It would seem that the similar code in regtry() 416 * already takes care of this, and in fact it is in 417 * a better location to since this code can #if 0-ed out 418 * but the code in regtry() is needed or otherwise tests 419 * requiring null fields (pat.t#187 and split.t#{13,14} 420 * (as of patchlevel 7877) will fail. Then again, 421 * this code seems to be necessary or otherwise 422 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 423 * --jhi updated by dapm */ 424 for (i = rex->lastparen + 1; i <= rex->nparens; i++) { 425 if (i > *maxopenparen_p) 426 rex->offs[i].start = -1; 427 rex->offs[i].end = -1; 428 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, 429 " \\%"UVuf": %s ..-1 undeffing\n", 430 (UV)i, 431 (i > *maxopenparen_p) ? "-1" : " " 432 )); 433 } 434 #endif 435 } 436 437 /* restore the parens and associated vars at savestack position ix, 438 * but without popping the stack */ 439 440 STATIC void 441 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p) 442 { 443 I32 tmpix = PL_savestack_ix; 444 PL_savestack_ix = ix; 445 regcppop(rex, maxopenparen_p); 446 PL_savestack_ix = tmpix; 447 } 448 449 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ 450 451 STATIC bool 452 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) 453 { 454 /* Returns a boolean as to whether or not 'character' is a member of the 455 * Posix character class given by 'classnum' that should be equivalent to a 456 * value in the typedef '_char_class_number'. 457 * 458 * Ideally this could be replaced by a just an array of function pointers 459 * to the C library functions that implement the macros this calls. 460 * However, to compile, the precise function signatures are required, and 461 * these may vary from platform to to platform. To avoid having to figure 462 * out what those all are on each platform, I (khw) am using this method, 463 * which adds an extra layer of function call overhead (unless the C 464 * optimizer strips it away). But we don't particularly care about 465 * performance with locales anyway. */ 466 467 switch ((_char_class_number) classnum) { 468 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character); 469 case _CC_ENUM_ALPHA: return isALPHA_LC(character); 470 case _CC_ENUM_ASCII: return isASCII_LC(character); 471 case _CC_ENUM_BLANK: return isBLANK_LC(character); 472 case _CC_ENUM_CASED: return isLOWER_LC(character) 473 || isUPPER_LC(character); 474 case _CC_ENUM_CNTRL: return isCNTRL_LC(character); 475 case _CC_ENUM_DIGIT: return isDIGIT_LC(character); 476 case _CC_ENUM_GRAPH: return isGRAPH_LC(character); 477 case _CC_ENUM_LOWER: return isLOWER_LC(character); 478 case _CC_ENUM_PRINT: return isPRINT_LC(character); 479 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character); 480 case _CC_ENUM_PUNCT: return isPUNCT_LC(character); 481 case _CC_ENUM_SPACE: return isSPACE_LC(character); 482 case _CC_ENUM_UPPER: return isUPPER_LC(character); 483 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character); 484 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character); 485 default: /* VERTSPACE should never occur in locales */ 486 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum); 487 } 488 489 assert(0); /* NOTREACHED */ 490 return FALSE; 491 } 492 493 STATIC bool 494 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) 495 { 496 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded 497 * 'character' is a member of the Posix character class given by 'classnum' 498 * that should be equivalent to a value in the typedef 499 * '_char_class_number'. 500 * 501 * This just calls isFOO_lc on the code point for the character if it is in 502 * the range 0-255. Outside that range, all characters avoid Unicode 503 * rules, ignoring any locale. So use the Unicode function if this class 504 * requires a swash, and use the Unicode macro otherwise. */ 505 506 PERL_ARGS_ASSERT_ISFOO_UTF8_LC; 507 508 if (UTF8_IS_INVARIANT(*character)) { 509 return isFOO_lc(classnum, *character); 510 } 511 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { 512 return isFOO_lc(classnum, 513 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); 514 } 515 516 if (classnum < _FIRST_NON_SWASH_CC) { 517 518 /* Initialize the swash unless done already */ 519 if (! PL_utf8_swash_ptrs[classnum]) { 520 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; 521 PL_utf8_swash_ptrs[classnum] = 522 _core_swash_init("utf8", 523 "", 524 &PL_sv_undef, 1, 0, 525 PL_XPosix_ptrs[classnum], &flags); 526 } 527 528 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) 529 character, 530 TRUE /* is UTF */ )); 531 } 532 533 switch ((_char_class_number) classnum) { 534 case _CC_ENUM_SPACE: 535 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character); 536 537 case _CC_ENUM_BLANK: return is_HORIZWS_high(character); 538 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character); 539 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character); 540 default: return 0; /* Things like CNTRL are always 541 below 256 */ 542 } 543 544 assert(0); /* NOTREACHED */ 545 return FALSE; 546 } 547 548 /* 549 * pregexec and friends 550 */ 551 552 #ifndef PERL_IN_XSUB_RE 553 /* 554 - pregexec - match a regexp against a string 555 */ 556 I32 557 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, 558 char *strbeg, SSize_t minend, SV *screamer, U32 nosave) 559 /* stringarg: the point in the string at which to begin matching */ 560 /* strend: pointer to null at end of string */ 561 /* strbeg: real beginning of string */ 562 /* minend: end of match must be >= minend bytes after stringarg. */ 563 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string 564 * itself is accessed via the pointers above */ 565 /* nosave: For optimizations. */ 566 { 567 PERL_ARGS_ASSERT_PREGEXEC; 568 569 return 570 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, 571 nosave ? 0 : REXEC_COPY_STR); 572 } 573 #endif 574 575 576 577 /* re_intuit_start(): 578 * 579 * Based on some optimiser hints, try to find the earliest position in the 580 * string where the regex could match. 581 * 582 * rx: the regex to match against 583 * sv: the SV being matched: only used for utf8 flag; the string 584 * itself is accessed via the pointers below. Note that on 585 * something like an overloaded SV, SvPOK(sv) may be false 586 * and the string pointers may point to something unrelated to 587 * the SV itself. 588 * strbeg: real beginning of string 589 * strpos: the point in the string at which to begin matching 590 * strend: pointer to the byte following the last char of the string 591 * flags currently unused; set to 0 592 * data: currently unused; set to NULL 593 * 594 * The basic idea of re_intuit_start() is to use some known information 595 * about the pattern, namely: 596 * 597 * a) the longest known anchored substring (i.e. one that's at a 598 * constant offset from the beginning of the pattern; but not 599 * necessarily at a fixed offset from the beginning of the 600 * string); 601 * b) the longest floating substring (i.e. one that's not at a constant 602 * offset from the beginning of the pattern); 603 * c) Whether the pattern is anchored to the string; either 604 * an absolute anchor: /^../, or anchored to \n: /^.../m, 605 * or anchored to pos(): /\G/; 606 * d) A start class: a real or synthetic character class which 607 * represents which characters are legal at the start of the pattern; 608 * 609 * to either quickly reject the match, or to find the earliest position 610 * within the string at which the pattern might match, thus avoiding 611 * running the full NFA engine at those earlier locations, only to 612 * eventually fail and retry further along. 613 * 614 * Returns NULL if the pattern can't match, or returns the address within 615 * the string which is the earliest place the match could occur. 616 * 617 * The longest of the anchored and floating substrings is called 'check' 618 * and is checked first. The other is called 'other' and is checked 619 * second. The 'other' substring may not be present. For example, 620 * 621 * /(abc|xyz)ABC\d{0,3}DEFG/ 622 * 623 * will have 624 * 625 * check substr (float) = "DEFG", offset 6..9 chars 626 * other substr (anchored) = "ABC", offset 3..3 chars 627 * stclass = [ax] 628 * 629 * Be aware that during the course of this function, sometimes 'anchored' 630 * refers to a substring being anchored relative to the start of the 631 * pattern, and sometimes to the pattern itself being anchored relative to 632 * the string. For example: 633 * 634 * /\dabc/: "abc" is anchored to the pattern; 635 * /^\dabc/: "abc" is anchored to the pattern and the string; 636 * /\d+abc/: "abc" is anchored to neither the pattern nor the string; 637 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, 638 * but the pattern is anchored to the string. 639 */ 640 641 char * 642 Perl_re_intuit_start(pTHX_ 643 REGEXP * const rx, 644 SV *sv, 645 const char * const strbeg, 646 char *strpos, 647 char *strend, 648 const U32 flags, 649 re_scream_pos_data *data) 650 { 651 dVAR; 652 struct regexp *const prog = ReANY(rx); 653 SSize_t start_shift = prog->check_offset_min; 654 /* Should be nonnegative! */ 655 SSize_t end_shift = 0; 656 /* current lowest pos in string where the regex can start matching */ 657 char *rx_origin = strpos; 658 SV *check; 659 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ 660 U8 other_ix = 1 - prog->substrs->check_ix; 661 bool ml_anch = 0; 662 char *other_last = strpos;/* latest pos 'other' substr already checked to */ 663 char *check_at = NULL; /* check substr found at this pos */ 664 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; 665 RXi_GET_DECL(prog,progi); 666 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ 667 regmatch_info *const reginfo = ®info_buf; 668 GET_RE_DEBUG_FLAGS_DECL; 669 670 PERL_ARGS_ASSERT_RE_INTUIT_START; 671 PERL_UNUSED_ARG(flags); 672 PERL_UNUSED_ARG(data); 673 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 675 "Intuit: trying to determine minimum start position...\n")); 676 677 /* for now, assume that all substr offsets are positive. If at some point 678 * in the future someone wants to do clever things with look-behind and 679 * -ve offsets, they'll need to fix up any code in this function 680 * which uses these offsets. See the thread beginning 681 * <20140113145929.GF27210@iabyn.com> 682 */ 683 assert(prog->substrs->data[0].min_offset >= 0); 684 assert(prog->substrs->data[0].max_offset >= 0); 685 assert(prog->substrs->data[1].min_offset >= 0); 686 assert(prog->substrs->data[1].max_offset >= 0); 687 assert(prog->substrs->data[2].min_offset >= 0); 688 assert(prog->substrs->data[2].max_offset >= 0); 689 690 /* for now, assume that if both present, that the floating substring 691 * doesn't start before the anchored substring. 692 * If you break this assumption (e.g. doing better optimisations 693 * with lookahead/behind), then you'll need to audit the code in this 694 * function carefully first 695 */ 696 assert( 697 ! ( (prog->anchored_utf8 || prog->anchored_substr) 698 && (prog->float_utf8 || prog->float_substr)) 699 || (prog->float_min_offset >= prog->anchored_offset)); 700 701 /* byte rather than char calculation for efficiency. It fails 702 * to quickly reject some cases that can't match, but will reject 703 * them later after doing full char arithmetic */ 704 if (prog->minlen > strend - strpos) { 705 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 706 " String too short...\n")); 707 goto fail; 708 } 709 710 RX_MATCH_UTF8_set(rx,utf8_target); 711 reginfo->is_utf8_target = cBOOL(utf8_target); 712 reginfo->info_aux = NULL; 713 reginfo->strbeg = strbeg; 714 reginfo->strend = strend; 715 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); 716 reginfo->intuit = 1; 717 /* not actually used within intuit, but zero for safety anyway */ 718 reginfo->poscache_maxiter = 0; 719 720 if (utf8_target) { 721 if (!prog->check_utf8 && prog->check_substr) 722 to_utf8_substr(prog); 723 check = prog->check_utf8; 724 } else { 725 if (!prog->check_substr && prog->check_utf8) { 726 if (! to_byte_substr(prog)) { 727 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); 728 } 729 } 730 check = prog->check_substr; 731 } 732 733 /* dump the various substring data */ 734 DEBUG_OPTIMISE_MORE_r({ 735 int i; 736 for (i=0; i<=2; i++) { 737 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr 738 : prog->substrs->data[i].substr); 739 if (!sv) 740 continue; 741 742 PerlIO_printf(Perl_debug_log, 743 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf 744 " useful=%"IVdf" utf8=%d [%s]\n", 745 i, 746 (IV)prog->substrs->data[i].min_offset, 747 (IV)prog->substrs->data[i].max_offset, 748 (IV)prog->substrs->data[i].end_shift, 749 BmUSEFUL(sv), 750 utf8_target ? 1 : 0, 751 SvPEEK(sv)); 752 } 753 }); 754 755 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ 756 757 /* ml_anch: check after \n? 758 * 759 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning 760 * with /.*.../, these flags will have been added by the 761 * compiler: 762 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL 763 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL 764 */ 765 ml_anch = (prog->intflags & PREGf_ANCH_MBOL) 766 && !(prog->intflags & PREGf_IMPLICIT); 767 768 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { 769 /* we are only allowed to match at BOS or \G */ 770 771 /* trivially reject if there's a BOS anchor and we're not at BOS. 772 * 773 * Note that we don't try to do a similar quick reject for 774 * \G, since generally the caller will have calculated strpos 775 * based on pos() and gofs, so the string is already correctly 776 * anchored by definition; and handling the exceptions would 777 * be too fiddly (e.g. REXEC_IGNOREPOS). 778 */ 779 if ( strpos != strbeg 780 && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) 781 { 782 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 783 " Not at start...\n")); 784 goto fail; 785 } 786 787 /* in the presence of an anchor, the anchored (relative to the 788 * start of the regex) substr must also be anchored relative 789 * to strpos. So quickly reject if substr isn't found there. 790 * This works for \G too, because the caller will already have 791 * subtracted gofs from pos, and gofs is the offset from the 792 * \G to the start of the regex. For example, in /.abc\Gdef/, 793 * where substr="abcdef", pos()=3, gofs=4, offset_min=1: 794 * caller will have set strpos=pos()-4; we look for the substr 795 * at position pos()-4+1, which lines up with the "a" */ 796 797 if (prog->check_offset_min == prog->check_offset_max 798 && !(prog->intflags & PREGf_CANY_SEEN)) 799 { 800 /* Substring at constant offset from beg-of-str... */ 801 SSize_t slen = SvCUR(check); 802 char *s = HOP3c(strpos, prog->check_offset_min, strend); 803 804 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 805 " Looking for check substr at fixed offset %"IVdf"...\n", 806 (IV)prog->check_offset_min)); 807 808 if (SvTAIL(check)) { 809 /* In this case, the regex is anchored at the end too. 810 * Unless it's a multiline match, the lengths must match 811 * exactly, give or take a \n. NB: slen >= 1 since 812 * the last char of check is \n */ 813 if (!multiline 814 && ( strend - s > slen 815 || strend - s < slen - 1 816 || (strend - s == slen && strend[-1] != '\n'))) 817 { 818 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 819 " String too long...\n")); 820 goto fail_finish; 821 } 822 /* Now should match s[0..slen-2] */ 823 slen--; 824 } 825 if (slen && (*SvPVX_const(check) != *s 826 || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) 827 { 828 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 829 " String not equal...\n")); 830 goto fail_finish; 831 } 832 833 check_at = s; 834 goto success_at_start; 835 } 836 } 837 } 838 839 end_shift = prog->check_end_shift; 840 841 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ 842 if (end_shift < 0) 843 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", 844 (IV)end_shift, RX_PRECOMP(prog)); 845 #endif 846 847 restart: 848 849 /* This is the (re)entry point of the main loop in this function. 850 * The goal of this loop is to: 851 * 1) find the "check" substring in the region rx_origin..strend 852 * (adjusted by start_shift / end_shift). If not found, reject 853 * immediately. 854 * 2) If it exists, look for the "other" substr too if defined; for 855 * example, if the check substr maps to the anchored substr, then 856 * check the floating substr, and vice-versa. If not found, go 857 * back to (1) with rx_origin suitably incremented. 858 * 3) If we find an rx_origin position that doesn't contradict 859 * either of the substrings, then check the possible additional 860 * constraints on rx_origin of /^.../m or a known start class. 861 * If these fail, then depending on which constraints fail, jump 862 * back to here, or to various other re-entry points further along 863 * that skip some of the first steps. 864 * 4) If we pass all those tests, update the BmUSEFUL() count on the 865 * substring. If the start position was determined to be at the 866 * beginning of the string - so, not rejected, but not optimised, 867 * since we have to run regmatch from position 0 - decrement the 868 * BmUSEFUL() count. Otherwise increment it. 869 */ 870 871 872 /* first, look for the 'check' substring */ 873 874 { 875 U8* start_point; 876 U8* end_point; 877 878 DEBUG_OPTIMISE_MORE_r({ 879 PerlIO_printf(Perl_debug_log, 880 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf 881 " Start shift: %"IVdf" End shift %"IVdf 882 " Real end Shift: %"IVdf"\n", 883 (IV)(rx_origin - strpos), 884 (IV)prog->check_offset_min, 885 (IV)start_shift, 886 (IV)end_shift, 887 (IV)prog->check_end_shift); 888 }); 889 890 if (prog->intflags & PREGf_CANY_SEEN) { 891 start_point= (U8*)(rx_origin + start_shift); 892 end_point= (U8*)(strend - end_shift); 893 if (start_point > end_point) 894 goto fail_finish; 895 } else { 896 end_point = HOP3(strend, -end_shift, strbeg); 897 start_point = HOPMAYBE3(rx_origin, start_shift, end_point); 898 if (!start_point) 899 goto fail_finish; 900 } 901 902 903 /* If the regex is absolutely anchored to either the start of the 904 * string (BOL,SBOL) or to pos() (ANCH_GPOS), then 905 * check_offset_max represents an upper bound on the string where 906 * the substr could start. For the ANCH_GPOS case, we assume that 907 * the caller of intuit will have already set strpos to 908 * pos()-gofs, so in this case strpos + offset_max will still be 909 * an upper bound on the substr. 910 */ 911 if (!ml_anch 912 && prog->intflags & PREGf_ANCH 913 && prog->check_offset_max != SSize_t_MAX) 914 { 915 SSize_t len = SvCUR(check) - !!SvTAIL(check); 916 const char * const anchor = 917 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); 918 919 /* do a bytes rather than chars comparison. It's conservative; 920 * so it skips doing the HOP if the result can't possibly end 921 * up earlier than the old value of end_point. 922 */ 923 if ((char*)end_point - anchor > prog->check_offset_max) { 924 end_point = HOP3lim((U8*)anchor, 925 prog->check_offset_max, 926 end_point -len) 927 + len; 928 } 929 } 930 931 DEBUG_OPTIMISE_MORE_r({ 932 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", 933 (int)(end_point - start_point), 934 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), 935 start_point); 936 }); 937 938 check_at = fbm_instr( start_point, end_point, 939 check, multiline ? FBMrf_MULTILINE : 0); 940 941 /* Update the count-of-usability, remove useless subpatterns, 942 unshift s. */ 943 944 DEBUG_EXECUTE_r({ 945 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 946 SvPVX_const(check), RE_SV_DUMPLEN(check), 30); 947 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", 948 (check_at ? "Found" : "Did not find"), 949 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) 950 ? "anchored" : "floating"), 951 quoted, 952 RE_SV_TAIL(check), 953 (check_at ? " at offset " : "...\n") ); 954 }); 955 956 if (!check_at) 957 goto fail_finish; 958 /* Finish the diagnostic message */ 959 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); 960 961 /* set rx_origin to the minimum position where the regex could start 962 * matching, given the constraint of the just-matched check substring. 963 * But don't set it lower than previously. 964 */ 965 966 if (check_at - rx_origin > prog->check_offset_max) 967 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); 968 } 969 970 971 /* now look for the 'other' substring if defined */ 972 973 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr 974 : prog->substrs->data[other_ix].substr) 975 { 976 /* Take into account the "other" substring. */ 977 char *last, *last1; 978 char *s; 979 SV* must; 980 struct reg_substr_datum *other; 981 982 do_other_substr: 983 other = &prog->substrs->data[other_ix]; 984 985 /* if "other" is anchored: 986 * we've previously found a floating substr starting at check_at. 987 * This means that the regex origin must lie somewhere 988 * between min (rx_origin): HOP3(check_at, -check_offset_max) 989 * and max: HOP3(check_at, -check_offset_min) 990 * (except that min will be >= strpos) 991 * So the fixed substr must lie somewhere between 992 * HOP3(min, anchored_offset) 993 * HOP3(max, anchored_offset) + SvCUR(substr) 994 */ 995 996 /* if "other" is floating 997 * Calculate last1, the absolute latest point where the 998 * floating substr could start in the string, ignoring any 999 * constraints from the earlier fixed match. It is calculated 1000 * as follows: 1001 * 1002 * strend - prog->minlen (in chars) is the absolute latest 1003 * position within the string where the origin of the regex 1004 * could appear. The latest start point for the floating 1005 * substr is float_min_offset(*) on from the start of the 1006 * regex. last1 simply combines thee two offsets. 1007 * 1008 * (*) You might think the latest start point should be 1009 * float_max_offset from the regex origin, and technically 1010 * you'd be correct. However, consider 1011 * /a\d{2,4}bcd\w/ 1012 * Here, float min, max are 3,5 and minlen is 7. 1013 * This can match either 1014 * /a\d\dbcd\w/ 1015 * /a\d\d\dbcd\w/ 1016 * /a\d\d\d\dbcd\w/ 1017 * In the first case, the regex matches minlen chars; in the 1018 * second, minlen+1, in the third, minlen+2. 1019 * In the first case, the floating offset is 3 (which equals 1020 * float_min), in the second, 4, and in the third, 5 (which 1021 * equals float_max). In all cases, the floating string bcd 1022 * can never start more than 4 chars from the end of the 1023 * string, which equals minlen - float_min. As the substring 1024 * starts to match more than float_min from the start of the 1025 * regex, it makes the regex match more than minlen chars, 1026 * and the two cancel each other out. So we can always use 1027 * float_min - minlen, rather than float_max - minlen for the 1028 * latest position in the string. 1029 * 1030 * Note that -minlen + float_min_offset is equivalent (AFAIKT) 1031 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift 1032 */ 1033 1034 assert(prog->minlen >= other->min_offset); 1035 last1 = HOP3c(strend, 1036 other->min_offset - prog->minlen, strbeg); 1037 1038 if (other_ix) {/* i.e. if (other-is-float) */ 1039 /* last is the latest point where the floating substr could 1040 * start, *given* any constraints from the earlier fixed 1041 * match. This constraint is that the floating string starts 1042 * <= float_max_offset chars from the regex origin (rx_origin). 1043 * If this value is less than last1, use it instead. 1044 */ 1045 assert(rx_origin <= last1); 1046 last = 1047 /* this condition handles the offset==infinity case, and 1048 * is a short-cut otherwise. Although it's comparing a 1049 * byte offset to a char length, it does so in a safe way, 1050 * since 1 char always occupies 1 or more bytes, 1051 * so if a string range is (last1 - rx_origin) bytes, 1052 * it will be less than or equal to (last1 - rx_origin) 1053 * chars; meaning it errs towards doing the accurate HOP3 1054 * rather than just using last1 as a short-cut */ 1055 (last1 - rx_origin) < other->max_offset 1056 ? last1 1057 : (char*)HOP3lim(rx_origin, other->max_offset, last1); 1058 } 1059 else { 1060 assert(strpos + start_shift <= check_at); 1061 last = HOP4c(check_at, other->min_offset - start_shift, 1062 strbeg, strend); 1063 } 1064 1065 s = HOP3c(rx_origin, other->min_offset, strend); 1066 if (s < other_last) /* These positions already checked */ 1067 s = other_last; 1068 1069 must = utf8_target ? other->utf8_substr : other->substr; 1070 assert(SvPOK(must)); 1071 s = fbm_instr( 1072 (unsigned char*)s, 1073 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), 1074 must, 1075 multiline ? FBMrf_MULTILINE : 0 1076 ); 1077 DEBUG_EXECUTE_r({ 1078 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 1079 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 1080 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", 1081 s ? "Found" : "Contradicts", 1082 other_ix ? "floating" : "anchored", 1083 quoted, RE_SV_TAIL(must)); 1084 }); 1085 1086 1087 if (!s) { 1088 /* last1 is latest possible substr location. If we didn't 1089 * find it before there, we never will */ 1090 if (last >= last1) { 1091 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1092 ", giving up...\n")); 1093 goto fail_finish; 1094 } 1095 1096 /* try to find the check substr again at a later 1097 * position. Maybe next time we'll find the "other" substr 1098 * in range too */ 1099 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1100 ", trying %s at offset %ld...\n", 1101 (other_ix ? "floating" : "anchored"), 1102 (long)(HOP3c(check_at, 1, strend) - strpos))); 1103 1104 other_last = HOP3c(last, 1, strend) /* highest failure */; 1105 rx_origin = 1106 other_ix /* i.e. if other-is-float */ 1107 ? HOP3c(rx_origin, 1, strend) 1108 : HOP4c(last, 1 - other->min_offset, strbeg, strend); 1109 goto restart; 1110 } 1111 else { 1112 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", 1113 (long)(s - strpos))); 1114 1115 if (other_ix) { /* if (other-is-float) */ 1116 /* other_last is set to s, not s+1, since its possible for 1117 * a floating substr to fail first time, then succeed 1118 * second time at the same floating position; e.g.: 1119 * "-AB--AABZ" =~ /\wAB\d*Z/ 1120 * The first time round, anchored and float match at 1121 * "-(AB)--AAB(Z)" then fail on the initial \w character 1122 * class. Second time round, they match at "-AB--A(AB)(Z)". 1123 */ 1124 other_last = s; 1125 } 1126 else { 1127 rx_origin = HOP3c(s, -other->min_offset, strbeg); 1128 other_last = HOP3c(s, 1, strend); 1129 } 1130 } 1131 } 1132 else { 1133 DEBUG_OPTIMISE_MORE_r( 1134 PerlIO_printf(Perl_debug_log, 1135 " Check-only match: offset min:%"IVdf" max:%"IVdf 1136 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf 1137 " strend-strpos:%"IVdf"\n", 1138 (IV)prog->check_offset_min, 1139 (IV)prog->check_offset_max, 1140 (IV)(check_at-strpos), 1141 (IV)(rx_origin-strpos), 1142 (IV)(rx_origin-check_at), 1143 (IV)(strend-strpos) 1144 ) 1145 ); 1146 } 1147 1148 postprocess_substr_matches: 1149 1150 /* handle the extra constraint of /^.../m if present */ 1151 1152 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { 1153 char *s; 1154 1155 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1156 " looking for /^/m anchor")); 1157 1158 /* we have failed the constraint of a \n before rx_origin. 1159 * Find the next \n, if any, even if it's beyond the current 1160 * anchored and/or floating substrings. Whether we should be 1161 * scanning ahead for the next \n or the next substr is debatable. 1162 * On the one hand you'd expect rare substrings to appear less 1163 * often than \n's. On the other hand, searching for \n means 1164 * we're effectively flipping been check_substr and "\n" on each 1165 * iteration as the current "rarest" string candidate, which 1166 * means for example that we'll quickly reject the whole string if 1167 * hasn't got a \n, rather than trying every substr position 1168 * first 1169 */ 1170 1171 s = HOP3c(strend, - prog->minlen, strpos); 1172 if (s <= rx_origin || 1173 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) 1174 { 1175 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1176 " Did not find /%s^%s/m...\n", 1177 PL_colors[0], PL_colors[1])); 1178 goto fail_finish; 1179 } 1180 1181 /* earliest possible origin is 1 char after the \n. 1182 * (since *rx_origin == '\n', it's safe to ++ here rather than 1183 * HOP(rx_origin, 1)) */ 1184 rx_origin++; 1185 1186 if (prog->substrs->check_ix == 0 /* check is anchored */ 1187 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) 1188 { 1189 /* Position contradicts check-string; either because 1190 * check was anchored (and thus has no wiggle room), 1191 * or check was float and rx_origin is above the float range */ 1192 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1193 " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", 1194 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); 1195 goto restart; 1196 } 1197 1198 /* if we get here, the check substr must have been float, 1199 * is in range, and we may or may not have had an anchored 1200 * "other" substr which still contradicts */ 1201 assert(prog->substrs->check_ix); /* check is float */ 1202 1203 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { 1204 /* whoops, the anchored "other" substr exists, so we still 1205 * contradict. On the other hand, the float "check" substr 1206 * didn't contradict, so just retry the anchored "other" 1207 * substr */ 1208 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1209 " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", 1210 PL_colors[0], PL_colors[1], 1211 (long)(rx_origin - strpos), 1212 (long)(rx_origin - strpos + prog->anchored_offset))); 1213 goto do_other_substr; 1214 } 1215 1216 /* success: we don't contradict the found floating substring 1217 * (and there's no anchored substr). */ 1218 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1219 " Found /%s^%s/m at offset %ld...\n", 1220 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); 1221 } 1222 else { 1223 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1224 " (multiline anchor test skipped)\n")); 1225 } 1226 1227 success_at_start: 1228 1229 1230 /* if we have a starting character class, then test that extra constraint. 1231 * (trie stclasses are too expensive to use here, we are better off to 1232 * leave it to regmatch itself) */ 1233 1234 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { 1235 const U8* const str = (U8*)STRING(progi->regstclass); 1236 1237 /* XXX this value could be pre-computed */ 1238 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT 1239 ? (reginfo->is_utf8_pat 1240 ? utf8_distance(str + STR_LEN(progi->regstclass), str) 1241 : STR_LEN(progi->regstclass)) 1242 : 1); 1243 char * endpos; 1244 char *s; 1245 /* latest pos that a matching float substr constrains rx start to */ 1246 char *rx_max_float = NULL; 1247 1248 /* if the current rx_origin is anchored, either by satisfying an 1249 * anchored substring constraint, or a /^.../m constraint, then we 1250 * can reject the current origin if the start class isn't found 1251 * at the current position. If we have a float-only match, then 1252 * rx_origin is constrained to a range; so look for the start class 1253 * in that range. if neither, then look for the start class in the 1254 * whole rest of the string */ 1255 1256 /* XXX DAPM it's not clear what the minlen test is for, and why 1257 * it's not used in the floating case. Nothing in the test suite 1258 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. 1259 * Here are some old comments, which may or may not be correct: 1260 * 1261 * minlen == 0 is possible if regstclass is \b or \B, 1262 * and the fixed substr is ''$. 1263 * Since minlen is already taken into account, rx_origin+1 is 1264 * before strend; accidentally, minlen >= 1 guaranties no false 1265 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : 1266 * 0) below assumes that regstclass does not come from lookahead... 1267 * If regstclass takes bytelength more than 1: If charlength==1, OK. 1268 * This leaves EXACTF-ish only, which are dealt with in 1269 * find_byclass(). 1270 */ 1271 1272 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) 1273 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); 1274 else if (prog->float_substr || prog->float_utf8) { 1275 rx_max_float = HOP3c(check_at, -start_shift, strbeg); 1276 endpos= HOP3c(rx_max_float, cl_l, strend); 1277 } 1278 else 1279 endpos= strend; 1280 1281 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1282 " looking for class: start_shift: %"IVdf" check_at: %"IVdf 1283 " rx_origin: %"IVdf" endpos: %"IVdf"\n", 1284 (IV)start_shift, (IV)(check_at - strbeg), 1285 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); 1286 1287 s = find_byclass(prog, progi->regstclass, rx_origin, endpos, 1288 reginfo); 1289 if (!s) { 1290 if (endpos == strend) { 1291 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1292 " Could not match STCLASS...\n") ); 1293 goto fail; 1294 } 1295 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1296 " This position contradicts STCLASS...\n") ); 1297 if ((prog->intflags & PREGf_ANCH) && !ml_anch 1298 && !(prog->intflags & PREGf_IMPLICIT)) 1299 goto fail; 1300 1301 /* Contradict one of substrings */ 1302 if (prog->anchored_substr || prog->anchored_utf8) { 1303 if (prog->substrs->check_ix == 1) { /* check is float */ 1304 /* Have both, check_string is floating */ 1305 assert(rx_origin + start_shift <= check_at); 1306 if (rx_origin + start_shift != check_at) { 1307 /* not at latest position float substr could match: 1308 * Recheck anchored substring, but not floating. 1309 * The condition above is in bytes rather than 1310 * chars for efficiency. It's conservative, in 1311 * that it errs on the side of doing 'goto 1312 * do_other_substr', where a more accurate 1313 * char-based calculation will be done */ 1314 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1315 " Looking for anchored substr starting at offset %ld...\n", 1316 (long)(other_last - strpos)) ); 1317 goto do_other_substr; 1318 } 1319 } 1320 } 1321 else { 1322 /* float-only */ 1323 1324 if (ml_anch) { 1325 /* In the presence of ml_anch, we might be able to 1326 * find another \n without breaking the current float 1327 * constraint. */ 1328 1329 /* strictly speaking this should be HOP3c(..., 1, ...), 1330 * but since we goto a block of code that's going to 1331 * search for the next \n if any, its safe here */ 1332 rx_origin++; 1333 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1334 " Looking for /%s^%s/m starting at offset %ld...\n", 1335 PL_colors[0], PL_colors[1], 1336 (long)(rx_origin - strpos)) ); 1337 goto postprocess_substr_matches; 1338 } 1339 1340 /* strictly speaking this can never be true; but might 1341 * be if we ever allow intuit without substrings */ 1342 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) 1343 goto fail; 1344 1345 rx_origin = rx_max_float; 1346 } 1347 1348 /* at this point, any matching substrings have been 1349 * contradicted. Start again... */ 1350 1351 rx_origin = HOP3c(rx_origin, 1, strend); 1352 1353 /* uses bytes rather than char calculations for efficiency. 1354 * It's conservative: it errs on the side of doing 'goto restart', 1355 * where there is code that does a proper char-based test */ 1356 if (rx_origin + start_shift + end_shift > strend) { 1357 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1358 " Could not match STCLASS...\n") ); 1359 goto fail; 1360 } 1361 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1362 " Looking for %s substr starting at offset %ld...\n", 1363 (prog->substrs->check_ix ? "floating" : "anchored"), 1364 (long)(rx_origin + start_shift - strpos)) ); 1365 goto restart; 1366 } 1367 1368 /* Success !!! */ 1369 1370 if (rx_origin != s) { 1371 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1372 " By STCLASS: moving %ld --> %ld\n", 1373 (long)(rx_origin - strpos), (long)(s - strpos)) 1374 ); 1375 } 1376 else { 1377 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1378 " Does not contradict STCLASS...\n"); 1379 ); 1380 } 1381 } 1382 1383 /* Decide whether using the substrings helped */ 1384 1385 if (rx_origin != strpos) { 1386 /* Fixed substring is found far enough so that the match 1387 cannot start at strpos. */ 1388 1389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); 1390 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ 1391 } 1392 else { 1393 /* The found rx_origin position does not prohibit matching at 1394 * strpos, so calling intuit didn't gain us anything. Decrement 1395 * the BmUSEFUL() count on the check substring, and if we reach 1396 * zero, free it. */ 1397 if (!(prog->intflags & PREGf_NAUGHTY) 1398 && (utf8_target ? ( 1399 prog->check_utf8 /* Could be deleted already */ 1400 && --BmUSEFUL(prog->check_utf8) < 0 1401 && (prog->check_utf8 == prog->float_utf8) 1402 ) : ( 1403 prog->check_substr /* Could be deleted already */ 1404 && --BmUSEFUL(prog->check_substr) < 0 1405 && (prog->check_substr == prog->float_substr) 1406 ))) 1407 { 1408 /* If flags & SOMETHING - do not do it many times on the same match */ 1409 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); 1410 /* XXX Does the destruction order has to change with utf8_target? */ 1411 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); 1412 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); 1413 prog->check_substr = prog->check_utf8 = NULL; /* disable */ 1414 prog->float_substr = prog->float_utf8 = NULL; /* clear */ 1415 check = NULL; /* abort */ 1416 /* XXXX This is a remnant of the old implementation. It 1417 looks wasteful, since now INTUIT can use many 1418 other heuristics. */ 1419 prog->extflags &= ~RXf_USE_INTUIT; 1420 } 1421 } 1422 1423 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1424 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", 1425 PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); 1426 1427 return rx_origin; 1428 1429 fail_finish: /* Substring not found */ 1430 if (prog->check_substr || prog->check_utf8) /* could be removed already */ 1431 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ 1432 fail: 1433 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", 1434 PL_colors[4], PL_colors[5])); 1435 return NULL; 1436 } 1437 1438 1439 #define DECL_TRIE_TYPE(scan) \ 1440 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ 1441 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ 1442 trie_type = ((scan->flags == EXACT) \ 1443 ? (utf8_target ? trie_utf8 : trie_plain) \ 1444 : (scan->flags == EXACTFA) \ 1445 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ 1446 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) 1447 1448 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ 1449 STMT_START { \ 1450 STRLEN skiplen; \ 1451 U8 flags = FOLD_FLAGS_FULL; \ 1452 switch (trie_type) { \ 1453 case trie_utf8_exactfa_fold: \ 1454 flags |= FOLD_FLAGS_NOMIX_ASCII; \ 1455 /* FALL THROUGH */ \ 1456 case trie_utf8_fold: \ 1457 if ( foldlen>0 ) { \ 1458 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ 1459 foldlen -= len; \ 1460 uscan += len; \ 1461 len=0; \ 1462 } else { \ 1463 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ 1464 len = UTF8SKIP(uc); \ 1465 skiplen = UNISKIP( uvc ); \ 1466 foldlen -= skiplen; \ 1467 uscan = foldbuf + skiplen; \ 1468 } \ 1469 break; \ 1470 case trie_latin_utf8_exactfa_fold: \ 1471 flags |= FOLD_FLAGS_NOMIX_ASCII; \ 1472 /* FALL THROUGH */ \ 1473 case trie_latin_utf8_fold: \ 1474 if ( foldlen>0 ) { \ 1475 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ 1476 foldlen -= len; \ 1477 uscan += len; \ 1478 len=0; \ 1479 } else { \ 1480 len = 1; \ 1481 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ 1482 skiplen = UNISKIP( uvc ); \ 1483 foldlen -= skiplen; \ 1484 uscan = foldbuf + skiplen; \ 1485 } \ 1486 break; \ 1487 case trie_utf8: \ 1488 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ 1489 break; \ 1490 case trie_plain: \ 1491 uvc = (UV)*uc; \ 1492 len = 1; \ 1493 } \ 1494 if (uvc < 256) { \ 1495 charid = trie->charmap[ uvc ]; \ 1496 } \ 1497 else { \ 1498 charid = 0; \ 1499 if (widecharmap) { \ 1500 SV** const svpp = hv_fetch(widecharmap, \ 1501 (char*)&uvc, sizeof(UV), 0); \ 1502 if (svpp) \ 1503 charid = (U16)SvIV(*svpp); \ 1504 } \ 1505 } \ 1506 } STMT_END 1507 1508 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \ 1509 STMT_START { \ 1510 while (s <= e) { \ 1511 if ( (CoNd) \ 1512 && (ln == 1 || folder(s, pat_string, ln)) \ 1513 && (reginfo->intuit || regtry(reginfo, &s)) )\ 1514 goto got_it; \ 1515 s++; \ 1516 } \ 1517 } STMT_END 1518 1519 #define REXEC_FBC_UTF8_SCAN(CoDe) \ 1520 STMT_START { \ 1521 while (s < strend) { \ 1522 CoDe \ 1523 s += UTF8SKIP(s); \ 1524 } \ 1525 } STMT_END 1526 1527 #define REXEC_FBC_SCAN(CoDe) \ 1528 STMT_START { \ 1529 while (s < strend) { \ 1530 CoDe \ 1531 s++; \ 1532 } \ 1533 } STMT_END 1534 1535 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \ 1536 REXEC_FBC_UTF8_SCAN( \ 1537 if (CoNd) { \ 1538 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ 1539 goto got_it; \ 1540 else \ 1541 tmp = doevery; \ 1542 } \ 1543 else \ 1544 tmp = 1; \ 1545 ) 1546 1547 #define REXEC_FBC_CLASS_SCAN(CoNd) \ 1548 REXEC_FBC_SCAN( \ 1549 if (CoNd) { \ 1550 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ 1551 goto got_it; \ 1552 else \ 1553 tmp = doevery; \ 1554 } \ 1555 else \ 1556 tmp = 1; \ 1557 ) 1558 1559 #define REXEC_FBC_TRYIT \ 1560 if ((reginfo->intuit || regtry(reginfo, &s))) \ 1561 goto got_it 1562 1563 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ 1564 if (utf8_target) { \ 1565 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ 1566 } \ 1567 else { \ 1568 REXEC_FBC_CLASS_SCAN(CoNd); \ 1569 } 1570 1571 #define DUMP_EXEC_POS(li,s,doutf8) \ 1572 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ 1573 startpos, doutf8) 1574 1575 1576 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 1577 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ 1578 tmp = TEST_NON_UTF8(tmp); \ 1579 REXEC_FBC_UTF8_SCAN( \ 1580 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ 1581 tmp = !tmp; \ 1582 IF_SUCCESS; \ 1583 } \ 1584 else { \ 1585 IF_FAIL; \ 1586 } \ 1587 ); \ 1588 1589 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ 1590 if (s == reginfo->strbeg) { \ 1591 tmp = '\n'; \ 1592 } \ 1593 else { \ 1594 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ 1595 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ 1596 0, UTF8_ALLOW_DEFAULT); \ 1597 } \ 1598 tmp = TeSt1_UtF8; \ 1599 LOAD_UTF8_CHARCLASS_ALNUM(); \ 1600 REXEC_FBC_UTF8_SCAN( \ 1601 if (tmp == ! (TeSt2_UtF8)) { \ 1602 tmp = !tmp; \ 1603 IF_SUCCESS; \ 1604 } \ 1605 else { \ 1606 IF_FAIL; \ 1607 } \ 1608 ); \ 1609 1610 /* The only difference between the BOUND and NBOUND cases is that 1611 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in 1612 * NBOUND. This is accomplished by passing it in either the if or else clause, 1613 * with the other one being empty */ 1614 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ 1615 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 1616 1617 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ 1618 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 1619 1620 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ 1621 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 1622 1623 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ 1624 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 1625 1626 1627 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to 1628 * be passed in completely with the variable name being tested, which isn't 1629 * such a clean interface, but this is easier to read than it was before. We 1630 * are looking for the boundary (or non-boundary between a word and non-word 1631 * character. The utf8 and non-utf8 cases have the same logic, but the details 1632 * must be different. Find the "wordness" of the character just prior to this 1633 * one, and compare it with the wordness of this one. If they differ, we have 1634 * a boundary. At the beginning of the string, pretend that the previous 1635 * character was a new-line */ 1636 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 1637 if (utf8_target) { \ 1638 UTF8_CODE \ 1639 } \ 1640 else { /* Not utf8 */ \ 1641 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ 1642 tmp = TEST_NON_UTF8(tmp); \ 1643 REXEC_FBC_SCAN( \ 1644 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ 1645 tmp = !tmp; \ 1646 IF_SUCCESS; \ 1647 } \ 1648 else { \ 1649 IF_FAIL; \ 1650 } \ 1651 ); \ 1652 } \ 1653 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ 1654 goto got_it; 1655 1656 /* We know what class REx starts with. Try to find this position... */ 1657 /* if reginfo->intuit, its a dryrun */ 1658 /* annoyingly all the vars in this routine have different names from their counterparts 1659 in regmatch. /grrr */ 1660 1661 STATIC char * 1662 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, 1663 const char *strend, regmatch_info *reginfo) 1664 { 1665 dVAR; 1666 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; 1667 char *pat_string; /* The pattern's exactish string */ 1668 char *pat_end; /* ptr to end char of pat_string */ 1669 re_fold_t folder; /* Function for computing non-utf8 folds */ 1670 const U8 *fold_array; /* array for folding ords < 256 */ 1671 STRLEN ln; 1672 STRLEN lnc; 1673 U8 c1; 1674 U8 c2; 1675 char *e; 1676 I32 tmp = 1; /* Scratch variable? */ 1677 const bool utf8_target = reginfo->is_utf8_target; 1678 UV utf8_fold_flags = 0; 1679 const bool is_utf8_pat = reginfo->is_utf8_pat; 1680 bool to_complement = FALSE; /* Invert the result? Taking the xor of this 1681 with a result inverts that result, as 0^1 = 1682 1 and 1^1 = 0 */ 1683 _char_class_number classnum; 1684 1685 RXi_GET_DECL(prog,progi); 1686 1687 PERL_ARGS_ASSERT_FIND_BYCLASS; 1688 1689 /* We know what class it must start with. */ 1690 switch (OP(c)) { 1691 case ANYOF: 1692 if (utf8_target) { 1693 REXEC_FBC_UTF8_CLASS_SCAN( 1694 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); 1695 } 1696 else { 1697 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); 1698 } 1699 break; 1700 case CANY: 1701 REXEC_FBC_SCAN( 1702 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) 1703 goto got_it; 1704 else 1705 tmp = doevery; 1706 ); 1707 break; 1708 1709 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ 1710 assert(! is_utf8_pat); 1711 /* FALL THROUGH */ 1712 case EXACTFA: 1713 if (is_utf8_pat || utf8_target) { 1714 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 1715 goto do_exactf_utf8; 1716 } 1717 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */ 1718 folder = foldEQ_latin1; /* /a, except the sharp s one which */ 1719 goto do_exactf_non_utf8; /* isn't dealt with by these */ 1720 1721 case EXACTF: /* This node only generated for non-utf8 patterns */ 1722 assert(! is_utf8_pat); 1723 if (utf8_target) { 1724 utf8_fold_flags = 0; 1725 goto do_exactf_utf8; 1726 } 1727 fold_array = PL_fold; 1728 folder = foldEQ; 1729 goto do_exactf_non_utf8; 1730 1731 case EXACTFL: 1732 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { 1733 utf8_fold_flags = FOLDEQ_LOCALE; 1734 goto do_exactf_utf8; 1735 } 1736 fold_array = PL_fold_locale; 1737 folder = foldEQ_locale; 1738 goto do_exactf_non_utf8; 1739 1740 case EXACTFU_SS: 1741 if (is_utf8_pat) { 1742 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; 1743 } 1744 goto do_exactf_utf8; 1745 1746 case EXACTFU: 1747 if (is_utf8_pat || utf8_target) { 1748 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; 1749 goto do_exactf_utf8; 1750 } 1751 1752 /* Any 'ss' in the pattern should have been replaced by regcomp, 1753 * so we don't have to worry here about this single special case 1754 * in the Latin1 range */ 1755 fold_array = PL_fold_latin1; 1756 folder = foldEQ_latin1; 1757 1758 /* FALL THROUGH */ 1759 1760 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there 1761 are no glitches with fold-length differences 1762 between the target string and pattern */ 1763 1764 /* The idea in the non-utf8 EXACTF* cases is to first find the 1765 * first character of the EXACTF* node and then, if necessary, 1766 * case-insensitively compare the full text of the node. c1 is the 1767 * first character. c2 is its fold. This logic will not work for 1768 * Unicode semantics and the german sharp ss, which hence should 1769 * not be compiled into a node that gets here. */ 1770 pat_string = STRING(c); 1771 ln = STR_LEN(c); /* length to match in octets/bytes */ 1772 1773 /* We know that we have to match at least 'ln' bytes (which is the 1774 * same as characters, since not utf8). If we have to match 3 1775 * characters, and there are only 2 availabe, we know without 1776 * trying that it will fail; so don't start a match past the 1777 * required minimum number from the far end */ 1778 e = HOP3c(strend, -((SSize_t)ln), s); 1779 1780 if (reginfo->intuit && e < s) { 1781 e = s; /* Due to minlen logic of intuit() */ 1782 } 1783 1784 c1 = *pat_string; 1785 c2 = fold_array[c1]; 1786 if (c1 == c2) { /* If char and fold are the same */ 1787 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1); 1788 } 1789 else { 1790 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2); 1791 } 1792 break; 1793 1794 do_exactf_utf8: 1795 { 1796 unsigned expansion; 1797 1798 /* If one of the operands is in utf8, we can't use the simpler folding 1799 * above, due to the fact that many different characters can have the 1800 * same fold, or portion of a fold, or different- length fold */ 1801 pat_string = STRING(c); 1802 ln = STR_LEN(c); /* length to match in octets/bytes */ 1803 pat_end = pat_string + ln; 1804 lnc = is_utf8_pat /* length to match in characters */ 1805 ? utf8_length((U8 *) pat_string, (U8 *) pat_end) 1806 : ln; 1807 1808 /* We have 'lnc' characters to match in the pattern, but because of 1809 * multi-character folding, each character in the target can match 1810 * up to 3 characters (Unicode guarantees it will never exceed 1811 * this) if it is utf8-encoded; and up to 2 if not (based on the 1812 * fact that the Latin 1 folds are already determined, and the 1813 * only multi-char fold in that range is the sharp-s folding to 1814 * 'ss'. Thus, a pattern character can match as little as 1/3 of a 1815 * string character. Adjust lnc accordingly, rounding up, so that 1816 * if we need to match at least 4+1/3 chars, that really is 5. */ 1817 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; 1818 lnc = (lnc + expansion - 1) / expansion; 1819 1820 /* As in the non-UTF8 case, if we have to match 3 characters, and 1821 * only 2 are left, it's guaranteed to fail, so don't start a 1822 * match that would require us to go beyond the end of the string 1823 */ 1824 e = HOP3c(strend, -((SSize_t)lnc), s); 1825 1826 if (reginfo->intuit && e < s) { 1827 e = s; /* Due to minlen logic of intuit() */ 1828 } 1829 1830 /* XXX Note that we could recalculate e to stop the loop earlier, 1831 * as the worst case expansion above will rarely be met, and as we 1832 * go along we would usually find that e moves further to the left. 1833 * This would happen only after we reached the point in the loop 1834 * where if there were no expansion we should fail. Unclear if 1835 * worth the expense */ 1836 1837 while (s <= e) { 1838 char *my_strend= (char *)strend; 1839 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, 1840 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags) 1841 && (reginfo->intuit || regtry(reginfo, &s)) ) 1842 { 1843 goto got_it; 1844 } 1845 s += (utf8_target) ? UTF8SKIP(s) : 1; 1846 } 1847 break; 1848 } 1849 case BOUNDL: 1850 FBC_BOUND(isWORDCHAR_LC, 1851 isWORDCHAR_LC_uvchr(tmp), 1852 isWORDCHAR_LC_utf8((U8*)s)); 1853 break; 1854 case NBOUNDL: 1855 FBC_NBOUND(isWORDCHAR_LC, 1856 isWORDCHAR_LC_uvchr(tmp), 1857 isWORDCHAR_LC_utf8((U8*)s)); 1858 break; 1859 case BOUND: 1860 FBC_BOUND(isWORDCHAR, 1861 isWORDCHAR_uni(tmp), 1862 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target))); 1863 break; 1864 case BOUNDA: 1865 FBC_BOUND_NOLOAD(isWORDCHAR_A, 1866 isWORDCHAR_A(tmp), 1867 isWORDCHAR_A((U8*)s)); 1868 break; 1869 case NBOUND: 1870 FBC_NBOUND(isWORDCHAR, 1871 isWORDCHAR_uni(tmp), 1872 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target))); 1873 break; 1874 case NBOUNDA: 1875 FBC_NBOUND_NOLOAD(isWORDCHAR_A, 1876 isWORDCHAR_A(tmp), 1877 isWORDCHAR_A((U8*)s)); 1878 break; 1879 case BOUNDU: 1880 FBC_BOUND(isWORDCHAR_L1, 1881 isWORDCHAR_uni(tmp), 1882 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target))); 1883 break; 1884 case NBOUNDU: 1885 FBC_NBOUND(isWORDCHAR_L1, 1886 isWORDCHAR_uni(tmp), 1887 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target))); 1888 break; 1889 case LNBREAK: 1890 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend), 1891 is_LNBREAK_latin1_safe(s, strend) 1892 ); 1893 break; 1894 1895 /* The argument to all the POSIX node types is the class number to pass to 1896 * _generic_isCC() to build a mask for searching in PL_charclass[] */ 1897 1898 case NPOSIXL: 1899 to_complement = 1; 1900 /* FALLTHROUGH */ 1901 1902 case POSIXL: 1903 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), 1904 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); 1905 break; 1906 1907 case NPOSIXD: 1908 to_complement = 1; 1909 /* FALLTHROUGH */ 1910 1911 case POSIXD: 1912 if (utf8_target) { 1913 goto posix_utf8; 1914 } 1915 goto posixa; 1916 1917 case NPOSIXA: 1918 if (utf8_target) { 1919 /* The complement of something that matches only ASCII matches all 1920 * UTF-8 variant code points, plus everything in ASCII that isn't 1921 * in the class */ 1922 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s) 1923 || ! _generic_isCC_A(*s, FLAGS(c))); 1924 break; 1925 } 1926 1927 to_complement = 1; 1928 /* FALLTHROUGH */ 1929 1930 case POSIXA: 1931 posixa: 1932 /* Don't need to worry about utf8, as it can match only a single 1933 * byte invariant character. */ 1934 REXEC_FBC_CLASS_SCAN( 1935 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); 1936 break; 1937 1938 case NPOSIXU: 1939 to_complement = 1; 1940 /* FALLTHROUGH */ 1941 1942 case POSIXU: 1943 if (! utf8_target) { 1944 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s, 1945 FLAGS(c)))); 1946 } 1947 else { 1948 1949 posix_utf8: 1950 classnum = (_char_class_number) FLAGS(c); 1951 if (classnum < _FIRST_NON_SWASH_CC) { 1952 while (s < strend) { 1953 1954 /* We avoid loading in the swash as long as possible, but 1955 * should we have to, we jump to a separate loop. This 1956 * extra 'if' statement is what keeps this code from being 1957 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */ 1958 if (UTF8_IS_ABOVE_LATIN1(*s)) { 1959 goto found_above_latin1; 1960 } 1961 if ((UTF8_IS_INVARIANT(*s) 1962 && to_complement ^ cBOOL(_generic_isCC((U8) *s, 1963 classnum))) 1964 || (UTF8_IS_DOWNGRADEABLE_START(*s) 1965 && to_complement ^ cBOOL( 1966 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, 1967 *(s + 1)), 1968 classnum)))) 1969 { 1970 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) 1971 goto got_it; 1972 else { 1973 tmp = doevery; 1974 } 1975 } 1976 else { 1977 tmp = 1; 1978 } 1979 s += UTF8SKIP(s); 1980 } 1981 } 1982 else switch (classnum) { /* These classes are implemented as 1983 macros */ 1984 case _CC_ENUM_SPACE: /* XXX would require separate code if we 1985 revert the change of \v matching this */ 1986 /* FALL THROUGH */ 1987 1988 case _CC_ENUM_PSXSPC: 1989 REXEC_FBC_UTF8_CLASS_SCAN( 1990 to_complement ^ cBOOL(isSPACE_utf8(s))); 1991 break; 1992 1993 case _CC_ENUM_BLANK: 1994 REXEC_FBC_UTF8_CLASS_SCAN( 1995 to_complement ^ cBOOL(isBLANK_utf8(s))); 1996 break; 1997 1998 case _CC_ENUM_XDIGIT: 1999 REXEC_FBC_UTF8_CLASS_SCAN( 2000 to_complement ^ cBOOL(isXDIGIT_utf8(s))); 2001 break; 2002 2003 case _CC_ENUM_VERTSPACE: 2004 REXEC_FBC_UTF8_CLASS_SCAN( 2005 to_complement ^ cBOOL(isVERTWS_utf8(s))); 2006 break; 2007 2008 case _CC_ENUM_CNTRL: 2009 REXEC_FBC_UTF8_CLASS_SCAN( 2010 to_complement ^ cBOOL(isCNTRL_utf8(s))); 2011 break; 2012 2013 default: 2014 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum); 2015 assert(0); /* NOTREACHED */ 2016 } 2017 } 2018 break; 2019 2020 found_above_latin1: /* Here we have to load a swash to get the result 2021 for the current code point */ 2022 if (! PL_utf8_swash_ptrs[classnum]) { 2023 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; 2024 PL_utf8_swash_ptrs[classnum] = 2025 _core_swash_init("utf8", 2026 "", 2027 &PL_sv_undef, 1, 0, 2028 PL_XPosix_ptrs[classnum], &flags); 2029 } 2030 2031 /* This is a copy of the loop above for swash classes, though using the 2032 * FBC macro instead of being expanded out. Since we've loaded the 2033 * swash, we don't have to check for that each time through the loop */ 2034 REXEC_FBC_UTF8_CLASS_SCAN( 2035 to_complement ^ cBOOL(_generic_utf8( 2036 classnum, 2037 s, 2038 swash_fetch(PL_utf8_swash_ptrs[classnum], 2039 (U8 *) s, TRUE)))); 2040 break; 2041 2042 case AHOCORASICKC: 2043 case AHOCORASICK: 2044 { 2045 DECL_TRIE_TYPE(c); 2046 /* what trie are we using right now */ 2047 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ]; 2048 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ]; 2049 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); 2050 2051 const char *last_start = strend - trie->minlen; 2052 #ifdef DEBUGGING 2053 const char *real_start = s; 2054 #endif 2055 STRLEN maxlen = trie->maxlen; 2056 SV *sv_points; 2057 U8 **points; /* map of where we were in the input string 2058 when reading a given char. For ASCII this 2059 is unnecessary overhead as the relationship 2060 is always 1:1, but for Unicode, especially 2061 case folded Unicode this is not true. */ 2062 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 2063 U8 *bitmap=NULL; 2064 2065 2066 GET_RE_DEBUG_FLAGS_DECL; 2067 2068 /* We can't just allocate points here. We need to wrap it in 2069 * an SV so it gets freed properly if there is a croak while 2070 * running the match */ 2071 ENTER; 2072 SAVETMPS; 2073 sv_points=newSV(maxlen * sizeof(U8 *)); 2074 SvCUR_set(sv_points, 2075 maxlen * sizeof(U8 *)); 2076 SvPOK_on(sv_points); 2077 sv_2mortal(sv_points); 2078 points=(U8**)SvPV_nolen(sv_points ); 2079 if ( trie_type != trie_utf8_fold 2080 && (trie->bitmap || OP(c)==AHOCORASICKC) ) 2081 { 2082 if (trie->bitmap) 2083 bitmap=(U8*)trie->bitmap; 2084 else 2085 bitmap=(U8*)ANYOF_BITMAP(c); 2086 } 2087 /* this is the Aho-Corasick algorithm modified a touch 2088 to include special handling for long "unknown char" sequences. 2089 The basic idea being that we use AC as long as we are dealing 2090 with a possible matching char, when we encounter an unknown char 2091 (and we have not encountered an accepting state) we scan forward 2092 until we find a legal starting char. 2093 AC matching is basically that of trie matching, except that when 2094 we encounter a failing transition, we fall back to the current 2095 states "fail state", and try the current char again, a process 2096 we repeat until we reach the root state, state 1, or a legal 2097 transition. If we fail on the root state then we can either 2098 terminate if we have reached an accepting state previously, or 2099 restart the entire process from the beginning if we have not. 2100 2101 */ 2102 while (s <= last_start) { 2103 const U32 uniflags = UTF8_ALLOW_DEFAULT; 2104 U8 *uc = (U8*)s; 2105 U16 charid = 0; 2106 U32 base = 1; 2107 U32 state = 1; 2108 UV uvc = 0; 2109 STRLEN len = 0; 2110 STRLEN foldlen = 0; 2111 U8 *uscan = (U8*)NULL; 2112 U8 *leftmost = NULL; 2113 #ifdef DEBUGGING 2114 U32 accepted_word= 0; 2115 #endif 2116 U32 pointpos = 0; 2117 2118 while ( state && uc <= (U8*)strend ) { 2119 int failed=0; 2120 U32 word = aho->states[ state ].wordnum; 2121 2122 if( state==1 ) { 2123 if ( bitmap ) { 2124 DEBUG_TRIE_EXECUTE_r( 2125 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { 2126 dump_exec_pos( (char *)uc, c, strend, real_start, 2127 (char *)uc, utf8_target ); 2128 PerlIO_printf( Perl_debug_log, 2129 " Scanning for legal start char...\n"); 2130 } 2131 ); 2132 if (utf8_target) { 2133 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { 2134 uc += UTF8SKIP(uc); 2135 } 2136 } else { 2137 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { 2138 uc++; 2139 } 2140 } 2141 s= (char *)uc; 2142 } 2143 if (uc >(U8*)last_start) break; 2144 } 2145 2146 if ( word ) { 2147 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ]; 2148 if (!leftmost || lpos < leftmost) { 2149 DEBUG_r(accepted_word=word); 2150 leftmost= lpos; 2151 } 2152 if (base==0) break; 2153 2154 } 2155 points[pointpos++ % maxlen]= uc; 2156 if (foldlen || uc < (U8*)strend) { 2157 REXEC_TRIE_READ_CHAR(trie_type, trie, 2158 widecharmap, uc, 2159 uscan, len, uvc, charid, foldlen, 2160 foldbuf, uniflags); 2161 DEBUG_TRIE_EXECUTE_r({ 2162 dump_exec_pos( (char *)uc, c, strend, 2163 real_start, s, utf8_target); 2164 PerlIO_printf(Perl_debug_log, 2165 " Charid:%3u CP:%4"UVxf" ", 2166 charid, uvc); 2167 }); 2168 } 2169 else { 2170 len = 0; 2171 charid = 0; 2172 } 2173 2174 2175 do { 2176 #ifdef DEBUGGING 2177 word = aho->states[ state ].wordnum; 2178 #endif 2179 base = aho->states[ state ].trans.base; 2180 2181 DEBUG_TRIE_EXECUTE_r({ 2182 if (failed) 2183 dump_exec_pos( (char *)uc, c, strend, real_start, 2184 s, utf8_target ); 2185 PerlIO_printf( Perl_debug_log, 2186 "%sState: %4"UVxf", word=%"UVxf, 2187 failed ? " Fail transition to " : "", 2188 (UV)state, (UV)word); 2189 }); 2190 if ( base ) { 2191 U32 tmp; 2192 I32 offset; 2193 if (charid && 2194 ( ((offset = base + charid 2195 - 1 - trie->uniquecharcount)) >= 0) 2196 && ((U32)offset < trie->lasttrans) 2197 && trie->trans[offset].check == state 2198 && (tmp=trie->trans[offset].next)) 2199 { 2200 DEBUG_TRIE_EXECUTE_r( 2201 PerlIO_printf( Perl_debug_log," - legal\n")); 2202 state = tmp; 2203 break; 2204 } 2205 else { 2206 DEBUG_TRIE_EXECUTE_r( 2207 PerlIO_printf( Perl_debug_log," - fail\n")); 2208 failed = 1; 2209 state = aho->fail[state]; 2210 } 2211 } 2212 else { 2213 /* we must be accepting here */ 2214 DEBUG_TRIE_EXECUTE_r( 2215 PerlIO_printf( Perl_debug_log," - accepting\n")); 2216 failed = 1; 2217 break; 2218 } 2219 } while(state); 2220 uc += len; 2221 if (failed) { 2222 if (leftmost) 2223 break; 2224 if (!state) state = 1; 2225 } 2226 } 2227 if ( aho->states[ state ].wordnum ) { 2228 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ]; 2229 if (!leftmost || lpos < leftmost) { 2230 DEBUG_r(accepted_word=aho->states[ state ].wordnum); 2231 leftmost = lpos; 2232 } 2233 } 2234 if (leftmost) { 2235 s = (char*)leftmost; 2236 DEBUG_TRIE_EXECUTE_r({ 2237 PerlIO_printf( 2238 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n", 2239 (UV)accepted_word, (IV)(s - real_start) 2240 ); 2241 }); 2242 if (reginfo->intuit || regtry(reginfo, &s)) { 2243 FREETMPS; 2244 LEAVE; 2245 goto got_it; 2246 } 2247 s = HOPc(s,1); 2248 DEBUG_TRIE_EXECUTE_r({ 2249 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n"); 2250 }); 2251 } else { 2252 DEBUG_TRIE_EXECUTE_r( 2253 PerlIO_printf( Perl_debug_log,"No match.\n")); 2254 break; 2255 } 2256 } 2257 FREETMPS; 2258 LEAVE; 2259 } 2260 break; 2261 default: 2262 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); 2263 break; 2264 } 2265 return 0; 2266 got_it: 2267 return s; 2268 } 2269 2270 /* set RX_SAVED_COPY, RX_SUBBEG etc. 2271 * flags have same meanings as with regexec_flags() */ 2272 2273 static void 2274 S_reg_set_capture_string(pTHX_ REGEXP * const rx, 2275 char *strbeg, 2276 char *strend, 2277 SV *sv, 2278 U32 flags, 2279 bool utf8_target) 2280 { 2281 struct regexp *const prog = ReANY(rx); 2282 2283 if (flags & REXEC_COPY_STR) { 2284 #ifdef PERL_ANY_COW 2285 if (SvCANCOW(sv)) { 2286 if (DEBUG_C_TEST) { 2287 PerlIO_printf(Perl_debug_log, 2288 "Copy on write: regexp capture, type %d\n", 2289 (int) SvTYPE(sv)); 2290 } 2291 /* Create a new COW SV to share the match string and store 2292 * in saved_copy, unless the current COW SV in saved_copy 2293 * is valid and suitable for our purpose */ 2294 if (( prog->saved_copy 2295 && SvIsCOW(prog->saved_copy) 2296 && SvPOKp(prog->saved_copy) 2297 && SvIsCOW(sv) 2298 && SvPOKp(sv) 2299 && SvPVX(sv) == SvPVX(prog->saved_copy))) 2300 { 2301 /* just reuse saved_copy SV */ 2302 if (RXp_MATCH_COPIED(prog)) { 2303 Safefree(prog->subbeg); 2304 RXp_MATCH_COPIED_off(prog); 2305 } 2306 } 2307 else { 2308 /* create new COW SV to share string */ 2309 RX_MATCH_COPY_FREE(rx); 2310 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); 2311 } 2312 prog->subbeg = (char *)SvPVX_const(prog->saved_copy); 2313 assert (SvPOKp(prog->saved_copy)); 2314 prog->sublen = strend - strbeg; 2315 prog->suboffset = 0; 2316 prog->subcoffset = 0; 2317 } else 2318 #endif 2319 { 2320 SSize_t min = 0; 2321 SSize_t max = strend - strbeg; 2322 SSize_t sublen; 2323 2324 if ( (flags & REXEC_COPY_SKIP_POST) 2325 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ 2326 && !(PL_sawampersand & SAWAMPERSAND_RIGHT) 2327 ) { /* don't copy $' part of string */ 2328 U32 n = 0; 2329 max = -1; 2330 /* calculate the right-most part of the string covered 2331 * by a capture. Due to look-ahead, this may be to 2332 * the right of $&, so we have to scan all captures */ 2333 while (n <= prog->lastparen) { 2334 if (prog->offs[n].end > max) 2335 max = prog->offs[n].end; 2336 n++; 2337 } 2338 if (max == -1) 2339 max = (PL_sawampersand & SAWAMPERSAND_LEFT) 2340 ? prog->offs[0].start 2341 : 0; 2342 assert(max >= 0 && max <= strend - strbeg); 2343 } 2344 2345 if ( (flags & REXEC_COPY_SKIP_PRE) 2346 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ 2347 && !(PL_sawampersand & SAWAMPERSAND_LEFT) 2348 ) { /* don't copy $` part of string */ 2349 U32 n = 0; 2350 min = max; 2351 /* calculate the left-most part of the string covered 2352 * by a capture. Due to look-behind, this may be to 2353 * the left of $&, so we have to scan all captures */ 2354 while (min && n <= prog->lastparen) { 2355 if ( prog->offs[n].start != -1 2356 && prog->offs[n].start < min) 2357 { 2358 min = prog->offs[n].start; 2359 } 2360 n++; 2361 } 2362 if ((PL_sawampersand & SAWAMPERSAND_RIGHT) 2363 && min > prog->offs[0].end 2364 ) 2365 min = prog->offs[0].end; 2366 2367 } 2368 2369 assert(min >= 0 && min <= max && min <= strend - strbeg); 2370 sublen = max - min; 2371 2372 if (RX_MATCH_COPIED(rx)) { 2373 if (sublen > prog->sublen) 2374 prog->subbeg = 2375 (char*)saferealloc(prog->subbeg, sublen+1); 2376 } 2377 else 2378 prog->subbeg = (char*)safemalloc(sublen+1); 2379 Copy(strbeg + min, prog->subbeg, sublen, char); 2380 prog->subbeg[sublen] = '\0'; 2381 prog->suboffset = min; 2382 prog->sublen = sublen; 2383 RX_MATCH_COPIED_on(rx); 2384 } 2385 prog->subcoffset = prog->suboffset; 2386 if (prog->suboffset && utf8_target) { 2387 /* Convert byte offset to chars. 2388 * XXX ideally should only compute this if @-/@+ 2389 * has been seen, a la PL_sawampersand ??? */ 2390 2391 /* If there's a direct correspondence between the 2392 * string which we're matching and the original SV, 2393 * then we can use the utf8 len cache associated with 2394 * the SV. In particular, it means that under //g, 2395 * sv_pos_b2u() will use the previously cached 2396 * position to speed up working out the new length of 2397 * subcoffset, rather than counting from the start of 2398 * the string each time. This stops 2399 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; 2400 * from going quadratic */ 2401 if (SvPOKp(sv) && SvPVX(sv) == strbeg) 2402 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, 2403 SV_GMAGIC|SV_CONST_RETURN); 2404 else 2405 prog->subcoffset = utf8_length((U8*)strbeg, 2406 (U8*)(strbeg+prog->suboffset)); 2407 } 2408 } 2409 else { 2410 RX_MATCH_COPY_FREE(rx); 2411 prog->subbeg = strbeg; 2412 prog->suboffset = 0; 2413 prog->subcoffset = 0; 2414 prog->sublen = strend - strbeg; 2415 } 2416 } 2417 2418 2419 2420 2421 /* 2422 - regexec_flags - match a regexp against a string 2423 */ 2424 I32 2425 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, 2426 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) 2427 /* stringarg: the point in the string at which to begin matching */ 2428 /* strend: pointer to null at end of string */ 2429 /* strbeg: real beginning of string */ 2430 /* minend: end of match must be >= minend bytes after stringarg. */ 2431 /* sv: SV being matched: only used for utf8 flag, pos() etc; string 2432 * itself is accessed via the pointers above */ 2433 /* data: May be used for some additional optimizations. 2434 Currently unused. */ 2435 /* flags: For optimizations. See REXEC_* in regexp.h */ 2436 2437 { 2438 dVAR; 2439 struct regexp *const prog = ReANY(rx); 2440 char *s; 2441 regnode *c; 2442 char *startpos; 2443 SSize_t minlen; /* must match at least this many chars */ 2444 SSize_t dontbother = 0; /* how many characters not to try at end */ 2445 const bool utf8_target = cBOOL(DO_UTF8(sv)); 2446 I32 multiline; 2447 RXi_GET_DECL(prog,progi); 2448 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */ 2449 regmatch_info *const reginfo = ®info_buf; 2450 regexp_paren_pair *swap = NULL; 2451 I32 oldsave; 2452 GET_RE_DEBUG_FLAGS_DECL; 2453 2454 PERL_ARGS_ASSERT_REGEXEC_FLAGS; 2455 PERL_UNUSED_ARG(data); 2456 2457 /* Be paranoid... */ 2458 if (prog == NULL || stringarg == NULL) { 2459 Perl_croak(aTHX_ "NULL regexp parameter"); 2460 return 0; 2461 } 2462 2463 DEBUG_EXECUTE_r( 2464 debug_start_match(rx, utf8_target, stringarg, strend, 2465 "Matching"); 2466 ); 2467 2468 startpos = stringarg; 2469 2470 if (prog->intflags & PREGf_GPOS_SEEN) { 2471 MAGIC *mg; 2472 2473 /* set reginfo->ganch, the position where \G can match */ 2474 2475 reginfo->ganch = 2476 (flags & REXEC_IGNOREPOS) 2477 ? stringarg /* use start pos rather than pos() */ 2478 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) 2479 /* Defined pos(): */ 2480 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) 2481 : strbeg; /* pos() not defined; use start of string */ 2482 2483 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, 2484 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); 2485 2486 /* in the presence of \G, we may need to start looking earlier in 2487 * the string than the suggested start point of stringarg: 2488 * if prog->gofs is set, then that's a known, fixed minimum 2489 * offset, such as 2490 * /..\G/: gofs = 2 2491 * /ab|c\G/: gofs = 1 2492 * or if the minimum offset isn't known, then we have to go back 2493 * to the start of the string, e.g. /w+\G/ 2494 */ 2495 2496 if (prog->intflags & PREGf_ANCH_GPOS) { 2497 startpos = reginfo->ganch - prog->gofs; 2498 if (startpos < 2499 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) 2500 { 2501 DEBUG_r(PerlIO_printf(Perl_debug_log, 2502 "fail: ganch-gofs before earliest possible start\n")); 2503 return 0; 2504 } 2505 } 2506 else if (prog->gofs) { 2507 if (startpos - prog->gofs < strbeg) 2508 startpos = strbeg; 2509 else 2510 startpos -= prog->gofs; 2511 } 2512 else if (prog->intflags & PREGf_GPOS_FLOAT) 2513 startpos = strbeg; 2514 } 2515 2516 minlen = prog->minlen; 2517 if ((startpos + minlen) > strend || startpos < strbeg) { 2518 DEBUG_r(PerlIO_printf(Perl_debug_log, 2519 "Regex match can't succeed, so not even tried\n")); 2520 return 0; 2521 } 2522 2523 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), 2524 * which will call destuctors to reset PL_regmatch_state, free higher 2525 * PL_regmatch_slabs, and clean up regmatch_info_aux and 2526 * regmatch_info_aux_eval */ 2527 2528 oldsave = PL_savestack_ix; 2529 2530 s = startpos; 2531 2532 if ((prog->extflags & RXf_USE_INTUIT) 2533 && !(flags & REXEC_CHECKED)) 2534 { 2535 s = re_intuit_start(rx, sv, strbeg, startpos, strend, 2536 flags, NULL); 2537 if (!s) 2538 return 0; 2539 2540 if (prog->extflags & RXf_CHECK_ALL) { 2541 /* we can match based purely on the result of INTUIT. 2542 * Set up captures etc just for $& and $-[0] 2543 * (an intuit-only match wont have $1,$2,..) */ 2544 assert(!prog->nparens); 2545 2546 /* s/// doesn't like it if $& is earlier than where we asked it to 2547 * start searching (which can happen on something like /.\G/) */ 2548 if ( (flags & REXEC_FAIL_ON_UNDERFLOW) 2549 && (s < stringarg)) 2550 { 2551 /* this should only be possible under \G */ 2552 assert(prog->intflags & PREGf_GPOS_SEEN); 2553 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 2554 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); 2555 goto phooey; 2556 } 2557 2558 /* match via INTUIT shouldn't have any captures. 2559 * Let @-, @+, $^N know */ 2560 prog->lastparen = prog->lastcloseparen = 0; 2561 RX_MATCH_UTF8_set(rx, utf8_target); 2562 prog->offs[0].start = s - strbeg; 2563 prog->offs[0].end = utf8_target 2564 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg 2565 : s - strbeg + prog->minlenret; 2566 if ( !(flags & REXEC_NOT_FIRST) ) 2567 S_reg_set_capture_string(aTHX_ rx, 2568 strbeg, strend, 2569 sv, flags, utf8_target); 2570 2571 return 1; 2572 } 2573 } 2574 2575 multiline = prog->extflags & RXf_PMf_MULTILINE; 2576 2577 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { 2578 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 2579 "String too short [regexec_flags]...\n")); 2580 goto phooey; 2581 } 2582 2583 /* Check validity of program. */ 2584 if (UCHARAT(progi->program) != REG_MAGIC) { 2585 Perl_croak(aTHX_ "corrupted regexp program"); 2586 } 2587 2588 RX_MATCH_TAINTED_off(rx); 2589 RX_MATCH_UTF8_set(rx, utf8_target); 2590 2591 reginfo->prog = rx; /* Yes, sorry that this is confusing. */ 2592 reginfo->intuit = 0; 2593 reginfo->is_utf8_target = cBOOL(utf8_target); 2594 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); 2595 reginfo->warned = FALSE; 2596 reginfo->strbeg = strbeg; 2597 reginfo->sv = sv; 2598 reginfo->poscache_maxiter = 0; /* not yet started a countdown */ 2599 reginfo->strend = strend; 2600 /* see how far we have to get to not match where we matched before */ 2601 reginfo->till = stringarg + minend; 2602 2603 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { 2604 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after 2605 S_cleanup_regmatch_info_aux has executed (registered by 2606 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies 2607 magic belonging to this SV. 2608 Not newSVsv, either, as it does not COW. 2609 */ 2610 assert(!IS_PADGV(sv)); 2611 reginfo->sv = newSV(0); 2612 SvSetSV_nosteal(reginfo->sv, sv); 2613 SAVEFREESV(reginfo->sv); 2614 } 2615 2616 /* reserve next 2 or 3 slots in PL_regmatch_state: 2617 * slot N+0: may currently be in use: skip it 2618 * slot N+1: use for regmatch_info_aux struct 2619 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s 2620 * slot N+3: ready for use by regmatch() 2621 */ 2622 2623 { 2624 regmatch_state *old_regmatch_state; 2625 regmatch_slab *old_regmatch_slab; 2626 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; 2627 2628 /* on first ever match, allocate first slab */ 2629 if (!PL_regmatch_slab) { 2630 Newx(PL_regmatch_slab, 1, regmatch_slab); 2631 PL_regmatch_slab->prev = NULL; 2632 PL_regmatch_slab->next = NULL; 2633 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); 2634 } 2635 2636 old_regmatch_state = PL_regmatch_state; 2637 old_regmatch_slab = PL_regmatch_slab; 2638 2639 for (i=0; i <= max; i++) { 2640 if (i == 1) 2641 reginfo->info_aux = &(PL_regmatch_state->u.info_aux); 2642 else if (i ==2) 2643 reginfo->info_aux_eval = 2644 reginfo->info_aux->info_aux_eval = 2645 &(PL_regmatch_state->u.info_aux_eval); 2646 2647 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) 2648 PL_regmatch_state = S_push_slab(aTHX); 2649 } 2650 2651 /* note initial PL_regmatch_state position; at end of match we'll 2652 * pop back to there and free any higher slabs */ 2653 2654 reginfo->info_aux->old_regmatch_state = old_regmatch_state; 2655 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; 2656 reginfo->info_aux->poscache = NULL; 2657 2658 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); 2659 2660 if ((prog->extflags & RXf_EVAL_SEEN)) 2661 S_setup_eval_state(aTHX_ reginfo); 2662 else 2663 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; 2664 } 2665 2666 /* If there is a "must appear" string, look for it. */ 2667 2668 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { 2669 /* We have to be careful. If the previous successful match 2670 was from this regex we don't want a subsequent partially 2671 successful match to clobber the old results. 2672 So when we detect this possibility we add a swap buffer 2673 to the re, and switch the buffer each match. If we fail, 2674 we switch it back; otherwise we leave it swapped. 2675 */ 2676 swap = prog->offs; 2677 /* do we need a save destructor here for eval dies? */ 2678 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); 2679 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, 2680 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n", 2681 PTR2UV(prog), 2682 PTR2UV(swap), 2683 PTR2UV(prog->offs) 2684 )); 2685 } 2686 2687 /* Simplest case: anchored match need be tried only once, or with 2688 * MBOL, only at the beginning of each line. 2689 * 2690 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets 2691 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't 2692 * match at the start of the string then it won't match anywhere else 2693 * either; while with /.*.../, if it doesn't match at the beginning, 2694 * the earliest it could match is at the start of the next line */ 2695 2696 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { 2697 char *end; 2698 2699 if (regtry(reginfo, &s)) 2700 goto got_it; 2701 2702 if (!(prog->intflags & PREGf_ANCH_MBOL)) 2703 goto phooey; 2704 2705 /* didn't match at start, try at other newline positions */ 2706 2707 if (minlen) 2708 dontbother = minlen - 1; 2709 end = HOP3c(strend, -dontbother, strbeg) - 1; 2710 2711 /* skip to next newline */ 2712 2713 while (s <= end) { /* note it could be possible to match at the end of the string */ 2714 /* NB: newlines are the same in unicode as they are in latin */ 2715 if (*s++ != '\n') 2716 continue; 2717 if (prog->check_substr || prog->check_utf8) { 2718 /* note that with PREGf_IMPLICIT, intuit can only fail 2719 * or return the start position, so it's of limited utility. 2720 * Nevertheless, I made the decision that the potential for 2721 * quick fail was still worth it - DAPM */ 2722 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL); 2723 if (!s) 2724 goto phooey; 2725 } 2726 if (regtry(reginfo, &s)) 2727 goto got_it; 2728 } 2729 goto phooey; 2730 } /* end anchored search */ 2731 2732 if (prog->intflags & PREGf_ANCH_GPOS) 2733 { 2734 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ 2735 assert(prog->intflags & PREGf_GPOS_SEEN); 2736 /* For anchored \G, the only position it can match from is 2737 * (ganch-gofs); we already set startpos to this above; if intuit 2738 * moved us on from there, we can't possibly succeed */ 2739 assert(startpos == reginfo->ganch - prog->gofs); 2740 if (s == startpos && regtry(reginfo, &s)) 2741 goto got_it; 2742 goto phooey; 2743 } 2744 2745 /* Messy cases: unanchored match. */ 2746 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { 2747 /* we have /x+whatever/ */ 2748 /* it must be a one character string (XXXX Except is_utf8_pat?) */ 2749 char ch; 2750 #ifdef DEBUGGING 2751 int did_match = 0; 2752 #endif 2753 if (utf8_target) { 2754 if (! prog->anchored_utf8) { 2755 to_utf8_substr(prog); 2756 } 2757 ch = SvPVX_const(prog->anchored_utf8)[0]; 2758 REXEC_FBC_SCAN( 2759 if (*s == ch) { 2760 DEBUG_EXECUTE_r( did_match = 1 ); 2761 if (regtry(reginfo, &s)) goto got_it; 2762 s += UTF8SKIP(s); 2763 while (s < strend && *s == ch) 2764 s += UTF8SKIP(s); 2765 } 2766 ); 2767 2768 } 2769 else { 2770 if (! prog->anchored_substr) { 2771 if (! to_byte_substr(prog)) { 2772 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 2773 } 2774 } 2775 ch = SvPVX_const(prog->anchored_substr)[0]; 2776 REXEC_FBC_SCAN( 2777 if (*s == ch) { 2778 DEBUG_EXECUTE_r( did_match = 1 ); 2779 if (regtry(reginfo, &s)) goto got_it; 2780 s++; 2781 while (s < strend && *s == ch) 2782 s++; 2783 } 2784 ); 2785 } 2786 DEBUG_EXECUTE_r(if (!did_match) 2787 PerlIO_printf(Perl_debug_log, 2788 "Did not find anchored character...\n") 2789 ); 2790 } 2791 else if (prog->anchored_substr != NULL 2792 || prog->anchored_utf8 != NULL 2793 || ((prog->float_substr != NULL || prog->float_utf8 != NULL) 2794 && prog->float_max_offset < strend - s)) { 2795 SV *must; 2796 SSize_t back_max; 2797 SSize_t back_min; 2798 char *last; 2799 char *last1; /* Last position checked before */ 2800 #ifdef DEBUGGING 2801 int did_match = 0; 2802 #endif 2803 if (prog->anchored_substr || prog->anchored_utf8) { 2804 if (utf8_target) { 2805 if (! prog->anchored_utf8) { 2806 to_utf8_substr(prog); 2807 } 2808 must = prog->anchored_utf8; 2809 } 2810 else { 2811 if (! prog->anchored_substr) { 2812 if (! to_byte_substr(prog)) { 2813 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 2814 } 2815 } 2816 must = prog->anchored_substr; 2817 } 2818 back_max = back_min = prog->anchored_offset; 2819 } else { 2820 if (utf8_target) { 2821 if (! prog->float_utf8) { 2822 to_utf8_substr(prog); 2823 } 2824 must = prog->float_utf8; 2825 } 2826 else { 2827 if (! prog->float_substr) { 2828 if (! to_byte_substr(prog)) { 2829 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 2830 } 2831 } 2832 must = prog->float_substr; 2833 } 2834 back_max = prog->float_max_offset; 2835 back_min = prog->float_min_offset; 2836 } 2837 2838 if (back_min<0) { 2839 last = strend; 2840 } else { 2841 last = HOP3c(strend, /* Cannot start after this */ 2842 -(SSize_t)(CHR_SVLEN(must) 2843 - (SvTAIL(must) != 0) + back_min), strbeg); 2844 } 2845 if (s > reginfo->strbeg) 2846 last1 = HOPc(s, -1); 2847 else 2848 last1 = s - 1; /* bogus */ 2849 2850 /* XXXX check_substr already used to find "s", can optimize if 2851 check_substr==must. */ 2852 dontbother = 0; 2853 strend = HOPc(strend, -dontbother); 2854 while ( (s <= last) && 2855 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), 2856 (unsigned char*)strend, must, 2857 multiline ? FBMrf_MULTILINE : 0)) ) { 2858 DEBUG_EXECUTE_r( did_match = 1 ); 2859 if (HOPc(s, -back_max) > last1) { 2860 last1 = HOPc(s, -back_min); 2861 s = HOPc(s, -back_max); 2862 } 2863 else { 2864 char * const t = (last1 >= reginfo->strbeg) 2865 ? HOPc(last1, 1) : last1 + 1; 2866 2867 last1 = HOPc(s, -back_min); 2868 s = t; 2869 } 2870 if (utf8_target) { 2871 while (s <= last1) { 2872 if (regtry(reginfo, &s)) 2873 goto got_it; 2874 if (s >= last1) { 2875 s++; /* to break out of outer loop */ 2876 break; 2877 } 2878 s += UTF8SKIP(s); 2879 } 2880 } 2881 else { 2882 while (s <= last1) { 2883 if (regtry(reginfo, &s)) 2884 goto got_it; 2885 s++; 2886 } 2887 } 2888 } 2889 DEBUG_EXECUTE_r(if (!did_match) { 2890 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 2891 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 2892 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n", 2893 ((must == prog->anchored_substr || must == prog->anchored_utf8) 2894 ? "anchored" : "floating"), 2895 quoted, RE_SV_TAIL(must)); 2896 }); 2897 goto phooey; 2898 } 2899 else if ( (c = progi->regstclass) ) { 2900 if (minlen) { 2901 const OPCODE op = OP(progi->regstclass); 2902 /* don't bother with what can't match */ 2903 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE) 2904 strend = HOPc(strend, -(minlen - 1)); 2905 } 2906 DEBUG_EXECUTE_r({ 2907 SV * const prop = sv_newmortal(); 2908 regprop(prog, prop, c, reginfo); 2909 { 2910 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), 2911 s,strend-s,60); 2912 PerlIO_printf(Perl_debug_log, 2913 "Matching stclass %.*s against %s (%d bytes)\n", 2914 (int)SvCUR(prop), SvPVX_const(prop), 2915 quoted, (int)(strend - s)); 2916 } 2917 }); 2918 if (find_byclass(prog, c, s, strend, reginfo)) 2919 goto got_it; 2920 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n")); 2921 } 2922 else { 2923 dontbother = 0; 2924 if (prog->float_substr != NULL || prog->float_utf8 != NULL) { 2925 /* Trim the end. */ 2926 char *last= NULL; 2927 SV* float_real; 2928 STRLEN len; 2929 const char *little; 2930 2931 if (utf8_target) { 2932 if (! prog->float_utf8) { 2933 to_utf8_substr(prog); 2934 } 2935 float_real = prog->float_utf8; 2936 } 2937 else { 2938 if (! prog->float_substr) { 2939 if (! to_byte_substr(prog)) { 2940 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 2941 } 2942 } 2943 float_real = prog->float_substr; 2944 } 2945 2946 little = SvPV_const(float_real, len); 2947 if (SvTAIL(float_real)) { 2948 /* This means that float_real contains an artificial \n on 2949 * the end due to the presence of something like this: 2950 * /foo$/ where we can match both "foo" and "foo\n" at the 2951 * end of the string. So we have to compare the end of the 2952 * string first against the float_real without the \n and 2953 * then against the full float_real with the string. We 2954 * have to watch out for cases where the string might be 2955 * smaller than the float_real or the float_real without 2956 * the \n. */ 2957 char *checkpos= strend - len; 2958 DEBUG_OPTIMISE_r( 2959 PerlIO_printf(Perl_debug_log, 2960 "%sChecking for float_real.%s\n", 2961 PL_colors[4], PL_colors[5])); 2962 if (checkpos + 1 < strbeg) { 2963 /* can't match, even if we remove the trailing \n 2964 * string is too short to match */ 2965 DEBUG_EXECUTE_r( 2966 PerlIO_printf(Perl_debug_log, 2967 "%sString shorter than required trailing substring, cannot match.%s\n", 2968 PL_colors[4], PL_colors[5])); 2969 goto phooey; 2970 } else if (memEQ(checkpos + 1, little, len - 1)) { 2971 /* can match, the end of the string matches without the 2972 * "\n" */ 2973 last = checkpos + 1; 2974 } else if (checkpos < strbeg) { 2975 /* cant match, string is too short when the "\n" is 2976 * included */ 2977 DEBUG_EXECUTE_r( 2978 PerlIO_printf(Perl_debug_log, 2979 "%sString does not contain required trailing substring, cannot match.%s\n", 2980 PL_colors[4], PL_colors[5])); 2981 goto phooey; 2982 } else if (!multiline) { 2983 /* non multiline match, so compare with the "\n" at the 2984 * end of the string */ 2985 if (memEQ(checkpos, little, len)) { 2986 last= checkpos; 2987 } else { 2988 DEBUG_EXECUTE_r( 2989 PerlIO_printf(Perl_debug_log, 2990 "%sString does not contain required trailing substring, cannot match.%s\n", 2991 PL_colors[4], PL_colors[5])); 2992 goto phooey; 2993 } 2994 } else { 2995 /* multiline match, so we have to search for a place 2996 * where the full string is located */ 2997 goto find_last; 2998 } 2999 } else { 3000 find_last: 3001 if (len) 3002 last = rninstr(s, strend, little, little + len); 3003 else 3004 last = strend; /* matching "$" */ 3005 } 3006 if (!last) { 3007 /* at one point this block contained a comment which was 3008 * probably incorrect, which said that this was a "should not 3009 * happen" case. Even if it was true when it was written I am 3010 * pretty sure it is not anymore, so I have removed the comment 3011 * and replaced it with this one. Yves */ 3012 DEBUG_EXECUTE_r( 3013 PerlIO_printf(Perl_debug_log, 3014 "String does not contain required substring, cannot match.\n" 3015 )); 3016 goto phooey; 3017 } 3018 dontbother = strend - last + prog->float_min_offset; 3019 } 3020 if (minlen && (dontbother < minlen)) 3021 dontbother = minlen - 1; 3022 strend -= dontbother; /* this one's always in bytes! */ 3023 /* We don't know much -- general case. */ 3024 if (utf8_target) { 3025 for (;;) { 3026 if (regtry(reginfo, &s)) 3027 goto got_it; 3028 if (s >= strend) 3029 break; 3030 s += UTF8SKIP(s); 3031 }; 3032 } 3033 else { 3034 do { 3035 if (regtry(reginfo, &s)) 3036 goto got_it; 3037 } while (s++ < strend); 3038 } 3039 } 3040 3041 /* Failure. */ 3042 goto phooey; 3043 3044 got_it: 3045 /* s/// doesn't like it if $& is earlier than where we asked it to 3046 * start searching (which can happen on something like /.\G/) */ 3047 if ( (flags & REXEC_FAIL_ON_UNDERFLOW) 3048 && (prog->offs[0].start < stringarg - strbeg)) 3049 { 3050 /* this should only be possible under \G */ 3051 assert(prog->intflags & PREGf_GPOS_SEEN); 3052 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 3053 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); 3054 goto phooey; 3055 } 3056 3057 DEBUG_BUFFERS_r( 3058 if (swap) 3059 PerlIO_printf(Perl_debug_log, 3060 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n", 3061 PTR2UV(prog), 3062 PTR2UV(swap) 3063 ); 3064 ); 3065 Safefree(swap); 3066 3067 /* clean up; this will trigger destructors that will free all slabs 3068 * above the current one, and cleanup the regmatch_info_aux 3069 * and regmatch_info_aux_eval sructs */ 3070 3071 LEAVE_SCOPE(oldsave); 3072 3073 if (RXp_PAREN_NAMES(prog)) 3074 (void)hv_iterinit(RXp_PAREN_NAMES(prog)); 3075 3076 /* make sure $`, $&, $', and $digit will work later */ 3077 if ( !(flags & REXEC_NOT_FIRST) ) 3078 S_reg_set_capture_string(aTHX_ rx, 3079 strbeg, reginfo->strend, 3080 sv, flags, utf8_target); 3081 3082 return 1; 3083 3084 phooey: 3085 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", 3086 PL_colors[4], PL_colors[5])); 3087 3088 /* clean up; this will trigger destructors that will free all slabs 3089 * above the current one, and cleanup the regmatch_info_aux 3090 * and regmatch_info_aux_eval sructs */ 3091 3092 LEAVE_SCOPE(oldsave); 3093 3094 if (swap) { 3095 /* we failed :-( roll it back */ 3096 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, 3097 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n", 3098 PTR2UV(prog), 3099 PTR2UV(prog->offs), 3100 PTR2UV(swap) 3101 )); 3102 Safefree(prog->offs); 3103 prog->offs = swap; 3104 } 3105 return 0; 3106 } 3107 3108 3109 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting. 3110 * Do inc before dec, in case old and new rex are the same */ 3111 #define SET_reg_curpm(Re2) \ 3112 if (reginfo->info_aux_eval) { \ 3113 (void)ReREFCNT_inc(Re2); \ 3114 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ 3115 PM_SETRE((PL_reg_curpm), (Re2)); \ 3116 } 3117 3118 3119 /* 3120 - regtry - try match at specific point 3121 */ 3122 STATIC I32 /* 0 failure, 1 success */ 3123 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) 3124 { 3125 dVAR; 3126 CHECKPOINT lastcp; 3127 REGEXP *const rx = reginfo->prog; 3128 regexp *const prog = ReANY(rx); 3129 SSize_t result; 3130 RXi_GET_DECL(prog,progi); 3131 GET_RE_DEBUG_FLAGS_DECL; 3132 3133 PERL_ARGS_ASSERT_REGTRY; 3134 3135 reginfo->cutpoint=NULL; 3136 3137 prog->offs[0].start = *startposp - reginfo->strbeg; 3138 prog->lastparen = 0; 3139 prog->lastcloseparen = 0; 3140 3141 /* XXXX What this code is doing here?!!! There should be no need 3142 to do this again and again, prog->lastparen should take care of 3143 this! --ilya*/ 3144 3145 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. 3146 * Actually, the code in regcppop() (which Ilya may be meaning by 3147 * prog->lastparen), is not needed at all by the test suite 3148 * (op/regexp, op/pat, op/split), but that code is needed otherwise 3149 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 3150 * Meanwhile, this code *is* needed for the 3151 * above-mentioned test suite tests to succeed. The common theme 3152 * on those tests seems to be returning null fields from matches. 3153 * --jhi updated by dapm */ 3154 #if 1 3155 if (prog->nparens) { 3156 regexp_paren_pair *pp = prog->offs; 3157 I32 i; 3158 for (i = prog->nparens; i > (I32)prog->lastparen; i--) { 3159 ++pp; 3160 pp->start = -1; 3161 pp->end = -1; 3162 } 3163 } 3164 #endif 3165 REGCP_SET(lastcp); 3166 result = regmatch(reginfo, *startposp, progi->program + 1); 3167 if (result != -1) { 3168 prog->offs[0].end = result; 3169 return 1; 3170 } 3171 if (reginfo->cutpoint) 3172 *startposp= reginfo->cutpoint; 3173 REGCP_UNWIND(lastcp); 3174 return 0; 3175 } 3176 3177 3178 #define sayYES goto yes 3179 #define sayNO goto no 3180 #define sayNO_SILENT goto no_silent 3181 3182 /* we dont use STMT_START/END here because it leads to 3183 "unreachable code" warnings, which are bogus, but distracting. */ 3184 #define CACHEsayNO \ 3185 if (ST.cache_mask) \ 3186 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ 3187 sayNO 3188 3189 /* this is used to determine how far from the left messages like 3190 'failed...' are printed. It should be set such that messages 3191 are inline with the regop output that created them. 3192 */ 3193 #define REPORT_CODE_OFF 32 3194 3195 3196 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ 3197 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ 3198 #define CHRTEST_NOT_A_CP_1 -999 3199 #define CHRTEST_NOT_A_CP_2 -998 3200 3201 /* grab a new slab and return the first slot in it */ 3202 3203 STATIC regmatch_state * 3204 S_push_slab(pTHX) 3205 { 3206 #if PERL_VERSION < 9 && !defined(PERL_CORE) 3207 dMY_CXT; 3208 #endif 3209 regmatch_slab *s = PL_regmatch_slab->next; 3210 if (!s) { 3211 Newx(s, 1, regmatch_slab); 3212 s->prev = PL_regmatch_slab; 3213 s->next = NULL; 3214 PL_regmatch_slab->next = s; 3215 } 3216 PL_regmatch_slab = s; 3217 return SLAB_FIRST(s); 3218 } 3219 3220 3221 /* push a new state then goto it */ 3222 3223 #define PUSH_STATE_GOTO(state, node, input) \ 3224 pushinput = input; \ 3225 scan = node; \ 3226 st->resume_state = state; \ 3227 goto push_state; 3228 3229 /* push a new state with success backtracking, then goto it */ 3230 3231 #define PUSH_YES_STATE_GOTO(state, node, input) \ 3232 pushinput = input; \ 3233 scan = node; \ 3234 st->resume_state = state; \ 3235 goto push_yes_state; 3236 3237 3238 3239 3240 /* 3241 3242 regmatch() - main matching routine 3243 3244 This is basically one big switch statement in a loop. We execute an op, 3245 set 'next' to point the next op, and continue. If we come to a point which 3246 we may need to backtrack to on failure such as (A|B|C), we push a 3247 backtrack state onto the backtrack stack. On failure, we pop the top 3248 state, and re-enter the loop at the state indicated. If there are no more 3249 states to pop, we return failure. 3250 3251 Sometimes we also need to backtrack on success; for example /A+/, where 3252 after successfully matching one A, we need to go back and try to 3253 match another one; similarly for lookahead assertions: if the assertion 3254 completes successfully, we backtrack to the state just before the assertion 3255 and then carry on. In these cases, the pushed state is marked as 3256 'backtrack on success too'. This marking is in fact done by a chain of 3257 pointers, each pointing to the previous 'yes' state. On success, we pop to 3258 the nearest yes state, discarding any intermediate failure-only states. 3259 Sometimes a yes state is pushed just to force some cleanup code to be 3260 called at the end of a successful match or submatch; e.g. (??{$re}) uses 3261 it to free the inner regex. 3262 3263 Note that failure backtracking rewinds the cursor position, while 3264 success backtracking leaves it alone. 3265 3266 A pattern is complete when the END op is executed, while a subpattern 3267 such as (?=foo) is complete when the SUCCESS op is executed. Both of these 3268 ops trigger the "pop to last yes state if any, otherwise return true" 3269 behaviour. 3270 3271 A common convention in this function is to use A and B to refer to the two 3272 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is 3273 the subpattern to be matched possibly multiple times, while B is the entire 3274 rest of the pattern. Variable and state names reflect this convention. 3275 3276 The states in the main switch are the union of ops and failure/success of 3277 substates associated with with that op. For example, IFMATCH is the op 3278 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means 3279 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just 3280 successfully matched A and IFMATCH_A_fail is a state saying that we have 3281 just failed to match A. Resume states always come in pairs. The backtrack 3282 state we push is marked as 'IFMATCH_A', but when that is popped, we resume 3283 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking 3284 on success or failure. 3285 3286 The struct that holds a backtracking state is actually a big union, with 3287 one variant for each major type of op. The variable st points to the 3288 top-most backtrack struct. To make the code clearer, within each 3289 block of code we #define ST to alias the relevant union. 3290 3291 Here's a concrete example of a (vastly oversimplified) IFMATCH 3292 implementation: 3293 3294 switch (state) { 3295 .... 3296 3297 #define ST st->u.ifmatch 3298 3299 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 3300 ST.foo = ...; // some state we wish to save 3301 ... 3302 // push a yes backtrack state with a resume value of 3303 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the 3304 // first node of A: 3305 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput); 3306 // NOTREACHED 3307 3308 case IFMATCH_A: // we have successfully executed A; now continue with B 3309 next = B; 3310 bar = ST.foo; // do something with the preserved value 3311 break; 3312 3313 case IFMATCH_A_fail: // A failed, so the assertion failed 3314 ...; // do some housekeeping, then ... 3315 sayNO; // propagate the failure 3316 3317 #undef ST 3318 3319 ... 3320 } 3321 3322 For any old-timers reading this who are familiar with the old recursive 3323 approach, the code above is equivalent to: 3324 3325 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 3326 { 3327 int foo = ... 3328 ... 3329 if (regmatch(A)) { 3330 next = B; 3331 bar = foo; 3332 break; 3333 } 3334 ...; // do some housekeeping, then ... 3335 sayNO; // propagate the failure 3336 } 3337 3338 The topmost backtrack state, pointed to by st, is usually free. If you 3339 want to claim it, populate any ST.foo fields in it with values you wish to 3340 save, then do one of 3341 3342 PUSH_STATE_GOTO(resume_state, node, newinput); 3343 PUSH_YES_STATE_GOTO(resume_state, node, newinput); 3344 3345 which sets that backtrack state's resume value to 'resume_state', pushes a 3346 new free entry to the top of the backtrack stack, then goes to 'node'. 3347 On backtracking, the free slot is popped, and the saved state becomes the 3348 new free state. An ST.foo field in this new top state can be temporarily 3349 accessed to retrieve values, but once the main loop is re-entered, it 3350 becomes available for reuse. 3351 3352 Note that the depth of the backtrack stack constantly increases during the 3353 left-to-right execution of the pattern, rather than going up and down with 3354 the pattern nesting. For example the stack is at its maximum at Z at the 3355 end of the pattern, rather than at X in the following: 3356 3357 /(((X)+)+)+....(Y)+....Z/ 3358 3359 The only exceptions to this are lookahead/behind assertions and the cut, 3360 (?>A), which pop all the backtrack states associated with A before 3361 continuing. 3362 3363 Backtrack state structs are allocated in slabs of about 4K in size. 3364 PL_regmatch_state and st always point to the currently active state, 3365 and PL_regmatch_slab points to the slab currently containing 3366 PL_regmatch_state. The first time regmatch() is called, the first slab is 3367 allocated, and is never freed until interpreter destruction. When the slab 3368 is full, a new one is allocated and chained to the end. At exit from 3369 regmatch(), slabs allocated since entry are freed. 3370 3371 */ 3372 3373 3374 #define DEBUG_STATE_pp(pp) \ 3375 DEBUG_STATE_r({ \ 3376 DUMP_EXEC_POS(locinput, scan, utf8_target); \ 3377 PerlIO_printf(Perl_debug_log, \ 3378 " %*s"pp" %s%s%s%s%s\n", \ 3379 depth*2, "", \ 3380 PL_reg_name[st->resume_state], \ 3381 ((st==yes_state||st==mark_state) ? "[" : ""), \ 3382 ((st==yes_state) ? "Y" : ""), \ 3383 ((st==mark_state) ? "M" : ""), \ 3384 ((st==yes_state||st==mark_state) ? "]" : "") \ 3385 ); \ 3386 }); 3387 3388 3389 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1) 3390 3391 #ifdef DEBUGGING 3392 3393 STATIC void 3394 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, 3395 const char *start, const char *end, const char *blurb) 3396 { 3397 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; 3398 3399 PERL_ARGS_ASSERT_DEBUG_START_MATCH; 3400 3401 if (!PL_colorset) 3402 reginitcolors(); 3403 { 3404 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), 3405 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); 3406 3407 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), 3408 start, end - start, 60); 3409 3410 PerlIO_printf(Perl_debug_log, 3411 "%s%s REx%s %s against %s\n", 3412 PL_colors[4], blurb, PL_colors[5], s0, s1); 3413 3414 if (utf8_target||utf8_pat) 3415 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", 3416 utf8_pat ? "pattern" : "", 3417 utf8_pat && utf8_target ? " and " : "", 3418 utf8_target ? "string" : "" 3419 ); 3420 } 3421 } 3422 3423 STATIC void 3424 S_dump_exec_pos(pTHX_ const char *locinput, 3425 const regnode *scan, 3426 const char *loc_regeol, 3427 const char *loc_bostr, 3428 const char *loc_reg_starttry, 3429 const bool utf8_target) 3430 { 3431 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; 3432 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ 3433 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); 3434 /* The part of the string before starttry has one color 3435 (pref0_len chars), between starttry and current 3436 position another one (pref_len - pref0_len chars), 3437 after the current position the third one. 3438 We assume that pref0_len <= pref_len, otherwise we 3439 decrease pref0_len. */ 3440 int pref_len = (locinput - loc_bostr) > (5 + taill) - l 3441 ? (5 + taill) - l : locinput - loc_bostr; 3442 int pref0_len; 3443 3444 PERL_ARGS_ASSERT_DUMP_EXEC_POS; 3445 3446 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) 3447 pref_len++; 3448 pref0_len = pref_len - (locinput - loc_reg_starttry); 3449 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) 3450 l = ( loc_regeol - locinput > (5 + taill) - pref_len 3451 ? (5 + taill) - pref_len : loc_regeol - locinput); 3452 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) 3453 l--; 3454 if (pref0_len < 0) 3455 pref0_len = 0; 3456 if (pref0_len > pref_len) 3457 pref0_len = pref_len; 3458 { 3459 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0; 3460 3461 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), 3462 (locinput - pref_len),pref0_len, 60, 4, 5); 3463 3464 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), 3465 (locinput - pref_len + pref0_len), 3466 pref_len - pref0_len, 60, 2, 3); 3467 3468 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), 3469 locinput, loc_regeol - locinput, 10, 0, 1); 3470 3471 const STRLEN tlen=len0+len1+len2; 3472 PerlIO_printf(Perl_debug_log, 3473 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|", 3474 (IV)(locinput - loc_bostr), 3475 len0, s0, 3476 len1, s1, 3477 (docolor ? "" : "> <"), 3478 len2, s2, 3479 (int)(tlen > 19 ? 0 : 19 - tlen), 3480 ""); 3481 } 3482 } 3483 3484 #endif 3485 3486 /* reg_check_named_buff_matched() 3487 * Checks to see if a named buffer has matched. The data array of 3488 * buffer numbers corresponding to the buffer is expected to reside 3489 * in the regexp->data->data array in the slot stored in the ARG() of 3490 * node involved. Note that this routine doesn't actually care about the 3491 * name, that information is not preserved from compilation to execution. 3492 * Returns the index of the leftmost defined buffer with the given name 3493 * or 0 if non of the buffers matched. 3494 */ 3495 STATIC I32 3496 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) 3497 { 3498 I32 n; 3499 RXi_GET_DECL(rex,rexi); 3500 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 3501 I32 *nums=(I32*)SvPVX(sv_dat); 3502 3503 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; 3504 3505 for ( n=0; n<SvIVX(sv_dat); n++ ) { 3506 if ((I32)rex->lastparen >= nums[n] && 3507 rex->offs[nums[n]].end != -1) 3508 { 3509 return nums[n]; 3510 } 3511 } 3512 return 0; 3513 } 3514 3515 3516 static bool 3517 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, 3518 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo) 3519 { 3520 /* This function determines if there are one or two characters that match 3521 * the first character of the passed-in EXACTish node <text_node>, and if 3522 * so, returns them in the passed-in pointers. 3523 * 3524 * If it determines that no possible character in the target string can 3525 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if 3526 * the first character in <text_node> requires UTF-8 to represent, and the 3527 * target string isn't in UTF-8.) 3528 * 3529 * If there are more than two characters that could match the beginning of 3530 * <text_node>, or if more context is required to determine a match or not, 3531 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID. 3532 * 3533 * The motiviation behind this function is to allow the caller to set up 3534 * tight loops for matching. If <text_node> is of type EXACT, there is 3535 * only one possible character that can match its first character, and so 3536 * the situation is quite simple. But things get much more complicated if 3537 * folding is involved. It may be that the first character of an EXACTFish 3538 * node doesn't participate in any possible fold, e.g., punctuation, so it 3539 * can be matched only by itself. The vast majority of characters that are 3540 * in folds match just two things, their lower and upper-case equivalents. 3541 * But not all are like that; some have multiple possible matches, or match 3542 * sequences of more than one character. This function sorts all that out. 3543 * 3544 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a 3545 * loop of trying to match A*, we know we can't exit where the thing 3546 * following it isn't a B. And something can't be a B unless it is the 3547 * beginning of B. By putting a quick test for that beginning in a tight 3548 * loop, we can rule out things that can't possibly be B without having to 3549 * break out of the loop, thus avoiding work. Similarly, if A is a single 3550 * character, we can make a tight loop matching A*, using the outputs of 3551 * this function. 3552 * 3553 * If the target string to match isn't in UTF-8, and there aren't 3554 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to 3555 * the one or two possible octets (which are characters in this situation) 3556 * that can match. In all cases, if there is only one character that can 3557 * match, *<c1p> and *<c2p> will be identical. 3558 * 3559 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8> 3560 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that 3561 * can match the beginning of <text_node>. They should be declared with at 3562 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is 3563 * undefined what these contain.) If one or both of the buffers are 3564 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the 3565 * corresponding invariant. If variant, the corresponding *<c1p> and/or 3566 * *<c2p> will be set to a negative number(s) that shouldn't match any code 3567 * point (unless inappropriately coerced to unsigned). *<c1p> will equal 3568 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */ 3569 3570 const bool utf8_target = reginfo->is_utf8_target; 3571 3572 UV c1 = CHRTEST_NOT_A_CP_1; 3573 UV c2 = CHRTEST_NOT_A_CP_2; 3574 bool use_chrtest_void = FALSE; 3575 const bool is_utf8_pat = reginfo->is_utf8_pat; 3576 3577 /* Used when we have both utf8 input and utf8 output, to avoid converting 3578 * to/from code points */ 3579 bool utf8_has_been_setup = FALSE; 3580 3581 dVAR; 3582 3583 U8 *pat = (U8*)STRING(text_node); 3584 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; 3585 3586 if (OP(text_node) == EXACT) { 3587 3588 /* In an exact node, only one thing can be matched, that first 3589 * character. If both the pat and the target are UTF-8, we can just 3590 * copy the input to the output, avoiding finding the code point of 3591 * that character */ 3592 if (!is_utf8_pat) { 3593 c2 = c1 = *pat; 3594 } 3595 else if (utf8_target) { 3596 Copy(pat, c1_utf8, UTF8SKIP(pat), U8); 3597 Copy(pat, c2_utf8, UTF8SKIP(pat), U8); 3598 utf8_has_been_setup = TRUE; 3599 } 3600 else { 3601 c2 = c1 = valid_utf8_to_uvchr(pat, NULL); 3602 } 3603 } 3604 else { /* an EXACTFish node */ 3605 U8 *pat_end = pat + STR_LEN(text_node); 3606 3607 /* An EXACTFL node has at least some characters unfolded, because what 3608 * they match is not known until now. So, now is the time to fold 3609 * the first few of them, as many as are needed to determine 'c1' and 3610 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need 3611 * to fold if in a UTF-8 locale, and then only the Sharp S; everything 3612 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we 3613 * need to fold as many characters as a single character can fold to, 3614 * so that later we can check if the first ones are such a multi-char 3615 * fold. But, in such a pattern only locale-problematic characters 3616 * aren't folded, so we can skip this completely if the first character 3617 * in the node isn't one of the tricky ones */ 3618 if (OP(text_node) == EXACTFL) { 3619 3620 if (! is_utf8_pat) { 3621 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) 3622 { 3623 folded[0] = folded[1] = 's'; 3624 pat = folded; 3625 pat_end = folded + 2; 3626 } 3627 } 3628 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { 3629 U8 *s = pat; 3630 U8 *d = folded; 3631 int i; 3632 3633 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { 3634 if (isASCII(*s)) { 3635 *(d++) = (U8) toFOLD_LC(*s); 3636 s++; 3637 } 3638 else { 3639 STRLEN len; 3640 _to_utf8_fold_flags(s, 3641 d, 3642 &len, 3643 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); 3644 d += len; 3645 s += UTF8SKIP(s); 3646 } 3647 } 3648 3649 pat = folded; 3650 pat_end = d; 3651 } 3652 } 3653 3654 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) 3655 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) 3656 { 3657 /* Multi-character folds require more context to sort out. Also 3658 * PL_utf8_foldclosures used below doesn't handle them, so have to 3659 * be handled outside this routine */ 3660 use_chrtest_void = TRUE; 3661 } 3662 else { /* an EXACTFish node which doesn't begin with a multi-char fold */ 3663 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; 3664 if (c1 > 256) { 3665 /* Load the folds hash, if not already done */ 3666 SV** listp; 3667 if (! PL_utf8_foldclosures) { 3668 if (! PL_utf8_tofold) { 3669 U8 dummy[UTF8_MAXBYTES_CASE+1]; 3670 3671 /* Force loading this by folding an above-Latin1 char */ 3672 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); 3673 assert(PL_utf8_tofold); /* Verify that worked */ 3674 } 3675 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); 3676 } 3677 3678 /* The fold closures data structure is a hash with the keys 3679 * being the UTF-8 of every character that is folded to, like 3680 * 'k', and the values each an array of all code points that 3681 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. 3682 * Multi-character folds are not included */ 3683 if ((! (listp = hv_fetch(PL_utf8_foldclosures, 3684 (char *) pat, 3685 UTF8SKIP(pat), 3686 FALSE)))) 3687 { 3688 /* Not found in the hash, therefore there are no folds 3689 * containing it, so there is only a single character that 3690 * could match */ 3691 c2 = c1; 3692 } 3693 else { /* Does participate in folds */ 3694 AV* list = (AV*) *listp; 3695 if (av_tindex(list) != 1) { 3696 3697 /* If there aren't exactly two folds to this, it is 3698 * outside the scope of this function */ 3699 use_chrtest_void = TRUE; 3700 } 3701 else { /* There are two. Get them */ 3702 SV** c_p = av_fetch(list, 0, FALSE); 3703 if (c_p == NULL) { 3704 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); 3705 } 3706 c1 = SvUV(*c_p); 3707 3708 c_p = av_fetch(list, 1, FALSE); 3709 if (c_p == NULL) { 3710 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); 3711 } 3712 c2 = SvUV(*c_p); 3713 3714 /* Folds that cross the 255/256 boundary are forbidden 3715 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and 3716 * one is ASCIII. Since the pattern character is above 3717 * 256, and its only other match is below 256, the only 3718 * legal match will be to itself. We have thrown away 3719 * the original, so have to compute which is the one 3720 * above 255 */ 3721 if ((c1 < 256) != (c2 < 256)) { 3722 if ((OP(text_node) == EXACTFL 3723 && ! IN_UTF8_CTYPE_LOCALE) 3724 || ((OP(text_node) == EXACTFA 3725 || OP(text_node) == EXACTFA_NO_TRIE) 3726 && (isASCII(c1) || isASCII(c2)))) 3727 { 3728 if (c1 < 256) { 3729 c1 = c2; 3730 } 3731 else { 3732 c2 = c1; 3733 } 3734 } 3735 } 3736 } 3737 } 3738 } 3739 else /* Here, c1 is < 255 */ 3740 if (utf8_target 3741 && HAS_NONLATIN1_FOLD_CLOSURE(c1) 3742 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) 3743 && ((OP(text_node) != EXACTFA 3744 && OP(text_node) != EXACTFA_NO_TRIE) 3745 || ! isASCII(c1))) 3746 { 3747 /* Here, there could be something above Latin1 in the target 3748 * which folds to this character in the pattern. All such 3749 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more 3750 * than two characters involved in their folds, so are outside 3751 * the scope of this function */ 3752 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { 3753 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; 3754 } 3755 else { 3756 use_chrtest_void = TRUE; 3757 } 3758 } 3759 else { /* Here nothing above Latin1 can fold to the pattern 3760 character */ 3761 switch (OP(text_node)) { 3762 3763 case EXACTFL: /* /l rules */ 3764 c2 = PL_fold_locale[c1]; 3765 break; 3766 3767 case EXACTF: /* This node only generated for non-utf8 3768 patterns */ 3769 assert(! is_utf8_pat); 3770 if (! utf8_target) { /* /d rules */ 3771 c2 = PL_fold[c1]; 3772 break; 3773 } 3774 /* FALLTHROUGH */ 3775 /* /u rules for all these. This happens to work for 3776 * EXACTFA as nothing in Latin1 folds to ASCII */ 3777 case EXACTFA_NO_TRIE: /* This node only generated for 3778 non-utf8 patterns */ 3779 assert(! is_utf8_pat); 3780 /* FALL THROUGH */ 3781 case EXACTFA: 3782 case EXACTFU_SS: 3783 case EXACTFU: 3784 c2 = PL_fold_latin1[c1]; 3785 break; 3786 3787 default: 3788 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); 3789 assert(0); /* NOTREACHED */ 3790 } 3791 } 3792 } 3793 } 3794 3795 /* Here have figured things out. Set up the returns */ 3796 if (use_chrtest_void) { 3797 *c2p = *c1p = CHRTEST_VOID; 3798 } 3799 else if (utf8_target) { 3800 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */ 3801 uvchr_to_utf8(c1_utf8, c1); 3802 uvchr_to_utf8(c2_utf8, c2); 3803 } 3804 3805 /* Invariants are stored in both the utf8 and byte outputs; Use 3806 * negative numbers otherwise for the byte ones. Make sure that the 3807 * byte ones are the same iff the utf8 ones are the same */ 3808 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1; 3809 *c2p = (UTF8_IS_INVARIANT(*c2_utf8)) 3810 ? *c2_utf8 3811 : (c1 == c2) 3812 ? CHRTEST_NOT_A_CP_1 3813 : CHRTEST_NOT_A_CP_2; 3814 } 3815 else if (c1 > 255) { 3816 if (c2 > 255) { /* both possibilities are above what a non-utf8 string 3817 can represent */ 3818 return FALSE; 3819 } 3820 3821 *c1p = *c2p = c2; /* c2 is the only representable value */ 3822 } 3823 else { /* c1 is representable; see about c2 */ 3824 *c1p = c1; 3825 *c2p = (c2 < 256) ? c2 : c1; 3826 } 3827 3828 return TRUE; 3829 } 3830 3831 /* returns -1 on failure, $+[0] on success */ 3832 STATIC SSize_t 3833 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) 3834 { 3835 #if PERL_VERSION < 9 && !defined(PERL_CORE) 3836 dMY_CXT; 3837 #endif 3838 dVAR; 3839 const bool utf8_target = reginfo->is_utf8_target; 3840 const U32 uniflags = UTF8_ALLOW_DEFAULT; 3841 REGEXP *rex_sv = reginfo->prog; 3842 regexp *rex = ReANY(rex_sv); 3843 RXi_GET_DECL(rex,rexi); 3844 /* the current state. This is a cached copy of PL_regmatch_state */ 3845 regmatch_state *st; 3846 /* cache heavy used fields of st in registers */ 3847 regnode *scan; 3848 regnode *next; 3849 U32 n = 0; /* general value; init to avoid compiler warning */ 3850 SSize_t ln = 0; /* len or last; init to avoid compiler warning */ 3851 char *locinput = startpos; 3852 char *pushinput; /* where to continue after a PUSH */ 3853 I32 nextchr; /* is always set to UCHARAT(locinput) */ 3854 3855 bool result = 0; /* return value of S_regmatch */ 3856 int depth = 0; /* depth of backtrack stack */ 3857 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ 3858 const U32 max_nochange_depth = 3859 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? 3860 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; 3861 regmatch_state *yes_state = NULL; /* state to pop to on success of 3862 subpattern */ 3863 /* mark_state piggy backs on the yes_state logic so that when we unwind 3864 the stack on success we can update the mark_state as we go */ 3865 regmatch_state *mark_state = NULL; /* last mark state we have seen */ 3866 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ 3867 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ 3868 U32 state_num; 3869 bool no_final = 0; /* prevent failure from backtracking? */ 3870 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ 3871 char *startpoint = locinput; 3872 SV *popmark = NULL; /* are we looking for a mark? */ 3873 SV *sv_commit = NULL; /* last mark name seen in failure */ 3874 SV *sv_yes_mark = NULL; /* last mark name we have seen 3875 during a successful match */ 3876 U32 lastopen = 0; /* last open we saw */ 3877 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; 3878 SV* const oreplsv = GvSVn(PL_replgv); 3879 /* these three flags are set by various ops to signal information to 3880 * the very next op. They have a useful lifetime of exactly one loop 3881 * iteration, and are not preserved or restored by state pushes/pops 3882 */ 3883 bool sw = 0; /* the condition value in (?(cond)a|b) */ 3884 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ 3885 int logical = 0; /* the following EVAL is: 3886 0: (?{...}) 3887 1: (?(?{...})X|Y) 3888 2: (??{...}) 3889 or the following IFMATCH/UNLESSM is: 3890 false: plain (?=foo) 3891 true: used as a condition: (?(?=foo)) 3892 */ 3893 PAD* last_pad = NULL; 3894 dMULTICALL; 3895 I32 gimme = G_SCALAR; 3896 CV *caller_cv = NULL; /* who called us */ 3897 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ 3898 CHECKPOINT runops_cp; /* savestack position before executing EVAL */ 3899 U32 maxopenparen = 0; /* max '(' index seen so far */ 3900 int to_complement; /* Invert the result? */ 3901 _char_class_number classnum; 3902 bool is_utf8_pat = reginfo->is_utf8_pat; 3903 3904 #ifdef DEBUGGING 3905 GET_RE_DEBUG_FLAGS_DECL; 3906 #endif 3907 3908 /* protect against undef(*^R) */ 3909 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); 3910 3911 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ 3912 multicall_oldcatch = 0; 3913 multicall_cv = NULL; 3914 cx = NULL; 3915 PERL_UNUSED_VAR(multicall_cop); 3916 PERL_UNUSED_VAR(newsp); 3917 3918 3919 PERL_ARGS_ASSERT_REGMATCH; 3920 3921 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ 3922 PerlIO_printf(Perl_debug_log,"regmatch start\n"); 3923 })); 3924 3925 st = PL_regmatch_state; 3926 3927 /* Note that nextchr is a byte even in UTF */ 3928 SET_nextchr; 3929 scan = prog; 3930 while (scan != NULL) { 3931 3932 DEBUG_EXECUTE_r( { 3933 SV * const prop = sv_newmortal(); 3934 regnode *rnext=regnext(scan); 3935 DUMP_EXEC_POS( locinput, scan, utf8_target ); 3936 regprop(rex, prop, scan, reginfo); 3937 3938 PerlIO_printf(Perl_debug_log, 3939 "%3"IVdf":%*s%s(%"IVdf")\n", 3940 (IV)(scan - rexi->program), depth*2, "", 3941 SvPVX_const(prop), 3942 (PL_regkind[OP(scan)] == END || !rnext) ? 3943 0 : (IV)(rnext - rexi->program)); 3944 }); 3945 3946 next = scan + NEXT_OFF(scan); 3947 if (next == scan) 3948 next = NULL; 3949 state_num = OP(scan); 3950 3951 reenter_switch: 3952 to_complement = 0; 3953 3954 SET_nextchr; 3955 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); 3956 3957 switch (state_num) { 3958 case BOL: /* /^../ */ 3959 case SBOL: /* /^../s */ 3960 if (locinput == reginfo->strbeg) 3961 break; 3962 sayNO; 3963 3964 case MBOL: /* /^../m */ 3965 if (locinput == reginfo->strbeg || 3966 (!NEXTCHR_IS_EOS && locinput[-1] == '\n')) 3967 { 3968 break; 3969 } 3970 sayNO; 3971 3972 case GPOS: /* \G */ 3973 if (locinput == reginfo->ganch) 3974 break; 3975 sayNO; 3976 3977 case KEEPS: /* \K */ 3978 /* update the startpoint */ 3979 st->u.keeper.val = rex->offs[0].start; 3980 rex->offs[0].start = locinput - reginfo->strbeg; 3981 PUSH_STATE_GOTO(KEEPS_next, next, locinput); 3982 assert(0); /*NOTREACHED*/ 3983 case KEEPS_next_fail: 3984 /* rollback the start point change */ 3985 rex->offs[0].start = st->u.keeper.val; 3986 sayNO_SILENT; 3987 assert(0); /*NOTREACHED*/ 3988 3989 case MEOL: /* /..$/m */ 3990 if (!NEXTCHR_IS_EOS && nextchr != '\n') 3991 sayNO; 3992 break; 3993 3994 case EOL: /* /..$/ */ 3995 /* FALL THROUGH */ 3996 case SEOL: /* /..$/s */ 3997 if (!NEXTCHR_IS_EOS && nextchr != '\n') 3998 sayNO; 3999 if (reginfo->strend - locinput > 1) 4000 sayNO; 4001 break; 4002 4003 case EOS: /* \z */ 4004 if (!NEXTCHR_IS_EOS) 4005 sayNO; 4006 break; 4007 4008 case SANY: /* /./s */ 4009 if (NEXTCHR_IS_EOS) 4010 sayNO; 4011 goto increment_locinput; 4012 4013 case CANY: /* \C */ 4014 if (NEXTCHR_IS_EOS) 4015 sayNO; 4016 locinput++; 4017 break; 4018 4019 case REG_ANY: /* /./ */ 4020 if ((NEXTCHR_IS_EOS) || nextchr == '\n') 4021 sayNO; 4022 goto increment_locinput; 4023 4024 4025 #undef ST 4026 #define ST st->u.trie 4027 case TRIEC: /* (ab|cd) with known charclass */ 4028 /* In this case the charclass data is available inline so 4029 we can fail fast without a lot of extra overhead. 4030 */ 4031 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) { 4032 DEBUG_EXECUTE_r( 4033 PerlIO_printf(Perl_debug_log, 4034 "%*s %sfailed to match trie start class...%s\n", 4035 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) 4036 ); 4037 sayNO_SILENT; 4038 assert(0); /* NOTREACHED */ 4039 } 4040 /* FALL THROUGH */ 4041 case TRIE: /* (ab|cd) */ 4042 /* the basic plan of execution of the trie is: 4043 * At the beginning, run though all the states, and 4044 * find the longest-matching word. Also remember the position 4045 * of the shortest matching word. For example, this pattern: 4046 * 1 2 3 4 5 4047 * ab|a|x|abcd|abc 4048 * when matched against the string "abcde", will generate 4049 * accept states for all words except 3, with the longest 4050 * matching word being 4, and the shortest being 2 (with 4051 * the position being after char 1 of the string). 4052 * 4053 * Then for each matching word, in word order (i.e. 1,2,4,5), 4054 * we run the remainder of the pattern; on each try setting 4055 * the current position to the character following the word, 4056 * returning to try the next word on failure. 4057 * 4058 * We avoid having to build a list of words at runtime by 4059 * using a compile-time structure, wordinfo[].prev, which 4060 * gives, for each word, the previous accepting word (if any). 4061 * In the case above it would contain the mappings 1->2, 2->0, 4062 * 3->0, 4->5, 5->1. We can use this table to generate, from 4063 * the longest word (4 above), a list of all words, by 4064 * following the list of prev pointers; this gives us the 4065 * unordered list 4,5,1,2. Then given the current word we have 4066 * just tried, we can go through the list and find the 4067 * next-biggest word to try (so if we just failed on word 2, 4068 * the next in the list is 4). 4069 * 4070 * Since at runtime we don't record the matching position in 4071 * the string for each word, we have to work that out for 4072 * each word we're about to process. The wordinfo table holds 4073 * the character length of each word; given that we recorded 4074 * at the start: the position of the shortest word and its 4075 * length in chars, we just need to move the pointer the 4076 * difference between the two char lengths. Depending on 4077 * Unicode status and folding, that's cheap or expensive. 4078 * 4079 * This algorithm is optimised for the case where are only a 4080 * small number of accept states, i.e. 0,1, or maybe 2. 4081 * With lots of accepts states, and having to try all of them, 4082 * it becomes quadratic on number of accept states to find all 4083 * the next words. 4084 */ 4085 4086 { 4087 /* what type of TRIE am I? (utf8 makes this contextual) */ 4088 DECL_TRIE_TYPE(scan); 4089 4090 /* what trie are we using right now */ 4091 reg_trie_data * const trie 4092 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; 4093 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); 4094 U32 state = trie->startstate; 4095 4096 if ( trie->bitmap 4097 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr))) 4098 { 4099 if (trie->states[ state ].wordnum) { 4100 DEBUG_EXECUTE_r( 4101 PerlIO_printf(Perl_debug_log, 4102 "%*s %smatched empty string...%s\n", 4103 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) 4104 ); 4105 if (!trie->jump) 4106 break; 4107 } else { 4108 DEBUG_EXECUTE_r( 4109 PerlIO_printf(Perl_debug_log, 4110 "%*s %sfailed to match trie start class...%s\n", 4111 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) 4112 ); 4113 sayNO_SILENT; 4114 } 4115 } 4116 4117 { 4118 U8 *uc = ( U8* )locinput; 4119 4120 STRLEN len = 0; 4121 STRLEN foldlen = 0; 4122 U8 *uscan = (U8*)NULL; 4123 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 4124 U32 charcount = 0; /* how many input chars we have matched */ 4125 U32 accepted = 0; /* have we seen any accepting states? */ 4126 4127 ST.jump = trie->jump; 4128 ST.me = scan; 4129 ST.firstpos = NULL; 4130 ST.longfold = FALSE; /* char longer if folded => it's harder */ 4131 ST.nextword = 0; 4132 4133 /* fully traverse the TRIE; note the position of the 4134 shortest accept state and the wordnum of the longest 4135 accept state */ 4136 4137 while ( state && uc <= (U8*)(reginfo->strend) ) { 4138 U32 base = trie->states[ state ].trans.base; 4139 UV uvc = 0; 4140 U16 charid = 0; 4141 U16 wordnum; 4142 wordnum = trie->states[ state ].wordnum; 4143 4144 if (wordnum) { /* it's an accept state */ 4145 if (!accepted) { 4146 accepted = 1; 4147 /* record first match position */ 4148 if (ST.longfold) { 4149 ST.firstpos = (U8*)locinput; 4150 ST.firstchars = 0; 4151 } 4152 else { 4153 ST.firstpos = uc; 4154 ST.firstchars = charcount; 4155 } 4156 } 4157 if (!ST.nextword || wordnum < ST.nextword) 4158 ST.nextword = wordnum; 4159 ST.topword = wordnum; 4160 } 4161 4162 DEBUG_TRIE_EXECUTE_r({ 4163 DUMP_EXEC_POS( (char *)uc, scan, utf8_target ); 4164 PerlIO_printf( Perl_debug_log, 4165 "%*s %sState: %4"UVxf" Accepted: %c ", 4166 2+depth * 2, "", PL_colors[4], 4167 (UV)state, (accepted ? 'Y' : 'N')); 4168 }); 4169 4170 /* read a char and goto next state */ 4171 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) { 4172 I32 offset; 4173 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 4174 uscan, len, uvc, charid, foldlen, 4175 foldbuf, uniflags); 4176 charcount++; 4177 if (foldlen>0) 4178 ST.longfold = TRUE; 4179 if (charid && 4180 ( ((offset = 4181 base + charid - 1 - trie->uniquecharcount)) >= 0) 4182 4183 && ((U32)offset < trie->lasttrans) 4184 && trie->trans[offset].check == state) 4185 { 4186 state = trie->trans[offset].next; 4187 } 4188 else { 4189 state = 0; 4190 } 4191 uc += len; 4192 4193 } 4194 else { 4195 state = 0; 4196 } 4197 DEBUG_TRIE_EXECUTE_r( 4198 PerlIO_printf( Perl_debug_log, 4199 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n", 4200 charid, uvc, (UV)state, PL_colors[5] ); 4201 ); 4202 } 4203 if (!accepted) 4204 sayNO; 4205 4206 /* calculate total number of accept states */ 4207 { 4208 U16 w = ST.topword; 4209 accepted = 0; 4210 while (w) { 4211 w = trie->wordinfo[w].prev; 4212 accepted++; 4213 } 4214 ST.accepted = accepted; 4215 } 4216 4217 DEBUG_EXECUTE_r( 4218 PerlIO_printf( Perl_debug_log, 4219 "%*s %sgot %"IVdf" possible matches%s\n", 4220 REPORT_CODE_OFF + depth * 2, "", 4221 PL_colors[4], (IV)ST.accepted, PL_colors[5] ); 4222 ); 4223 goto trie_first_try; /* jump into the fail handler */ 4224 }} 4225 assert(0); /* NOTREACHED */ 4226 4227 case TRIE_next_fail: /* we failed - try next alternative */ 4228 { 4229 U8 *uc; 4230 if ( ST.jump) { 4231 REGCP_UNWIND(ST.cp); 4232 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 4233 } 4234 if (!--ST.accepted) { 4235 DEBUG_EXECUTE_r({ 4236 PerlIO_printf( Perl_debug_log, 4237 "%*s %sTRIE failed...%s\n", 4238 REPORT_CODE_OFF+depth*2, "", 4239 PL_colors[4], 4240 PL_colors[5] ); 4241 }); 4242 sayNO_SILENT; 4243 } 4244 { 4245 /* Find next-highest word to process. Note that this code 4246 * is O(N^2) per trie run (O(N) per branch), so keep tight */ 4247 U16 min = 0; 4248 U16 word; 4249 U16 const nextword = ST.nextword; 4250 reg_trie_wordinfo * const wordinfo 4251 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo; 4252 for (word=ST.topword; word; word=wordinfo[word].prev) { 4253 if (word > nextword && (!min || word < min)) 4254 min = word; 4255 } 4256 ST.nextword = min; 4257 } 4258 4259 trie_first_try: 4260 if (do_cutgroup) { 4261 do_cutgroup = 0; 4262 no_final = 0; 4263 } 4264 4265 if ( ST.jump) { 4266 ST.lastparen = rex->lastparen; 4267 ST.lastcloseparen = rex->lastcloseparen; 4268 REGCP_SET(ST.cp); 4269 } 4270 4271 /* find start char of end of current word */ 4272 { 4273 U32 chars; /* how many chars to skip */ 4274 reg_trie_data * const trie 4275 = (reg_trie_data*)rexi->data->data[ARG(ST.me)]; 4276 4277 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen) 4278 >= ST.firstchars); 4279 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen) 4280 - ST.firstchars; 4281 uc = ST.firstpos; 4282 4283 if (ST.longfold) { 4284 /* the hard option - fold each char in turn and find 4285 * its folded length (which may be different */ 4286 U8 foldbuf[UTF8_MAXBYTES_CASE + 1]; 4287 STRLEN foldlen; 4288 STRLEN len; 4289 UV uvc; 4290 U8 *uscan; 4291 4292 while (chars) { 4293 if (utf8_target) { 4294 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, 4295 uniflags); 4296 uc += len; 4297 } 4298 else { 4299 uvc = *uc; 4300 uc++; 4301 } 4302 uvc = to_uni_fold(uvc, foldbuf, &foldlen); 4303 uscan = foldbuf; 4304 while (foldlen) { 4305 if (!--chars) 4306 break; 4307 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, 4308 uniflags); 4309 uscan += len; 4310 foldlen -= len; 4311 } 4312 } 4313 } 4314 else { 4315 if (utf8_target) 4316 while (chars--) 4317 uc += UTF8SKIP(uc); 4318 else 4319 uc += chars; 4320 } 4321 } 4322 4323 scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) 4324 ? ST.jump[ST.nextword] 4325 : NEXT_OFF(ST.me)); 4326 4327 DEBUG_EXECUTE_r({ 4328 PerlIO_printf( Perl_debug_log, 4329 "%*s %sTRIE matched word #%d, continuing%s\n", 4330 REPORT_CODE_OFF+depth*2, "", 4331 PL_colors[4], 4332 ST.nextword, 4333 PL_colors[5] 4334 ); 4335 }); 4336 4337 if (ST.accepted > 1 || has_cutgroup) { 4338 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc); 4339 assert(0); /* NOTREACHED */ 4340 } 4341 /* only one choice left - just continue */ 4342 DEBUG_EXECUTE_r({ 4343 AV *const trie_words 4344 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); 4345 SV ** const tmp = av_fetch( trie_words, 4346 ST.nextword-1, 0 ); 4347 SV *sv= tmp ? sv_newmortal() : NULL; 4348 4349 PerlIO_printf( Perl_debug_log, 4350 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n", 4351 REPORT_CODE_OFF+depth*2, "", PL_colors[4], 4352 ST.nextword, 4353 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 4354 PL_colors[0], PL_colors[1], 4355 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII 4356 ) 4357 : "not compiled under -Dr", 4358 PL_colors[5] ); 4359 }); 4360 4361 locinput = (char*)uc; 4362 continue; /* execute rest of RE */ 4363 assert(0); /* NOTREACHED */ 4364 } 4365 #undef ST 4366 4367 case EXACT: { /* /abc/ */ 4368 char *s = STRING(scan); 4369 ln = STR_LEN(scan); 4370 if (utf8_target != is_utf8_pat) { 4371 /* The target and the pattern have differing utf8ness. */ 4372 char *l = locinput; 4373 const char * const e = s + ln; 4374 4375 if (utf8_target) { 4376 /* The target is utf8, the pattern is not utf8. 4377 * Above-Latin1 code points can't match the pattern; 4378 * invariants match exactly, and the other Latin1 ones need 4379 * to be downgraded to a single byte in order to do the 4380 * comparison. (If we could be confident that the target 4381 * is not malformed, this could be refactored to have fewer 4382 * tests by just assuming that if the first bytes match, it 4383 * is an invariant, but there are tests in the test suite 4384 * dealing with (??{...}) which violate this) */ 4385 while (s < e) { 4386 if (l >= reginfo->strend 4387 || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) 4388 { 4389 sayNO; 4390 } 4391 if (UTF8_IS_INVARIANT(*(U8*)l)) { 4392 if (*l != *s) { 4393 sayNO; 4394 } 4395 l++; 4396 } 4397 else { 4398 if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) 4399 { 4400 sayNO; 4401 } 4402 l += 2; 4403 } 4404 s++; 4405 } 4406 } 4407 else { 4408 /* The target is not utf8, the pattern is utf8. */ 4409 while (s < e) { 4410 if (l >= reginfo->strend 4411 || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) 4412 { 4413 sayNO; 4414 } 4415 if (UTF8_IS_INVARIANT(*(U8*)s)) { 4416 if (*s != *l) { 4417 sayNO; 4418 } 4419 s++; 4420 } 4421 else { 4422 if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) 4423 { 4424 sayNO; 4425 } 4426 s += 2; 4427 } 4428 l++; 4429 } 4430 } 4431 locinput = l; 4432 } 4433 else { 4434 /* The target and the pattern have the same utf8ness. */ 4435 /* Inline the first character, for speed. */ 4436 if (reginfo->strend - locinput < ln 4437 || UCHARAT(s) != nextchr 4438 || (ln > 1 && memNE(s, locinput, ln))) 4439 { 4440 sayNO; 4441 } 4442 locinput += ln; 4443 } 4444 break; 4445 } 4446 4447 case EXACTFL: { /* /abc/il */ 4448 re_fold_t folder; 4449 const U8 * fold_array; 4450 const char * s; 4451 U32 fold_utf8_flags; 4452 4453 folder = foldEQ_locale; 4454 fold_array = PL_fold_locale; 4455 fold_utf8_flags = FOLDEQ_LOCALE; 4456 goto do_exactf; 4457 4458 case EXACTFU_SS: /* /\x{df}/iu */ 4459 case EXACTFU: /* /abc/iu */ 4460 folder = foldEQ_latin1; 4461 fold_array = PL_fold_latin1; 4462 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; 4463 goto do_exactf; 4464 4465 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 4466 patterns */ 4467 assert(! is_utf8_pat); 4468 /* FALL THROUGH */ 4469 case EXACTFA: /* /abc/iaa */ 4470 folder = foldEQ_latin1; 4471 fold_array = PL_fold_latin1; 4472 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; 4473 goto do_exactf; 4474 4475 case EXACTF: /* /abc/i This node only generated for 4476 non-utf8 patterns */ 4477 assert(! is_utf8_pat); 4478 folder = foldEQ; 4479 fold_array = PL_fold; 4480 fold_utf8_flags = 0; 4481 4482 do_exactf: 4483 s = STRING(scan); 4484 ln = STR_LEN(scan); 4485 4486 if (utf8_target 4487 || is_utf8_pat 4488 || state_num == EXACTFU_SS 4489 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) 4490 { 4491 /* Either target or the pattern are utf8, or has the issue where 4492 * the fold lengths may differ. */ 4493 const char * const l = locinput; 4494 char *e = reginfo->strend; 4495 4496 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat, 4497 l, &e, 0, utf8_target, fold_utf8_flags)) 4498 { 4499 sayNO; 4500 } 4501 locinput = e; 4502 break; 4503 } 4504 4505 /* Neither the target nor the pattern are utf8 */ 4506 if (UCHARAT(s) != nextchr 4507 && !NEXTCHR_IS_EOS 4508 && UCHARAT(s) != fold_array[nextchr]) 4509 { 4510 sayNO; 4511 } 4512 if (reginfo->strend - locinput < ln) 4513 sayNO; 4514 if (ln > 1 && ! folder(s, locinput, ln)) 4515 sayNO; 4516 locinput += ln; 4517 break; 4518 } 4519 4520 /* XXX Could improve efficiency by separating these all out using a 4521 * macro or in-line function. At that point regcomp.c would no longer 4522 * have to set the FLAGS fields of these */ 4523 case BOUNDL: /* /\b/l */ 4524 case NBOUNDL: /* /\B/l */ 4525 case BOUND: /* /\b/ */ 4526 case BOUNDU: /* /\b/u */ 4527 case BOUNDA: /* /\b/a */ 4528 case NBOUND: /* /\B/ */ 4529 case NBOUNDU: /* /\B/u */ 4530 case NBOUNDA: /* /\B/a */ 4531 /* was last char in word? */ 4532 if (utf8_target 4533 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET 4534 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET) 4535 { 4536 if (locinput == reginfo->strbeg) 4537 ln = '\n'; 4538 else { 4539 const U8 * const r = 4540 reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); 4541 4542 ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, 4543 0, uniflags); 4544 } 4545 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { 4546 ln = isWORDCHAR_uni(ln); 4547 if (NEXTCHR_IS_EOS) 4548 n = 0; 4549 else { 4550 LOAD_UTF8_CHARCLASS_ALNUM(); 4551 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput, 4552 utf8_target); 4553 } 4554 } 4555 else { 4556 ln = isWORDCHAR_LC_uvchr(ln); 4557 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); 4558 } 4559 } 4560 else { 4561 4562 /* Here the string isn't utf8, or is utf8 and only ascii 4563 * characters are to match \w. In the latter case looking at 4564 * the byte just prior to the current one may be just the final 4565 * byte of a multi-byte character. This is ok. There are two 4566 * cases: 4567 * 1) it is a single byte character, and then the test is doing 4568 * just what it's supposed to. 4569 * 2) it is a multi-byte character, in which case the final 4570 * byte is never mistakable for ASCII, and so the test 4571 * will say it is not a word character, which is the 4572 * correct answer. */ 4573 ln = (locinput != reginfo->strbeg) ? 4574 UCHARAT(locinput - 1) : '\n'; 4575 switch (FLAGS(scan)) { 4576 case REGEX_UNICODE_CHARSET: 4577 ln = isWORDCHAR_L1(ln); 4578 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr); 4579 break; 4580 case REGEX_LOCALE_CHARSET: 4581 ln = isWORDCHAR_LC(ln); 4582 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr); 4583 break; 4584 case REGEX_DEPENDS_CHARSET: 4585 ln = isWORDCHAR(ln); 4586 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr); 4587 break; 4588 case REGEX_ASCII_RESTRICTED_CHARSET: 4589 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 4590 ln = isWORDCHAR_A(ln); 4591 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr); 4592 break; 4593 default: 4594 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); 4595 break; 4596 } 4597 } 4598 /* Note requires that all BOUNDs be lower than all NBOUNDs in 4599 * regcomp.sym */ 4600 if (((!ln) == (!n)) == (OP(scan) < NBOUND)) 4601 sayNO; 4602 break; 4603 4604 case ANYOF: /* /[abc]/ */ 4605 if (NEXTCHR_IS_EOS) 4606 sayNO; 4607 if (utf8_target) { 4608 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, 4609 utf8_target)) 4610 sayNO; 4611 locinput += UTF8SKIP(locinput); 4612 } 4613 else { 4614 if (!REGINCLASS(rex, scan, (U8*)locinput)) 4615 sayNO; 4616 locinput++; 4617 } 4618 break; 4619 4620 /* The argument (FLAGS) to all the POSIX node types is the class number 4621 * */ 4622 4623 case NPOSIXL: /* \W or [:^punct:] etc. under /l */ 4624 to_complement = 1; 4625 /* FALLTHROUGH */ 4626 4627 case POSIXL: /* \w or [:punct:] etc. under /l */ 4628 if (NEXTCHR_IS_EOS) 4629 sayNO; 4630 4631 /* Use isFOO_lc() for characters within Latin1. (Note that 4632 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else 4633 * wouldn't be invariant) */ 4634 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { 4635 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) { 4636 sayNO; 4637 } 4638 } 4639 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { 4640 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), 4641 (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, 4642 *(locinput + 1)))))) 4643 { 4644 sayNO; 4645 } 4646 } 4647 else { /* Here, must be an above Latin-1 code point */ 4648 goto utf8_posix_not_eos; 4649 } 4650 4651 /* Here, must be utf8 */ 4652 locinput += UTF8SKIP(locinput); 4653 break; 4654 4655 case NPOSIXD: /* \W or [:^punct:] etc. under /d */ 4656 to_complement = 1; 4657 /* FALLTHROUGH */ 4658 4659 case POSIXD: /* \w or [:punct:] etc. under /d */ 4660 if (utf8_target) { 4661 goto utf8_posix; 4662 } 4663 goto posixa; 4664 4665 case NPOSIXA: /* \W or [:^punct:] etc. under /a */ 4666 4667 if (NEXTCHR_IS_EOS) { 4668 sayNO; 4669 } 4670 4671 /* All UTF-8 variants match */ 4672 if (! UTF8_IS_INVARIANT(nextchr)) { 4673 goto increment_locinput; 4674 } 4675 4676 to_complement = 1; 4677 /* FALLTHROUGH */ 4678 4679 case POSIXA: /* \w or [:punct:] etc. under /a */ 4680 4681 posixa: 4682 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in 4683 * UTF-8, and also from NPOSIXA even in UTF-8 when the current 4684 * character is a single byte */ 4685 4686 if (NEXTCHR_IS_EOS 4687 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, 4688 FLAGS(scan))))) 4689 { 4690 sayNO; 4691 } 4692 4693 /* Here we are either not in utf8, or we matched a utf8-invariant, 4694 * so the next char is the next byte */ 4695 locinput++; 4696 break; 4697 4698 case NPOSIXU: /* \W or [:^punct:] etc. under /u */ 4699 to_complement = 1; 4700 /* FALLTHROUGH */ 4701 4702 case POSIXU: /* \w or [:punct:] etc. under /u */ 4703 utf8_posix: 4704 if (NEXTCHR_IS_EOS) { 4705 sayNO; 4706 } 4707 utf8_posix_not_eos: 4708 4709 /* Use _generic_isCC() for characters within Latin1. (Note that 4710 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else 4711 * wouldn't be invariant) */ 4712 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { 4713 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr, 4714 FLAGS(scan))))) 4715 { 4716 sayNO; 4717 } 4718 locinput++; 4719 } 4720 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { 4721 if (! (to_complement 4722 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, 4723 *(locinput + 1)), 4724 FLAGS(scan))))) 4725 { 4726 sayNO; 4727 } 4728 locinput += 2; 4729 } 4730 else { /* Handle above Latin-1 code points */ 4731 classnum = (_char_class_number) FLAGS(scan); 4732 if (classnum < _FIRST_NON_SWASH_CC) { 4733 4734 /* Here, uses a swash to find such code points. Load if if 4735 * not done already */ 4736 if (! PL_utf8_swash_ptrs[classnum]) { 4737 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; 4738 PL_utf8_swash_ptrs[classnum] 4739 = _core_swash_init("utf8", 4740 "", 4741 &PL_sv_undef, 1, 0, 4742 PL_XPosix_ptrs[classnum], &flags); 4743 } 4744 if (! (to_complement 4745 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], 4746 (U8 *) locinput, TRUE)))) 4747 { 4748 sayNO; 4749 } 4750 } 4751 else { /* Here, uses macros to find above Latin-1 code points */ 4752 switch (classnum) { 4753 case _CC_ENUM_SPACE: /* XXX would require separate 4754 code if we revert the change 4755 of \v matching this */ 4756 case _CC_ENUM_PSXSPC: 4757 if (! (to_complement 4758 ^ cBOOL(is_XPERLSPACE_high(locinput)))) 4759 { 4760 sayNO; 4761 } 4762 break; 4763 case _CC_ENUM_BLANK: 4764 if (! (to_complement 4765 ^ cBOOL(is_HORIZWS_high(locinput)))) 4766 { 4767 sayNO; 4768 } 4769 break; 4770 case _CC_ENUM_XDIGIT: 4771 if (! (to_complement 4772 ^ cBOOL(is_XDIGIT_high(locinput)))) 4773 { 4774 sayNO; 4775 } 4776 break; 4777 case _CC_ENUM_VERTSPACE: 4778 if (! (to_complement 4779 ^ cBOOL(is_VERTWS_high(locinput)))) 4780 { 4781 sayNO; 4782 } 4783 break; 4784 default: /* The rest, e.g. [:cntrl:], can't match 4785 above Latin1 */ 4786 if (! to_complement) { 4787 sayNO; 4788 } 4789 break; 4790 } 4791 } 4792 locinput += UTF8SKIP(locinput); 4793 } 4794 break; 4795 4796 case CLUMP: /* Match \X: logical Unicode character. This is defined as 4797 a Unicode extended Grapheme Cluster */ 4798 /* From http://www.unicode.org/reports/tr29 (5.2 version). An 4799 extended Grapheme Cluster is: 4800 4801 CR LF 4802 | Prepend* Begin Extend* 4803 | . 4804 4805 Begin is: ( Special_Begin | ! Control ) 4806 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) 4807 Extend is: ( Grapheme_Extend | Spacing_Mark ) 4808 Control is: [ GCB_Control | CR | LF ] 4809 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) 4810 4811 If we create a 'Regular_Begin' = Begin - Special_Begin, then 4812 we can rewrite 4813 4814 Begin is ( Regular_Begin + Special Begin ) 4815 4816 It turns out that 98.4% of all Unicode code points match 4817 Regular_Begin. Doing it this way eliminates a table match in 4818 the previous implementation for almost all Unicode code points. 4819 4820 There is a subtlety with Prepend* which showed up in testing. 4821 Note that the Begin, and only the Begin is required in: 4822 | Prepend* Begin Extend* 4823 Also, Begin contains '! Control'. A Prepend must be a 4824 '! Control', which means it must also be a Begin. What it 4825 comes down to is that if we match Prepend* and then find no 4826 suitable Begin afterwards, that if we backtrack the last 4827 Prepend, that one will be a suitable Begin. 4828 */ 4829 4830 if (NEXTCHR_IS_EOS) 4831 sayNO; 4832 if (! utf8_target) { 4833 4834 /* Match either CR LF or '.', as all the other possibilities 4835 * require utf8 */ 4836 locinput++; /* Match the . or CR */ 4837 if (nextchr == '\r' /* And if it was CR, and the next is LF, 4838 match the LF */ 4839 && locinput < reginfo->strend 4840 && UCHARAT(locinput) == '\n') 4841 { 4842 locinput++; 4843 } 4844 } 4845 else { 4846 4847 /* Utf8: See if is ( CR LF ); already know that locinput < 4848 * reginfo->strend, so locinput+1 is in bounds */ 4849 if ( nextchr == '\r' && locinput+1 < reginfo->strend 4850 && UCHARAT(locinput + 1) == '\n') 4851 { 4852 locinput += 2; 4853 } 4854 else { 4855 STRLEN len; 4856 4857 /* In case have to backtrack to beginning, then match '.' */ 4858 char *starting = locinput; 4859 4860 /* In case have to backtrack the last prepend */ 4861 char *previous_prepend = NULL; 4862 4863 LOAD_UTF8_CHARCLASS_GCB(); 4864 4865 /* Match (prepend)* */ 4866 while (locinput < reginfo->strend 4867 && (len = is_GCB_Prepend_utf8(locinput))) 4868 { 4869 previous_prepend = locinput; 4870 locinput += len; 4871 } 4872 4873 /* As noted above, if we matched a prepend character, but 4874 * the next thing won't match, back off the last prepend we 4875 * matched, as it is guaranteed to match the begin */ 4876 if (previous_prepend 4877 && (locinput >= reginfo->strend 4878 || (! swash_fetch(PL_utf8_X_regular_begin, 4879 (U8*)locinput, utf8_target) 4880 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput))) 4881 ) 4882 { 4883 locinput = previous_prepend; 4884 } 4885 4886 /* Note that here we know reginfo->strend > locinput, as we 4887 * tested that upon input to this switch case, and if we 4888 * moved locinput forward, we tested the result just above 4889 * and it either passed, or we backed off so that it will 4890 * now pass */ 4891 if (swash_fetch(PL_utf8_X_regular_begin, 4892 (U8*)locinput, utf8_target)) { 4893 locinput += UTF8SKIP(locinput); 4894 } 4895 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) { 4896 4897 /* Here did not match the required 'Begin' in the 4898 * second term. So just match the very first 4899 * character, the '.' of the final term of the regex */ 4900 locinput = starting + UTF8SKIP(starting); 4901 goto exit_utf8; 4902 } else { 4903 4904 /* Here is a special begin. It can be composed of 4905 * several individual characters. One possibility is 4906 * RI+ */ 4907 if ((len = is_GCB_RI_utf8(locinput))) { 4908 locinput += len; 4909 while (locinput < reginfo->strend 4910 && (len = is_GCB_RI_utf8(locinput))) 4911 { 4912 locinput += len; 4913 } 4914 } else if ((len = is_GCB_T_utf8(locinput))) { 4915 /* Another possibility is T+ */ 4916 locinput += len; 4917 while (locinput < reginfo->strend 4918 && (len = is_GCB_T_utf8(locinput))) 4919 { 4920 locinput += len; 4921 } 4922 } else { 4923 4924 /* Here, neither RI+ nor T+; must be some other 4925 * Hangul. That means it is one of the others: L, 4926 * LV, LVT or V, and matches: 4927 * L* (L | LVT T* | V * V* T* | LV V* T*) */ 4928 4929 /* Match L* */ 4930 while (locinput < reginfo->strend 4931 && (len = is_GCB_L_utf8(locinput))) 4932 { 4933 locinput += len; 4934 } 4935 4936 /* Here, have exhausted L*. If the next character 4937 * is not an LV, LVT nor V, it means we had to have 4938 * at least one L, so matches L+ in the original 4939 * equation, we have a complete hangul syllable. 4940 * Are done. */ 4941 4942 if (locinput < reginfo->strend 4943 && is_GCB_LV_LVT_V_utf8(locinput)) 4944 { 4945 /* Otherwise keep going. Must be LV, LVT or V. 4946 * See if LVT, by first ruling out V, then LV */ 4947 if (! is_GCB_V_utf8(locinput) 4948 /* All but every TCount one is LV */ 4949 && (valid_utf8_to_uvchr((U8 *) locinput, 4950 NULL) 4951 - SBASE) 4952 % TCount != 0) 4953 { 4954 locinput += UTF8SKIP(locinput); 4955 } else { 4956 4957 /* Must be V or LV. Take it, then match 4958 * V* */ 4959 locinput += UTF8SKIP(locinput); 4960 while (locinput < reginfo->strend 4961 && (len = is_GCB_V_utf8(locinput))) 4962 { 4963 locinput += len; 4964 } 4965 } 4966 4967 /* And any of LV, LVT, or V can be followed 4968 * by T* */ 4969 while (locinput < reginfo->strend 4970 && (len = is_GCB_T_utf8(locinput))) 4971 { 4972 locinput += len; 4973 } 4974 } 4975 } 4976 } 4977 4978 /* Match any extender */ 4979 while (locinput < reginfo->strend 4980 && swash_fetch(PL_utf8_X_extend, 4981 (U8*)locinput, utf8_target)) 4982 { 4983 locinput += UTF8SKIP(locinput); 4984 } 4985 } 4986 exit_utf8: 4987 if (locinput > reginfo->strend) sayNO; 4988 } 4989 break; 4990 4991 case NREFFL: /* /\g{name}/il */ 4992 { /* The capture buffer cases. The ones beginning with N for the 4993 named buffers just convert to the equivalent numbered and 4994 pretend they were called as the corresponding numbered buffer 4995 op. */ 4996 /* don't initialize these in the declaration, it makes C++ 4997 unhappy */ 4998 const char *s; 4999 char type; 5000 re_fold_t folder; 5001 const U8 *fold_array; 5002 UV utf8_fold_flags; 5003 5004 folder = foldEQ_locale; 5005 fold_array = PL_fold_locale; 5006 type = REFFL; 5007 utf8_fold_flags = FOLDEQ_LOCALE; 5008 goto do_nref; 5009 5010 case NREFFA: /* /\g{name}/iaa */ 5011 folder = foldEQ_latin1; 5012 fold_array = PL_fold_latin1; 5013 type = REFFA; 5014 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 5015 goto do_nref; 5016 5017 case NREFFU: /* /\g{name}/iu */ 5018 folder = foldEQ_latin1; 5019 fold_array = PL_fold_latin1; 5020 type = REFFU; 5021 utf8_fold_flags = 0; 5022 goto do_nref; 5023 5024 case NREFF: /* /\g{name}/i */ 5025 folder = foldEQ; 5026 fold_array = PL_fold; 5027 type = REFF; 5028 utf8_fold_flags = 0; 5029 goto do_nref; 5030 5031 case NREF: /* /\g{name}/ */ 5032 type = REF; 5033 folder = NULL; 5034 fold_array = NULL; 5035 utf8_fold_flags = 0; 5036 do_nref: 5037 5038 /* For the named back references, find the corresponding buffer 5039 * number */ 5040 n = reg_check_named_buff_matched(rex,scan); 5041 5042 if ( ! n ) { 5043 sayNO; 5044 } 5045 goto do_nref_ref_common; 5046 5047 case REFFL: /* /\1/il */ 5048 folder = foldEQ_locale; 5049 fold_array = PL_fold_locale; 5050 utf8_fold_flags = FOLDEQ_LOCALE; 5051 goto do_ref; 5052 5053 case REFFA: /* /\1/iaa */ 5054 folder = foldEQ_latin1; 5055 fold_array = PL_fold_latin1; 5056 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 5057 goto do_ref; 5058 5059 case REFFU: /* /\1/iu */ 5060 folder = foldEQ_latin1; 5061 fold_array = PL_fold_latin1; 5062 utf8_fold_flags = 0; 5063 goto do_ref; 5064 5065 case REFF: /* /\1/i */ 5066 folder = foldEQ; 5067 fold_array = PL_fold; 5068 utf8_fold_flags = 0; 5069 goto do_ref; 5070 5071 case REF: /* /\1/ */ 5072 folder = NULL; 5073 fold_array = NULL; 5074 utf8_fold_flags = 0; 5075 5076 do_ref: 5077 type = OP(scan); 5078 n = ARG(scan); /* which paren pair */ 5079 5080 do_nref_ref_common: 5081 ln = rex->offs[n].start; 5082 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ 5083 if (rex->lastparen < n || ln == -1) 5084 sayNO; /* Do not match unless seen CLOSEn. */ 5085 if (ln == rex->offs[n].end) 5086 break; 5087 5088 s = reginfo->strbeg + ln; 5089 if (type != REF /* REF can do byte comparison */ 5090 && (utf8_target || type == REFFU || type == REFFL)) 5091 { 5092 char * limit = reginfo->strend; 5093 5094 /* This call case insensitively compares the entire buffer 5095 * at s, with the current input starting at locinput, but 5096 * not going off the end given by reginfo->strend, and 5097 * returns in <limit> upon success, how much of the 5098 * current input was matched */ 5099 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, 5100 locinput, &limit, 0, utf8_target, utf8_fold_flags)) 5101 { 5102 sayNO; 5103 } 5104 locinput = limit; 5105 break; 5106 } 5107 5108 /* Not utf8: Inline the first character, for speed. */ 5109 if (!NEXTCHR_IS_EOS && 5110 UCHARAT(s) != nextchr && 5111 (type == REF || 5112 UCHARAT(s) != fold_array[nextchr])) 5113 sayNO; 5114 ln = rex->offs[n].end - ln; 5115 if (locinput + ln > reginfo->strend) 5116 sayNO; 5117 if (ln > 1 && (type == REF 5118 ? memNE(s, locinput, ln) 5119 : ! folder(s, locinput, ln))) 5120 sayNO; 5121 locinput += ln; 5122 break; 5123 } 5124 5125 case NOTHING: /* null op; e.g. the 'nothing' following 5126 * the '*' in m{(a+|b)*}' */ 5127 break; 5128 case TAIL: /* placeholder while compiling (A|B|C) */ 5129 break; 5130 5131 case BACK: /* ??? doesn't appear to be used ??? */ 5132 break; 5133 5134 #undef ST 5135 #define ST st->u.eval 5136 { 5137 SV *ret; 5138 REGEXP *re_sv; 5139 regexp *re; 5140 regexp_internal *rei; 5141 regnode *startpoint; 5142 5143 case GOSTART: /* (?R) */ 5144 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ 5145 if (cur_eval && cur_eval->locinput==locinput) { 5146 if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) 5147 Perl_croak(aTHX_ "Infinite recursion in regex"); 5148 if ( ++nochange_depth > max_nochange_depth ) 5149 Perl_croak(aTHX_ 5150 "Pattern subroutine nesting without pos change" 5151 " exceeded limit in regex"); 5152 } else { 5153 nochange_depth = 0; 5154 } 5155 re_sv = rex_sv; 5156 re = rex; 5157 rei = rexi; 5158 if (OP(scan)==GOSUB) { 5159 startpoint = scan + ARG2L(scan); 5160 ST.close_paren = ARG(scan); 5161 } else { 5162 startpoint = rei->program+1; 5163 ST.close_paren = 0; 5164 } 5165 5166 /* Save all the positions seen so far. */ 5167 ST.cp = regcppush(rex, 0, maxopenparen); 5168 REGCP_SET(ST.lastcp); 5169 5170 /* and then jump to the code we share with EVAL */ 5171 goto eval_recurse_doit; 5172 5173 assert(0); /* NOTREACHED */ 5174 5175 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ 5176 if (cur_eval && cur_eval->locinput==locinput) { 5177 if ( ++nochange_depth > max_nochange_depth ) 5178 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); 5179 } else { 5180 nochange_depth = 0; 5181 } 5182 { 5183 /* execute the code in the {...} */ 5184 5185 dSP; 5186 IV before; 5187 OP * const oop = PL_op; 5188 COP * const ocurcop = PL_curcop; 5189 OP *nop; 5190 CV *newcv; 5191 5192 /* save *all* paren positions */ 5193 regcppush(rex, 0, maxopenparen); 5194 REGCP_SET(runops_cp); 5195 5196 if (!caller_cv) 5197 caller_cv = find_runcv(NULL); 5198 5199 n = ARG(scan); 5200 5201 if (rexi->data->what[n] == 'r') { /* code from an external qr */ 5202 newcv = (ReANY( 5203 (REGEXP*)(rexi->data->data[n]) 5204 ))->qr_anoncv 5205 ; 5206 nop = (OP*)rexi->data->data[n+1]; 5207 } 5208 else if (rexi->data->what[n] == 'l') { /* literal code */ 5209 newcv = caller_cv; 5210 nop = (OP*)rexi->data->data[n]; 5211 assert(CvDEPTH(newcv)); 5212 } 5213 else { 5214 /* literal with own CV */ 5215 assert(rexi->data->what[n] == 'L'); 5216 newcv = rex->qr_anoncv; 5217 nop = (OP*)rexi->data->data[n]; 5218 } 5219 5220 /* normally if we're about to execute code from the same 5221 * CV that we used previously, we just use the existing 5222 * CX stack entry. However, its possible that in the 5223 * meantime we may have backtracked, popped from the save 5224 * stack, and undone the SAVECOMPPAD(s) associated with 5225 * PUSH_MULTICALL; in which case PL_comppad no longer 5226 * points to newcv's pad. */ 5227 if (newcv != last_pushed_cv || PL_comppad != last_pad) 5228 { 5229 U8 flags = (CXp_SUB_RE | 5230 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); 5231 if (last_pushed_cv) { 5232 CHANGE_MULTICALL_FLAGS(newcv, flags); 5233 } 5234 else { 5235 PUSH_MULTICALL_FLAGS(newcv, flags); 5236 } 5237 last_pushed_cv = newcv; 5238 } 5239 else { 5240 /* these assignments are just to silence compiler 5241 * warnings */ 5242 multicall_cop = NULL; 5243 newsp = NULL; 5244 } 5245 last_pad = PL_comppad; 5246 5247 /* the initial nextstate you would normally execute 5248 * at the start of an eval (which would cause error 5249 * messages to come from the eval), may be optimised 5250 * away from the execution path in the regex code blocks; 5251 * so manually set PL_curcop to it initially */ 5252 { 5253 OP *o = cUNOPx(nop)->op_first; 5254 assert(o->op_type == OP_NULL); 5255 if (o->op_targ == OP_SCOPE) { 5256 o = cUNOPo->op_first; 5257 } 5258 else { 5259 assert(o->op_targ == OP_LEAVE); 5260 o = cUNOPo->op_first; 5261 assert(o->op_type == OP_ENTER); 5262 o = o->op_sibling; 5263 } 5264 5265 if (o->op_type != OP_STUB) { 5266 assert( o->op_type == OP_NEXTSTATE 5267 || o->op_type == OP_DBSTATE 5268 || (o->op_type == OP_NULL 5269 && ( o->op_targ == OP_NEXTSTATE 5270 || o->op_targ == OP_DBSTATE 5271 ) 5272 ) 5273 ); 5274 PL_curcop = (COP*)o; 5275 } 5276 } 5277 nop = nop->op_next; 5278 5279 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, 5280 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); 5281 5282 rex->offs[0].end = locinput - reginfo->strbeg; 5283 if (reginfo->info_aux_eval->pos_magic) 5284 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, 5285 reginfo->sv, reginfo->strbeg, 5286 locinput - reginfo->strbeg); 5287 5288 if (sv_yes_mark) { 5289 SV *sv_mrk = get_sv("REGMARK", 1); 5290 sv_setsv(sv_mrk, sv_yes_mark); 5291 } 5292 5293 /* we don't use MULTICALL here as we want to call the 5294 * first op of the block of interest, rather than the 5295 * first op of the sub */ 5296 before = (IV)(SP-PL_stack_base); 5297 PL_op = nop; 5298 CALLRUNOPS(aTHX); /* Scalar context. */ 5299 SPAGAIN; 5300 if ((IV)(SP-PL_stack_base) == before) 5301 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ 5302 else { 5303 ret = POPs; 5304 PUTBACK; 5305 } 5306 5307 /* before restoring everything, evaluate the returned 5308 * value, so that 'uninit' warnings don't use the wrong 5309 * PL_op or pad. Also need to process any magic vars 5310 * (e.g. $1) *before* parentheses are restored */ 5311 5312 PL_op = NULL; 5313 5314 re_sv = NULL; 5315 if (logical == 0) /* (?{})/ */ 5316 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */ 5317 else if (logical == 1) { /* /(?(?{...})X|Y)/ */ 5318 sw = cBOOL(SvTRUE(ret)); 5319 logical = 0; 5320 } 5321 else { /* /(??{}) */ 5322 /* if its overloaded, let the regex compiler handle 5323 * it; otherwise extract regex, or stringify */ 5324 if (SvGMAGICAL(ret)) 5325 ret = sv_mortalcopy(ret); 5326 if (!SvAMAGIC(ret)) { 5327 SV *sv = ret; 5328 if (SvROK(sv)) 5329 sv = SvRV(sv); 5330 if (SvTYPE(sv) == SVt_REGEXP) 5331 re_sv = (REGEXP*) sv; 5332 else if (SvSMAGICAL(ret)) { 5333 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); 5334 if (mg) 5335 re_sv = (REGEXP *) mg->mg_obj; 5336 } 5337 5338 /* force any undef warnings here */ 5339 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { 5340 ret = sv_mortalcopy(ret); 5341 (void) SvPV_force_nolen(ret); 5342 } 5343 } 5344 5345 } 5346 5347 /* *** Note that at this point we don't restore 5348 * PL_comppad, (or pop the CxSUB) on the assumption it may 5349 * be used again soon. This is safe as long as nothing 5350 * in the regexp code uses the pad ! */ 5351 PL_op = oop; 5352 PL_curcop = ocurcop; 5353 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); 5354 PL_curpm = PL_reg_curpm; 5355 5356 if (logical != 2) 5357 break; 5358 } 5359 5360 /* only /(??{})/ from now on */ 5361 logical = 0; 5362 { 5363 /* extract RE object from returned value; compiling if 5364 * necessary */ 5365 5366 if (re_sv) { 5367 re_sv = reg_temp_copy(NULL, re_sv); 5368 } 5369 else { 5370 U32 pm_flags = 0; 5371 5372 if (SvUTF8(ret) && IN_BYTES) { 5373 /* In use 'bytes': make a copy of the octet 5374 * sequence, but without the flag on */ 5375 STRLEN len; 5376 const char *const p = SvPV(ret, len); 5377 ret = newSVpvn_flags(p, len, SVs_TEMP); 5378 } 5379 if (rex->intflags & PREGf_USE_RE_EVAL) 5380 pm_flags |= PMf_USE_RE_EVAL; 5381 5382 /* if we got here, it should be an engine which 5383 * supports compiling code blocks and stuff */ 5384 assert(rex->engine && rex->engine->op_comp); 5385 assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); 5386 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, 5387 rex->engine, NULL, NULL, 5388 /* copy /msix etc to inner pattern */ 5389 scan->flags, 5390 pm_flags); 5391 5392 if (!(SvFLAGS(ret) 5393 & (SVs_TEMP | SVs_GMG | SVf_ROK)) 5394 && (!SvPADTMP(ret) || SvREADONLY(ret))) { 5395 /* This isn't a first class regexp. Instead, it's 5396 caching a regexp onto an existing, Perl visible 5397 scalar. */ 5398 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); 5399 } 5400 } 5401 SAVEFREESV(re_sv); 5402 re = ReANY(re_sv); 5403 } 5404 RXp_MATCH_COPIED_off(re); 5405 re->subbeg = rex->subbeg; 5406 re->sublen = rex->sublen; 5407 re->suboffset = rex->suboffset; 5408 re->subcoffset = rex->subcoffset; 5409 re->lastparen = 0; 5410 re->lastcloseparen = 0; 5411 rei = RXi_GET(re); 5412 DEBUG_EXECUTE_r( 5413 debug_start_match(re_sv, utf8_target, locinput, 5414 reginfo->strend, "Matching embedded"); 5415 ); 5416 startpoint = rei->program + 1; 5417 ST.close_paren = 0; /* only used for GOSUB */ 5418 /* Save all the seen positions so far. */ 5419 ST.cp = regcppush(rex, 0, maxopenparen); 5420 REGCP_SET(ST.lastcp); 5421 /* and set maxopenparen to 0, since we are starting a "fresh" match */ 5422 maxopenparen = 0; 5423 /* run the pattern returned from (??{...}) */ 5424 5425 eval_recurse_doit: /* Share code with GOSUB below this line 5426 * At this point we expect the stack context to be 5427 * set up correctly */ 5428 5429 /* invalidate the S-L poscache. We're now executing a 5430 * different set of WHILEM ops (and their associated 5431 * indexes) against the same string, so the bits in the 5432 * cache are meaningless. Setting maxiter to zero forces 5433 * the cache to be invalidated and zeroed before reuse. 5434 * XXX This is too dramatic a measure. Ideally we should 5435 * save the old cache and restore when running the outer 5436 * pattern again */ 5437 reginfo->poscache_maxiter = 0; 5438 5439 /* the new regexp might have a different is_utf8_pat than we do */ 5440 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); 5441 5442 ST.prev_rex = rex_sv; 5443 ST.prev_curlyx = cur_curlyx; 5444 rex_sv = re_sv; 5445 SET_reg_curpm(rex_sv); 5446 rex = re; 5447 rexi = rei; 5448 cur_curlyx = NULL; 5449 ST.B = next; 5450 ST.prev_eval = cur_eval; 5451 cur_eval = st; 5452 /* now continue from first node in postoned RE */ 5453 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput); 5454 assert(0); /* NOTREACHED */ 5455 } 5456 5457 case EVAL_AB: /* cleanup after a successful (??{A})B */ 5458 /* note: this is called twice; first after popping B, then A */ 5459 rex_sv = ST.prev_rex; 5460 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 5461 SET_reg_curpm(rex_sv); 5462 rex = ReANY(rex_sv); 5463 rexi = RXi_GET(rex); 5464 { 5465 /* preserve $^R across LEAVE's. See Bug 121070. */ 5466 SV *save_sv= GvSV(PL_replgv); 5467 SvREFCNT_inc(save_sv); 5468 regcpblow(ST.cp); /* LEAVE in disguise */ 5469 sv_setsv(GvSV(PL_replgv), save_sv); 5470 SvREFCNT_dec(save_sv); 5471 } 5472 cur_eval = ST.prev_eval; 5473 cur_curlyx = ST.prev_curlyx; 5474 5475 /* Invalidate cache. See "invalidate" comment above. */ 5476 reginfo->poscache_maxiter = 0; 5477 if ( nochange_depth ) 5478 nochange_depth--; 5479 sayYES; 5480 5481 5482 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ 5483 /* note: this is called twice; first after popping B, then A */ 5484 rex_sv = ST.prev_rex; 5485 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 5486 SET_reg_curpm(rex_sv); 5487 rex = ReANY(rex_sv); 5488 rexi = RXi_GET(rex); 5489 5490 REGCP_UNWIND(ST.lastcp); 5491 regcppop(rex, &maxopenparen); 5492 cur_eval = ST.prev_eval; 5493 cur_curlyx = ST.prev_curlyx; 5494 /* Invalidate cache. See "invalidate" comment above. */ 5495 reginfo->poscache_maxiter = 0; 5496 if ( nochange_depth ) 5497 nochange_depth--; 5498 sayNO_SILENT; 5499 #undef ST 5500 5501 case OPEN: /* ( */ 5502 n = ARG(scan); /* which paren pair */ 5503 rex->offs[n].start_tmp = locinput - reginfo->strbeg; 5504 if (n > maxopenparen) 5505 maxopenparen = n; 5506 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, 5507 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n", 5508 PTR2UV(rex), 5509 PTR2UV(rex->offs), 5510 (UV)n, 5511 (IV)rex->offs[n].start_tmp, 5512 (UV)maxopenparen 5513 )); 5514 lastopen = n; 5515 break; 5516 5517 /* XXX really need to log other places start/end are set too */ 5518 #define CLOSE_CAPTURE \ 5519 rex->offs[n].start = rex->offs[n].start_tmp; \ 5520 rex->offs[n].end = locinput - reginfo->strbeg; \ 5521 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \ 5522 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \ 5523 PTR2UV(rex), \ 5524 PTR2UV(rex->offs), \ 5525 (UV)n, \ 5526 (IV)rex->offs[n].start, \ 5527 (IV)rex->offs[n].end \ 5528 )) 5529 5530 case CLOSE: /* ) */ 5531 n = ARG(scan); /* which paren pair */ 5532 CLOSE_CAPTURE; 5533 if (n > rex->lastparen) 5534 rex->lastparen = n; 5535 rex->lastcloseparen = n; 5536 if (cur_eval && cur_eval->u.eval.close_paren == n) { 5537 goto fake_end; 5538 } 5539 break; 5540 5541 case ACCEPT: /* (*ACCEPT) */ 5542 if (ARG(scan)){ 5543 regnode *cursor; 5544 for (cursor=scan; 5545 cursor && OP(cursor)!=END; 5546 cursor=regnext(cursor)) 5547 { 5548 if ( OP(cursor)==CLOSE ){ 5549 n = ARG(cursor); 5550 if ( n <= lastopen ) { 5551 CLOSE_CAPTURE; 5552 if (n > rex->lastparen) 5553 rex->lastparen = n; 5554 rex->lastcloseparen = n; 5555 if ( n == ARG(scan) || (cur_eval && 5556 cur_eval->u.eval.close_paren == n)) 5557 break; 5558 } 5559 } 5560 } 5561 } 5562 goto fake_end; 5563 /*NOTREACHED*/ 5564 5565 case GROUPP: /* (?(1)) */ 5566 n = ARG(scan); /* which paren pair */ 5567 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); 5568 break; 5569 5570 case NGROUPP: /* (?(<name>)) */ 5571 /* reg_check_named_buff_matched returns 0 for no match */ 5572 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); 5573 break; 5574 5575 case INSUBP: /* (?(R)) */ 5576 n = ARG(scan); 5577 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n)); 5578 break; 5579 5580 case DEFINEP: /* (?(DEFINE)) */ 5581 sw = 0; 5582 break; 5583 5584 case IFTHEN: /* (?(cond)A|B) */ 5585 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ 5586 if (sw) 5587 next = NEXTOPER(NEXTOPER(scan)); 5588 else { 5589 next = scan + ARG(scan); 5590 if (OP(next) == IFTHEN) /* Fake one. */ 5591 next = NEXTOPER(NEXTOPER(next)); 5592 } 5593 break; 5594 5595 case LOGICAL: /* modifier for EVAL and IFMATCH */ 5596 logical = scan->flags; 5597 break; 5598 5599 /******************************************************************* 5600 5601 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ 5602 pattern, where A and B are subpatterns. (For simple A, CURLYM or 5603 STAR/PLUS/CURLY/CURLYN are used instead.) 5604 5605 A*B is compiled as <CURLYX><A><WHILEM><B> 5606 5607 On entry to the subpattern, CURLYX is called. This pushes a CURLYX 5608 state, which contains the current count, initialised to -1. It also sets 5609 cur_curlyx to point to this state, with any previous value saved in the 5610 state block. 5611 5612 CURLYX then jumps straight to the WHILEM op, rather than executing A, 5613 since the pattern may possibly match zero times (i.e. it's a while {} loop 5614 rather than a do {} while loop). 5615 5616 Each entry to WHILEM represents a successful match of A. The count in the 5617 CURLYX block is incremented, another WHILEM state is pushed, and execution 5618 passes to A or B depending on greediness and the current count. 5619 5620 For example, if matching against the string a1a2a3b (where the aN are 5621 substrings that match /A/), then the match progresses as follows: (the 5622 pushed states are interspersed with the bits of strings matched so far): 5623 5624 <CURLYX cnt=-1> 5625 <CURLYX cnt=0><WHILEM> 5626 <CURLYX cnt=1><WHILEM> a1 <WHILEM> 5627 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM> 5628 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> 5629 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b 5630 5631 (Contrast this with something like CURLYM, which maintains only a single 5632 backtrack state: 5633 5634 <CURLYM cnt=0> a1 5635 a1 <CURLYM cnt=1> a2 5636 a1 a2 <CURLYM cnt=2> a3 5637 a1 a2 a3 <CURLYM cnt=3> b 5638 ) 5639 5640 Each WHILEM state block marks a point to backtrack to upon partial failure 5641 of A or B, and also contains some minor state data related to that 5642 iteration. The CURLYX block, pointed to by cur_curlyx, contains the 5643 overall state, such as the count, and pointers to the A and B ops. 5644 5645 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx 5646 must always point to the *current* CURLYX block, the rules are: 5647 5648 When executing CURLYX, save the old cur_curlyx in the CURLYX state block, 5649 and set cur_curlyx to point the new block. 5650 5651 When popping the CURLYX block after a successful or unsuccessful match, 5652 restore the previous cur_curlyx. 5653 5654 When WHILEM is about to execute B, save the current cur_curlyx, and set it 5655 to the outer one saved in the CURLYX block. 5656 5657 When popping the WHILEM block after a successful or unsuccessful B match, 5658 restore the previous cur_curlyx. 5659 5660 Here's an example for the pattern (AI* BI)*BO 5661 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: 5662 5663 cur_ 5664 curlyx backtrack stack 5665 ------ --------------- 5666 NULL 5667 CO <CO prev=NULL> <WO> 5668 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 5669 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 5670 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo 5671 5672 At this point the pattern succeeds, and we work back down the stack to 5673 clean up, restoring as we go: 5674 5675 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 5676 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 5677 CO <CO prev=NULL> <WO> 5678 NULL 5679 5680 *******************************************************************/ 5681 5682 #define ST st->u.curlyx 5683 5684 case CURLYX: /* start of /A*B/ (for complex A) */ 5685 { 5686 /* No need to save/restore up to this paren */ 5687 I32 parenfloor = scan->flags; 5688 5689 assert(next); /* keep Coverity happy */ 5690 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ 5691 next += ARG(next); 5692 5693 /* XXXX Probably it is better to teach regpush to support 5694 parenfloor > maxopenparen ... */ 5695 if (parenfloor > (I32)rex->lastparen) 5696 parenfloor = rex->lastparen; /* Pessimization... */ 5697 5698 ST.prev_curlyx= cur_curlyx; 5699 cur_curlyx = st; 5700 ST.cp = PL_savestack_ix; 5701 5702 /* these fields contain the state of the current curly. 5703 * they are accessed by subsequent WHILEMs */ 5704 ST.parenfloor = parenfloor; 5705 ST.me = scan; 5706 ST.B = next; 5707 ST.minmod = minmod; 5708 minmod = 0; 5709 ST.count = -1; /* this will be updated by WHILEM */ 5710 ST.lastloc = NULL; /* this will be updated by WHILEM */ 5711 5712 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput); 5713 assert(0); /* NOTREACHED */ 5714 } 5715 5716 case CURLYX_end: /* just finished matching all of A*B */ 5717 cur_curlyx = ST.prev_curlyx; 5718 sayYES; 5719 assert(0); /* NOTREACHED */ 5720 5721 case CURLYX_end_fail: /* just failed to match all of A*B */ 5722 regcpblow(ST.cp); 5723 cur_curlyx = ST.prev_curlyx; 5724 sayNO; 5725 assert(0); /* NOTREACHED */ 5726 5727 5728 #undef ST 5729 #define ST st->u.whilem 5730 5731 case WHILEM: /* just matched an A in /A*B/ (for complex A) */ 5732 { 5733 /* see the discussion above about CURLYX/WHILEM */ 5734 I32 n; 5735 int min = ARG1(cur_curlyx->u.curlyx.me); 5736 int max = ARG2(cur_curlyx->u.curlyx.me); 5737 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS; 5738 5739 assert(cur_curlyx); /* keep Coverity happy */ 5740 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ 5741 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; 5742 ST.cache_offset = 0; 5743 ST.cache_mask = 0; 5744 5745 5746 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 5747 "%*s whilem: matched %ld out of %d..%d\n", 5748 REPORT_CODE_OFF+depth*2, "", (long)n, min, max) 5749 ); 5750 5751 /* First just match a string of min A's. */ 5752 5753 if (n < min) { 5754 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, 5755 maxopenparen); 5756 cur_curlyx->u.curlyx.lastloc = locinput; 5757 REGCP_SET(ST.lastcp); 5758 5759 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput); 5760 assert(0); /* NOTREACHED */ 5761 } 5762 5763 /* If degenerate A matches "", assume A done. */ 5764 5765 if (locinput == cur_curlyx->u.curlyx.lastloc) { 5766 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 5767 "%*s whilem: empty match detected, trying continuation...\n", 5768 REPORT_CODE_OFF+depth*2, "") 5769 ); 5770 goto do_whilem_B_max; 5771 } 5772 5773 /* super-linear cache processing. 5774 * 5775 * The idea here is that for certain types of CURLYX/WHILEM - 5776 * principally those whose upper bound is infinity (and 5777 * excluding regexes that have things like \1 and other very 5778 * non-regular expresssiony things), then if a pattern like 5779 * /....A*.../ fails and we backtrack to the WHILEM, then we 5780 * make a note that this particular WHILEM op was at string 5781 * position 47 (say) when the rest of pattern failed. Then, if 5782 * we ever find ourselves back at that WHILEM, and at string 5783 * position 47 again, we can just fail immediately rather than 5784 * running the rest of the pattern again. 5785 * 5786 * This is very handy when patterns start to go 5787 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up 5788 * with a combinatorial explosion of backtracking. 5789 * 5790 * The cache is implemented as a bit array, with one bit per 5791 * string byte position per WHILEM op (up to 16) - so its 5792 * between 0.25 and 2x the string size. 5793 * 5794 * To avoid allocating a poscache buffer every time, we do an 5795 * initially countdown; only after we have executed a WHILEM 5796 * op (string-length x #WHILEMs) times do we allocate the 5797 * cache. 5798 * 5799 * The top 4 bits of scan->flags byte say how many different 5800 * relevant CURLLYX/WHILEM op pairs there are, while the 5801 * bottom 4-bits is the identifying index number of this 5802 * WHILEM. 5803 */ 5804 5805 if (scan->flags) { 5806 5807 if (!reginfo->poscache_maxiter) { 5808 /* start the countdown: Postpone detection until we 5809 * know the match is not *that* much linear. */ 5810 reginfo->poscache_maxiter 5811 = (reginfo->strend - reginfo->strbeg + 1) 5812 * (scan->flags>>4); 5813 /* possible overflow for long strings and many CURLYX's */ 5814 if (reginfo->poscache_maxiter < 0) 5815 reginfo->poscache_maxiter = I32_MAX; 5816 reginfo->poscache_iter = reginfo->poscache_maxiter; 5817 } 5818 5819 if (reginfo->poscache_iter-- == 0) { 5820 /* initialise cache */ 5821 const SSize_t size = (reginfo->poscache_maxiter + 7)/8; 5822 regmatch_info_aux *const aux = reginfo->info_aux; 5823 if (aux->poscache) { 5824 if ((SSize_t)reginfo->poscache_size < size) { 5825 Renew(aux->poscache, size, char); 5826 reginfo->poscache_size = size; 5827 } 5828 Zero(aux->poscache, size, char); 5829 } 5830 else { 5831 reginfo->poscache_size = size; 5832 Newxz(aux->poscache, size, char); 5833 } 5834 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 5835 "%swhilem: Detected a super-linear match, switching on caching%s...\n", 5836 PL_colors[4], PL_colors[5]) 5837 ); 5838 } 5839 5840 if (reginfo->poscache_iter < 0) { 5841 /* have we already failed at this position? */ 5842 SSize_t offset, mask; 5843 5844 reginfo->poscache_iter = -1; /* stop eventual underflow */ 5845 offset = (scan->flags & 0xf) - 1 5846 + (locinput - reginfo->strbeg) 5847 * (scan->flags>>4); 5848 mask = 1 << (offset % 8); 5849 offset /= 8; 5850 if (reginfo->info_aux->poscache[offset] & mask) { 5851 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 5852 "%*s whilem: (cache) already tried at this position...\n", 5853 REPORT_CODE_OFF+depth*2, "") 5854 ); 5855 sayNO; /* cache records failure */ 5856 } 5857 ST.cache_offset = offset; 5858 ST.cache_mask = mask; 5859 } 5860 } 5861 5862 /* Prefer B over A for minimal matching. */ 5863 5864 if (cur_curlyx->u.curlyx.minmod) { 5865 ST.save_curlyx = cur_curlyx; 5866 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 5867 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor, 5868 maxopenparen); 5869 REGCP_SET(ST.lastcp); 5870 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, 5871 locinput); 5872 assert(0); /* NOTREACHED */ 5873 } 5874 5875 /* Prefer A over B for maximal matching. */ 5876 5877 if (n < max) { /* More greed allowed? */ 5878 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, 5879 maxopenparen); 5880 cur_curlyx->u.curlyx.lastloc = locinput; 5881 REGCP_SET(ST.lastcp); 5882 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput); 5883 assert(0); /* NOTREACHED */ 5884 } 5885 goto do_whilem_B_max; 5886 } 5887 assert(0); /* NOTREACHED */ 5888 5889 case WHILEM_B_min: /* just matched B in a minimal match */ 5890 case WHILEM_B_max: /* just matched B in a maximal match */ 5891 cur_curlyx = ST.save_curlyx; 5892 sayYES; 5893 assert(0); /* NOTREACHED */ 5894 5895 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ 5896 cur_curlyx = ST.save_curlyx; 5897 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 5898 cur_curlyx->u.curlyx.count--; 5899 CACHEsayNO; 5900 assert(0); /* NOTREACHED */ 5901 5902 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ 5903 /* FALL THROUGH */ 5904 case WHILEM_A_pre_fail: /* just failed to match even minimal A */ 5905 REGCP_UNWIND(ST.lastcp); 5906 regcppop(rex, &maxopenparen); 5907 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 5908 cur_curlyx->u.curlyx.count--; 5909 CACHEsayNO; 5910 assert(0); /* NOTREACHED */ 5911 5912 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ 5913 REGCP_UNWIND(ST.lastcp); 5914 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */ 5915 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 5916 "%*s whilem: failed, trying continuation...\n", 5917 REPORT_CODE_OFF+depth*2, "") 5918 ); 5919 do_whilem_B_max: 5920 if (cur_curlyx->u.curlyx.count >= REG_INFTY 5921 && ckWARN(WARN_REGEXP) 5922 && !reginfo->warned) 5923 { 5924 reginfo->warned = TRUE; 5925 Perl_warner(aTHX_ packWARN(WARN_REGEXP), 5926 "Complex regular subexpression recursion limit (%d) " 5927 "exceeded", 5928 REG_INFTY - 1); 5929 } 5930 5931 /* now try B */ 5932 ST.save_curlyx = cur_curlyx; 5933 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 5934 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, 5935 locinput); 5936 assert(0); /* NOTREACHED */ 5937 5938 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ 5939 cur_curlyx = ST.save_curlyx; 5940 REGCP_UNWIND(ST.lastcp); 5941 regcppop(rex, &maxopenparen); 5942 5943 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) { 5944 /* Maximum greed exceeded */ 5945 if (cur_curlyx->u.curlyx.count >= REG_INFTY 5946 && ckWARN(WARN_REGEXP) 5947 && !reginfo->warned) 5948 { 5949 reginfo->warned = TRUE; 5950 Perl_warner(aTHX_ packWARN(WARN_REGEXP), 5951 "Complex regular subexpression recursion " 5952 "limit (%d) exceeded", 5953 REG_INFTY - 1); 5954 } 5955 cur_curlyx->u.curlyx.count--; 5956 CACHEsayNO; 5957 } 5958 5959 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 5960 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "") 5961 ); 5962 /* Try grabbing another A and see if it helps. */ 5963 cur_curlyx->u.curlyx.lastloc = locinput; 5964 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, 5965 maxopenparen); 5966 REGCP_SET(ST.lastcp); 5967 PUSH_STATE_GOTO(WHILEM_A_min, 5968 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, 5969 locinput); 5970 assert(0); /* NOTREACHED */ 5971 5972 #undef ST 5973 #define ST st->u.branch 5974 5975 case BRANCHJ: /* /(...|A|...)/ with long next pointer */ 5976 next = scan + ARG(scan); 5977 if (next == scan) 5978 next = NULL; 5979 scan = NEXTOPER(scan); 5980 /* FALL THROUGH */ 5981 5982 case BRANCH: /* /(...|A|...)/ */ 5983 scan = NEXTOPER(scan); /* scan now points to inner node */ 5984 ST.lastparen = rex->lastparen; 5985 ST.lastcloseparen = rex->lastcloseparen; 5986 ST.next_branch = next; 5987 REGCP_SET(ST.cp); 5988 5989 /* Now go into the branch */ 5990 if (has_cutgroup) { 5991 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput); 5992 } else { 5993 PUSH_STATE_GOTO(BRANCH_next, scan, locinput); 5994 } 5995 assert(0); /* NOTREACHED */ 5996 5997 case CUTGROUP: /* /(*THEN)/ */ 5998 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : 5999 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 6000 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput); 6001 assert(0); /* NOTREACHED */ 6002 6003 case CUTGROUP_next_fail: 6004 do_cutgroup = 1; 6005 no_final = 1; 6006 if (st->u.mark.mark_name) 6007 sv_commit = st->u.mark.mark_name; 6008 sayNO; 6009 assert(0); /* NOTREACHED */ 6010 6011 case BRANCH_next: 6012 sayYES; 6013 assert(0); /* NOTREACHED */ 6014 6015 case BRANCH_next_fail: /* that branch failed; try the next, if any */ 6016 if (do_cutgroup) { 6017 do_cutgroup = 0; 6018 no_final = 0; 6019 } 6020 REGCP_UNWIND(ST.cp); 6021 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 6022 scan = ST.next_branch; 6023 /* no more branches? */ 6024 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { 6025 DEBUG_EXECUTE_r({ 6026 PerlIO_printf( Perl_debug_log, 6027 "%*s %sBRANCH failed...%s\n", 6028 REPORT_CODE_OFF+depth*2, "", 6029 PL_colors[4], 6030 PL_colors[5] ); 6031 }); 6032 sayNO_SILENT; 6033 } 6034 continue; /* execute next BRANCH[J] op */ 6035 assert(0); /* NOTREACHED */ 6036 6037 case MINMOD: /* next op will be non-greedy, e.g. A*? */ 6038 minmod = 1; 6039 break; 6040 6041 #undef ST 6042 #define ST st->u.curlym 6043 6044 case CURLYM: /* /A{m,n}B/ where A is fixed-length */ 6045 6046 /* This is an optimisation of CURLYX that enables us to push 6047 * only a single backtracking state, no matter how many matches 6048 * there are in {m,n}. It relies on the pattern being constant 6049 * length, with no parens to influence future backrefs 6050 */ 6051 6052 ST.me = scan; 6053 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 6054 6055 ST.lastparen = rex->lastparen; 6056 ST.lastcloseparen = rex->lastcloseparen; 6057 6058 /* if paren positive, emulate an OPEN/CLOSE around A */ 6059 if (ST.me->flags) { 6060 U32 paren = ST.me->flags; 6061 if (paren > maxopenparen) 6062 maxopenparen = paren; 6063 scan += NEXT_OFF(scan); /* Skip former OPEN. */ 6064 } 6065 ST.A = scan; 6066 ST.B = next; 6067 ST.alen = 0; 6068 ST.count = 0; 6069 ST.minmod = minmod; 6070 minmod = 0; 6071 ST.c1 = CHRTEST_UNINIT; 6072 REGCP_SET(ST.cp); 6073 6074 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ 6075 goto curlym_do_B; 6076 6077 curlym_do_A: /* execute the A in /A{m,n}B/ */ 6078 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */ 6079 assert(0); /* NOTREACHED */ 6080 6081 case CURLYM_A: /* we've just matched an A */ 6082 ST.count++; 6083 /* after first match, determine A's length: u.curlym.alen */ 6084 if (ST.count == 1) { 6085 if (reginfo->is_utf8_target) { 6086 char *s = st->locinput; 6087 while (s < locinput) { 6088 ST.alen++; 6089 s += UTF8SKIP(s); 6090 } 6091 } 6092 else { 6093 ST.alen = locinput - st->locinput; 6094 } 6095 if (ST.alen == 0) 6096 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me); 6097 } 6098 DEBUG_EXECUTE_r( 6099 PerlIO_printf(Perl_debug_log, 6100 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n", 6101 (int)(REPORT_CODE_OFF+(depth*2)), "", 6102 (IV) ST.count, (IV)ST.alen) 6103 ); 6104 6105 if (cur_eval && cur_eval->u.eval.close_paren && 6106 cur_eval->u.eval.close_paren == (U32)ST.me->flags) 6107 goto fake_end; 6108 6109 { 6110 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); 6111 if ( max == REG_INFTY || ST.count < max ) 6112 goto curlym_do_A; /* try to match another A */ 6113 } 6114 goto curlym_do_B; /* try to match B */ 6115 6116 case CURLYM_A_fail: /* just failed to match an A */ 6117 REGCP_UNWIND(ST.cp); 6118 6119 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ 6120 || (cur_eval && cur_eval->u.eval.close_paren && 6121 cur_eval->u.eval.close_paren == (U32)ST.me->flags)) 6122 sayNO; 6123 6124 curlym_do_B: /* execute the B in /A{m,n}B/ */ 6125 if (ST.c1 == CHRTEST_UNINIT) { 6126 /* calculate c1 and c2 for possible match of 1st char 6127 * following curly */ 6128 ST.c1 = ST.c2 = CHRTEST_VOID; 6129 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { 6130 regnode *text_node = ST.B; 6131 if (! HAS_TEXT(text_node)) 6132 FIND_NEXT_IMPT(text_node); 6133 /* this used to be 6134 6135 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT) 6136 6137 But the former is redundant in light of the latter. 6138 6139 if this changes back then the macro for 6140 IS_TEXT and friends need to change. 6141 */ 6142 if (PL_regkind[OP(text_node)] == EXACT) { 6143 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ 6144 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, 6145 reginfo)) 6146 { 6147 sayNO; 6148 } 6149 } 6150 } 6151 } 6152 6153 DEBUG_EXECUTE_r( 6154 PerlIO_printf(Perl_debug_log, 6155 "%*s CURLYM trying tail with matches=%"IVdf"...\n", 6156 (int)(REPORT_CODE_OFF+(depth*2)), 6157 "", (IV)ST.count) 6158 ); 6159 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { 6160 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { 6161 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) 6162 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) 6163 { 6164 /* simulate B failing */ 6165 DEBUG_OPTIMISE_r( 6166 PerlIO_printf(Perl_debug_log, 6167 "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", 6168 (int)(REPORT_CODE_OFF+(depth*2)),"", 6169 valid_utf8_to_uvchr((U8 *) locinput, NULL), 6170 valid_utf8_to_uvchr(ST.c1_utf8, NULL), 6171 valid_utf8_to_uvchr(ST.c2_utf8, NULL)) 6172 ); 6173 state_num = CURLYM_B_fail; 6174 goto reenter_switch; 6175 } 6176 } 6177 else if (nextchr != ST.c1 && nextchr != ST.c2) { 6178 /* simulate B failing */ 6179 DEBUG_OPTIMISE_r( 6180 PerlIO_printf(Perl_debug_log, 6181 "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", 6182 (int)(REPORT_CODE_OFF+(depth*2)),"", 6183 (int) nextchr, ST.c1, ST.c2) 6184 ); 6185 state_num = CURLYM_B_fail; 6186 goto reenter_switch; 6187 } 6188 } 6189 6190 if (ST.me->flags) { 6191 /* emulate CLOSE: mark current A as captured */ 6192 I32 paren = ST.me->flags; 6193 if (ST.count) { 6194 rex->offs[paren].start 6195 = HOPc(locinput, -ST.alen) - reginfo->strbeg; 6196 rex->offs[paren].end = locinput - reginfo->strbeg; 6197 if ((U32)paren > rex->lastparen) 6198 rex->lastparen = paren; 6199 rex->lastcloseparen = paren; 6200 } 6201 else 6202 rex->offs[paren].end = -1; 6203 if (cur_eval && cur_eval->u.eval.close_paren && 6204 cur_eval->u.eval.close_paren == (U32)ST.me->flags) 6205 { 6206 if (ST.count) 6207 goto fake_end; 6208 else 6209 sayNO; 6210 } 6211 } 6212 6213 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */ 6214 assert(0); /* NOTREACHED */ 6215 6216 case CURLYM_B_fail: /* just failed to match a B */ 6217 REGCP_UNWIND(ST.cp); 6218 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 6219 if (ST.minmod) { 6220 I32 max = ARG2(ST.me); 6221 if (max != REG_INFTY && ST.count == max) 6222 sayNO; 6223 goto curlym_do_A; /* try to match a further A */ 6224 } 6225 /* backtrack one A */ 6226 if (ST.count == ARG1(ST.me) /* min */) 6227 sayNO; 6228 ST.count--; 6229 SET_locinput(HOPc(locinput, -ST.alen)); 6230 goto curlym_do_B; /* try to match B */ 6231 6232 #undef ST 6233 #define ST st->u.curly 6234 6235 #define CURLY_SETPAREN(paren, success) \ 6236 if (paren) { \ 6237 if (success) { \ 6238 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \ 6239 rex->offs[paren].end = locinput - reginfo->strbeg; \ 6240 if (paren > rex->lastparen) \ 6241 rex->lastparen = paren; \ 6242 rex->lastcloseparen = paren; \ 6243 } \ 6244 else { \ 6245 rex->offs[paren].end = -1; \ 6246 rex->lastparen = ST.lastparen; \ 6247 rex->lastcloseparen = ST.lastcloseparen; \ 6248 } \ 6249 } 6250 6251 case STAR: /* /A*B/ where A is width 1 char */ 6252 ST.paren = 0; 6253 ST.min = 0; 6254 ST.max = REG_INFTY; 6255 scan = NEXTOPER(scan); 6256 goto repeat; 6257 6258 case PLUS: /* /A+B/ where A is width 1 char */ 6259 ST.paren = 0; 6260 ST.min = 1; 6261 ST.max = REG_INFTY; 6262 scan = NEXTOPER(scan); 6263 goto repeat; 6264 6265 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ 6266 ST.paren = scan->flags; /* Which paren to set */ 6267 ST.lastparen = rex->lastparen; 6268 ST.lastcloseparen = rex->lastcloseparen; 6269 if (ST.paren > maxopenparen) 6270 maxopenparen = ST.paren; 6271 ST.min = ARG1(scan); /* min to match */ 6272 ST.max = ARG2(scan); /* max to match */ 6273 if (cur_eval && cur_eval->u.eval.close_paren && 6274 cur_eval->u.eval.close_paren == (U32)ST.paren) { 6275 ST.min=1; 6276 ST.max=1; 6277 } 6278 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); 6279 goto repeat; 6280 6281 case CURLY: /* /A{m,n}B/ where A is width 1 char */ 6282 ST.paren = 0; 6283 ST.min = ARG1(scan); /* min to match */ 6284 ST.max = ARG2(scan); /* max to match */ 6285 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 6286 repeat: 6287 /* 6288 * Lookahead to avoid useless match attempts 6289 * when we know what character comes next. 6290 * 6291 * Used to only do .*x and .*?x, but now it allows 6292 * for )'s, ('s and (?{ ... })'s to be in the way 6293 * of the quantifier and the EXACT-like node. -- japhy 6294 */ 6295 6296 assert(ST.min <= ST.max); 6297 if (! HAS_TEXT(next) && ! JUMPABLE(next)) { 6298 ST.c1 = ST.c2 = CHRTEST_VOID; 6299 } 6300 else { 6301 regnode *text_node = next; 6302 6303 if (! HAS_TEXT(text_node)) 6304 FIND_NEXT_IMPT(text_node); 6305 6306 if (! HAS_TEXT(text_node)) 6307 ST.c1 = ST.c2 = CHRTEST_VOID; 6308 else { 6309 if ( PL_regkind[OP(text_node)] != EXACT ) { 6310 ST.c1 = ST.c2 = CHRTEST_VOID; 6311 } 6312 else { 6313 6314 /* Currently we only get here when 6315 6316 PL_rekind[OP(text_node)] == EXACT 6317 6318 if this changes back then the macro for IS_TEXT and 6319 friends need to change. */ 6320 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ 6321 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, 6322 reginfo)) 6323 { 6324 sayNO; 6325 } 6326 } 6327 } 6328 } 6329 6330 ST.A = scan; 6331 ST.B = next; 6332 if (minmod) { 6333 char *li = locinput; 6334 minmod = 0; 6335 if (ST.min && 6336 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth) 6337 < ST.min) 6338 sayNO; 6339 SET_locinput(li); 6340 ST.count = ST.min; 6341 REGCP_SET(ST.cp); 6342 if (ST.c1 == CHRTEST_VOID) 6343 goto curly_try_B_min; 6344 6345 ST.oldloc = locinput; 6346 6347 /* set ST.maxpos to the furthest point along the 6348 * string that could possibly match */ 6349 if (ST.max == REG_INFTY) { 6350 ST.maxpos = reginfo->strend - 1; 6351 if (utf8_target) 6352 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) 6353 ST.maxpos--; 6354 } 6355 else if (utf8_target) { 6356 int m = ST.max - ST.min; 6357 for (ST.maxpos = locinput; 6358 m >0 && ST.maxpos < reginfo->strend; m--) 6359 ST.maxpos += UTF8SKIP(ST.maxpos); 6360 } 6361 else { 6362 ST.maxpos = locinput + ST.max - ST.min; 6363 if (ST.maxpos >= reginfo->strend) 6364 ST.maxpos = reginfo->strend - 1; 6365 } 6366 goto curly_try_B_min_known; 6367 6368 } 6369 else { 6370 /* avoid taking address of locinput, so it can remain 6371 * a register var */ 6372 char *li = locinput; 6373 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth); 6374 if (ST.count < ST.min) 6375 sayNO; 6376 SET_locinput(li); 6377 if ((ST.count > ST.min) 6378 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL)) 6379 { 6380 /* A{m,n} must come at the end of the string, there's 6381 * no point in backing off ... */ 6382 ST.min = ST.count; 6383 /* ...except that $ and \Z can match before *and* after 6384 newline at the end. Consider "\n\n" =~ /\n+\Z\n/. 6385 We may back off by one in this case. */ 6386 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS) 6387 ST.min--; 6388 } 6389 REGCP_SET(ST.cp); 6390 goto curly_try_B_max; 6391 } 6392 assert(0); /* NOTREACHED */ 6393 6394 6395 case CURLY_B_min_known_fail: 6396 /* failed to find B in a non-greedy match where c1,c2 valid */ 6397 6398 REGCP_UNWIND(ST.cp); 6399 if (ST.paren) { 6400 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 6401 } 6402 /* Couldn't or didn't -- move forward. */ 6403 ST.oldloc = locinput; 6404 if (utf8_target) 6405 locinput += UTF8SKIP(locinput); 6406 else 6407 locinput++; 6408 ST.count++; 6409 curly_try_B_min_known: 6410 /* find the next place where 'B' could work, then call B */ 6411 { 6412 int n; 6413 if (utf8_target) { 6414 n = (ST.oldloc == locinput) ? 0 : 1; 6415 if (ST.c1 == ST.c2) { 6416 /* set n to utf8_distance(oldloc, locinput) */ 6417 while (locinput <= ST.maxpos 6418 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) 6419 { 6420 locinput += UTF8SKIP(locinput); 6421 n++; 6422 } 6423 } 6424 else { 6425 /* set n to utf8_distance(oldloc, locinput) */ 6426 while (locinput <= ST.maxpos 6427 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) 6428 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) 6429 { 6430 locinput += UTF8SKIP(locinput); 6431 n++; 6432 } 6433 } 6434 } 6435 else { /* Not utf8_target */ 6436 if (ST.c1 == ST.c2) { 6437 while (locinput <= ST.maxpos && 6438 UCHARAT(locinput) != ST.c1) 6439 locinput++; 6440 } 6441 else { 6442 while (locinput <= ST.maxpos 6443 && UCHARAT(locinput) != ST.c1 6444 && UCHARAT(locinput) != ST.c2) 6445 locinput++; 6446 } 6447 n = locinput - ST.oldloc; 6448 } 6449 if (locinput > ST.maxpos) 6450 sayNO; 6451 if (n) { 6452 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is 6453 * at b; check that everything between oldloc and 6454 * locinput matches */ 6455 char *li = ST.oldloc; 6456 ST.count += n; 6457 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n) 6458 sayNO; 6459 assert(n == REG_INFTY || locinput == li); 6460 } 6461 CURLY_SETPAREN(ST.paren, ST.count); 6462 if (cur_eval && cur_eval->u.eval.close_paren && 6463 cur_eval->u.eval.close_paren == (U32)ST.paren) { 6464 goto fake_end; 6465 } 6466 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput); 6467 } 6468 assert(0); /* NOTREACHED */ 6469 6470 6471 case CURLY_B_min_fail: 6472 /* failed to find B in a non-greedy match where c1,c2 invalid */ 6473 6474 REGCP_UNWIND(ST.cp); 6475 if (ST.paren) { 6476 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 6477 } 6478 /* failed -- move forward one */ 6479 { 6480 char *li = locinput; 6481 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) { 6482 sayNO; 6483 } 6484 locinput = li; 6485 } 6486 { 6487 ST.count++; 6488 if (ST.count <= ST.max || (ST.max == REG_INFTY && 6489 ST.count > 0)) /* count overflow ? */ 6490 { 6491 curly_try_B_min: 6492 CURLY_SETPAREN(ST.paren, ST.count); 6493 if (cur_eval && cur_eval->u.eval.close_paren && 6494 cur_eval->u.eval.close_paren == (U32)ST.paren) { 6495 goto fake_end; 6496 } 6497 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput); 6498 } 6499 } 6500 sayNO; 6501 assert(0); /* NOTREACHED */ 6502 6503 6504 curly_try_B_max: 6505 /* a successful greedy match: now try to match B */ 6506 if (cur_eval && cur_eval->u.eval.close_paren && 6507 cur_eval->u.eval.close_paren == (U32)ST.paren) { 6508 goto fake_end; 6509 } 6510 { 6511 bool could_match = locinput < reginfo->strend; 6512 6513 /* If it could work, try it. */ 6514 if (ST.c1 != CHRTEST_VOID && could_match) { 6515 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) 6516 { 6517 could_match = memEQ(locinput, 6518 ST.c1_utf8, 6519 UTF8SKIP(locinput)) 6520 || memEQ(locinput, 6521 ST.c2_utf8, 6522 UTF8SKIP(locinput)); 6523 } 6524 else { 6525 could_match = UCHARAT(locinput) == ST.c1 6526 || UCHARAT(locinput) == ST.c2; 6527 } 6528 } 6529 if (ST.c1 == CHRTEST_VOID || could_match) { 6530 CURLY_SETPAREN(ST.paren, ST.count); 6531 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput); 6532 assert(0); /* NOTREACHED */ 6533 } 6534 } 6535 /* FALL THROUGH */ 6536 6537 case CURLY_B_max_fail: 6538 /* failed to find B in a greedy match */ 6539 6540 REGCP_UNWIND(ST.cp); 6541 if (ST.paren) { 6542 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 6543 } 6544 /* back up. */ 6545 if (--ST.count < ST.min) 6546 sayNO; 6547 locinput = HOPc(locinput, -1); 6548 goto curly_try_B_max; 6549 6550 #undef ST 6551 6552 case END: /* last op of main pattern */ 6553 fake_end: 6554 if (cur_eval) { 6555 /* we've just finished A in /(??{A})B/; now continue with B */ 6556 6557 st->u.eval.prev_rex = rex_sv; /* inner */ 6558 6559 /* Save *all* the positions. */ 6560 st->u.eval.cp = regcppush(rex, 0, maxopenparen); 6561 rex_sv = cur_eval->u.eval.prev_rex; 6562 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 6563 SET_reg_curpm(rex_sv); 6564 rex = ReANY(rex_sv); 6565 rexi = RXi_GET(rex); 6566 cur_curlyx = cur_eval->u.eval.prev_curlyx; 6567 6568 REGCP_SET(st->u.eval.lastcp); 6569 6570 /* Restore parens of the outer rex without popping the 6571 * savestack */ 6572 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp, 6573 &maxopenparen); 6574 6575 st->u.eval.prev_eval = cur_eval; 6576 cur_eval = cur_eval->u.eval.prev_eval; 6577 DEBUG_EXECUTE_r( 6578 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n", 6579 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval));); 6580 if ( nochange_depth ) 6581 nochange_depth--; 6582 6583 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B, 6584 locinput); /* match B */ 6585 } 6586 6587 if (locinput < reginfo->till) { 6588 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 6589 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", 6590 PL_colors[4], 6591 (long)(locinput - startpos), 6592 (long)(reginfo->till - startpos), 6593 PL_colors[5])); 6594 6595 sayNO_SILENT; /* Cannot match: too short. */ 6596 } 6597 sayYES; /* Success! */ 6598 6599 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */ 6600 DEBUG_EXECUTE_r( 6601 PerlIO_printf(Perl_debug_log, 6602 "%*s %ssubpattern success...%s\n", 6603 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])); 6604 sayYES; /* Success! */ 6605 6606 #undef ST 6607 #define ST st->u.ifmatch 6608 6609 { 6610 char *newstart; 6611 6612 case SUSPEND: /* (?>A) */ 6613 ST.wanted = 1; 6614 newstart = locinput; 6615 goto do_ifmatch; 6616 6617 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */ 6618 ST.wanted = 0; 6619 goto ifmatch_trivial_fail_test; 6620 6621 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */ 6622 ST.wanted = 1; 6623 ifmatch_trivial_fail_test: 6624 if (scan->flags) { 6625 char * const s = HOPBACKc(locinput, scan->flags); 6626 if (!s) { 6627 /* trivial fail */ 6628 if (logical) { 6629 logical = 0; 6630 sw = 1 - cBOOL(ST.wanted); 6631 } 6632 else if (ST.wanted) 6633 sayNO; 6634 next = scan + ARG(scan); 6635 if (next == scan) 6636 next = NULL; 6637 break; 6638 } 6639 newstart = s; 6640 } 6641 else 6642 newstart = locinput; 6643 6644 do_ifmatch: 6645 ST.me = scan; 6646 ST.logical = logical; 6647 logical = 0; /* XXX: reset state of logical once it has been saved into ST */ 6648 6649 /* execute body of (?...A) */ 6650 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart); 6651 assert(0); /* NOTREACHED */ 6652 } 6653 6654 case IFMATCH_A_fail: /* body of (?...A) failed */ 6655 ST.wanted = !ST.wanted; 6656 /* FALL THROUGH */ 6657 6658 case IFMATCH_A: /* body of (?...A) succeeded */ 6659 if (ST.logical) { 6660 sw = cBOOL(ST.wanted); 6661 } 6662 else if (!ST.wanted) 6663 sayNO; 6664 6665 if (OP(ST.me) != SUSPEND) { 6666 /* restore old position except for (?>...) */ 6667 locinput = st->locinput; 6668 } 6669 scan = ST.me + ARG(ST.me); 6670 if (scan == ST.me) 6671 scan = NULL; 6672 continue; /* execute B */ 6673 6674 #undef ST 6675 6676 case LONGJMP: /* alternative with many branches compiles to 6677 * (BRANCHJ; EXACT ...; LONGJMP ) x N */ 6678 next = scan + ARG(scan); 6679 if (next == scan) 6680 next = NULL; 6681 break; 6682 6683 case COMMIT: /* (*COMMIT) */ 6684 reginfo->cutpoint = reginfo->strend; 6685 /* FALLTHROUGH */ 6686 6687 case PRUNE: /* (*PRUNE) */ 6688 if (!scan->flags) 6689 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 6690 PUSH_STATE_GOTO(COMMIT_next, next, locinput); 6691 assert(0); /* NOTREACHED */ 6692 6693 case COMMIT_next_fail: 6694 no_final = 1; 6695 /* FALLTHROUGH */ 6696 6697 case OPFAIL: /* (*FAIL) */ 6698 sayNO; 6699 assert(0); /* NOTREACHED */ 6700 6701 #define ST st->u.mark 6702 case MARKPOINT: /* (*MARK:foo) */ 6703 ST.prev_mark = mark_state; 6704 ST.mark_name = sv_commit = sv_yes_mark 6705 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 6706 mark_state = st; 6707 ST.mark_loc = locinput; 6708 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput); 6709 assert(0); /* NOTREACHED */ 6710 6711 case MARKPOINT_next: 6712 mark_state = ST.prev_mark; 6713 sayYES; 6714 assert(0); /* NOTREACHED */ 6715 6716 case MARKPOINT_next_fail: 6717 if (popmark && sv_eq(ST.mark_name,popmark)) 6718 { 6719 if (ST.mark_loc > startpoint) 6720 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 6721 popmark = NULL; /* we found our mark */ 6722 sv_commit = ST.mark_name; 6723 6724 DEBUG_EXECUTE_r({ 6725 PerlIO_printf(Perl_debug_log, 6726 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n", 6727 REPORT_CODE_OFF+depth*2, "", 6728 PL_colors[4], SVfARG(sv_commit), PL_colors[5]); 6729 }); 6730 } 6731 mark_state = ST.prev_mark; 6732 sv_yes_mark = mark_state ? 6733 mark_state->u.mark.mark_name : NULL; 6734 sayNO; 6735 assert(0); /* NOTREACHED */ 6736 6737 case SKIP: /* (*SKIP) */ 6738 if (scan->flags) { 6739 /* (*SKIP) : if we fail we cut here*/ 6740 ST.mark_name = NULL; 6741 ST.mark_loc = locinput; 6742 PUSH_STATE_GOTO(SKIP_next,next, locinput); 6743 } else { 6744 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, 6745 otherwise do nothing. Meaning we need to scan 6746 */ 6747 regmatch_state *cur = mark_state; 6748 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 6749 6750 while (cur) { 6751 if ( sv_eq( cur->u.mark.mark_name, 6752 find ) ) 6753 { 6754 ST.mark_name = find; 6755 PUSH_STATE_GOTO( SKIP_next, next, locinput); 6756 } 6757 cur = cur->u.mark.prev_mark; 6758 } 6759 } 6760 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ 6761 break; 6762 6763 case SKIP_next_fail: 6764 if (ST.mark_name) { 6765 /* (*CUT:NAME) - Set up to search for the name as we 6766 collapse the stack*/ 6767 popmark = ST.mark_name; 6768 } else { 6769 /* (*CUT) - No name, we cut here.*/ 6770 if (ST.mark_loc > startpoint) 6771 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 6772 /* but we set sv_commit to latest mark_name if there 6773 is one so they can test to see how things lead to this 6774 cut */ 6775 if (mark_state) 6776 sv_commit=mark_state->u.mark.mark_name; 6777 } 6778 no_final = 1; 6779 sayNO; 6780 assert(0); /* NOTREACHED */ 6781 #undef ST 6782 6783 case LNBREAK: /* \R */ 6784 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) { 6785 locinput += n; 6786 } else 6787 sayNO; 6788 break; 6789 6790 default: 6791 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", 6792 PTR2UV(scan), OP(scan)); 6793 Perl_croak(aTHX_ "regexp memory corruption"); 6794 6795 /* this is a point to jump to in order to increment 6796 * locinput by one character */ 6797 increment_locinput: 6798 assert(!NEXTCHR_IS_EOS); 6799 if (utf8_target) { 6800 locinput += PL_utf8skip[nextchr]; 6801 /* locinput is allowed to go 1 char off the end, but not 2+ */ 6802 if (locinput > reginfo->strend) 6803 sayNO; 6804 } 6805 else 6806 locinput++; 6807 break; 6808 6809 } /* end switch */ 6810 6811 /* switch break jumps here */ 6812 scan = next; /* prepare to execute the next op and ... */ 6813 continue; /* ... jump back to the top, reusing st */ 6814 assert(0); /* NOTREACHED */ 6815 6816 push_yes_state: 6817 /* push a state that backtracks on success */ 6818 st->u.yes.prev_yes_state = yes_state; 6819 yes_state = st; 6820 /* FALL THROUGH */ 6821 push_state: 6822 /* push a new regex state, then continue at scan */ 6823 { 6824 regmatch_state *newst; 6825 6826 DEBUG_STACK_r({ 6827 regmatch_state *cur = st; 6828 regmatch_state *curyes = yes_state; 6829 int curd = depth; 6830 regmatch_slab *slab = PL_regmatch_slab; 6831 for (;curd > -1;cur--,curd--) { 6832 if (cur < SLAB_FIRST(slab)) { 6833 slab = slab->prev; 6834 cur = SLAB_LAST(slab); 6835 } 6836 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n", 6837 REPORT_CODE_OFF + 2 + depth * 2,"", 6838 curd, PL_reg_name[cur->resume_state], 6839 (curyes == cur) ? "yes" : "" 6840 ); 6841 if (curyes == cur) 6842 curyes = cur->u.yes.prev_yes_state; 6843 } 6844 } else 6845 DEBUG_STATE_pp("push") 6846 ); 6847 depth++; 6848 st->locinput = locinput; 6849 newst = st+1; 6850 if (newst > SLAB_LAST(PL_regmatch_slab)) 6851 newst = S_push_slab(aTHX); 6852 PL_regmatch_state = newst; 6853 6854 locinput = pushinput; 6855 st = newst; 6856 continue; 6857 assert(0); /* NOTREACHED */ 6858 } 6859 } 6860 6861 /* 6862 * We get here only if there's trouble -- normally "case END" is 6863 * the terminating point. 6864 */ 6865 Perl_croak(aTHX_ "corrupted regexp pointers"); 6866 /*NOTREACHED*/ 6867 sayNO; 6868 6869 yes: 6870 if (yes_state) { 6871 /* we have successfully completed a subexpression, but we must now 6872 * pop to the state marked by yes_state and continue from there */ 6873 assert(st != yes_state); 6874 #ifdef DEBUGGING 6875 while (st != yes_state) { 6876 st--; 6877 if (st < SLAB_FIRST(PL_regmatch_slab)) { 6878 PL_regmatch_slab = PL_regmatch_slab->prev; 6879 st = SLAB_LAST(PL_regmatch_slab); 6880 } 6881 DEBUG_STATE_r({ 6882 if (no_final) { 6883 DEBUG_STATE_pp("pop (no final)"); 6884 } else { 6885 DEBUG_STATE_pp("pop (yes)"); 6886 } 6887 }); 6888 depth--; 6889 } 6890 #else 6891 while (yes_state < SLAB_FIRST(PL_regmatch_slab) 6892 || yes_state > SLAB_LAST(PL_regmatch_slab)) 6893 { 6894 /* not in this slab, pop slab */ 6895 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); 6896 PL_regmatch_slab = PL_regmatch_slab->prev; 6897 st = SLAB_LAST(PL_regmatch_slab); 6898 } 6899 depth -= (st - yes_state); 6900 #endif 6901 st = yes_state; 6902 yes_state = st->u.yes.prev_yes_state; 6903 PL_regmatch_state = st; 6904 6905 if (no_final) 6906 locinput= st->locinput; 6907 state_num = st->resume_state + no_final; 6908 goto reenter_switch; 6909 } 6910 6911 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", 6912 PL_colors[4], PL_colors[5])); 6913 6914 if (reginfo->info_aux_eval) { 6915 /* each successfully executed (?{...}) block does the equivalent of 6916 * local $^R = do {...} 6917 * When popping the save stack, all these locals would be undone; 6918 * bypass this by setting the outermost saved $^R to the latest 6919 * value */ 6920 /* I dont know if this is needed or works properly now. 6921 * see code related to PL_replgv elsewhere in this file. 6922 * Yves 6923 */ 6924 if (oreplsv != GvSV(PL_replgv)) 6925 sv_setsv(oreplsv, GvSV(PL_replgv)); 6926 } 6927 result = 1; 6928 goto final_exit; 6929 6930 no: 6931 DEBUG_EXECUTE_r( 6932 PerlIO_printf(Perl_debug_log, 6933 "%*s %sfailed...%s\n", 6934 REPORT_CODE_OFF+depth*2, "", 6935 PL_colors[4], PL_colors[5]) 6936 ); 6937 6938 no_silent: 6939 if (no_final) { 6940 if (yes_state) { 6941 goto yes; 6942 } else { 6943 goto final_exit; 6944 } 6945 } 6946 if (depth) { 6947 /* there's a previous state to backtrack to */ 6948 st--; 6949 if (st < SLAB_FIRST(PL_regmatch_slab)) { 6950 PL_regmatch_slab = PL_regmatch_slab->prev; 6951 st = SLAB_LAST(PL_regmatch_slab); 6952 } 6953 PL_regmatch_state = st; 6954 locinput= st->locinput; 6955 6956 DEBUG_STATE_pp("pop"); 6957 depth--; 6958 if (yes_state == st) 6959 yes_state = st->u.yes.prev_yes_state; 6960 6961 state_num = st->resume_state + 1; /* failure = success + 1 */ 6962 goto reenter_switch; 6963 } 6964 result = 0; 6965 6966 final_exit: 6967 if (rex->intflags & PREGf_VERBARG_SEEN) { 6968 SV *sv_err = get_sv("REGERROR", 1); 6969 SV *sv_mrk = get_sv("REGMARK", 1); 6970 if (result) { 6971 sv_commit = &PL_sv_no; 6972 if (!sv_yes_mark) 6973 sv_yes_mark = &PL_sv_yes; 6974 } else { 6975 if (!sv_commit) 6976 sv_commit = &PL_sv_yes; 6977 sv_yes_mark = &PL_sv_no; 6978 } 6979 sv_setsv(sv_err, sv_commit); 6980 sv_setsv(sv_mrk, sv_yes_mark); 6981 } 6982 6983 6984 if (last_pushed_cv) { 6985 dSP; 6986 POP_MULTICALL; 6987 PERL_UNUSED_VAR(SP); 6988 } 6989 6990 assert(!result || locinput - reginfo->strbeg >= 0); 6991 return result ? locinput - reginfo->strbeg : -1; 6992 } 6993 6994 /* 6995 - regrepeat - repeatedly match something simple, report how many 6996 * 6997 * What 'simple' means is a node which can be the operand of a quantifier like 6998 * '+', or {1,3} 6999 * 7000 * startposp - pointer a pointer to the start position. This is updated 7001 * to point to the byte following the highest successful 7002 * match. 7003 * p - the regnode to be repeatedly matched against. 7004 * reginfo - struct holding match state, such as strend 7005 * max - maximum number of things to match. 7006 * depth - (for debugging) backtracking depth. 7007 */ 7008 STATIC I32 7009 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, 7010 regmatch_info *const reginfo, I32 max, int depth) 7011 { 7012 dVAR; 7013 char *scan; /* Pointer to current position in target string */ 7014 I32 c; 7015 char *loceol = reginfo->strend; /* local version */ 7016 I32 hardcount = 0; /* How many matches so far */ 7017 bool utf8_target = reginfo->is_utf8_target; 7018 int to_complement = 0; /* Invert the result? */ 7019 UV utf8_flags; 7020 _char_class_number classnum; 7021 #ifndef DEBUGGING 7022 PERL_UNUSED_ARG(depth); 7023 #endif 7024 7025 PERL_ARGS_ASSERT_REGREPEAT; 7026 7027 scan = *startposp; 7028 if (max == REG_INFTY) 7029 max = I32_MAX; 7030 else if (! utf8_target && loceol - scan > max) 7031 loceol = scan + max; 7032 7033 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down 7034 * to the maximum of how far we should go in it (leaving it set to the real 7035 * end, if the maximum permissible would take us beyond that). This allows 7036 * us to make the loop exit condition that we haven't gone past <loceol> to 7037 * also mean that we haven't exceeded the max permissible count, saving a 7038 * test each time through the loop. But it assumes that the OP matches a 7039 * single byte, which is true for most of the OPs below when applied to a 7040 * non-UTF-8 target. Those relatively few OPs that don't have this 7041 * characteristic will have to compensate. 7042 * 7043 * There is no adjustment for UTF-8 targets, as the number of bytes per 7044 * character varies. OPs will have to test both that the count is less 7045 * than the max permissible (using <hardcount> to keep track), and that we 7046 * are still within the bounds of the string (using <loceol>. A few OPs 7047 * match a single byte no matter what the encoding. They can omit the max 7048 * test if, for the UTF-8 case, they do the adjustment that was skipped 7049 * above. 7050 * 7051 * Thus, the code above sets things up for the common case; and exceptional 7052 * cases need extra work; the common case is to make sure <scan> doesn't 7053 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the 7054 * count doesn't exceed the maximum permissible */ 7055 7056 switch (OP(p)) { 7057 case REG_ANY: 7058 if (utf8_target) { 7059 while (scan < loceol && hardcount < max && *scan != '\n') { 7060 scan += UTF8SKIP(scan); 7061 hardcount++; 7062 } 7063 } else { 7064 while (scan < loceol && *scan != '\n') 7065 scan++; 7066 } 7067 break; 7068 case SANY: 7069 if (utf8_target) { 7070 while (scan < loceol && hardcount < max) { 7071 scan += UTF8SKIP(scan); 7072 hardcount++; 7073 } 7074 } 7075 else 7076 scan = loceol; 7077 break; 7078 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */ 7079 if (utf8_target && loceol - scan > max) { 7080 7081 /* <loceol> hadn't been adjusted in the UTF-8 case */ 7082 scan += max; 7083 } 7084 else { 7085 scan = loceol; 7086 } 7087 break; 7088 case EXACT: 7089 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); 7090 7091 c = (U8)*STRING(p); 7092 7093 /* Can use a simple loop if the pattern char to match on is invariant 7094 * under UTF-8, or both target and pattern aren't UTF-8. Note that we 7095 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's 7096 * true iff it doesn't matter if the argument is in UTF-8 or not */ 7097 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) { 7098 if (utf8_target && loceol - scan > max) { 7099 /* We didn't adjust <loceol> because is UTF-8, but ok to do so, 7100 * since here, to match at all, 1 char == 1 byte */ 7101 loceol = scan + max; 7102 } 7103 while (scan < loceol && UCHARAT(scan) == c) { 7104 scan++; 7105 } 7106 } 7107 else if (reginfo->is_utf8_pat) { 7108 if (utf8_target) { 7109 STRLEN scan_char_len; 7110 7111 /* When both target and pattern are UTF-8, we have to do 7112 * string EQ */ 7113 while (hardcount < max 7114 && scan < loceol 7115 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p) 7116 && memEQ(scan, STRING(p), scan_char_len)) 7117 { 7118 scan += scan_char_len; 7119 hardcount++; 7120 } 7121 } 7122 else if (! UTF8_IS_ABOVE_LATIN1(c)) { 7123 7124 /* Target isn't utf8; convert the character in the UTF-8 7125 * pattern to non-UTF8, and do a simple loop */ 7126 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); 7127 while (scan < loceol && UCHARAT(scan) == c) { 7128 scan++; 7129 } 7130 } /* else pattern char is above Latin1, can't possibly match the 7131 non-UTF-8 target */ 7132 } 7133 else { 7134 7135 /* Here, the string must be utf8; pattern isn't, and <c> is 7136 * different in utf8 than not, so can't compare them directly. 7137 * Outside the loop, find the two utf8 bytes that represent c, and 7138 * then look for those in sequence in the utf8 string */ 7139 U8 high = UTF8_TWO_BYTE_HI(c); 7140 U8 low = UTF8_TWO_BYTE_LO(c); 7141 7142 while (hardcount < max 7143 && scan + 1 < loceol 7144 && UCHARAT(scan) == high 7145 && UCHARAT(scan + 1) == low) 7146 { 7147 scan += 2; 7148 hardcount++; 7149 } 7150 } 7151 break; 7152 7153 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ 7154 assert(! reginfo->is_utf8_pat); 7155 /* FALL THROUGH */ 7156 case EXACTFA: 7157 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7158 goto do_exactf; 7159 7160 case EXACTFL: 7161 utf8_flags = FOLDEQ_LOCALE; 7162 goto do_exactf; 7163 7164 case EXACTF: /* This node only generated for non-utf8 patterns */ 7165 assert(! reginfo->is_utf8_pat); 7166 utf8_flags = 0; 7167 goto do_exactf; 7168 7169 case EXACTFU_SS: 7170 case EXACTFU: 7171 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; 7172 7173 do_exactf: { 7174 int c1, c2; 7175 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; 7176 7177 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); 7178 7179 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, 7180 reginfo)) 7181 { 7182 if (c1 == CHRTEST_VOID) { 7183 /* Use full Unicode fold matching */ 7184 char *tmpeol = reginfo->strend; 7185 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1; 7186 while (hardcount < max 7187 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, 7188 STRING(p), NULL, pat_len, 7189 reginfo->is_utf8_pat, utf8_flags)) 7190 { 7191 scan = tmpeol; 7192 tmpeol = reginfo->strend; 7193 hardcount++; 7194 } 7195 } 7196 else if (utf8_target) { 7197 if (c1 == c2) { 7198 while (scan < loceol 7199 && hardcount < max 7200 && memEQ(scan, c1_utf8, UTF8SKIP(scan))) 7201 { 7202 scan += UTF8SKIP(scan); 7203 hardcount++; 7204 } 7205 } 7206 else { 7207 while (scan < loceol 7208 && hardcount < max 7209 && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) 7210 || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) 7211 { 7212 scan += UTF8SKIP(scan); 7213 hardcount++; 7214 } 7215 } 7216 } 7217 else if (c1 == c2) { 7218 while (scan < loceol && UCHARAT(scan) == c1) { 7219 scan++; 7220 } 7221 } 7222 else { 7223 while (scan < loceol && 7224 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) 7225 { 7226 scan++; 7227 } 7228 } 7229 } 7230 break; 7231 } 7232 case ANYOF: 7233 if (utf8_target) { 7234 while (hardcount < max 7235 && scan < loceol 7236 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) 7237 { 7238 scan += UTF8SKIP(scan); 7239 hardcount++; 7240 } 7241 } else { 7242 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan)) 7243 scan++; 7244 } 7245 break; 7246 7247 /* The argument (FLAGS) to all the POSIX node types is the class number */ 7248 7249 case NPOSIXL: 7250 to_complement = 1; 7251 /* FALLTHROUGH */ 7252 7253 case POSIXL: 7254 if (! utf8_target) { 7255 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), 7256 *scan))) 7257 { 7258 scan++; 7259 } 7260 } else { 7261 while (hardcount < max && scan < loceol 7262 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), 7263 (U8 *) scan))) 7264 { 7265 scan += UTF8SKIP(scan); 7266 hardcount++; 7267 } 7268 } 7269 break; 7270 7271 case POSIXD: 7272 if (utf8_target) { 7273 goto utf8_posix; 7274 } 7275 /* FALLTHROUGH */ 7276 7277 case POSIXA: 7278 if (utf8_target && loceol - scan > max) { 7279 7280 /* We didn't adjust <loceol> at the beginning of this routine 7281 * because is UTF-8, but it is actually ok to do so, since here, to 7282 * match, 1 char == 1 byte. */ 7283 loceol = scan + max; 7284 } 7285 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { 7286 scan++; 7287 } 7288 break; 7289 7290 case NPOSIXD: 7291 if (utf8_target) { 7292 to_complement = 1; 7293 goto utf8_posix; 7294 } 7295 /* FALL THROUGH */ 7296 7297 case NPOSIXA: 7298 if (! utf8_target) { 7299 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { 7300 scan++; 7301 } 7302 } 7303 else { 7304 7305 /* The complement of something that matches only ASCII matches all 7306 * UTF-8 variant code points, plus everything in ASCII that isn't 7307 * in the class. */ 7308 while (hardcount < max && scan < loceol 7309 && (! UTF8_IS_INVARIANT(*scan) 7310 || ! _generic_isCC_A((U8) *scan, FLAGS(p)))) 7311 { 7312 scan += UTF8SKIP(scan); 7313 hardcount++; 7314 } 7315 } 7316 break; 7317 7318 case NPOSIXU: 7319 to_complement = 1; 7320 /* FALLTHROUGH */ 7321 7322 case POSIXU: 7323 if (! utf8_target) { 7324 while (scan < loceol && to_complement 7325 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p)))) 7326 { 7327 scan++; 7328 } 7329 } 7330 else { 7331 utf8_posix: 7332 classnum = (_char_class_number) FLAGS(p); 7333 if (classnum < _FIRST_NON_SWASH_CC) { 7334 7335 /* Here, a swash is needed for above-Latin1 code points. 7336 * Process as many Latin1 code points using the built-in rules. 7337 * Go to another loop to finish processing upon encountering 7338 * the first Latin1 code point. We could do that in this loop 7339 * as well, but the other way saves having to test if the swash 7340 * has been loaded every time through the loop: extra space to 7341 * save a test. */ 7342 while (hardcount < max && scan < loceol) { 7343 if (UTF8_IS_INVARIANT(*scan)) { 7344 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan, 7345 classnum)))) 7346 { 7347 break; 7348 } 7349 scan++; 7350 } 7351 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { 7352 if (! (to_complement 7353 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, 7354 *(scan + 1)), 7355 classnum)))) 7356 { 7357 break; 7358 } 7359 scan += 2; 7360 } 7361 else { 7362 goto found_above_latin1; 7363 } 7364 7365 hardcount++; 7366 } 7367 } 7368 else { 7369 /* For these character classes, the knowledge of how to handle 7370 * every code point is compiled in to Perl via a macro. This 7371 * code is written for making the loops as tight as possible. 7372 * It could be refactored to save space instead */ 7373 switch (classnum) { 7374 case _CC_ENUM_SPACE: /* XXX would require separate code 7375 if we revert the change of \v 7376 matching this */ 7377 /* FALL THROUGH */ 7378 case _CC_ENUM_PSXSPC: 7379 while (hardcount < max 7380 && scan < loceol 7381 && (to_complement ^ cBOOL(isSPACE_utf8(scan)))) 7382 { 7383 scan += UTF8SKIP(scan); 7384 hardcount++; 7385 } 7386 break; 7387 case _CC_ENUM_BLANK: 7388 while (hardcount < max 7389 && scan < loceol 7390 && (to_complement ^ cBOOL(isBLANK_utf8(scan)))) 7391 { 7392 scan += UTF8SKIP(scan); 7393 hardcount++; 7394 } 7395 break; 7396 case _CC_ENUM_XDIGIT: 7397 while (hardcount < max 7398 && scan < loceol 7399 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan)))) 7400 { 7401 scan += UTF8SKIP(scan); 7402 hardcount++; 7403 } 7404 break; 7405 case _CC_ENUM_VERTSPACE: 7406 while (hardcount < max 7407 && scan < loceol 7408 && (to_complement ^ cBOOL(isVERTWS_utf8(scan)))) 7409 { 7410 scan += UTF8SKIP(scan); 7411 hardcount++; 7412 } 7413 break; 7414 case _CC_ENUM_CNTRL: 7415 while (hardcount < max 7416 && scan < loceol 7417 && (to_complement ^ cBOOL(isCNTRL_utf8(scan)))) 7418 { 7419 scan += UTF8SKIP(scan); 7420 hardcount++; 7421 } 7422 break; 7423 default: 7424 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum); 7425 } 7426 } 7427 } 7428 break; 7429 7430 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */ 7431 7432 /* Load the swash if not already present */ 7433 if (! PL_utf8_swash_ptrs[classnum]) { 7434 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; 7435 PL_utf8_swash_ptrs[classnum] = _core_swash_init( 7436 "utf8", 7437 "", 7438 &PL_sv_undef, 1, 0, 7439 PL_XPosix_ptrs[classnum], &flags); 7440 } 7441 7442 while (hardcount < max && scan < loceol 7443 && to_complement ^ cBOOL(_generic_utf8( 7444 classnum, 7445 scan, 7446 swash_fetch(PL_utf8_swash_ptrs[classnum], 7447 (U8 *) scan, 7448 TRUE)))) 7449 { 7450 scan += UTF8SKIP(scan); 7451 hardcount++; 7452 } 7453 break; 7454 7455 case LNBREAK: 7456 if (utf8_target) { 7457 while (hardcount < max && scan < loceol && 7458 (c=is_LNBREAK_utf8_safe(scan, loceol))) { 7459 scan += c; 7460 hardcount++; 7461 } 7462 } else { 7463 /* LNBREAK can match one or two latin chars, which is ok, but we 7464 * have to use hardcount in this situation, and throw away the 7465 * adjustment to <loceol> done before the switch statement */ 7466 loceol = reginfo->strend; 7467 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { 7468 scan+=c; 7469 hardcount++; 7470 } 7471 } 7472 break; 7473 7474 case BOUND: 7475 case BOUNDA: 7476 case BOUNDL: 7477 case BOUNDU: 7478 case EOS: 7479 case GPOS: 7480 case KEEPS: 7481 case NBOUND: 7482 case NBOUNDA: 7483 case NBOUNDL: 7484 case NBOUNDU: 7485 case OPFAIL: 7486 case SBOL: 7487 case SEOL: 7488 /* These are all 0 width, so match right here or not at all. */ 7489 break; 7490 7491 default: 7492 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); 7493 assert(0); /* NOTREACHED */ 7494 7495 } 7496 7497 if (hardcount) 7498 c = hardcount; 7499 else 7500 c = scan - *startposp; 7501 *startposp = scan; 7502 7503 DEBUG_r({ 7504 GET_RE_DEBUG_FLAGS_DECL; 7505 DEBUG_EXECUTE_r({ 7506 SV * const prop = sv_newmortal(); 7507 regprop(prog, prop, p, reginfo); 7508 PerlIO_printf(Perl_debug_log, 7509 "%*s %s can match %"IVdf" times out of %"IVdf"...\n", 7510 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); 7511 }); 7512 }); 7513 7514 return(c); 7515 } 7516 7517 7518 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 7519 /* 7520 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to 7521 create a copy so that changes the caller makes won't change the shared one. 7522 If <altsvp> is non-null, will return NULL in it, for back-compat. 7523 */ 7524 SV * 7525 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp) 7526 { 7527 PERL_ARGS_ASSERT_REGCLASS_SWASH; 7528 7529 if (altsvp) { 7530 *altsvp = NULL; 7531 } 7532 7533 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); 7534 } 7535 7536 SV * 7537 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, 7538 const regnode* node, 7539 bool doinit, 7540 SV** listsvp, 7541 SV** only_utf8_locale_ptr) 7542 { 7543 /* For internal core use only. 7544 * Returns the swash for the input 'node' in the regex 'prog'. 7545 * If <doinit> is 'true', will attempt to create the swash if not already 7546 * done. 7547 * If <listsvp> is non-null, will return the printable contents of the 7548 * swash. This can be used to get debugging information even before the 7549 * swash exists, by calling this function with 'doinit' set to false, in 7550 * which case the components that will be used to eventually create the 7551 * swash are returned (in a printable form). 7552 * Tied intimately to how regcomp.c sets up the data structure */ 7553 7554 dVAR; 7555 SV *sw = NULL; 7556 SV *si = NULL; /* Input swash initialization string */ 7557 SV* invlist = NULL; 7558 7559 RXi_GET_DECL(prog,progi); 7560 const struct reg_data * const data = prog ? progi->data : NULL; 7561 7562 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; 7563 7564 assert(ANYOF_FLAGS(node) 7565 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); 7566 7567 if (data && data->count) { 7568 const U32 n = ARG(node); 7569 7570 if (data->what[n] == 's') { 7571 SV * const rv = MUTABLE_SV(data->data[n]); 7572 AV * const av = MUTABLE_AV(SvRV(rv)); 7573 SV **const ary = AvARRAY(av); 7574 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; 7575 7576 si = *ary; /* ary[0] = the string to initialize the swash with */ 7577 7578 /* Elements 3 and 4 are either both present or both absent. [3] is 7579 * any inversion list generated at compile time; [4] indicates if 7580 * that inversion list has any user-defined properties in it. */ 7581 if (av_tindex(av) >= 2) { 7582 if (only_utf8_locale_ptr 7583 && ary[2] 7584 && ary[2] != &PL_sv_undef) 7585 { 7586 *only_utf8_locale_ptr = ary[2]; 7587 } 7588 else { 7589 *only_utf8_locale_ptr = NULL; 7590 } 7591 7592 if (av_tindex(av) >= 3) { 7593 invlist = ary[3]; 7594 if (SvUV(ary[4])) { 7595 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; 7596 } 7597 } 7598 else { 7599 invlist = NULL; 7600 } 7601 } 7602 7603 /* Element [1] is reserved for the set-up swash. If already there, 7604 * return it; if not, create it and store it there */ 7605 if (ary[1] && SvROK(ary[1])) { 7606 sw = ary[1]; 7607 } 7608 else if (doinit && ((si && si != &PL_sv_undef) 7609 || (invlist && invlist != &PL_sv_undef))) { 7610 7611 sw = _core_swash_init("utf8", /* the utf8 package */ 7612 "", /* nameless */ 7613 si, 7614 1, /* binary */ 7615 0, /* not from tr/// */ 7616 invlist, 7617 &swash_init_flags); 7618 (void)av_store(av, 1, sw); 7619 } 7620 } 7621 } 7622 7623 /* If requested, return a printable version of what this swash matches */ 7624 if (listsvp) { 7625 SV* matches_string = newSVpvn("", 0); 7626 7627 /* The swash should be used, if possible, to get the data, as it 7628 * contains the resolved data. But this function can be called at 7629 * compile-time, before everything gets resolved, in which case we 7630 * return the currently best available information, which is the string 7631 * that will eventually be used to do that resolving, 'si' */ 7632 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) 7633 && (si && si != &PL_sv_undef)) 7634 { 7635 sv_catsv(matches_string, si); 7636 } 7637 7638 /* Add the inversion list to whatever we have. This may have come from 7639 * the swash, or from an input parameter */ 7640 if (invlist) { 7641 sv_catsv(matches_string, _invlist_contents(invlist)); 7642 } 7643 *listsvp = matches_string; 7644 } 7645 7646 return sw; 7647 } 7648 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ 7649 7650 /* 7651 - reginclass - determine if a character falls into a character class 7652 7653 n is the ANYOF regnode 7654 p is the target string 7655 p_end points to one byte beyond the end of the target string 7656 utf8_target tells whether p is in UTF-8. 7657 7658 Returns true if matched; false otherwise. 7659 7660 Note that this can be a synthetic start class, a combination of various 7661 nodes, so things you think might be mutually exclusive, such as locale, 7662 aren't. It can match both locale and non-locale 7663 7664 */ 7665 7666 STATIC bool 7667 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) 7668 { 7669 dVAR; 7670 const char flags = ANYOF_FLAGS(n); 7671 bool match = FALSE; 7672 UV c = *p; 7673 7674 PERL_ARGS_ASSERT_REGINCLASS; 7675 7676 /* If c is not already the code point, get it. Note that 7677 * UTF8_IS_INVARIANT() works even if not in UTF-8 */ 7678 if (! UTF8_IS_INVARIANT(c) && utf8_target) { 7679 STRLEN c_len = 0; 7680 c = utf8n_to_uvchr(p, p_end - p, &c_len, 7681 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) 7682 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); 7683 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for 7684 * UTF8_ALLOW_FFFF */ 7685 if (c_len == (STRLEN)-1) 7686 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); 7687 } 7688 7689 /* If this character is potentially in the bitmap, check it */ 7690 if (c < 256) { 7691 if (ANYOF_BITMAP_TEST(n, c)) 7692 match = TRUE; 7693 else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL 7694 && ! utf8_target 7695 && ! isASCII(c)) 7696 { 7697 match = TRUE; 7698 } 7699 else if (flags & ANYOF_LOCALE_FLAGS) { 7700 if (flags & ANYOF_LOC_FOLD) { 7701 if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { 7702 match = TRUE; 7703 } 7704 } 7705 if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { 7706 7707 /* The data structure is arranged so bits 0, 2, 4, ... are set 7708 * if the class includes the Posix character class given by 7709 * bit/2; and 1, 3, 5, ... are set if the class includes the 7710 * complemented Posix class given by int(bit/2). So we loop 7711 * through the bits, each time changing whether we complement 7712 * the result or not. Suppose for the sake of illustration 7713 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0 7714 * is set, it means there is a match for this ANYOF node if the 7715 * character is in the class given by the expression (0 / 2 = 0 7716 * = \w). If it is in that class, isFOO_lc() will return 1, 7717 * and since 'to_complement' is 0, the result will stay TRUE, 7718 * and we exit the loop. Suppose instead that bit 0 is 0, but 7719 * bit 1 is 1. That means there is a match if the character 7720 * matches \W. We won't bother to call isFOO_lc() on bit 0, 7721 * but will on bit 1. On the second iteration 'to_complement' 7722 * will be 1, so the exclusive or will reverse things, so we 7723 * are testing for \W. On the third iteration, 'to_complement' 7724 * will be 0, and we would be testing for \s; the fourth 7725 * iteration would test for \S, etc. 7726 * 7727 * Note that this code assumes that all the classes are closed 7728 * under folding. For example, if a character matches \w, then 7729 * its fold does too; and vice versa. This should be true for 7730 * any well-behaved locale for all the currently defined Posix 7731 * classes, except for :lower: and :upper:, which are handled 7732 * by the pseudo-class :cased: which matches if either of the 7733 * other two does. To get rid of this assumption, an outer 7734 * loop could be used below to iterate over both the source 7735 * character, and its fold (if different) */ 7736 7737 int count = 0; 7738 int to_complement = 0; 7739 7740 while (count < ANYOF_MAX) { 7741 if (ANYOF_POSIXL_TEST(n, count) 7742 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) 7743 { 7744 match = TRUE; 7745 break; 7746 } 7747 count++; 7748 to_complement ^= 1; 7749 } 7750 } 7751 } 7752 } 7753 7754 7755 /* If the bitmap didn't (or couldn't) match, and something outside the 7756 * bitmap could match, try that. */ 7757 if (!match) { 7758 if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { 7759 match = TRUE; /* Everything above 255 matches */ 7760 } 7761 else if ((flags & ANYOF_NONBITMAP_NON_UTF8) 7762 || (utf8_target && (flags & ANYOF_UTF8)) 7763 || ((flags & ANYOF_LOC_FOLD) 7764 && IN_UTF8_CTYPE_LOCALE 7765 && ARG(n) != ANYOF_NONBITMAP_EMPTY)) 7766 { 7767 SV* only_utf8_locale = NULL; 7768 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, 7769 &only_utf8_locale); 7770 if (sw) { 7771 U8 * utf8_p; 7772 if (utf8_target) { 7773 utf8_p = (U8 *) p; 7774 } else { /* Convert to utf8 */ 7775 STRLEN len = 1; 7776 utf8_p = bytes_to_utf8(p, &len); 7777 } 7778 7779 if (swash_fetch(sw, utf8_p, TRUE)) { 7780 match = TRUE; 7781 } 7782 7783 /* If we allocated a string above, free it */ 7784 if (! utf8_target) Safefree(utf8_p); 7785 } 7786 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { 7787 match = _invlist_contains_cp(only_utf8_locale, c); 7788 } 7789 } 7790 7791 if (UNICODE_IS_SUPER(c) 7792 && (flags & ANYOF_WARN_SUPER) 7793 && ckWARN_d(WARN_NON_UNICODE)) 7794 { 7795 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), 7796 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); 7797 } 7798 } 7799 7800 #if ANYOF_INVERT != 1 7801 /* Depending on compiler optimization cBOOL takes time, so if don't have to 7802 * use it, don't */ 7803 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, 7804 #endif 7805 7806 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ 7807 return (flags & ANYOF_INVERT) ^ match; 7808 } 7809 7810 STATIC U8 * 7811 S_reghop3(U8 *s, SSize_t off, const U8* lim) 7812 { 7813 /* return the position 'off' UTF-8 characters away from 's', forward if 7814 * 'off' >= 0, backwards if negative. But don't go outside of position 7815 * 'lim', which better be < s if off < 0 */ 7816 7817 dVAR; 7818 7819 PERL_ARGS_ASSERT_REGHOP3; 7820 7821 if (off >= 0) { 7822 while (off-- && s < lim) { 7823 /* XXX could check well-formedness here */ 7824 s += UTF8SKIP(s); 7825 } 7826 } 7827 else { 7828 while (off++ && s > lim) { 7829 s--; 7830 if (UTF8_IS_CONTINUED(*s)) { 7831 while (s > lim && UTF8_IS_CONTINUATION(*s)) 7832 s--; 7833 if (! UTF8_IS_START(*s)) { 7834 dTHX; 7835 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); 7836 } 7837 } 7838 /* XXX could check well-formedness here */ 7839 } 7840 } 7841 return s; 7842 } 7843 7844 STATIC U8 * 7845 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) 7846 { 7847 dVAR; 7848 7849 PERL_ARGS_ASSERT_REGHOP4; 7850 7851 if (off >= 0) { 7852 while (off-- && s < rlim) { 7853 /* XXX could check well-formedness here */ 7854 s += UTF8SKIP(s); 7855 } 7856 } 7857 else { 7858 while (off++ && s > llim) { 7859 s--; 7860 if (UTF8_IS_CONTINUED(*s)) { 7861 while (s > llim && UTF8_IS_CONTINUATION(*s)) 7862 s--; 7863 if (! UTF8_IS_START(*s)) { 7864 dTHX; 7865 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); 7866 } 7867 } 7868 /* XXX could check well-formedness here */ 7869 } 7870 } 7871 return s; 7872 } 7873 7874 /* like reghop3, but returns NULL on overrun, rather than returning last 7875 * char pos */ 7876 7877 STATIC U8 * 7878 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) 7879 { 7880 dVAR; 7881 7882 PERL_ARGS_ASSERT_REGHOPMAYBE3; 7883 7884 if (off >= 0) { 7885 while (off-- && s < lim) { 7886 /* XXX could check well-formedness here */ 7887 s += UTF8SKIP(s); 7888 } 7889 if (off >= 0) 7890 return NULL; 7891 } 7892 else { 7893 while (off++ && s > lim) { 7894 s--; 7895 if (UTF8_IS_CONTINUED(*s)) { 7896 while (s > lim && UTF8_IS_CONTINUATION(*s)) 7897 s--; 7898 if (! UTF8_IS_START(*s)) { 7899 dTHX; 7900 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); 7901 } 7902 } 7903 /* XXX could check well-formedness here */ 7904 } 7905 if (off <= 0) 7906 return NULL; 7907 } 7908 return s; 7909 } 7910 7911 7912 /* when executing a regex that may have (?{}), extra stuff needs setting 7913 up that will be visible to the called code, even before the current 7914 match has finished. In particular: 7915 7916 * $_ is localised to the SV currently being matched; 7917 * pos($_) is created if necessary, ready to be updated on each call-out 7918 to code; 7919 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm 7920 isn't set until the current pattern is successfully finished), so that 7921 $1 etc of the match-so-far can be seen; 7922 * save the old values of subbeg etc of the current regex, and set then 7923 to the current string (again, this is normally only done at the end 7924 of execution) 7925 */ 7926 7927 static void 7928 S_setup_eval_state(pTHX_ regmatch_info *const reginfo) 7929 { 7930 MAGIC *mg; 7931 regexp *const rex = ReANY(reginfo->prog); 7932 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; 7933 7934 eval_state->rex = rex; 7935 7936 if (reginfo->sv) { 7937 /* Make $_ available to executed code. */ 7938 if (reginfo->sv != DEFSV) { 7939 SAVE_DEFSV; 7940 DEFSV_set(reginfo->sv); 7941 } 7942 7943 if (!(mg = mg_find_mglob(reginfo->sv))) { 7944 /* prepare for quick setting of pos */ 7945 mg = sv_magicext_mglob(reginfo->sv); 7946 mg->mg_len = -1; 7947 } 7948 eval_state->pos_magic = mg; 7949 eval_state->pos = mg->mg_len; 7950 eval_state->pos_flags = mg->mg_flags; 7951 } 7952 else 7953 eval_state->pos_magic = NULL; 7954 7955 if (!PL_reg_curpm) { 7956 /* PL_reg_curpm is a fake PMOP that we can attach the current 7957 * regex to and point PL_curpm at, so that $1 et al are visible 7958 * within a /(?{})/. It's just allocated once per interpreter the 7959 * first time its needed */ 7960 Newxz(PL_reg_curpm, 1, PMOP); 7961 #ifdef USE_ITHREADS 7962 { 7963 SV* const repointer = &PL_sv_undef; 7964 /* this regexp is also owned by the new PL_reg_curpm, which 7965 will try to free it. */ 7966 av_push(PL_regex_padav, repointer); 7967 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); 7968 PL_regex_pad = AvARRAY(PL_regex_padav); 7969 } 7970 #endif 7971 } 7972 SET_reg_curpm(reginfo->prog); 7973 eval_state->curpm = PL_curpm; 7974 PL_curpm = PL_reg_curpm; 7975 if (RXp_MATCH_COPIED(rex)) { 7976 /* Here is a serious problem: we cannot rewrite subbeg, 7977 since it may be needed if this match fails. Thus 7978 $` inside (?{}) could fail... */ 7979 eval_state->subbeg = rex->subbeg; 7980 eval_state->sublen = rex->sublen; 7981 eval_state->suboffset = rex->suboffset; 7982 eval_state->subcoffset = rex->subcoffset; 7983 #ifdef PERL_ANY_COW 7984 eval_state->saved_copy = rex->saved_copy; 7985 #endif 7986 RXp_MATCH_COPIED_off(rex); 7987 } 7988 else 7989 eval_state->subbeg = NULL; 7990 rex->subbeg = (char *)reginfo->strbeg; 7991 rex->suboffset = 0; 7992 rex->subcoffset = 0; 7993 rex->sublen = reginfo->strend - reginfo->strbeg; 7994 } 7995 7996 7997 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ 7998 7999 static void 8000 S_cleanup_regmatch_info_aux(pTHX_ void *arg) 8001 { 8002 dVAR; 8003 regmatch_info_aux *aux = (regmatch_info_aux *) arg; 8004 regmatch_info_aux_eval *eval_state = aux->info_aux_eval; 8005 regmatch_slab *s; 8006 8007 Safefree(aux->poscache); 8008 8009 if (eval_state) { 8010 8011 /* undo the effects of S_setup_eval_state() */ 8012 8013 if (eval_state->subbeg) { 8014 regexp * const rex = eval_state->rex; 8015 rex->subbeg = eval_state->subbeg; 8016 rex->sublen = eval_state->sublen; 8017 rex->suboffset = eval_state->suboffset; 8018 rex->subcoffset = eval_state->subcoffset; 8019 #ifdef PERL_ANY_COW 8020 rex->saved_copy = eval_state->saved_copy; 8021 #endif 8022 RXp_MATCH_COPIED_on(rex); 8023 } 8024 if (eval_state->pos_magic) 8025 { 8026 eval_state->pos_magic->mg_len = eval_state->pos; 8027 eval_state->pos_magic->mg_flags = 8028 (eval_state->pos_magic->mg_flags & ~MGf_BYTES) 8029 | (eval_state->pos_flags & MGf_BYTES); 8030 } 8031 8032 PL_curpm = eval_state->curpm; 8033 } 8034 8035 PL_regmatch_state = aux->old_regmatch_state; 8036 PL_regmatch_slab = aux->old_regmatch_slab; 8037 8038 /* free all slabs above current one - this must be the last action 8039 * of this function, as aux and eval_state are allocated within 8040 * slabs and may be freed here */ 8041 8042 s = PL_regmatch_slab->next; 8043 if (s) { 8044 PL_regmatch_slab->next = NULL; 8045 while (s) { 8046 regmatch_slab * const osl = s; 8047 s = s->next; 8048 Safefree(osl); 8049 } 8050 } 8051 } 8052 8053 8054 STATIC void 8055 S_to_utf8_substr(pTHX_ regexp *prog) 8056 { 8057 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile 8058 * on the converted value */ 8059 8060 int i = 1; 8061 8062 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; 8063 8064 do { 8065 if (prog->substrs->data[i].substr 8066 && !prog->substrs->data[i].utf8_substr) { 8067 SV* const sv = newSVsv(prog->substrs->data[i].substr); 8068 prog->substrs->data[i].utf8_substr = sv; 8069 sv_utf8_upgrade(sv); 8070 if (SvVALID(prog->substrs->data[i].substr)) { 8071 if (SvTAIL(prog->substrs->data[i].substr)) { 8072 /* Trim the trailing \n that fbm_compile added last 8073 time. */ 8074 SvCUR_set(sv, SvCUR(sv) - 1); 8075 /* Whilst this makes the SV technically "invalid" (as its 8076 buffer is no longer followed by "\0") when fbm_compile() 8077 adds the "\n" back, a "\0" is restored. */ 8078 fbm_compile(sv, FBMcf_TAIL); 8079 } else 8080 fbm_compile(sv, 0); 8081 } 8082 if (prog->substrs->data[i].substr == prog->check_substr) 8083 prog->check_utf8 = sv; 8084 } 8085 } while (i--); 8086 } 8087 8088 STATIC bool 8089 S_to_byte_substr(pTHX_ regexp *prog) 8090 { 8091 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile 8092 * on the converted value; returns FALSE if can't be converted. */ 8093 8094 dVAR; 8095 int i = 1; 8096 8097 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; 8098 8099 do { 8100 if (prog->substrs->data[i].utf8_substr 8101 && !prog->substrs->data[i].substr) { 8102 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); 8103 if (! sv_utf8_downgrade(sv, TRUE)) { 8104 return FALSE; 8105 } 8106 if (SvVALID(prog->substrs->data[i].utf8_substr)) { 8107 if (SvTAIL(prog->substrs->data[i].utf8_substr)) { 8108 /* Trim the trailing \n that fbm_compile added last 8109 time. */ 8110 SvCUR_set(sv, SvCUR(sv) - 1); 8111 fbm_compile(sv, FBMcf_TAIL); 8112 } else 8113 fbm_compile(sv, 0); 8114 } 8115 prog->substrs->data[i].substr = sv; 8116 if (prog->substrs->data[i].utf8_substr == prog->check_utf8) 8117 prog->check_substr = sv; 8118 } 8119 } while (i--); 8120 8121 return TRUE; 8122 } 8123 8124 /* 8125 * Local variables: 8126 * c-indentation-style: bsd 8127 * c-basic-offset: 4 8128 * indent-tabs-mode: nil 8129 * End: 8130 * 8131 * ex: set ts=8 sts=4 sw=4 et: 8132 */ 8133