1 /* regexec.c 2 */ 3 4 /* 5 * One Ring to rule them all, One Ring to find them 6 * 7 * [p.v of _The Lord of the Rings_, opening poem] 8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] 9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] 10 */ 11 12 /* This file contains functions for executing a regular expression. See 13 * also regcomp.c which funnily enough, contains functions for compiling 14 * a regular expression. 15 * 16 * This file is also copied at build time to ext/re/re_exec.c, where 17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 18 * This causes the main functions to be compiled under new names and with 19 * debugging support added, which makes "use re 'debug'" work. 20 */ 21 22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 23 * confused with the original package (see point 3 below). Thanks, Henry! 24 */ 25 26 /* Additional note: this code is very heavily munged from Henry's version 27 * in places. In some spots I've traded clarity for efficiency, so don't 28 * blame Henry for some of the lack of readability. 29 */ 30 31 /* The names of the functions have been changed from regcomp and 32 * regexec to pregcomp and pregexec in order to avoid conflicts 33 * with the POSIX routines of the same names. 34 */ 35 36 #ifdef PERL_EXT_RE_BUILD 37 #include "re_top.h" 38 #endif 39 40 /* 41 * pregcomp and pregexec -- regsub and regerror are not used in perl 42 * 43 * Copyright (c) 1986 by University of Toronto. 44 * Written by Henry Spencer. Not derived from licensed software. 45 * 46 * Permission is granted to anyone to use this software for any 47 * purpose on any computer system, and to redistribute it freely, 48 * subject to the following restrictions: 49 * 50 * 1. The author is not responsible for the consequences of use of 51 * this software, no matter how awful, even if they arise 52 * from defects in it. 53 * 54 * 2. The origin of this software must not be misrepresented, either 55 * by explicit claim or by omission. 56 * 57 * 3. Altered versions must be plainly marked as such, and must not 58 * be misrepresented as being the original software. 59 * 60 **** Alterations to Henry's code are... 61 **** 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 64 **** by Larry Wall and others 65 **** 66 **** You may distribute under the terms of either the GNU General Public 67 **** License or the Artistic License, as specified in the README file. 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73 #include "EXTERN.h" 74 #define PERL_IN_REGEX_ENGINE 75 #define PERL_IN_REGEXEC_C 76 #include "perl.h" 77 78 #ifdef PERL_IN_XSUB_RE 79 # include "re_comp.h" 80 #else 81 # include "regcomp.h" 82 #endif 83 84 #include "invlist_inline.h" 85 #include "unicode_constants.h" 86 87 static const char b_utf8_locale_required[] = 88 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong." 89 " Assuming a UTF-8 locale"; 90 91 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \ 92 STMT_START { \ 93 if (! IN_UTF8_CTYPE_LOCALE) { \ 94 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \ 95 b_utf8_locale_required); \ 96 } \ 97 } STMT_END 98 99 static const char sets_utf8_locale_required[] = 100 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"; 101 102 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \ 103 STMT_START { \ 104 if (! IN_UTF8_CTYPE_LOCALE && (FLAGS(n) & ANYOFL_UTF8_LOCALE_REQD)){\ 105 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \ 106 sets_utf8_locale_required); \ 107 } \ 108 } STMT_END 109 110 #ifdef DEBUGGING 111 /* At least one required character in the target string is expressible only in 112 * UTF-8. */ 113 static const char non_utf8_target_but_utf8_required[] 114 = "Can't match, because target string needs to be in UTF-8\n"; 115 #endif 116 117 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ 118 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\ 119 goto target; \ 120 } STMT_END 121 122 #ifndef STATIC 123 #define STATIC static 124 #endif 125 126 /* 127 * Forwards. 128 */ 129 130 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) 131 132 #define HOPc(pos,off) \ 133 (char *)(reginfo->is_utf8_target \ 134 ? reghop3((U8*)pos, off, \ 135 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ 136 : (U8*)(pos + off)) 137 138 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */ 139 #define HOPBACK3(pos, off, lim) \ 140 (reginfo->is_utf8_target \ 141 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \ 142 : (pos - off >= lim) \ 143 ? (U8*)pos - off \ 144 : NULL) 145 146 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg)) 147 148 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) 149 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) 150 151 /* lim must be +ve. Returns NULL on overshoot */ 152 #define HOPMAYBE3(pos,off,lim) \ 153 (reginfo->is_utf8_target \ 154 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ 155 : ((U8*)pos + off <= lim) \ 156 ? (U8*)pos + off \ 157 : NULL) 158 159 /* like HOP3, but limits the result to <= lim even for the non-utf8 case. 160 * off must be >=0; args should be vars rather than expressions */ 161 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ 162 ? reghop3((U8*)(pos), off, (U8*)(lim)) \ 163 : (U8*)((pos + off) > lim ? lim : (pos + off))) 164 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim)) 165 166 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ 167 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ 168 : (U8*)(pos + off)) 169 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) 170 171 #define PLACEHOLDER /* Something for the preprocessor to grab onto */ 172 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ 173 174 /* for use after a quantifier and before an EXACT-like node -- japhy */ 175 /* it would be nice to rework regcomp.sym to generate this stuff. sigh 176 * 177 * NOTE that *nothing* that affects backtracking should be in here, specifically 178 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a 179 * node that is in between two EXACT like nodes when ascertaining what the required 180 * "follow" character is. This should probably be moved to regex compile time 181 * although it may be done at run time because of the REF possibility - more 182 * investigation required. -- demerphq 183 */ 184 #define JUMPABLE(rn) ( \ 185 OP(rn) == OPEN || \ 186 (OP(rn) == CLOSE && \ 187 !EVAL_CLOSE_PAREN_IS(cur_eval,PARNO(rn)) ) || \ 188 OP(rn) == EVAL || \ 189 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ 190 OP(rn) == PLUS || OP(rn) == MINMOD || \ 191 OP(rn) == KEEPS || \ 192 (REGNODE_TYPE(OP(rn)) == CURLY && ARG1i(rn) > 0) \ 193 ) 194 #define IS_EXACT(rn) (REGNODE_TYPE(OP(rn)) == EXACT) 195 196 #define HAS_TEXT(rn) ( IS_EXACT(rn) || REGNODE_TYPE(OP(rn)) == REF ) 197 198 /* 199 Search for mandatory following text node; for lookahead, the text must 200 follow but for lookbehind (FLAGS(rn) != 0) we skip to the next step. 201 */ 202 #define FIND_NEXT_IMPT(rn) STMT_START { \ 203 while (JUMPABLE(rn)) { \ 204 const OPCODE type = OP(rn); \ 205 if (type == SUSPEND || REGNODE_TYPE(type) == CURLY) \ 206 rn = REGNODE_AFTER_opcode(rn,type); \ 207 else if (type == PLUS) \ 208 rn = REGNODE_AFTER_type(rn,tregnode_PLUS); \ 209 else if (type == IFMATCH) \ 210 rn = (FLAGS(rn) == 0) ? REGNODE_AFTER_type(rn,tregnode_IFMATCH) : rn + ARG1u(rn); \ 211 else rn += NEXT_OFF(rn); \ 212 } \ 213 } STMT_END 214 215 #define SLAB_FIRST(s) (&(s)->states[0]) 216 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) 217 218 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); 219 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); 220 static regmatch_state * S_push_slab(pTHX); 221 222 #define REGCP_OTHER_ELEMS 3 223 #define REGCP_FRAME_ELEMS 1 224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and 225 * are needed for the regexp context stack bookkeeping. */ 226 227 STATIC CHECKPOINT 228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen comma_pDEPTH) 229 { 230 const int retval = PL_savestack_ix; 231 /* Number of bytes about to be stored in the stack */ 232 const SSize_t paren_bytes_to_push = sizeof(*RXp_OFFSp(rex)) * (maxopenparen - parenfloor); 233 /* Number of savestack[] entries to be filled by the paren data */ 234 /* Rounding is performed in case we are few elements short */ 235 const int paren_elems_to_push = (paren_bytes_to_push + sizeof(*PL_savestack) - 1) / sizeof(*PL_savestack); 236 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; 237 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; 238 239 DECLARE_AND_GET_RE_DEBUG_FLAGS; 240 241 PERL_ARGS_ASSERT_REGCPPUSH; 242 243 if (paren_elems_to_push < 0) 244 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i", 245 (int)paren_elems_to_push, (int)maxopenparen, 246 (int)parenfloor); 247 248 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) 249 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf 250 " out of range (%lu-%ld)", 251 total_elems, 252 (unsigned long)maxopenparen, 253 (long)parenfloor); 254 255 DEBUG_BUFFERS_r( 256 if ((int)maxopenparen > (int)parenfloor) 257 Perl_re_exec_indentf( aTHX_ 258 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n", 259 depth, 260 PTR2UV(rex), 261 PTR2UV(RXp_OFFSp(rex)) 262 ); 263 ); 264 265 SSGROW(total_elems + REGCP_FRAME_ELEMS); 266 assert((IV)PL_savestack_max > (IV)(total_elems + REGCP_FRAME_ELEMS)); 267 268 /* memcpy the offs inside the stack - it's faster than for loop */ 269 memcpy(&PL_savestack[PL_savestack_ix], RXp_OFFSp(rex) + parenfloor + 1, paren_bytes_to_push); 270 PL_savestack_ix += paren_elems_to_push; 271 272 DEBUG_BUFFERS_r({ 273 I32 p; 274 for (p = parenfloor + 1; p <= (I32)maxopenparen; p++) { 275 Perl_re_exec_indentf(aTHX_ 276 " \\%" UVuf " %" IVdf " (%" IVdf ") .. %" IVdf " (regcppush)\n", 277 depth, 278 (UV)p, 279 (IV)RXp_OFFSp(rex)[p].start, 280 (IV)RXp_OFFSp(rex)[p].start_tmp, 281 (IV)RXp_OFFSp(rex)[p].end 282 ); 283 } 284 }); 285 286 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ 287 SSPUSHINT(maxopenparen); 288 SSPUSHINT(RXp_LASTPAREN(rex)); 289 SSPUSHINT(RXp_LASTCLOSEPAREN(rex)); 290 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ 291 292 293 DEBUG_BUFFERS_r({ 294 Perl_re_exec_indentf(aTHX_ 295 "finished regcppush returning %" IVdf " cur: %" IVdf "\n", 296 depth, retval, PL_savestack_ix); 297 }); 298 299 return retval; 300 } 301 302 /* These are needed since we do not localize EVAL nodes: */ 303 #define REGCP_SET(cp) \ 304 DEBUG_STATE_r( \ 305 Perl_re_exec_indentf( aTHX_ \ 306 "Setting an EVAL scope, savestack=%" IVdf ",\n", \ 307 depth, (IV)PL_savestack_ix \ 308 ) \ 309 ); \ 310 cp = PL_savestack_ix 311 312 #define REGCP_UNWIND(cp) \ 313 DEBUG_STATE_r( \ 314 if (cp != PL_savestack_ix) \ 315 Perl_re_exec_indentf( aTHX_ \ 316 "Clearing an EVAL scope, savestack=%" \ 317 IVdf "..%" IVdf "\n", \ 318 depth, (IV)(cp), (IV)PL_savestack_ix \ 319 ) \ 320 ); \ 321 regcpblow(cp) 322 323 /* set the start and end positions of capture ix */ 324 #define CLOSE_ANY_CAPTURE(rex, ix, s, e) \ 325 RXp_OFFSp(rex)[(ix)].start = (s); \ 326 RXp_OFFSp(rex)[(ix)].end = (e) 327 328 #define CLOSE_CAPTURE(rex, ix, s, e) \ 329 CLOSE_ANY_CAPTURE(rex, ix, s, e); \ 330 if (ix > RXp_LASTPAREN(rex)) \ 331 RXp_LASTPAREN(rex) = (ix); \ 332 RXp_LASTCLOSEPAREN(rex) = (ix); \ 333 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \ 334 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " .. %" IVdf " max: %" UVuf "\n", \ 335 depth, \ 336 PTR2UV(rex), \ 337 PTR2UV(RXp_OFFSp(rex)), \ 338 (UV)(ix), \ 339 (IV)RXp_OFFSp(rex)[ix].start, \ 340 (IV)RXp_OFFSp(rex)[ix].end, \ 341 (UV)RXp_LASTPAREN(rex) \ 342 )) 343 344 /* the lp and lcp args match the relevant members of the 345 * regexp structure, but in practice they should all be U16 346 * instead as we have a hard limit of U16_MAX parens. See 347 * line 4003 or so of regcomp.c where we parse OPEN parens 348 * of various types. */ 349 PERL_STATIC_INLINE void 350 S_unwind_paren(pTHX_ regexp *rex, U32 lp, U32 lcp comma_pDEPTH) { 351 PERL_ARGS_ASSERT_UNWIND_PAREN; 352 U32 n; 353 DECLARE_AND_GET_RE_DEBUG_FLAGS; 354 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 355 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf 356 ": invalidate (%" UVuf " .. %" UVuf ") set lcp: %" UVuf "\n", 357 depth, 358 PTR2UV(rex), 359 PTR2UV(RXp_OFFSp(rex)), 360 (UV)(lp), 361 (UV)(RXp_LASTPAREN(rex)), 362 (UV)(lcp) 363 )); 364 for (n = RXp_LASTPAREN(rex); n > lp; n--) { 365 RXp_OFFSp(rex)[n].end = -1; 366 } 367 RXp_LASTPAREN(rex) = n; 368 RXp_LASTCLOSEPAREN(rex) = lcp; 369 } 370 #define UNWIND_PAREN(lp,lcp) unwind_paren(rex,lp,lcp) 371 372 PERL_STATIC_INLINE void 373 S_capture_clear(pTHX_ regexp *rex, U16 from_ix, U16 to_ix, const char *str comma_pDEPTH) { 374 PERL_ARGS_ASSERT_CAPTURE_CLEAR; 375 PERL_UNUSED_ARG(str); /* only used for debugging */ 376 U16 my_ix; 377 DECLARE_AND_GET_RE_DEBUG_FLAGS; 378 for ( my_ix = from_ix; my_ix <= to_ix; my_ix++ ) { 379 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 380 "CAPTURE_CLEAR %s \\%" IVdf ": " 381 "%" IVdf "(%" IVdf ") .. %" IVdf 382 " => " 383 "%" IVdf "(%" IVdf ") .. %" IVdf 384 "\n", 385 depth, str, (IV)my_ix, 386 (IV)RXp_OFFSp(rex)[my_ix].start, 387 (IV)RXp_OFFSp(rex)[my_ix].start_tmp, 388 (IV)RXp_OFFSp(rex)[my_ix].end, 389 (IV)-1, (IV)-1, (IV)-1)); 390 RXp_OFFSp(rex)[my_ix].start = -1; 391 RXp_OFFSp(rex)[my_ix].start_tmp = -1; 392 RXp_OFFSp(rex)[my_ix].end = -1; 393 } 394 } 395 396 #define CAPTURE_CLEAR(from_ix, to_ix, str) \ 397 if (from_ix) capture_clear(rex,from_ix, to_ix, str) 398 399 STATIC void 400 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p comma_pDEPTH) 401 { 402 UV i; 403 U32 paren; 404 DECLARE_AND_GET_RE_DEBUG_FLAGS; 405 406 PERL_ARGS_ASSERT_REGCPPOP; 407 408 409 DEBUG_BUFFERS_r({ 410 Perl_re_exec_indentf(aTHX_ 411 "starting regcppop at %" IVdf "\n", 412 depth, PL_savestack_ix); 413 }); 414 415 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ 416 i = SSPOPUV; 417 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ 418 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ 419 RXp_LASTCLOSEPAREN(rex) = SSPOPINT; 420 RXp_LASTPAREN(rex) = SSPOPINT; 421 *maxopenparen_p = SSPOPINT; 422 423 i -= REGCP_OTHER_ELEMS; 424 /* Now restore the parentheses context. */ 425 DEBUG_BUFFERS_r( 426 if (i || RXp_LASTPAREN(rex) + 1 <= rex->nparens) 427 Perl_re_exec_indentf( aTHX_ 428 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n", 429 depth, 430 PTR2UV(rex), 431 PTR2UV(RXp_OFFSp(rex)) 432 ); 433 ); 434 /* substract remaining elements from the stack */ 435 PL_savestack_ix -= i; 436 437 /* static assert that offs struc size is not less than stack elem size */ 438 STATIC_ASSERT_STMT(sizeof(*RXp_OFFSp(rex)) >= sizeof(*PL_savestack)); 439 440 /* calculate actual number of offs/capture groups stored */ 441 /* by doing integer division (leaving potential alignment aside) */ 442 i = (i * sizeof(*PL_savestack)) / sizeof(*RXp_OFFSp(rex)); 443 444 /* calculate paren starting point */ 445 /* i is our number of entries which we are subtracting from *maxopenparen_p */ 446 /* and we are storing + 1 this to get the beginning */ 447 paren = *maxopenparen_p - i + 1; 448 449 /* restore them */ 450 memcpy(RXp_OFFSp(rex) + paren, &PL_savestack[PL_savestack_ix], i * sizeof(*RXp_OFFSp(rex))); 451 452 DEBUG_BUFFERS_r( 453 for (; paren <= *maxopenparen_p; ++paren) { 454 Perl_re_exec_indentf(aTHX_ 455 " \\%" UVuf " %" IVdf "(%" IVdf ") .. %" IVdf " %s (regcppop)\n", 456 depth, 457 (UV)paren, 458 (IV)RXp_OFFSp(rex)[paren].start, 459 (IV)RXp_OFFSp(rex)[paren].start_tmp, 460 (IV)RXp_OFFSp(rex)[paren].end, 461 (paren > RXp_LASTPAREN(rex) ? "(skipped)" : "")); 462 } 463 ); 464 #if 1 465 /* It would seem that the similar code in regtry() 466 * already takes care of this, and in fact it is in 467 * a better location to since this code can #if 0-ed out 468 * but the code in regtry() is needed or otherwise tests 469 * requiring null fields (pat.t#187 and split.t#{13,14} 470 * (as of patchlevel 7877) will fail. Then again, 471 * this code seems to be necessary or otherwise 472 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 473 * --jhi updated by dapm */ 474 for (i = RXp_LASTPAREN(rex) + 1; i <= rex->nparens; i++) { 475 if (i > *maxopenparen_p) { 476 RXp_OFFSp(rex)[i].start = -1; 477 } 478 RXp_OFFSp(rex)[i].end = -1; 479 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_ 480 " \\%" UVuf ": %s ..-1 undeffing (regcppop)\n", 481 depth, 482 (UV)i, 483 (i > *maxopenparen_p) ? "-1" : " " 484 )); 485 } 486 #endif 487 DEBUG_BUFFERS_r({ 488 Perl_re_exec_indentf(aTHX_ 489 "finished regcppop at %" IVdf "\n", 490 depth, PL_savestack_ix); 491 }); 492 } 493 494 /* restore the parens and associated vars at savestack position ix, 495 * but without popping the stack */ 496 497 STATIC void 498 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p comma_pDEPTH) 499 { 500 I32 tmpix = PL_savestack_ix; 501 PERL_ARGS_ASSERT_REGCP_RESTORE; 502 503 PL_savestack_ix = ix; 504 regcppop(rex, maxopenparen_p); 505 PL_savestack_ix = tmpix; 506 } 507 508 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ 509 510 STATIC bool 511 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) 512 { 513 /* Returns a boolean as to whether or not 'character' is a member of the 514 * Posix character class given by 'classnum' that should be equivalent to a 515 * value in the typedef 'char_class_number_'. 516 * 517 * Ideally this could be replaced by a just an array of function pointers 518 * to the C library functions that implement the macros this calls. 519 * However, to compile, the precise function signatures are required, and 520 * these may vary from platform to platform. To avoid having to figure 521 * out what those all are on each platform, I (khw) am using this method, 522 * which adds an extra layer of function call overhead (unless the C 523 * optimizer strips it away). But we don't particularly care about 524 * performance with locales anyway. */ 525 526 if (IN_UTF8_CTYPE_LOCALE) { 527 return cBOOL(generic_isCC_(character, classnum)); 528 } 529 530 switch ((char_class_number_) classnum) { 531 case CC_ENUM_ALPHANUMERIC_: return isU8_ALPHANUMERIC_LC(character); 532 case CC_ENUM_ALPHA_: return isU8_ALPHA_LC(character); 533 case CC_ENUM_ASCII_: return isU8_ASCII_LC(character); 534 case CC_ENUM_BLANK_: return isU8_BLANK_LC(character); 535 case CC_ENUM_CASED_: return isU8_CASED_LC(character); 536 case CC_ENUM_CNTRL_: return isU8_CNTRL_LC(character); 537 case CC_ENUM_DIGIT_: return isU8_DIGIT_LC(character); 538 case CC_ENUM_GRAPH_: return isU8_GRAPH_LC(character); 539 case CC_ENUM_LOWER_: return isU8_LOWER_LC(character); 540 case CC_ENUM_PRINT_: return isU8_PRINT_LC(character); 541 case CC_ENUM_PUNCT_: return isU8_PUNCT_LC(character); 542 case CC_ENUM_SPACE_: return isU8_SPACE_LC(character); 543 case CC_ENUM_UPPER_: return isU8_UPPER_LC(character); 544 case CC_ENUM_WORDCHAR_: return isU8_WORDCHAR_LC(character); 545 case CC_ENUM_XDIGIT_: return isU8_XDIGIT_LC(character); 546 default: /* VERTSPACE should never occur in locales */ 547 break; 548 } 549 550 Perl_croak(aTHX_ 551 "panic: isFOO_lc() has an unexpected character class '%d'", 552 classnum); 553 554 NOT_REACHED; /* NOTREACHED */ 555 return FALSE; 556 } 557 558 PERL_STATIC_INLINE I32 559 S_foldEQ_latin1_s2_folded(pTHX_ const char *s1, const char *s2, I32 len) 560 { 561 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be 562 * folded. Works on all folds representable without UTF-8, except for 563 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it 564 * check that the strings each have at least 'len' characters. 565 * 566 * There is almost an identical API function where s2 need not be folded: 567 * Perl_foldEQ_latin1() */ 568 569 const U8 *a = (const U8 *)s1; 570 const U8 *b = (const U8 *)s2; 571 572 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED; 573 574 assert(len >= 0); 575 576 while (len--) { 577 assert(! isUPPER_L1(*b)); 578 if (toLOWER_L1(*a) != *b) { 579 return 0; 580 } 581 a++, b++; 582 } 583 return 1; 584 } 585 586 STATIC bool 587 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e) 588 { 589 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded 590 * 'character' is a member of the Posix character class given by 'classnum' 591 * that should be equivalent to a value in the typedef 592 * 'char_class_number_'. 593 * 594 * This just calls isFOO_lc on the code point for the character if it is in 595 * the range 0-255. Outside that range, all characters use Unicode 596 * rules, ignoring any locale. So use the Unicode function if this class 597 * requires an inversion list, and use the Unicode macro otherwise. */ 598 599 600 PERL_ARGS_ASSERT_ISFOO_UTF8_LC; 601 602 if (UTF8_IS_INVARIANT(*character)) { 603 return isFOO_lc(classnum, *character); 604 } 605 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { 606 return isFOO_lc(classnum, 607 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1))); 608 } 609 610 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e); 611 612 switch ((char_class_number_) classnum) { 613 case CC_ENUM_SPACE_: return is_XPERLSPACE_high(character); 614 case CC_ENUM_BLANK_: return is_HORIZWS_high(character); 615 case CC_ENUM_XDIGIT_: return is_XDIGIT_high(character); 616 case CC_ENUM_VERTSPACE_: return is_VERTWS_high(character); 617 default: 618 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], 619 utf8_to_uvchr_buf(character, e, NULL)); 620 } 621 NOT_REACHED; /* NOTREACHED */ 622 } 623 624 STATIC U8 * 625 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte) 626 { 627 /* Returns the position of the first byte in the sequence between 's' and 628 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found. 629 * */ 630 631 PERL_ARGS_ASSERT_FIND_SPAN_END; 632 633 assert(send >= s); 634 635 if ((STRLEN) (send - s) >= PERL_WORDSIZE 636 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 637 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 638 { 639 PERL_UINTMAX_T span_word; 640 641 /* Process per-byte until reach word boundary. XXX This loop could be 642 * eliminated if we knew that this platform had fast unaligned reads */ 643 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 644 if (*s != span_byte) { 645 return s; 646 } 647 s++; 648 } 649 650 /* Create a word filled with the bytes we are spanning */ 651 span_word = PERL_COUNT_MULTIPLIER * span_byte; 652 653 /* Process per-word as long as we have at least a full word left */ 654 do { 655 656 /* Keep going if the whole word is composed of 'span_byte's */ 657 if ((* (PERL_UINTMAX_T *) s) == span_word) { 658 s += PERL_WORDSIZE; 659 continue; 660 } 661 662 /* Here, at least one byte in the word isn't 'span_byte'. */ 663 664 #ifdef EBCDIC 665 666 break; 667 668 #else 669 670 /* This xor leaves 1 bits only in those non-matching bytes */ 671 span_word ^= * (PERL_UINTMAX_T *) s; 672 673 /* Make sure the upper bit of each non-matching byte is set. This 674 * makes each such byte look like an ASCII platform variant byte */ 675 span_word |= span_word << 1; 676 span_word |= span_word << 2; 677 span_word |= span_word << 4; 678 679 /* That reduces the problem to what this function solves */ 680 return s + variant_byte_number(span_word); 681 682 #endif 683 684 } while (s + PERL_WORDSIZE <= send); 685 } 686 687 /* Process the straggler bytes beyond the final word boundary */ 688 while (s < send) { 689 if (*s != span_byte) { 690 return s; 691 } 692 s++; 693 } 694 695 return s; 696 } 697 698 STATIC U8 * 699 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask) 700 { 701 /* Returns the position of the first byte in the sequence between 's' 702 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte'; 703 * returns 'send' if none found. It uses word-level operations instead of 704 * byte to speed up the process */ 705 706 PERL_ARGS_ASSERT_FIND_NEXT_MASKED; 707 708 assert(send >= s); 709 assert((byte & mask) == byte); 710 711 #ifndef EBCDIC 712 713 if ((STRLEN) (send - s) >= PERL_WORDSIZE 714 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 715 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 716 { 717 PERL_UINTMAX_T word, mask_word; 718 719 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 720 if (((*s) & mask) == byte) { 721 return s; 722 } 723 s++; 724 } 725 726 word = PERL_COUNT_MULTIPLIER * byte; 727 mask_word = PERL_COUNT_MULTIPLIER * mask; 728 729 do { 730 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word; 731 732 /* If 'masked' contains bytes with the bit pattern of 'byte' within 733 * it, xoring with 'word' will leave each of the 8 bits in such 734 * bytes be 0, and no byte containing any other bit pattern will be 735 * 0. */ 736 masked ^= word; 737 738 /* This causes the most significant bit to be set to 1 for any 739 * bytes in the word that aren't completely 0 */ 740 masked |= masked << 1; 741 masked |= masked << 2; 742 masked |= masked << 4; 743 744 /* The msbits are the same as what marks a byte as variant, so we 745 * can use this mask. If all msbits are 1, the word doesn't 746 * contain 'byte' */ 747 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) { 748 s += PERL_WORDSIZE; 749 continue; 750 } 751 752 /* Here, the msbit of bytes in the word that aren't 'byte' are 1, 753 * and any that are, are 0. Complement and re-AND to swap that */ 754 masked = ~ masked; 755 masked &= PERL_VARIANTS_WORD_MASK; 756 757 /* This reduces the problem to that solved by this function */ 758 s += variant_byte_number(masked); 759 return s; 760 761 } while (s + PERL_WORDSIZE <= send); 762 } 763 764 #endif 765 766 while (s < send) { 767 if (((*s) & mask) == byte) { 768 return s; 769 } 770 s++; 771 } 772 773 return s; 774 } 775 776 STATIC U8 * 777 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask) 778 { 779 /* Returns the position of the first byte in the sequence between 's' and 780 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'. 781 * 'span_byte' should have been ANDed with 'mask' in the call of this 782 * function. Returns 'send' if none found. Works like find_span_end(), 783 * except for the AND */ 784 785 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK; 786 787 assert(send >= s); 788 assert((span_byte & mask) == span_byte); 789 790 if ((STRLEN) (send - s) >= PERL_WORDSIZE 791 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 792 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 793 { 794 PERL_UINTMAX_T span_word, mask_word; 795 796 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 797 if (((*s) & mask) != span_byte) { 798 return s; 799 } 800 s++; 801 } 802 803 span_word = PERL_COUNT_MULTIPLIER * span_byte; 804 mask_word = PERL_COUNT_MULTIPLIER * mask; 805 806 do { 807 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word; 808 809 if (masked == span_word) { 810 s += PERL_WORDSIZE; 811 continue; 812 } 813 814 #ifdef EBCDIC 815 816 break; 817 818 #else 819 820 masked ^= span_word; 821 masked |= masked << 1; 822 masked |= masked << 2; 823 masked |= masked << 4; 824 return s + variant_byte_number(masked); 825 826 #endif 827 828 } while (s + PERL_WORDSIZE <= send); 829 } 830 831 while (s < send) { 832 if (((*s) & mask) != span_byte) { 833 return s; 834 } 835 s++; 836 } 837 838 return s; 839 } 840 841 /* 842 * pregexec and friends 843 */ 844 845 #ifndef PERL_IN_XSUB_RE 846 /* 847 - pregexec - match a regexp against a string 848 */ 849 I32 850 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, 851 char *strbeg, SSize_t minend, SV *screamer, U32 nosave) 852 /* stringarg: the point in the string at which to begin matching */ 853 /* strend: pointer to null at end of string */ 854 /* strbeg: real beginning of string */ 855 /* minend: end of match must be >= minend bytes after stringarg. */ 856 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string 857 * itself is accessed via the pointers above */ 858 /* nosave: For optimizations. */ 859 { 860 PERL_ARGS_ASSERT_PREGEXEC; 861 862 return 863 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, 864 nosave ? 0 : REXEC_COPY_STR); 865 } 866 #endif 867 868 869 870 /* re_intuit_start(): 871 * 872 * Based on some optimiser hints, try to find the earliest position in the 873 * string where the regex could match. 874 * 875 * rx: the regex to match against 876 * sv: the SV being matched: only used for utf8 flag; the string 877 * itself is accessed via the pointers below. Note that on 878 * something like an overloaded SV, SvPOK(sv) may be false 879 * and the string pointers may point to something unrelated to 880 * the SV itself. 881 * strbeg: real beginning of string 882 * strpos: the point in the string at which to begin matching 883 * strend: pointer to the byte following the last char of the string 884 * flags currently unused; set to 0 885 * data: currently unused; set to NULL 886 * 887 * The basic idea of re_intuit_start() is to use some known information 888 * about the pattern, namely: 889 * 890 * a) the longest known anchored substring (i.e. one that's at a 891 * constant offset from the beginning of the pattern; but not 892 * necessarily at a fixed offset from the beginning of the 893 * string); 894 * b) the longest floating substring (i.e. one that's not at a constant 895 * offset from the beginning of the pattern); 896 * c) Whether the pattern is anchored to the string; either 897 * an absolute anchor: /^../, or anchored to \n: /^.../m, 898 * or anchored to pos(): /\G/; 899 * d) A start class: a real or synthetic character class which 900 * represents which characters are legal at the start of the pattern; 901 * 902 * to either quickly reject the match, or to find the earliest position 903 * within the string at which the pattern might match, thus avoiding 904 * running the full NFA engine at those earlier locations, only to 905 * eventually fail and retry further along. 906 * 907 * Returns NULL if the pattern can't match, or returns the address within 908 * the string which is the earliest place the match could occur. 909 * 910 * The longest of the anchored and floating substrings is called 'check' 911 * and is checked first. The other is called 'other' and is checked 912 * second. The 'other' substring may not be present. For example, 913 * 914 * /(abc|xyz)ABC\d{0,3}DEFG/ 915 * 916 * will have 917 * 918 * check substr (float) = "DEFG", offset 6..9 chars 919 * other substr (anchored) = "ABC", offset 3..3 chars 920 * stclass = [ax] 921 * 922 * Be aware that during the course of this function, sometimes 'anchored' 923 * refers to a substring being anchored relative to the start of the 924 * pattern, and sometimes to the pattern itself being anchored relative to 925 * the string. For example: 926 * 927 * /\dabc/: "abc" is anchored to the pattern; 928 * /^\dabc/: "abc" is anchored to the pattern and the string; 929 * /\d+abc/: "abc" is anchored to neither the pattern nor the string; 930 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, 931 * but the pattern is anchored to the string. 932 */ 933 934 char * 935 Perl_re_intuit_start(pTHX_ 936 REGEXP * const rx, 937 SV *sv, 938 const char * const strbeg, 939 char *strpos, 940 char *strend, 941 const U32 flags, 942 re_scream_pos_data *data) 943 { 944 struct regexp *const prog = ReANY(rx); 945 SSize_t start_shift = prog->check_offset_min; 946 /* Should be nonnegative! */ 947 SSize_t end_shift = 0; 948 /* current lowest pos in string where the regex can start matching */ 949 char *rx_origin = strpos; 950 SV *check; 951 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ 952 U8 other_ix = 1 - prog->substrs->check_ix; 953 bool ml_anch = 0; 954 char *other_last = strpos;/* latest pos 'other' substr already checked to */ 955 char *check_at = NULL; /* check substr found at this pos */ 956 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; 957 RXi_GET_DECL(prog,progi); 958 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ 959 regmatch_info *const reginfo = ®info_buf; 960 DECLARE_AND_GET_RE_DEBUG_FLAGS; 961 962 PERL_ARGS_ASSERT_RE_INTUIT_START; 963 PERL_UNUSED_ARG(flags); 964 PERL_UNUSED_ARG(data); 965 966 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 967 "Intuit: trying to determine minimum start position...\n")); 968 969 /* for now, assume that all substr offsets are positive. If at some point 970 * in the future someone wants to do clever things with lookbehind and 971 * -ve offsets, they'll need to fix up any code in this function 972 * which uses these offsets. See the thread beginning 973 * <20140113145929.GF27210@iabyn.com> 974 */ 975 assert(prog->substrs->data[0].min_offset >= 0); 976 assert(prog->substrs->data[0].max_offset >= 0); 977 assert(prog->substrs->data[1].min_offset >= 0); 978 assert(prog->substrs->data[1].max_offset >= 0); 979 assert(prog->substrs->data[2].min_offset >= 0); 980 assert(prog->substrs->data[2].max_offset >= 0); 981 982 /* for now, assume that if both present, that the floating substring 983 * doesn't start before the anchored substring. 984 * If you break this assumption (e.g. doing better optimisations 985 * with lookahead/behind), then you'll need to audit the code in this 986 * function carefully first 987 */ 988 assert( 989 ! ( (prog->anchored_utf8 || prog->anchored_substr) 990 && (prog->float_utf8 || prog->float_substr)) 991 || (prog->float_min_offset >= prog->anchored_offset)); 992 993 /* byte rather than char calculation for efficiency. It fails 994 * to quickly reject some cases that can't match, but will reject 995 * them later after doing full char arithmetic */ 996 if (prog->minlen > strend - strpos) { 997 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 998 " String too short...\n")); 999 goto fail; 1000 } 1001 1002 RXp_MATCH_UTF8_set(prog, utf8_target); 1003 reginfo->is_utf8_target = cBOOL(utf8_target); 1004 reginfo->info_aux = NULL; 1005 reginfo->strbeg = strbeg; 1006 reginfo->strend = strend; 1007 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); 1008 reginfo->intuit = 1; 1009 /* not actually used within intuit, but zero for safety anyway */ 1010 reginfo->poscache_maxiter = 0; 1011 1012 if (utf8_target) { 1013 if ((!prog->anchored_utf8 && prog->anchored_substr) 1014 || (!prog->float_utf8 && prog->float_substr)) 1015 to_utf8_substr(prog); 1016 check = prog->check_utf8; 1017 } else { 1018 if (!prog->check_substr && prog->check_utf8) { 1019 if (! to_byte_substr(prog)) { 1020 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); 1021 } 1022 } 1023 check = prog->check_substr; 1024 } 1025 1026 /* dump the various substring data */ 1027 DEBUG_OPTIMISE_MORE_r({ 1028 int i; 1029 for (i=0; i<=2; i++) { 1030 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr 1031 : prog->substrs->data[i].substr); 1032 if (!sv) 1033 continue; 1034 1035 Perl_re_printf( aTHX_ 1036 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf 1037 " useful=%" IVdf " utf8=%d [%s]\n", 1038 i, 1039 (IV)prog->substrs->data[i].min_offset, 1040 (IV)prog->substrs->data[i].max_offset, 1041 (IV)prog->substrs->data[i].end_shift, 1042 BmUSEFUL(sv), 1043 utf8_target ? 1 : 0, 1044 SvPEEK(sv)); 1045 } 1046 }); 1047 1048 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ 1049 1050 /* ml_anch: check after \n? 1051 * 1052 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning 1053 * with /.*.../, these flags will have been added by the 1054 * compiler: 1055 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL 1056 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL 1057 */ 1058 ml_anch = (prog->intflags & PREGf_ANCH_MBOL) 1059 && !(prog->intflags & PREGf_IMPLICIT); 1060 1061 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { 1062 /* we are only allowed to match at BOS or \G */ 1063 1064 /* trivially reject if there's a BOS anchor and we're not at BOS. 1065 * 1066 * Note that we don't try to do a similar quick reject for 1067 * \G, since generally the caller will have calculated strpos 1068 * based on pos() and gofs, so the string is already correctly 1069 * anchored by definition; and handling the exceptions would 1070 * be too fiddly (e.g. REXEC_IGNOREPOS). 1071 */ 1072 if ( strpos != strbeg 1073 && (prog->intflags & PREGf_ANCH_SBOL)) 1074 { 1075 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1076 " Not at start...\n")); 1077 goto fail; 1078 } 1079 1080 /* in the presence of an anchor, the anchored (relative to the 1081 * start of the regex) substr must also be anchored relative 1082 * to strpos. So quickly reject if substr isn't found there. 1083 * This works for \G too, because the caller will already have 1084 * subtracted gofs from pos, and gofs is the offset from the 1085 * \G to the start of the regex. For example, in /.abc\Gdef/, 1086 * where substr="abcdef", pos()=3, gofs=4, offset_min=1: 1087 * caller will have set strpos=pos()-4; we look for the substr 1088 * at position pos()-4+1, which lines up with the "a" */ 1089 1090 if (prog->check_offset_min == prog->check_offset_max) { 1091 /* Substring at constant offset from beg-of-str... */ 1092 SSize_t slen = SvCUR(check); 1093 char *s = HOP3c(strpos, prog->check_offset_min, strend); 1094 1095 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1096 " Looking for check substr at fixed offset %" IVdf "...\n", 1097 (IV)prog->check_offset_min)); 1098 1099 if (SvTAIL(check)) { 1100 /* In this case, the regex is anchored at the end too. 1101 * Unless it's a multiline match, the lengths must match 1102 * exactly, give or take a \n. NB: slen >= 1 since 1103 * the last char of check is \n */ 1104 if (!multiline 1105 && ( strend - s > slen 1106 || strend - s < slen - 1 1107 || (strend - s == slen && strend[-1] != '\n'))) 1108 { 1109 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1110 " String too long...\n")); 1111 goto fail_finish; 1112 } 1113 /* Now should match s[0..slen-2] */ 1114 slen--; 1115 } 1116 if (slen && (strend - s < slen 1117 || *SvPVX_const(check) != *s 1118 || (slen > 1 && (memNE(SvPVX_const(check), s, slen))))) 1119 { 1120 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1121 " String not equal...\n")); 1122 goto fail_finish; 1123 } 1124 1125 check_at = s; 1126 goto success_at_start; 1127 } 1128 } 1129 } 1130 1131 end_shift = prog->check_end_shift; 1132 1133 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ 1134 if (end_shift < 0) 1135 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ", 1136 (IV)end_shift, RX_PRECOMP(rx)); 1137 #endif 1138 1139 restart: 1140 1141 /* This is the (re)entry point of the main loop in this function. 1142 * The goal of this loop is to: 1143 * 1) find the "check" substring in the region rx_origin..strend 1144 * (adjusted by start_shift / end_shift). If not found, reject 1145 * immediately. 1146 * 2) If it exists, look for the "other" substr too if defined; for 1147 * example, if the check substr maps to the anchored substr, then 1148 * check the floating substr, and vice-versa. If not found, go 1149 * back to (1) with rx_origin suitably incremented. 1150 * 3) If we find an rx_origin position that doesn't contradict 1151 * either of the substrings, then check the possible additional 1152 * constraints on rx_origin of /^.../m or a known start class. 1153 * If these fail, then depending on which constraints fail, jump 1154 * back to here, or to various other re-entry points further along 1155 * that skip some of the first steps. 1156 * 4) If we pass all those tests, update the BmUSEFUL() count on the 1157 * substring. If the start position was determined to be at the 1158 * beginning of the string - so, not rejected, but not optimised, 1159 * since we have to run regmatch from position 0 - decrement the 1160 * BmUSEFUL() count. Otherwise increment it. 1161 */ 1162 1163 1164 /* first, look for the 'check' substring */ 1165 1166 { 1167 U8* start_point; 1168 U8* end_point; 1169 1170 DEBUG_OPTIMISE_MORE_r({ 1171 Perl_re_printf( aTHX_ 1172 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf 1173 " Start shift: %" IVdf " End shift %" IVdf 1174 " Real end Shift: %" IVdf "\n", 1175 (IV)(rx_origin - strbeg), 1176 (IV)prog->check_offset_min, 1177 (IV)start_shift, 1178 (IV)end_shift, 1179 (IV)prog->check_end_shift); 1180 }); 1181 1182 end_point = HOPBACK3(strend, end_shift, rx_origin); 1183 if (!end_point) 1184 goto fail_finish; 1185 start_point = HOPMAYBE3(rx_origin, start_shift, end_point); 1186 if (!start_point) 1187 goto fail_finish; 1188 1189 1190 /* If the regex is absolutely anchored to either the start of the 1191 * string (SBOL) or to pos() (ANCH_GPOS), then 1192 * check_offset_max represents an upper bound on the string where 1193 * the substr could start. For the ANCH_GPOS case, we assume that 1194 * the caller of intuit will have already set strpos to 1195 * pos()-gofs, so in this case strpos + offset_max will still be 1196 * an upper bound on the substr. 1197 */ 1198 if (!ml_anch 1199 && prog->intflags & PREGf_ANCH 1200 && prog->check_offset_max != SSize_t_MAX) 1201 { 1202 SSize_t check_len = SvCUR(check) - cBOOL(SvTAIL(check)); 1203 const char * const anchor = 1204 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); 1205 SSize_t targ_len = (char*)end_point - anchor; 1206 1207 if (check_len > targ_len) { 1208 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1209 "Target string too short to match required substring...\n")); 1210 goto fail_finish; 1211 } 1212 1213 /* do a bytes rather than chars comparison. It's conservative; 1214 * so it skips doing the HOP if the result can't possibly end 1215 * up earlier than the old value of end_point. 1216 */ 1217 assert(anchor + check_len <= (char *)end_point); 1218 if (prog->check_offset_max + check_len < targ_len) { 1219 end_point = HOP3lim((U8*)anchor, 1220 prog->check_offset_max, 1221 end_point - check_len 1222 ) 1223 + check_len; 1224 if (end_point < start_point) 1225 goto fail_finish; 1226 } 1227 } 1228 1229 check_at = fbm_instr( start_point, end_point, 1230 check, multiline ? FBMrf_MULTILINE : 0); 1231 1232 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1233 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n", 1234 (IV)((char*)start_point - strbeg), 1235 (IV)((char*)end_point - strbeg), 1236 (IV)(check_at ? check_at - strbeg : -1) 1237 )); 1238 1239 /* Update the count-of-usability, remove useless subpatterns, 1240 unshift s. */ 1241 1242 DEBUG_EXECUTE_r({ 1243 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 1244 SvPVX_const(check), RE_SV_DUMPLEN(check), 30); 1245 Perl_re_printf( aTHX_ " %s %s substr %s%s%s", 1246 (check_at ? "Found" : "Did not find"), 1247 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) 1248 ? "anchored" : "floating"), 1249 quoted, 1250 RE_SV_TAIL(check), 1251 (check_at ? " at offset " : "...\n") ); 1252 }); 1253 1254 if (!check_at) 1255 goto fail_finish; 1256 /* set rx_origin to the minimum position where the regex could start 1257 * matching, given the constraint of the just-matched check substring. 1258 * But don't set it lower than previously. 1259 */ 1260 1261 if (check_at - rx_origin > prog->check_offset_max) 1262 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); 1263 /* Finish the diagnostic message */ 1264 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1265 "%ld (rx_origin now %" IVdf ")...\n", 1266 (long)(check_at - strbeg), 1267 (IV)(rx_origin - strbeg) 1268 )); 1269 } 1270 1271 1272 /* now look for the 'other' substring if defined */ 1273 1274 if (prog->substrs->data[other_ix].utf8_substr 1275 || prog->substrs->data[other_ix].substr) 1276 { 1277 /* Take into account the "other" substring. */ 1278 char *last, *last1; 1279 char *s; 1280 SV* must; 1281 struct reg_substr_datum *other; 1282 1283 do_other_substr: 1284 other = &prog->substrs->data[other_ix]; 1285 if (!utf8_target && !other->substr) { 1286 if (!to_byte_substr(prog)) { 1287 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); 1288 } 1289 } 1290 1291 /* if "other" is anchored: 1292 * we've previously found a floating substr starting at check_at. 1293 * This means that the regex origin must lie somewhere 1294 * between min (rx_origin): HOP3(check_at, -check_offset_max) 1295 * and max: HOP3(check_at, -check_offset_min) 1296 * (except that min will be >= strpos) 1297 * So the fixed substr must lie somewhere between 1298 * HOP3(min, anchored_offset) 1299 * HOP3(max, anchored_offset) + SvCUR(substr) 1300 */ 1301 1302 /* if "other" is floating 1303 * Calculate last1, the absolute latest point where the 1304 * floating substr could start in the string, ignoring any 1305 * constraints from the earlier fixed match. It is calculated 1306 * as follows: 1307 * 1308 * strend - prog->minlen (in chars) is the absolute latest 1309 * position within the string where the origin of the regex 1310 * could appear. The latest start point for the floating 1311 * substr is float_min_offset(*) on from the start of the 1312 * regex. last1 simply combines thee two offsets. 1313 * 1314 * (*) You might think the latest start point should be 1315 * float_max_offset from the regex origin, and technically 1316 * you'd be correct. However, consider 1317 * /a\d{2,4}bcd\w/ 1318 * Here, float min, max are 3,5 and minlen is 7. 1319 * This can match either 1320 * /a\d\dbcd\w/ 1321 * /a\d\d\dbcd\w/ 1322 * /a\d\d\d\dbcd\w/ 1323 * In the first case, the regex matches minlen chars; in the 1324 * second, minlen+1, in the third, minlen+2. 1325 * In the first case, the floating offset is 3 (which equals 1326 * float_min), in the second, 4, and in the third, 5 (which 1327 * equals float_max). In all cases, the floating string bcd 1328 * can never start more than 4 chars from the end of the 1329 * string, which equals minlen - float_min. As the substring 1330 * starts to match more than float_min from the start of the 1331 * regex, it makes the regex match more than minlen chars, 1332 * and the two cancel each other out. So we can always use 1333 * float_min - minlen, rather than float_max - minlen for the 1334 * latest position in the string. 1335 * 1336 * Note that -minlen + float_min_offset is equivalent (AFAIKT) 1337 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift 1338 */ 1339 1340 assert(prog->minlen >= other->min_offset); 1341 last1 = HOP3c(strend, 1342 other->min_offset - prog->minlen, strbeg); 1343 1344 if (other_ix) {/* i.e. if (other-is-float) */ 1345 /* last is the latest point where the floating substr could 1346 * start, *given* any constraints from the earlier fixed 1347 * match. This constraint is that the floating string starts 1348 * <= float_max_offset chars from the regex origin (rx_origin). 1349 * If this value is less than last1, use it instead. 1350 */ 1351 assert(rx_origin <= last1); 1352 last = 1353 /* this condition handles the offset==infinity case, and 1354 * is a short-cut otherwise. Although it's comparing a 1355 * byte offset to a char length, it does so in a safe way, 1356 * since 1 char always occupies 1 or more bytes, 1357 * so if a string range is (last1 - rx_origin) bytes, 1358 * it will be less than or equal to (last1 - rx_origin) 1359 * chars; meaning it errs towards doing the accurate HOP3 1360 * rather than just using last1 as a short-cut */ 1361 (last1 - rx_origin) < other->max_offset 1362 ? last1 1363 : (char*)HOP3lim(rx_origin, other->max_offset, last1); 1364 } 1365 else { 1366 assert(strpos + start_shift <= check_at); 1367 last = HOP4c(check_at, other->min_offset - start_shift, 1368 strbeg, strend); 1369 } 1370 1371 s = HOP3c(rx_origin, other->min_offset, strend); 1372 if (s < other_last) /* These positions already checked */ 1373 s = other_last; 1374 1375 must = utf8_target ? other->utf8_substr : other->substr; 1376 assert(SvPOK(must)); 1377 { 1378 char *from = s; 1379 char *to = last + SvCUR(must) - (SvTAIL(must)!=0); 1380 1381 if (to > strend) 1382 to = strend; 1383 if (from > to) { 1384 s = NULL; 1385 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1386 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n", 1387 (IV)(from - strbeg), 1388 (IV)(to - strbeg) 1389 )); 1390 } 1391 else { 1392 s = fbm_instr( 1393 (unsigned char*)from, 1394 (unsigned char*)to, 1395 must, 1396 multiline ? FBMrf_MULTILINE : 0 1397 ); 1398 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1399 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n", 1400 (IV)(from - strbeg), 1401 (IV)(to - strbeg), 1402 (IV)(s ? s - strbeg : -1) 1403 )); 1404 } 1405 } 1406 1407 DEBUG_EXECUTE_r({ 1408 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 1409 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 1410 Perl_re_printf( aTHX_ " %s %s substr %s%s", 1411 s ? "Found" : "Contradicts", 1412 other_ix ? "floating" : "anchored", 1413 quoted, RE_SV_TAIL(must)); 1414 }); 1415 1416 1417 if (!s) { 1418 /* last1 is latest possible substr location. If we didn't 1419 * find it before there, we never will */ 1420 if (last >= last1) { 1421 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1422 "; giving up...\n")); 1423 goto fail_finish; 1424 } 1425 1426 /* try to find the check substr again at a later 1427 * position. Maybe next time we'll find the "other" substr 1428 * in range too */ 1429 other_last = HOP3c(last, 1, strend) /* highest failure */; 1430 rx_origin = 1431 other_ix /* i.e. if other-is-float */ 1432 ? HOP3c(rx_origin, 1, strend) 1433 : HOP4c(last, 1 - other->min_offset, strbeg, strend); 1434 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1435 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n", 1436 (other_ix ? "floating" : "anchored"), 1437 (long)(HOP3c(check_at, 1, strend) - strbeg), 1438 (IV)(rx_origin - strbeg) 1439 )); 1440 goto restart; 1441 } 1442 else { 1443 if (other_ix) { /* if (other-is-float) */ 1444 /* other_last is set to s, not s+1, since its possible for 1445 * a floating substr to fail first time, then succeed 1446 * second time at the same floating position; e.g.: 1447 * "-AB--AABZ" =~ /\wAB\d*Z/ 1448 * The first time round, anchored and float match at 1449 * "-(AB)--AAB(Z)" then fail on the initial \w character 1450 * class. Second time round, they match at "-AB--A(AB)(Z)". 1451 */ 1452 other_last = s; 1453 } 1454 else { 1455 rx_origin = HOP3c(s, -other->min_offset, strbeg); 1456 other_last = HOP3c(s, 1, strend); 1457 } 1458 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1459 " at offset %ld (rx_origin now %" IVdf ")...\n", 1460 (long)(s - strbeg), 1461 (IV)(rx_origin - strbeg) 1462 )); 1463 1464 } 1465 } 1466 else { 1467 DEBUG_OPTIMISE_MORE_r( 1468 Perl_re_printf( aTHX_ 1469 " Check-only match: offset min:%" IVdf " max:%" IVdf 1470 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf 1471 " strend:%" IVdf "\n", 1472 (IV)prog->check_offset_min, 1473 (IV)prog->check_offset_max, 1474 (IV)(check_at-strbeg), 1475 (IV)(rx_origin-strbeg), 1476 (IV)(rx_origin-check_at), 1477 (IV)(strend-strbeg) 1478 ) 1479 ); 1480 } 1481 1482 postprocess_substr_matches: 1483 1484 /* handle the extra constraint of /^.../m if present */ 1485 1486 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { 1487 char *s; 1488 1489 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1490 " looking for /^/m anchor")); 1491 1492 /* we have failed the constraint of a \n before rx_origin. 1493 * Find the next \n, if any, even if it's beyond the current 1494 * anchored and/or floating substrings. Whether we should be 1495 * scanning ahead for the next \n or the next substr is debatable. 1496 * On the one hand you'd expect rare substrings to appear less 1497 * often than \n's. On the other hand, searching for \n means 1498 * we're effectively flipping between check_substr and "\n" on each 1499 * iteration as the current "rarest" candidate string, which 1500 * means for example that we'll quickly reject the whole string if 1501 * hasn't got a \n, rather than trying every substr position 1502 * first 1503 */ 1504 1505 s = HOP3c(strend, - prog->minlen, strpos); 1506 if (s <= rx_origin || 1507 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) 1508 { 1509 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1510 " Did not find /%s^%s/m...\n", 1511 PL_colors[0], PL_colors[1])); 1512 goto fail_finish; 1513 } 1514 1515 /* earliest possible origin is 1 char after the \n. 1516 * (since *rx_origin == '\n', it's safe to ++ here rather than 1517 * HOP(rx_origin, 1)) */ 1518 rx_origin++; 1519 1520 if (prog->substrs->check_ix == 0 /* check is anchored */ 1521 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) 1522 { 1523 /* Position contradicts check-string; either because 1524 * check was anchored (and thus has no wiggle room), 1525 * or check was float and rx_origin is above the float range */ 1526 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1527 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n", 1528 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg))); 1529 goto restart; 1530 } 1531 1532 /* if we get here, the check substr must have been float, 1533 * is in range, and we may or may not have had an anchored 1534 * "other" substr which still contradicts */ 1535 assert(prog->substrs->check_ix); /* check is float */ 1536 1537 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { 1538 /* whoops, the anchored "other" substr exists, so we still 1539 * contradict. On the other hand, the float "check" substr 1540 * didn't contradict, so just retry the anchored "other" 1541 * substr */ 1542 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1543 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n", 1544 PL_colors[0], PL_colors[1], 1545 (IV)(rx_origin - strbeg + prog->anchored_offset), 1546 (IV)(rx_origin - strbeg) 1547 )); 1548 goto do_other_substr; 1549 } 1550 1551 /* success: we don't contradict the found floating substring 1552 * (and there's no anchored substr). */ 1553 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1554 " Found /%s^%s/m with rx_origin %ld...\n", 1555 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg))); 1556 } 1557 else { 1558 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1559 " (multiline anchor test skipped)\n")); 1560 } 1561 1562 success_at_start: 1563 1564 1565 /* if we have a starting character class, then test that extra constraint. 1566 * (trie stclasses are too expensive to use here, we are better off to 1567 * leave it to regmatch itself) */ 1568 1569 if (progi->regstclass && REGNODE_TYPE(OP(progi->regstclass))!=TRIE) { 1570 const U8* const str = (U8*)STRING(progi->regstclass); 1571 1572 /* XXX this value could be pre-computed */ 1573 const SSize_t cl_l = (REGNODE_TYPE(OP(progi->regstclass)) == EXACT 1574 ? (reginfo->is_utf8_pat 1575 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str) 1576 : (SSize_t)STR_LEN(progi->regstclass)) 1577 : 1); 1578 char * endpos; 1579 char *s; 1580 /* latest pos that a matching float substr constrains rx start to */ 1581 char *rx_max_float = NULL; 1582 1583 /* if the current rx_origin is anchored, either by satisfying an 1584 * anchored substring constraint, or a /^.../m constraint, then we 1585 * can reject the current origin if the start class isn't found 1586 * at the current position. If we have a float-only match, then 1587 * rx_origin is constrained to a range; so look for the start class 1588 * in that range. if neither, then look for the start class in the 1589 * whole rest of the string */ 1590 1591 /* XXX DAPM it's not clear what the minlen test is for, and why 1592 * it's not used in the floating case. Nothing in the test suite 1593 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. 1594 * Here are some old comments, which may or may not be correct: 1595 * 1596 * minlen == 0 is possible if regstclass is \b or \B, 1597 * and the fixed substr is ''$. 1598 * Since minlen is already taken into account, rx_origin+1 is 1599 * before strend; accidentally, minlen >= 1 guaranties no false 1600 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : 1601 * 0) below assumes that regstclass does not come from lookahead... 1602 * If regstclass takes bytelength more than 1: If charlength==1, OK. 1603 * This leaves EXACTF-ish only, which are dealt with in 1604 * find_byclass(). 1605 */ 1606 1607 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) 1608 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend); 1609 else if (prog->float_substr || prog->float_utf8) { 1610 rx_max_float = HOP3c(check_at, -start_shift, strbeg); 1611 endpos = HOP3clim(rx_max_float, cl_l, strend); 1612 } 1613 else 1614 endpos= strend; 1615 1616 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1617 " looking for class: start_shift: %" IVdf " check_at: %" IVdf 1618 " rx_origin: %" IVdf " endpos: %" IVdf "\n", 1619 (IV)start_shift, (IV)(check_at - strbeg), 1620 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); 1621 1622 s = find_byclass(prog, progi->regstclass, rx_origin, endpos, 1623 reginfo); 1624 if (!s) { 1625 if (endpos == strend) { 1626 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1627 " Could not match STCLASS...\n") ); 1628 goto fail; 1629 } 1630 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1631 " This position contradicts STCLASS...\n") ); 1632 if ((prog->intflags & PREGf_ANCH) && !ml_anch 1633 && !(prog->intflags & PREGf_IMPLICIT)) 1634 goto fail; 1635 1636 /* Contradict one of substrings */ 1637 if (prog->anchored_substr || prog->anchored_utf8) { 1638 if (prog->substrs->check_ix == 1) { /* check is float */ 1639 /* Have both, check_string is floating */ 1640 assert(rx_origin + start_shift <= check_at); 1641 if (rx_origin + start_shift != check_at) { 1642 /* not at latest position float substr could match: 1643 * Recheck anchored substring, but not floating. 1644 * The condition above is in bytes rather than 1645 * chars for efficiency. It's conservative, in 1646 * that it errs on the side of doing 'goto 1647 * do_other_substr'. In this case, at worst, 1648 * an extra anchored search may get done, but in 1649 * practice the extra fbm_instr() is likely to 1650 * get skipped anyway. */ 1651 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1652 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n", 1653 (long)(other_last - strbeg), 1654 (IV)(rx_origin - strbeg) 1655 )); 1656 goto do_other_substr; 1657 } 1658 } 1659 } 1660 else { 1661 /* float-only */ 1662 1663 if (ml_anch) { 1664 /* In the presence of ml_anch, we might be able to 1665 * find another \n without breaking the current float 1666 * constraint. */ 1667 1668 /* strictly speaking this should be HOP3c(..., 1, ...), 1669 * but since we goto a block of code that's going to 1670 * search for the next \n if any, its safe here */ 1671 rx_origin++; 1672 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1673 " about to look for /%s^%s/m starting at rx_origin %ld...\n", 1674 PL_colors[0], PL_colors[1], 1675 (long)(rx_origin - strbeg)) ); 1676 goto postprocess_substr_matches; 1677 } 1678 1679 /* strictly speaking this can never be true; but might 1680 * be if we ever allow intuit without substrings */ 1681 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) 1682 goto fail; 1683 1684 rx_origin = rx_max_float; 1685 } 1686 1687 /* at this point, any matching substrings have been 1688 * contradicted. Start again... */ 1689 1690 rx_origin = HOP3c(rx_origin, 1, strend); 1691 1692 /* uses bytes rather than char calculations for efficiency. 1693 * It's conservative: it errs on the side of doing 'goto restart', 1694 * where there is code that does a proper char-based test */ 1695 if (rx_origin + start_shift + end_shift > strend) { 1696 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1697 " Could not match STCLASS...\n") ); 1698 goto fail; 1699 } 1700 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1701 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n", 1702 (prog->substrs->check_ix ? "floating" : "anchored"), 1703 (long)(rx_origin + start_shift - strbeg), 1704 (IV)(rx_origin - strbeg) 1705 )); 1706 goto restart; 1707 } 1708 1709 /* Success !!! */ 1710 1711 if (rx_origin != s) { 1712 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1713 " By STCLASS: moving %ld --> %ld\n", 1714 (long)(rx_origin - strbeg), (long)(s - strbeg)) 1715 ); 1716 } 1717 else { 1718 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1719 " Does not contradict STCLASS...\n"); 1720 ); 1721 } 1722 } 1723 1724 /* Decide whether using the substrings helped */ 1725 1726 if (rx_origin != strpos) { 1727 /* Fixed substring is found far enough so that the match 1728 cannot start at strpos. */ 1729 1730 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n")); 1731 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ 1732 } 1733 else { 1734 /* The found rx_origin position does not prohibit matching at 1735 * strpos, so calling intuit didn't gain us anything. Decrement 1736 * the BmUSEFUL() count on the check substring, and if we reach 1737 * zero, free it. */ 1738 if (!(prog->intflags & PREGf_NAUGHTY) 1739 && (utf8_target ? ( 1740 prog->check_utf8 /* Could be deleted already */ 1741 && --BmUSEFUL(prog->check_utf8) < 0 1742 && (prog->check_utf8 == prog->float_utf8) 1743 ) : ( 1744 prog->check_substr /* Could be deleted already */ 1745 && --BmUSEFUL(prog->check_substr) < 0 1746 && (prog->check_substr == prog->float_substr) 1747 ))) 1748 { 1749 /* If flags & SOMETHING - do not do it many times on the same match */ 1750 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n")); 1751 /* XXX Does the destruction order has to change with utf8_target? */ 1752 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); 1753 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); 1754 prog->check_substr = prog->check_utf8 = NULL; /* disable */ 1755 prog->float_substr = prog->float_utf8 = NULL; /* clear */ 1756 check = NULL; /* abort */ 1757 /* XXXX This is a remnant of the old implementation. It 1758 looks wasteful, since now INTUIT can use many 1759 other heuristics. */ 1760 prog->extflags &= ~RXf_USE_INTUIT; 1761 } 1762 } 1763 1764 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1765 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", 1766 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) ); 1767 1768 return rx_origin; 1769 1770 fail_finish: /* Substring not found */ 1771 if (prog->check_substr || prog->check_utf8) /* could be removed already */ 1772 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ 1773 fail: 1774 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n", 1775 PL_colors[4], PL_colors[5])); 1776 return NULL; 1777 } 1778 1779 1780 #define DECL_TRIE_TYPE(scan) \ 1781 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ 1782 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \ 1783 trie_utf8l, trie_flu8, trie_flu8_latin } \ 1784 trie_type = ((FLAGS(scan) == EXACT) \ 1785 ? (utf8_target ? trie_utf8 : trie_plain) \ 1786 : (FLAGS(scan) == EXACTL) \ 1787 ? (utf8_target ? trie_utf8l : trie_plain) \ 1788 : (FLAGS(scan) == EXACTFAA) \ 1789 ? (utf8_target \ 1790 ? trie_utf8_exactfa_fold \ 1791 : trie_latin_utf8_exactfa_fold) \ 1792 : (FLAGS(scan) == EXACTFLU8 \ 1793 ? (utf8_target \ 1794 ? trie_flu8 \ 1795 : trie_flu8_latin) \ 1796 : (utf8_target \ 1797 ? trie_utf8_fold \ 1798 : trie_latin_utf8_fold))) 1799 1800 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is 1801 * 'foldbuf+sizeof(foldbuf)' */ 1802 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ 1803 STMT_START { \ 1804 STRLEN skiplen; \ 1805 U8 flags = FOLD_FLAGS_FULL; \ 1806 switch (trie_type) { \ 1807 case trie_flu8: \ 1808 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \ 1809 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \ 1810 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \ 1811 } \ 1812 goto do_trie_utf8_fold; \ 1813 case trie_utf8_exactfa_fold: \ 1814 flags |= FOLD_FLAGS_NOMIX_ASCII; \ 1815 /* FALLTHROUGH */ \ 1816 case trie_utf8_fold: \ 1817 do_trie_utf8_fold: \ 1818 if ( foldlen>0 ) { \ 1819 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \ 1820 foldlen -= len; \ 1821 uscan += len; \ 1822 len=0; \ 1823 } else { \ 1824 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \ 1825 flags); \ 1826 len = UTF8_SAFE_SKIP(uc, uc_end); \ 1827 skiplen = UVCHR_SKIP( uvc ); \ 1828 foldlen -= skiplen; \ 1829 uscan = foldbuf + skiplen; \ 1830 } \ 1831 break; \ 1832 case trie_flu8_latin: \ 1833 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \ 1834 goto do_trie_latin_utf8_fold; \ 1835 case trie_latin_utf8_exactfa_fold: \ 1836 flags |= FOLD_FLAGS_NOMIX_ASCII; \ 1837 /* FALLTHROUGH */ \ 1838 case trie_latin_utf8_fold: \ 1839 do_trie_latin_utf8_fold: \ 1840 if ( foldlen>0 ) { \ 1841 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \ 1842 foldlen -= len; \ 1843 uscan += len; \ 1844 len=0; \ 1845 } else { \ 1846 len = 1; \ 1847 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ 1848 skiplen = UVCHR_SKIP( uvc ); \ 1849 foldlen -= skiplen; \ 1850 uscan = foldbuf + skiplen; \ 1851 } \ 1852 break; \ 1853 case trie_utf8l: \ 1854 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \ 1855 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \ 1856 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \ 1857 } \ 1858 /* FALLTHROUGH */ \ 1859 case trie_utf8: \ 1860 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \ 1861 break; \ 1862 case trie_plain: \ 1863 uvc = (UV)*uc; \ 1864 len = 1; \ 1865 } \ 1866 if (uvc < 256) { \ 1867 charid = trie->charmap[ uvc ]; \ 1868 } \ 1869 else { \ 1870 charid = 0; \ 1871 if (widecharmap) { \ 1872 SV** const svpp = hv_fetch(widecharmap, \ 1873 (char*)&uvc, sizeof(UV), 0); \ 1874 if (svpp) \ 1875 charid = (U16)SvIV(*svpp); \ 1876 } \ 1877 } \ 1878 } STMT_END 1879 1880 #define DUMP_EXEC_POS(li,s,doutf8,depth) \ 1881 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ 1882 startpos, doutf8, depth) 1883 1884 #define GET_ANYOFH_INVLIST(prog, n) \ 1885 GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0, NULL, NULL) 1886 1887 #define REXEC_FBC_UTF8_SCAN(CODE) \ 1888 STMT_START { \ 1889 while (s < strend) { \ 1890 CODE \ 1891 s += UTF8_SAFE_SKIP(s, reginfo->strend); \ 1892 } \ 1893 } STMT_END 1894 1895 #define REXEC_FBC_NON_UTF8_SCAN(CODE) \ 1896 STMT_START { \ 1897 while (s < strend) { \ 1898 CODE \ 1899 s++; \ 1900 } \ 1901 } STMT_END 1902 1903 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \ 1904 STMT_START { \ 1905 while (s < strend) { \ 1906 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \ 1907 } \ 1908 } STMT_END 1909 1910 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \ 1911 STMT_START { \ 1912 while (s < strend) { \ 1913 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \ 1914 } \ 1915 } STMT_END 1916 1917 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \ 1918 if (COND) { \ 1919 FBC_CHECK_AND_TRY \ 1920 s += UTF8_SAFE_SKIP(s, reginfo->strend); \ 1921 previous_occurrence_end = s; \ 1922 } \ 1923 else { \ 1924 s += UTF8SKIP(s); \ 1925 } 1926 1927 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \ 1928 if (COND) { \ 1929 FBC_CHECK_AND_TRY \ 1930 s++; \ 1931 previous_occurrence_end = s; \ 1932 } \ 1933 else { \ 1934 s++; \ 1935 } 1936 1937 #define LAST_REGTRY_SKIPPED_FORWARD(reginfo) (reginfo->cutpoint) 1938 1939 /* We keep track of where the next character should start after an occurrence 1940 * of the one we're looking for. Knowing that, we can see right away if the 1941 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we 1942 * don't accept the 2nd and succeeding adjacent occurrences */ 1943 #define FBC_CHECK_AND_TRY \ 1944 if ( ( doevery \ 1945 || s != previous_occurrence_end \ 1946 || LAST_REGTRY_SKIPPED_FORWARD(reginfo) ) \ 1947 && ( reginfo->intuit \ 1948 || (s <= reginfo->strend && regtry(reginfo, &s)))) \ 1949 { \ 1950 goto got_it; \ 1951 } 1952 1953 1954 /* These differ from the above macros in that they call a function which 1955 * returns the next occurrence of the thing being looked for in 's'; and 1956 * 'strend' if there is no such occurrence. 'f' is something like fcn(a,b,c) 1957 * */ 1958 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \ 1959 while (s < strend) { \ 1960 s = (char *) (f); \ 1961 if (s >= strend) { \ 1962 break; \ 1963 } \ 1964 \ 1965 FBC_CHECK_AND_TRY \ 1966 s += UTF8SKIP(s); \ 1967 previous_occurrence_end = s; \ 1968 } 1969 1970 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \ 1971 while (s < strend) { \ 1972 s = (char *) (f); \ 1973 if (s >= strend) { \ 1974 break; \ 1975 } \ 1976 \ 1977 FBC_CHECK_AND_TRY \ 1978 s++; \ 1979 previous_occurrence_end = s; \ 1980 } 1981 1982 /* This is like the above macro except the function returns NULL if there is no 1983 * occurrence, and there is a further condition that must be matched besides 1984 * the function */ 1985 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND) \ 1986 while (s < strend) { \ 1987 s = (char *) (f); \ 1988 if (s == NULL) { \ 1989 s = (char *) strend; \ 1990 break; \ 1991 } \ 1992 \ 1993 if (COND) { \ 1994 FBC_CHECK_AND_TRY \ 1995 s += UTF8_SAFE_SKIP(s, reginfo->strend); \ 1996 previous_occurrence_end = s; \ 1997 } \ 1998 else { \ 1999 s += UTF8SKIP(s); \ 2000 } \ 2001 } 2002 2003 /* This differs from the above macros in that it is passed a single byte that 2004 * is known to begin the next occurrence of the thing being looked for in 's'. 2005 * It does a memchr to find the next occurrence of 'byte', before trying 'COND' 2006 * at that position. */ 2007 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \ 2008 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s), \ 2009 COND) 2010 2011 /* This is like the function above, but takes an entire string to look for 2012 * instead of a single byte */ 2013 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND) \ 2014 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND( \ 2015 ninstr(s, strend, substr, substr_end), \ 2016 COND) 2017 2018 /* The four macros below are slightly different versions of the same logic. 2019 * 2020 * The first is for /a and /aa when the target string is UTF-8. This can only 2021 * match ascii, but it must advance based on UTF-8. The other three handle 2022 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are 2023 * looking for the boundary (or non-boundary) between a word and non-word 2024 * character. The utf8 and non-utf8 cases have the same logic, but the details 2025 * must be different. Find the "wordness" of the character just prior to this 2026 * one, and compare it with the wordness of this one. If they differ, we have 2027 * a boundary. At the beginning of the string, pretend that the previous 2028 * character was a new-line. 2029 * 2030 * All these macros uncleanly have side-effects with each other and outside 2031 * variables. So far it's been too much trouble to clean-up 2032 * 2033 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is 2034 * a word character or not. 2035 * IF_SUCCESS is code to do if it finds that we are at a boundary between 2036 * word/non-word 2037 * IF_FAIL is code to do if we aren't at a boundary between word/non-word 2038 * 2039 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we 2040 * are looking for a boundary or for a non-boundary. If we are looking for a 2041 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and 2042 * see if this tentative match actually works, and if so, to quit the loop 2043 * here. And vice-versa if we are looking for a non-boundary. 2044 * 2045 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and 2046 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of 2047 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be 2048 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal 2049 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that 2050 * complement. But in that branch we complement tmp, meaning that at the 2051 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s), 2052 * which means at the top of the loop in the next iteration, it is 2053 * TEST_NON_UTF8(s-1) */ 2054 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 2055 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ 2056 tmp = TEST_NON_UTF8(tmp); \ 2057 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ 2058 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ 2059 tmp = !tmp; \ 2060 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \ 2061 } \ 2062 else { \ 2063 IF_FAIL; \ 2064 } \ 2065 ); \ 2066 2067 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and 2068 * TEST_UTF8 is a macro that for the same input code points returns identically 2069 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an 2070 * end pointer as well) */ 2071 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \ 2072 if (s == reginfo->strbeg) { \ 2073 tmp = '\n'; \ 2074 } \ 2075 else { /* Back-up to the start of the previous character */ \ 2076 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ 2077 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ 2078 0, UTF8_ALLOW_DEFAULT); \ 2079 } \ 2080 tmp = TEST_UV(tmp); \ 2081 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \ 2082 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \ 2083 tmp = !tmp; \ 2084 IF_SUCCESS; \ 2085 } \ 2086 else { \ 2087 IF_FAIL; \ 2088 } \ 2089 ); 2090 2091 /* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the 2092 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases, 2093 * set-up by the FBC_BOUND, etc macros below */ 2094 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 2095 UTF8_CODE; \ 2096 /* Here, things have been set up by the previous code so that tmp is the \ 2097 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \ 2098 * against the EOS, which we treat as a \n */ \ 2099 if (tmp == ! TEST_NON_UTF8('\n')) { \ 2100 IF_SUCCESS; \ 2101 } \ 2102 else { \ 2103 IF_FAIL; \ 2104 } 2105 2106 /* Same as the macro above, but the target isn't UTF-8 */ 2107 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 2108 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ 2109 tmp = TEST_NON_UTF8(tmp); \ 2110 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \ 2111 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \ 2112 IF_SUCCESS; \ 2113 tmp = !tmp; \ 2114 } \ 2115 else { \ 2116 IF_FAIL; \ 2117 } \ 2118 ); \ 2119 /* Here, things have been set up by the previous code so that tmp is \ 2120 * the return of TEST_NON_UTF8(s-1). We also have to check if this \ 2121 * matches against the EOS, which we treat as a \n */ \ 2122 if (tmp == ! TEST_NON_UTF8('\n')) { \ 2123 IF_SUCCESS; \ 2124 } \ 2125 else { \ 2126 IF_FAIL; \ 2127 } 2128 2129 /* This is the macro to use when we want to see if something that looks like it 2130 * could match, actually does, and if so exits the loop. It needs to be used 2131 * only for bounds checking macros, as it allows for matching beyond the end of 2132 * string (which should be zero length without having to look at the string 2133 * contents) */ 2134 #define REXEC_FBC_TRYIT \ 2135 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \ 2136 goto got_it 2137 2138 /* The only difference between the BOUND and NBOUND cases is that 2139 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in 2140 * NBOUND. This is accomplished by passing it as either the if or else clause, 2141 * with the other one being empty (PLACEHOLDER is defined as empty). 2142 * 2143 * The TEST_FOO parameters are for operating on different forms of input, but 2144 * all should be ones that return identically for the same underlying code 2145 * points */ 2146 2147 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ 2148 FBC_BOUND_COMMON_UTF8( \ 2149 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ 2150 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2151 2152 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \ 2153 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2154 2155 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \ 2156 FBC_BOUND_COMMON_UTF8( \ 2157 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\ 2158 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2159 2160 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \ 2161 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2162 2163 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ 2164 FBC_BOUND_COMMON_UTF8( \ 2165 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ 2166 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2167 2168 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \ 2169 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2170 2171 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \ 2172 FBC_BOUND_COMMON_UTF8( \ 2173 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ 2174 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2175 2176 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \ 2177 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2178 2179 #ifdef DEBUGGING 2180 static IV 2181 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) { 2182 IV cp_out = _invlist_search(invlist, cp_in); 2183 assert(cp_out >= 0); 2184 return cp_out; 2185 } 2186 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \ 2187 invmap[S_get_break_val_cp_checked(invlist, cp)] 2188 #else 2189 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \ 2190 invmap[_invlist_search(invlist, cp)] 2191 #endif 2192 2193 /* Takes a pointer to an inversion list, a pointer to its corresponding 2194 * inversion map, and a code point, and returns the code point's value 2195 * according to the two arrays. It assumes that all code points have a value. 2196 * This is used as the base macro for macros for particular properties */ 2197 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \ 2198 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) 2199 2200 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead 2201 * of a code point, returning the value for the first code point in the string. 2202 * And it takes the particular macro name that finds the desired value given a 2203 * code point. Merely convert the UTF-8 to code point and call the cp macro */ 2204 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \ 2205 (__ASSERT_(pos < strend) \ 2206 /* Note assumes is valid UTF-8 */ \ 2207 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL)))) 2208 2209 /* Returns the GCB value for the input code point */ 2210 #define getGCB_VAL_CP(cp) \ 2211 _generic_GET_BREAK_VAL_CP( \ 2212 PL_GCB_invlist, \ 2213 _Perl_GCB_invmap, \ 2214 (cp)) 2215 2216 /* Returns the GCB value for the first code point in the UTF-8 encoded string 2217 * bounded by pos and strend */ 2218 #define getGCB_VAL_UTF8(pos, strend) \ 2219 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend) 2220 2221 /* Returns the LB value for the input code point */ 2222 #define getLB_VAL_CP(cp) \ 2223 _generic_GET_BREAK_VAL_CP( \ 2224 PL_LB_invlist, \ 2225 _Perl_LB_invmap, \ 2226 (cp)) 2227 2228 /* Returns the LB value for the first code point in the UTF-8 encoded string 2229 * bounded by pos and strend */ 2230 #define getLB_VAL_UTF8(pos, strend) \ 2231 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend) 2232 2233 2234 /* Returns the SB value for the input code point */ 2235 #define getSB_VAL_CP(cp) \ 2236 _generic_GET_BREAK_VAL_CP( \ 2237 PL_SB_invlist, \ 2238 _Perl_SB_invmap, \ 2239 (cp)) 2240 2241 /* Returns the SB value for the first code point in the UTF-8 encoded string 2242 * bounded by pos and strend */ 2243 #define getSB_VAL_UTF8(pos, strend) \ 2244 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend) 2245 2246 /* Returns the WB value for the input code point */ 2247 #define getWB_VAL_CP(cp) \ 2248 _generic_GET_BREAK_VAL_CP( \ 2249 PL_WB_invlist, \ 2250 _Perl_WB_invmap, \ 2251 (cp)) 2252 2253 /* Returns the WB value for the first code point in the UTF-8 encoded string 2254 * bounded by pos and strend */ 2255 #define getWB_VAL_UTF8(pos, strend) \ 2256 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend) 2257 2258 /* We know what class REx starts with. Try to find this position... */ 2259 /* if reginfo->intuit, its a dryrun */ 2260 /* annoyingly all the vars in this routine have different names from their counterparts 2261 in regmatch. /grrr */ 2262 STATIC char * 2263 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, 2264 const char *strend, regmatch_info *reginfo) 2265 { 2266 2267 /* TRUE if x+ need not match at just the 1st pos of run of x's */ 2268 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; 2269 2270 char *pat_string; /* The pattern's exactish string */ 2271 char *pat_end; /* ptr to end char of pat_string */ 2272 re_fold_t folder; /* Function for computing non-utf8 folds */ 2273 const U8 *fold_array; /* array for folding ords < 256 */ 2274 STRLEN ln; 2275 STRLEN lnc; 2276 U8 c1; 2277 U8 c2; 2278 char *e = NULL; 2279 2280 /* In some cases we accept only the first occurrence of 'x' in a sequence of 2281 * them. This variable points to just beyond the end of the previous 2282 * occurrence of 'x', hence we can tell if we are in a sequence. (Having 2283 * it point to beyond the 'x' allows us to work for UTF-8 without having to 2284 * hop back.) */ 2285 char * previous_occurrence_end = 0; 2286 2287 I32 tmp; /* Scratch variable */ 2288 const bool utf8_target = reginfo->is_utf8_target; 2289 UV utf8_fold_flags = 0; 2290 const bool is_utf8_pat = reginfo->is_utf8_pat; 2291 bool to_complement = FALSE; /* Invert the result? Taking the xor of this 2292 with a result inverts that result, as 0^1 = 2293 1 and 1^1 = 0 */ 2294 char_class_number_ classnum; 2295 2296 RXi_GET_DECL(prog,progi); 2297 2298 PERL_ARGS_ASSERT_FIND_BYCLASS; 2299 2300 /* We know what class it must start with. The case statements below have 2301 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb' 2302 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern 2303 * ('p8' and 'pb'. */ 2304 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) { 2305 SV * anyofh_list; 2306 2307 case ANYOFPOSIXL_t8_pb: 2308 case ANYOFPOSIXL_t8_p8: 2309 case ANYOFL_t8_pb: 2310 case ANYOFL_t8_p8: 2311 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2312 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c); 2313 2314 /* FALLTHROUGH */ 2315 2316 case ANYOFD_t8_pb: 2317 case ANYOFD_t8_p8: 2318 case ANYOF_t8_pb: 2319 case ANYOF_t8_p8: 2320 REXEC_FBC_UTF8_CLASS_SCAN( 2321 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)); 2322 break; 2323 2324 case ANYOFPOSIXL_tb_pb: 2325 case ANYOFPOSIXL_tb_p8: 2326 case ANYOFL_tb_pb: 2327 case ANYOFL_tb_p8: 2328 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2329 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c); 2330 2331 /* FALLTHROUGH */ 2332 2333 case ANYOFD_tb_pb: 2334 case ANYOFD_tb_p8: 2335 case ANYOF_tb_pb: 2336 case ANYOF_tb_p8: 2337 if (! ANYOF_FLAGS(c) && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(c)) { 2338 /* We know that s is in the bitmap range since the target isn't 2339 * UTF-8, so what happens for out-of-range values is not relevant, 2340 * so exclude that from the flags */ 2341 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s))); 2342 } 2343 else { 2344 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 2345 0)); 2346 } 2347 break; 2348 2349 case ANYOFM_tb_pb: /* ARG1u() is the base byte; FLAGS() the mask byte */ 2350 case ANYOFM_tb_p8: 2351 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN( 2352 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c))); 2353 break; 2354 2355 case ANYOFM_t8_pb: 2356 case ANYOFM_t8_p8: 2357 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But 2358 * we do anyway for performance reasons, as otherwise we would have to 2359 * examine all the continuation characters */ 2360 REXEC_FBC_UTF8_FIND_NEXT_SCAN( 2361 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c))); 2362 break; 2363 2364 case NANYOFM_tb_pb: 2365 case NANYOFM_tb_p8: 2366 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN( 2367 find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c))); 2368 break; 2369 2370 case NANYOFM_t8_pb: 2371 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8 2372 variants. */ 2373 REXEC_FBC_UTF8_FIND_NEXT_SCAN( 2374 (char *) find_span_end_mask((U8 *) s, (U8 *) strend, 2375 (U8) ARG1u(c), FLAGS(c))); 2376 break; 2377 2378 /* These nodes all require at least one code point to be in UTF-8 to 2379 * match */ 2380 case ANYOFH_tb_pb: 2381 case ANYOFH_tb_p8: 2382 case ANYOFHb_tb_pb: 2383 case ANYOFHb_tb_p8: 2384 case ANYOFHbbm_tb_pb: 2385 case ANYOFHbbm_tb_p8: 2386 case ANYOFHr_tb_pb: 2387 case ANYOFHr_tb_p8: 2388 case ANYOFHs_tb_pb: 2389 case ANYOFHs_tb_p8: 2390 case EXACTFLU8_tb_pb: 2391 case EXACTFLU8_tb_p8: 2392 case EXACTFU_REQ8_tb_pb: 2393 case EXACTFU_REQ8_tb_p8: 2394 break; 2395 2396 case ANYOFH_t8_pb: 2397 case ANYOFH_t8_p8: 2398 anyofh_list = GET_ANYOFH_INVLIST(prog, c); 2399 REXEC_FBC_UTF8_CLASS_SCAN( 2400 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) 2401 && _invlist_contains_cp(anyofh_list, 2402 utf8_to_uvchr_buf((U8 *) s, 2403 (U8 *) strend, 2404 NULL)))); 2405 break; 2406 2407 case ANYOFHb_t8_pb: 2408 case ANYOFHb_t8_p8: 2409 { 2410 /* We know what the first byte of any matched string should be. */ 2411 U8 first_byte = FLAGS(c); 2412 2413 anyofh_list = GET_ANYOFH_INVLIST(prog, c); 2414 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, 2415 _invlist_contains_cp(anyofh_list, 2416 utf8_to_uvchr_buf((U8 *) s, 2417 (U8 *) strend, 2418 NULL))); 2419 } 2420 break; 2421 2422 case ANYOFHbbm_t8_pb: 2423 case ANYOFHbbm_t8_p8: 2424 { 2425 /* We know what the first byte of any matched string should be. */ 2426 U8 first_byte = FLAGS(c); 2427 2428 /* And a bitmap defines all the legal 2nd byte matches */ 2429 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, 2430 ( s < strend 2431 && BITMAP_TEST(((struct regnode_bbm *) c)->bitmap, 2432 (U8) s[1] & UTF_CONTINUATION_MASK))); 2433 } 2434 break; 2435 2436 case ANYOFHr_t8_pb: 2437 case ANYOFHr_t8_p8: 2438 anyofh_list = GET_ANYOFH_INVLIST(prog, c); 2439 REXEC_FBC_UTF8_CLASS_SCAN( 2440 ( inRANGE(NATIVE_UTF8_TO_I8(*s), 2441 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)), 2442 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c))) 2443 && _invlist_contains_cp(anyofh_list, 2444 utf8_to_uvchr_buf((U8 *) s, 2445 (U8 *) strend, 2446 NULL)))); 2447 break; 2448 2449 case ANYOFHs_t8_pb: 2450 case ANYOFHs_t8_p8: 2451 anyofh_list = GET_ANYOFH_INVLIST(prog, c); 2452 REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN( 2453 ((struct regnode_anyofhs *) c)->string, 2454 /* Note FLAGS is the string length in this regnode */ 2455 ((struct regnode_anyofhs *) c)->string + FLAGS(c), 2456 _invlist_contains_cp(anyofh_list, 2457 utf8_to_uvchr_buf((U8 *) s, 2458 (U8 *) strend, 2459 NULL))); 2460 break; 2461 2462 case ANYOFR_tb_pb: 2463 case ANYOFR_tb_p8: 2464 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s, 2465 ANYOFRbase(c), ANYOFRdelta(c))); 2466 break; 2467 2468 case ANYOFR_t8_pb: 2469 case ANYOFR_t8_p8: 2470 REXEC_FBC_UTF8_CLASS_SCAN( 2471 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) 2472 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s, 2473 (U8 *) strend, 2474 NULL), 2475 ANYOFRbase(c), ANYOFRdelta(c)))); 2476 break; 2477 2478 case ANYOFRb_tb_pb: 2479 case ANYOFRb_tb_p8: 2480 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s, 2481 ANYOFRbase(c), ANYOFRdelta(c))); 2482 break; 2483 2484 case ANYOFRb_t8_pb: 2485 case ANYOFRb_t8_p8: 2486 { /* We know what the first byte of any matched string should be */ 2487 U8 first_byte = FLAGS(c); 2488 2489 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, 2490 withinCOUNT(utf8_to_uvchr_buf((U8 *) s, 2491 (U8 *) strend, 2492 NULL), 2493 ANYOFRbase(c), ANYOFRdelta(c))); 2494 } 2495 break; 2496 2497 case EXACTFAA_tb_pb: 2498 2499 /* Latin1 folds are not affected by /a, except it excludes the sharp s, 2500 * which these functions don't handle anyway */ 2501 fold_array = PL_fold_latin1; 2502 folder = S_foldEQ_latin1_s2_folded; 2503 goto do_exactf_non_utf8; 2504 2505 case EXACTF_tb_pb: 2506 fold_array = PL_fold; 2507 folder = Perl_foldEQ; 2508 goto do_exactf_non_utf8; 2509 2510 case EXACTFL_tb_pb: 2511 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2512 2513 if (IN_UTF8_CTYPE_LOCALE) { 2514 utf8_fold_flags = FOLDEQ_LOCALE; 2515 goto do_exactf_utf8; 2516 } 2517 2518 fold_array = PL_fold_locale; 2519 folder = Perl_foldEQ_locale; 2520 goto do_exactf_non_utf8; 2521 2522 case EXACTFU_tb_pb: 2523 /* Any 'ss' in the pattern should have been replaced by regcomp, so we 2524 * don't have to worry here about this single special case in the 2525 * Latin1 range */ 2526 fold_array = PL_fold_latin1; 2527 folder = S_foldEQ_latin1_s2_folded; 2528 2529 /* FALLTHROUGH */ 2530 2531 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there 2532 are no glitches with fold-length differences 2533 between the target string and pattern */ 2534 2535 /* The idea in the non-utf8 EXACTF* cases is to first find the first 2536 * character of the EXACTF* node and then, if necessary, 2537 * case-insensitively compare the full text of the node. c1 is the 2538 * first character. c2 is its fold. This logic will not work for 2539 * Unicode semantics and the german sharp ss, which hence should not be 2540 * compiled into a node that gets here. */ 2541 pat_string = STRINGs(c); 2542 ln = STR_LENs(c); /* length to match in octets/bytes */ 2543 2544 /* We know that we have to match at least 'ln' bytes (which is the same 2545 * as characters, since not utf8). If we have to match 3 characters, 2546 * and there are only 2 available, we know without trying that it will 2547 * fail; so don't start a match past the required minimum number from 2548 * the far end */ 2549 e = HOP3c(strend, -((SSize_t)ln), s); 2550 if (e < s) 2551 break; 2552 2553 c1 = *pat_string; 2554 c2 = fold_array[c1]; 2555 if (c1 == c2) { /* If char and fold are the same */ 2556 while (s <= e) { 2557 s = (char *) memchr(s, c1, e + 1 - s); 2558 if (s == NULL) { 2559 break; 2560 } 2561 2562 /* Check that the rest of the node matches */ 2563 if ( (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1)) 2564 && (reginfo->intuit || regtry(reginfo, &s)) ) 2565 { 2566 goto got_it; 2567 } 2568 s++; 2569 } 2570 } 2571 else { 2572 U8 bits_differing = c1 ^ c2; 2573 2574 /* If the folds differ in one bit position only, we can mask to 2575 * match either of them, and can use this faster find method. Both 2576 * ASCII and EBCDIC tend to have their case folds differ in only 2577 * one position, so this is very likely */ 2578 if (LIKELY(PL_bitcount[bits_differing] == 1)) { 2579 bits_differing = ~ bits_differing; 2580 while (s <= e) { 2581 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1, 2582 (c1 & bits_differing), bits_differing); 2583 if (s > e) { 2584 break; 2585 } 2586 2587 if ( (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1)) 2588 && (reginfo->intuit || regtry(reginfo, &s)) ) 2589 { 2590 goto got_it; 2591 } 2592 s++; 2593 } 2594 } 2595 else { /* Otherwise, stuck with looking byte-at-a-time. This 2596 should actually happen only in EXACTFL nodes */ 2597 while (s <= e) { 2598 if ( (*(U8*)s == c1 || *(U8*)s == c2) 2599 && (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1)) 2600 && (reginfo->intuit || regtry(reginfo, &s)) ) 2601 { 2602 goto got_it; 2603 } 2604 s++; 2605 } 2606 } 2607 } 2608 break; 2609 2610 case EXACTFAA_tb_p8: 2611 case EXACTFAA_t8_p8: 2612 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII 2613 |FOLDEQ_S2_ALREADY_FOLDED 2614 |FOLDEQ_S2_FOLDS_SANE; 2615 goto do_exactf_utf8; 2616 2617 case EXACTFAA_NO_TRIE_tb_pb: 2618 case EXACTFAA_NO_TRIE_t8_pb: 2619 case EXACTFAA_t8_pb: 2620 2621 /* Here, and elsewhere in this file, the reason we can't consider a 2622 * non-UTF-8 pattern already folded in the presence of a UTF-8 target 2623 * is because any MICRO SIGN in the pattern won't be folded. Since the 2624 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider 2625 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */ 2626 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 2627 goto do_exactf_utf8; 2628 2629 case EXACTFL_tb_p8: 2630 case EXACTFL_t8_pb: 2631 case EXACTFL_t8_p8: 2632 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2633 utf8_fold_flags = FOLDEQ_LOCALE; 2634 goto do_exactf_utf8; 2635 2636 case EXACTFLU8_t8_pb: 2637 case EXACTFLU8_t8_p8: 2638 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED 2639 | FOLDEQ_S2_FOLDS_SANE; 2640 goto do_exactf_utf8; 2641 2642 case EXACTFU_REQ8_t8_p8: 2643 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; 2644 goto do_exactf_utf8; 2645 2646 case EXACTFU_tb_p8: 2647 case EXACTFU_t8_pb: 2648 case EXACTFU_t8_p8: 2649 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; 2650 goto do_exactf_utf8; 2651 2652 /* The following are problematic even though pattern isn't UTF-8. Use 2653 * full functionality normally not done except for UTF-8. */ 2654 case EXACTF_t8_pb: 2655 case EXACTFUP_tb_pb: 2656 case EXACTFUP_t8_pb: 2657 2658 do_exactf_utf8: 2659 { 2660 unsigned expansion; 2661 2662 /* If one of the operands is in utf8, we can't use the simpler 2663 * folding above, due to the fact that many different characters 2664 * can have the same fold, or portion of a fold, or different- 2665 * length fold */ 2666 pat_string = STRINGs(c); 2667 ln = STR_LENs(c); /* length to match in octets/bytes */ 2668 pat_end = pat_string + ln; 2669 lnc = is_utf8_pat /* length to match in characters */ 2670 ? utf8_length((U8 *) pat_string, (U8 *) pat_end) 2671 : ln; 2672 2673 /* We have 'lnc' characters to match in the pattern, but because of 2674 * multi-character folding, each character in the target can match 2675 * up to 3 characters (Unicode guarantees it will never exceed 2676 * this) if it is utf8-encoded; and up to 2 if not (based on the 2677 * fact that the Latin 1 folds are already determined, and the only 2678 * multi-char fold in that range is the sharp-s folding to 'ss'. 2679 * Thus, a pattern character can match as little as 1/3 of a string 2680 * character. Adjust lnc accordingly, rounding up, so that if we 2681 * need to match at least 4+1/3 chars, that really is 5. */ 2682 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; 2683 lnc = (lnc + expansion - 1) / expansion; 2684 2685 /* As in the non-UTF8 case, if we have to match 3 characters, and 2686 * only 2 are left, it's guaranteed to fail, so don't start a match 2687 * that would require us to go beyond the end of the string */ 2688 e = HOP3c(strend, -((SSize_t)lnc), s); 2689 2690 /* XXX Note that we could recalculate e to stop the loop earlier, 2691 * as the worst case expansion above will rarely be met, and as we 2692 * go along we would usually find that e moves further to the left. 2693 * This would happen only after we reached the point in the loop 2694 * where if there were no expansion we should fail. Unclear if 2695 * worth the expense */ 2696 2697 while (s <= e) { 2698 char *my_strend= (char *)strend; 2699 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, 2700 pat_string, NULL, ln, is_utf8_pat, 2701 utf8_fold_flags) 2702 && (reginfo->intuit || regtry(reginfo, &s)) ) 2703 { 2704 goto got_it; 2705 } 2706 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; 2707 } 2708 } 2709 break; 2710 2711 case BOUNDA_tb_pb: 2712 case BOUNDA_tb_p8: 2713 case BOUND_tb_pb: /* /d without utf8 target is /a */ 2714 case BOUND_tb_p8: 2715 /* regcomp.c makes sure that these only have the traditional \b 2716 * meaning. */ 2717 assert(FLAGS(c) == TRADITIONAL_BOUND); 2718 2719 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A); 2720 break; 2721 2722 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */ 2723 case BOUNDA_t8_p8: 2724 /* regcomp.c makes sure that these only have the traditional \b 2725 * meaning. */ 2726 assert(FLAGS(c) == TRADITIONAL_BOUND); 2727 2728 FBC_BOUND_A_UTF8(isWORDCHAR_A); 2729 break; 2730 2731 case NBOUNDA_tb_pb: 2732 case NBOUNDA_tb_p8: 2733 case NBOUND_tb_pb: /* /d without utf8 target is /a */ 2734 case NBOUND_tb_p8: 2735 /* regcomp.c makes sure that these only have the traditional \b 2736 * meaning. */ 2737 assert(FLAGS(c) == TRADITIONAL_BOUND); 2738 2739 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A); 2740 break; 2741 2742 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */ 2743 case NBOUNDA_t8_p8: 2744 /* regcomp.c makes sure that these only have the traditional \b 2745 * meaning. */ 2746 assert(FLAGS(c) == TRADITIONAL_BOUND); 2747 2748 FBC_NBOUND_A_UTF8(isWORDCHAR_A); 2749 break; 2750 2751 case NBOUNDU_tb_pb: 2752 case NBOUNDU_tb_p8: 2753 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2754 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1); 2755 break; 2756 } 2757 2758 to_complement = 1; 2759 goto do_boundu_non_utf8; 2760 2761 case NBOUNDL_tb_pb: 2762 case NBOUNDL_tb_p8: 2763 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2764 if (FLAGS(c) == TRADITIONAL_BOUND) { 2765 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC); 2766 break; 2767 } 2768 2769 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2770 2771 to_complement = 1; 2772 goto do_boundu_non_utf8; 2773 2774 case BOUNDL_tb_pb: 2775 case BOUNDL_tb_p8: 2776 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2777 if (FLAGS(c) == TRADITIONAL_BOUND) { 2778 FBC_BOUND_NON_UTF8(isWORDCHAR_LC); 2779 break; 2780 } 2781 2782 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2783 2784 goto do_boundu_non_utf8; 2785 2786 case BOUNDU_tb_pb: 2787 case BOUNDU_tb_p8: 2788 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2789 FBC_BOUND_NON_UTF8(isWORDCHAR_L1); 2790 break; 2791 } 2792 2793 do_boundu_non_utf8: 2794 if (s == reginfo->strbeg) { 2795 if (reginfo->intuit || regtry(reginfo, &s)) 2796 { 2797 goto got_it; 2798 } 2799 2800 /* Didn't match. Try at the next position (if there is one) */ 2801 s++; 2802 if (UNLIKELY(s >= reginfo->strend)) { 2803 break; 2804 } 2805 } 2806 2807 switch((bound_type) FLAGS(c)) { 2808 case TRADITIONAL_BOUND: /* Should have already been handled */ 2809 assert(0); 2810 break; 2811 2812 case GCB_BOUND: 2813 /* Not utf8. Everything is a GCB except between CR and LF */ 2814 while (s < strend) { 2815 if ((to_complement ^ ( UCHARAT(s - 1) != '\r' 2816 || UCHARAT(s) != '\n')) 2817 && (reginfo->intuit || regtry(reginfo, &s))) 2818 { 2819 goto got_it; 2820 } 2821 s++; 2822 } 2823 2824 break; 2825 2826 case LB_BOUND: 2827 { 2828 LB_enum before = getLB_VAL_CP((U8) *(s -1)); 2829 while (s < strend) { 2830 LB_enum after = getLB_VAL_CP((U8) *s); 2831 if (to_complement ^ isLB(before, 2832 after, 2833 (U8*) reginfo->strbeg, 2834 (U8*) s, 2835 (U8*) reginfo->strend, 2836 0 /* target not utf8 */ ) 2837 && (reginfo->intuit || regtry(reginfo, &s))) 2838 { 2839 goto got_it; 2840 } 2841 before = after; 2842 s++; 2843 } 2844 } 2845 2846 break; 2847 2848 case SB_BOUND: 2849 { 2850 SB_enum before = getSB_VAL_CP((U8) *(s -1)); 2851 while (s < strend) { 2852 SB_enum after = getSB_VAL_CP((U8) *s); 2853 if ((to_complement ^ isSB(before, 2854 after, 2855 (U8*) reginfo->strbeg, 2856 (U8*) s, 2857 (U8*) reginfo->strend, 2858 0 /* target not utf8 */ )) 2859 && (reginfo->intuit || regtry(reginfo, &s))) 2860 { 2861 goto got_it; 2862 } 2863 before = after; 2864 s++; 2865 } 2866 } 2867 2868 break; 2869 2870 case WB_BOUND: 2871 { 2872 WB_enum previous = WB_UNKNOWN; 2873 WB_enum before = getWB_VAL_CP((U8) *(s -1)); 2874 while (s < strend) { 2875 WB_enum after = getWB_VAL_CP((U8) *s); 2876 if ((to_complement ^ isWB(previous, 2877 before, 2878 after, 2879 (U8*) reginfo->strbeg, 2880 (U8*) s, 2881 (U8*) reginfo->strend, 2882 0 /* target not utf8 */ )) 2883 && (reginfo->intuit || regtry(reginfo, &s))) 2884 { 2885 goto got_it; 2886 } 2887 previous = before; 2888 before = after; 2889 s++; 2890 } 2891 } 2892 } 2893 2894 /* Here are at the final position in the target string, which is a 2895 * boundary by definition, so matches, depending on other constraints. 2896 * */ 2897 if ( reginfo->intuit 2898 || (s <= reginfo->strend && regtry(reginfo, &s))) 2899 { 2900 goto got_it; 2901 } 2902 2903 break; 2904 2905 case BOUNDL_t8_pb: 2906 case BOUNDL_t8_p8: 2907 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2908 if (FLAGS(c) == TRADITIONAL_BOUND) { 2909 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, 2910 isWORDCHAR_LC_utf8_safe); 2911 break; 2912 } 2913 2914 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2915 2916 to_complement = 1; 2917 goto do_boundu_utf8; 2918 2919 case NBOUNDL_t8_pb: 2920 case NBOUNDL_t8_p8: 2921 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2922 if (FLAGS(c) == TRADITIONAL_BOUND) { 2923 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, 2924 isWORDCHAR_LC_utf8_safe); 2925 break; 2926 } 2927 2928 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2929 2930 to_complement = 1; 2931 goto do_boundu_utf8; 2932 2933 case NBOUND_t8_pb: 2934 case NBOUND_t8_p8: 2935 /* regcomp.c makes sure that these only have the traditional \b 2936 * meaning. */ 2937 assert(FLAGS(c) == TRADITIONAL_BOUND); 2938 2939 /* FALLTHROUGH */ 2940 2941 case NBOUNDU_t8_pb: 2942 case NBOUNDU_t8_p8: 2943 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2944 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, 2945 isWORDCHAR_utf8_safe); 2946 break; 2947 } 2948 2949 to_complement = 1; 2950 goto do_boundu_utf8; 2951 2952 case BOUND_t8_pb: 2953 case BOUND_t8_p8: 2954 /* regcomp.c makes sure that these only have the traditional \b 2955 * meaning. */ 2956 assert(FLAGS(c) == TRADITIONAL_BOUND); 2957 2958 /* FALLTHROUGH */ 2959 2960 case BOUNDU_t8_pb: 2961 case BOUNDU_t8_p8: 2962 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2963 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe); 2964 break; 2965 } 2966 2967 do_boundu_utf8: 2968 if (s == reginfo->strbeg) { 2969 if (reginfo->intuit || regtry(reginfo, &s)) 2970 { 2971 goto got_it; 2972 } 2973 2974 /* Didn't match. Try at the next position (if there is one) */ 2975 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2976 if (UNLIKELY(s >= reginfo->strend)) { 2977 break; 2978 } 2979 } 2980 2981 switch((bound_type) FLAGS(c)) { 2982 case TRADITIONAL_BOUND: /* Should have already been handled */ 2983 assert(0); 2984 break; 2985 2986 case GCB_BOUND: 2987 { 2988 GCB_enum before = getGCB_VAL_UTF8( 2989 reghop3((U8*)s, -1, 2990 (U8*)(reginfo->strbeg)), 2991 (U8*) reginfo->strend); 2992 while (s < strend) { 2993 GCB_enum after = getGCB_VAL_UTF8((U8*) s, 2994 (U8*) reginfo->strend); 2995 if ( (to_complement ^ isGCB(before, 2996 after, 2997 (U8*) reginfo->strbeg, 2998 (U8*) s, 2999 1 /* target is utf8 */ )) 3000 && (reginfo->intuit || regtry(reginfo, &s))) 3001 { 3002 goto got_it; 3003 } 3004 before = after; 3005 s += UTF8_SAFE_SKIP(s, reginfo->strend); 3006 } 3007 } 3008 break; 3009 3010 case LB_BOUND: 3011 { 3012 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s, 3013 -1, 3014 (U8*)(reginfo->strbeg)), 3015 (U8*) reginfo->strend); 3016 while (s < strend) { 3017 LB_enum after = getLB_VAL_UTF8((U8*) s, 3018 (U8*) reginfo->strend); 3019 if (to_complement ^ isLB(before, 3020 after, 3021 (U8*) reginfo->strbeg, 3022 (U8*) s, 3023 (U8*) reginfo->strend, 3024 1 /* target is utf8 */ ) 3025 && (reginfo->intuit || regtry(reginfo, &s))) 3026 { 3027 goto got_it; 3028 } 3029 before = after; 3030 s += UTF8_SAFE_SKIP(s, reginfo->strend); 3031 } 3032 } 3033 3034 break; 3035 3036 case SB_BOUND: 3037 { 3038 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s, 3039 -1, 3040 (U8*)(reginfo->strbeg)), 3041 (U8*) reginfo->strend); 3042 while (s < strend) { 3043 SB_enum after = getSB_VAL_UTF8((U8*) s, 3044 (U8*) reginfo->strend); 3045 if ((to_complement ^ isSB(before, 3046 after, 3047 (U8*) reginfo->strbeg, 3048 (U8*) s, 3049 (U8*) reginfo->strend, 3050 1 /* target is utf8 */ )) 3051 && (reginfo->intuit || regtry(reginfo, &s))) 3052 { 3053 goto got_it; 3054 } 3055 before = after; 3056 s += UTF8_SAFE_SKIP(s, reginfo->strend); 3057 } 3058 } 3059 3060 break; 3061 3062 case WB_BOUND: 3063 { 3064 /* We are at a boundary between char_sub_0 and char_sub_1. 3065 * We also keep track of the value for char_sub_-1 as we 3066 * loop through the line. Context may be needed to make a 3067 * determination, and if so, this can save having to 3068 * recalculate it */ 3069 WB_enum previous = WB_UNKNOWN; 3070 WB_enum before = getWB_VAL_UTF8( 3071 reghop3((U8*)s, 3072 -1, 3073 (U8*)(reginfo->strbeg)), 3074 (U8*) reginfo->strend); 3075 while (s < strend) { 3076 WB_enum after = getWB_VAL_UTF8((U8*) s, 3077 (U8*) reginfo->strend); 3078 if ((to_complement ^ isWB(previous, 3079 before, 3080 after, 3081 (U8*) reginfo->strbeg, 3082 (U8*) s, 3083 (U8*) reginfo->strend, 3084 1 /* target is utf8 */ )) 3085 && (reginfo->intuit || regtry(reginfo, &s))) 3086 { 3087 goto got_it; 3088 } 3089 previous = before; 3090 before = after; 3091 s += UTF8_SAFE_SKIP(s, reginfo->strend); 3092 } 3093 } 3094 } 3095 3096 /* Here are at the final position in the target string, which is a 3097 * boundary by definition, so matches, depending on other constraints. 3098 * */ 3099 3100 if ( reginfo->intuit 3101 || (s <= reginfo->strend && regtry(reginfo, &s))) 3102 { 3103 goto got_it; 3104 } 3105 break; 3106 3107 case LNBREAK_t8_pb: 3108 case LNBREAK_t8_p8: 3109 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend)); 3110 break; 3111 3112 case LNBREAK_tb_pb: 3113 case LNBREAK_tb_p8: 3114 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend)); 3115 break; 3116 3117 /* The argument to all the POSIX node types is the class number to pass 3118 * to generic_isCC_() to build a mask for searching in PL_charclass[] */ 3119 3120 case NPOSIXL_t8_pb: 3121 case NPOSIXL_t8_p8: 3122 to_complement = 1; 3123 /* FALLTHROUGH */ 3124 3125 case POSIXL_t8_pb: 3126 case POSIXL_t8_p8: 3127 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 3128 REXEC_FBC_UTF8_CLASS_SCAN( 3129 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, 3130 (U8 *) strend))); 3131 break; 3132 3133 case NPOSIXL_tb_pb: 3134 case NPOSIXL_tb_p8: 3135 to_complement = 1; 3136 /* FALLTHROUGH */ 3137 3138 case POSIXL_tb_pb: 3139 case POSIXL_tb_p8: 3140 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 3141 REXEC_FBC_NON_UTF8_CLASS_SCAN( 3142 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); 3143 break; 3144 3145 case NPOSIXA_t8_pb: 3146 case NPOSIXA_t8_p8: 3147 /* The complement of something that matches only ASCII matches all 3148 * non-ASCII, plus everything in ASCII that isn't in the class. */ 3149 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend) 3150 || ! generic_isCC_A_(*s, FLAGS(c))); 3151 break; 3152 3153 case POSIXA_t8_pb: 3154 case POSIXA_t8_p8: 3155 /* Don't need to worry about utf8, as it can match only a single 3156 * byte invariant character. But we do anyway for performance reasons, 3157 * as otherwise we would have to examine all the continuation 3158 * characters */ 3159 REXEC_FBC_UTF8_CLASS_SCAN(generic_isCC_A_(*s, FLAGS(c))); 3160 break; 3161 3162 case NPOSIXD_tb_pb: 3163 case NPOSIXD_tb_p8: 3164 case NPOSIXA_tb_pb: 3165 case NPOSIXA_tb_p8: 3166 to_complement = 1; 3167 /* FALLTHROUGH */ 3168 3169 case POSIXD_tb_pb: 3170 case POSIXD_tb_p8: 3171 case POSIXA_tb_pb: 3172 case POSIXA_tb_p8: 3173 REXEC_FBC_NON_UTF8_CLASS_SCAN( 3174 to_complement ^ cBOOL(generic_isCC_A_(*s, FLAGS(c)))); 3175 break; 3176 3177 case NPOSIXU_tb_pb: 3178 case NPOSIXU_tb_p8: 3179 to_complement = 1; 3180 /* FALLTHROUGH */ 3181 3182 case POSIXU_tb_pb: 3183 case POSIXU_tb_p8: 3184 REXEC_FBC_NON_UTF8_CLASS_SCAN( 3185 to_complement ^ cBOOL(generic_isCC_(*s, 3186 FLAGS(c)))); 3187 break; 3188 3189 case NPOSIXD_t8_pb: 3190 case NPOSIXD_t8_p8: 3191 case NPOSIXU_t8_pb: 3192 case NPOSIXU_t8_p8: 3193 to_complement = 1; 3194 /* FALLTHROUGH */ 3195 3196 case POSIXD_t8_pb: 3197 case POSIXD_t8_p8: 3198 case POSIXU_t8_pb: 3199 case POSIXU_t8_p8: 3200 classnum = (char_class_number_) FLAGS(c); 3201 switch (classnum) { 3202 default: 3203 REXEC_FBC_UTF8_CLASS_SCAN( 3204 to_complement ^ cBOOL(_invlist_contains_cp( 3205 PL_XPosix_ptrs[classnum], 3206 utf8_to_uvchr_buf((U8 *) s, 3207 (U8 *) strend, 3208 NULL)))); 3209 break; 3210 3211 case CC_ENUM_SPACE_: 3212 REXEC_FBC_UTF8_CLASS_SCAN( 3213 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend))); 3214 break; 3215 3216 case CC_ENUM_BLANK_: 3217 REXEC_FBC_UTF8_CLASS_SCAN( 3218 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend))); 3219 break; 3220 3221 case CC_ENUM_XDIGIT_: 3222 REXEC_FBC_UTF8_CLASS_SCAN( 3223 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend))); 3224 break; 3225 3226 case CC_ENUM_VERTSPACE_: 3227 REXEC_FBC_UTF8_CLASS_SCAN( 3228 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend))); 3229 break; 3230 3231 case CC_ENUM_CNTRL_: 3232 REXEC_FBC_UTF8_CLASS_SCAN( 3233 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend))); 3234 break; 3235 } 3236 break; 3237 3238 case AHOCORASICKC_tb_pb: 3239 case AHOCORASICKC_tb_p8: 3240 case AHOCORASICKC_t8_pb: 3241 case AHOCORASICKC_t8_p8: 3242 case AHOCORASICK_tb_pb: 3243 case AHOCORASICK_tb_p8: 3244 case AHOCORASICK_t8_pb: 3245 case AHOCORASICK_t8_p8: 3246 { 3247 DECL_TRIE_TYPE(c); 3248 /* what trie are we using right now */ 3249 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG1u( c ) ]; 3250 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie]; 3251 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); 3252 3253 const char *last_start = strend - trie->minlen; 3254 #ifdef DEBUGGING 3255 const char *real_start = s; 3256 #endif 3257 STRLEN maxlen = trie->maxlen; 3258 SV *sv_points; 3259 U8 **points; /* map of where we were in the input string 3260 when reading a given char. For ASCII this 3261 is unnecessary overhead as the relationship 3262 is always 1:1, but for Unicode, especially 3263 case folded Unicode this is not true. */ 3264 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 3265 U8 *bitmap=NULL; 3266 3267 3268 DECLARE_AND_GET_RE_DEBUG_FLAGS; 3269 3270 /* We can't just allocate points here. We need to wrap it in 3271 * an SV so it gets freed properly if there is a croak while 3272 * running the match */ 3273 ENTER; 3274 SAVETMPS; 3275 sv_points=newSV(maxlen * sizeof(U8 *)); 3276 SvCUR_set(sv_points, 3277 maxlen * sizeof(U8 *)); 3278 SvPOK_on(sv_points); 3279 sv_2mortal(sv_points); 3280 points=(U8**)SvPV_nolen(sv_points ); 3281 if ( trie_type != trie_utf8_fold 3282 && (trie->bitmap || OP(c)==AHOCORASICKC) ) 3283 { 3284 if (trie->bitmap) 3285 bitmap=(U8*)trie->bitmap; 3286 else 3287 bitmap=(U8*)ANYOF_BITMAP(c); 3288 } 3289 /* this is the Aho-Corasick algorithm modified a touch 3290 to include special handling for long "unknown char" sequences. 3291 The basic idea being that we use AC as long as we are dealing 3292 with a possible matching char, when we encounter an unknown char 3293 (and we have not encountered an accepting state) we scan forward 3294 until we find a legal starting char. 3295 AC matching is basically that of trie matching, except that when 3296 we encounter a failing transition, we fall back to the current 3297 states "fail state", and try the current char again, a process 3298 we repeat until we reach the root state, state 1, or a legal 3299 transition. If we fail on the root state then we can either 3300 terminate if we have reached an accepting state previously, or 3301 restart the entire process from the beginning if we have not. 3302 3303 */ 3304 while (s <= last_start) { 3305 const U32 uniflags = UTF8_ALLOW_DEFAULT; 3306 U8 *uc = (U8*)s; 3307 U16 charid = 0; 3308 U32 base = 1; 3309 U32 state = 1; 3310 UV uvc = 0; 3311 STRLEN len = 0; 3312 STRLEN foldlen = 0; 3313 U8 *uscan = (U8*)NULL; 3314 U8 *leftmost = NULL; 3315 #ifdef DEBUGGING 3316 U32 accepted_word= 0; 3317 #endif 3318 U32 pointpos = 0; 3319 3320 while ( state && uc <= (U8*)strend ) { 3321 int failed=0; 3322 U32 word = aho->states[ state ].wordnum; 3323 3324 if( state==1 ) { 3325 if ( bitmap ) { 3326 DEBUG_TRIE_EXECUTE_r( 3327 if ( uc <= (U8*)last_start 3328 && !BITMAP_TEST(bitmap,*uc) ) 3329 { 3330 dump_exec_pos( (char *)uc, c, strend, 3331 real_start, 3332 (char *)uc, utf8_target, 0 ); 3333 Perl_re_printf( aTHX_ 3334 " Scanning for legal start char...\n"); 3335 } 3336 ); 3337 if (utf8_target) { 3338 while ( uc <= (U8*)last_start 3339 && !BITMAP_TEST(bitmap,*uc) ) 3340 { 3341 uc += UTF8SKIP(uc); 3342 } 3343 } else { 3344 while ( uc <= (U8*)last_start 3345 && ! BITMAP_TEST(bitmap,*uc) ) 3346 { 3347 uc++; 3348 } 3349 } 3350 s= (char *)uc; 3351 } 3352 if (uc >(U8*)last_start) break; 3353 } 3354 3355 if ( word ) { 3356 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) 3357 % maxlen ]; 3358 if (!leftmost || lpos < leftmost) { 3359 DEBUG_r(accepted_word=word); 3360 leftmost= lpos; 3361 } 3362 if (base==0) break; 3363 3364 } 3365 points[pointpos++ % maxlen]= uc; 3366 if (foldlen || uc < (U8*)strend) { 3367 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 3368 (U8 *) strend, uscan, len, uvc, 3369 charid, foldlen, foldbuf, 3370 uniflags); 3371 DEBUG_TRIE_EXECUTE_r({ 3372 dump_exec_pos( (char *)uc, c, strend, 3373 real_start, s, utf8_target, 0); 3374 Perl_re_printf( aTHX_ 3375 " Charid:%3u CP:%4" UVxf " ", 3376 charid, uvc); 3377 }); 3378 } 3379 else { 3380 len = 0; 3381 charid = 0; 3382 } 3383 3384 3385 do { 3386 #ifdef DEBUGGING 3387 word = aho->states[ state ].wordnum; 3388 #endif 3389 base = aho->states[ state ].trans.base; 3390 3391 DEBUG_TRIE_EXECUTE_r({ 3392 if (failed) 3393 dump_exec_pos((char *)uc, c, strend, real_start, 3394 s, utf8_target, 0 ); 3395 Perl_re_printf( aTHX_ 3396 "%sState: %4" UVxf ", word=%" UVxf, 3397 failed ? " Fail transition to " : "", 3398 (UV)state, (UV)word); 3399 }); 3400 if ( base ) { 3401 U32 tmp; 3402 I32 offset; 3403 if (charid && 3404 ( ((offset = base + charid 3405 - 1 - trie->uniquecharcount)) >= 0) 3406 && ((U32)offset < trie->lasttrans) 3407 && trie->trans[offset].check == state 3408 && (tmp=trie->trans[offset].next)) 3409 { 3410 DEBUG_TRIE_EXECUTE_r( 3411 Perl_re_printf( aTHX_ " - legal\n")); 3412 state = tmp; 3413 break; 3414 } 3415 else { 3416 DEBUG_TRIE_EXECUTE_r( 3417 Perl_re_printf( aTHX_ " - fail\n")); 3418 failed = 1; 3419 state = aho->fail[state]; 3420 } 3421 } 3422 else { 3423 /* we must be accepting here */ 3424 DEBUG_TRIE_EXECUTE_r( 3425 Perl_re_printf( aTHX_ " - accepting\n")); 3426 failed = 1; 3427 break; 3428 } 3429 } while(state); 3430 uc += len; 3431 if (failed) { 3432 if (leftmost) 3433 break; 3434 if (!state) state = 1; 3435 } 3436 } 3437 if ( aho->states[ state ].wordnum ) { 3438 U8 *lpos = points[ (pointpos 3439 - trie->wordinfo[aho->states[ state ] 3440 .wordnum].len) % maxlen ]; 3441 if (!leftmost || lpos < leftmost) { 3442 DEBUG_r(accepted_word=aho->states[ state ].wordnum); 3443 leftmost = lpos; 3444 } 3445 } 3446 if (leftmost) { 3447 s = (char*)leftmost; 3448 DEBUG_TRIE_EXECUTE_r({ 3449 Perl_re_printf( aTHX_ "Matches word #%" UVxf 3450 " at position %" IVdf ". Trying full" 3451 " pattern...\n", 3452 (UV)accepted_word, (IV)(s - real_start) 3453 ); 3454 }); 3455 if (reginfo->intuit || regtry(reginfo, &s)) { 3456 FREETMPS; 3457 LEAVE; 3458 goto got_it; 3459 } 3460 if (s < reginfo->strend) { 3461 s = HOPc(s,1); 3462 } 3463 DEBUG_TRIE_EXECUTE_r({ 3464 Perl_re_printf( aTHX_ 3465 "Pattern failed. Looking for new start" 3466 " point...\n"); 3467 }); 3468 } else { 3469 DEBUG_TRIE_EXECUTE_r( 3470 Perl_re_printf( aTHX_ "No match.\n")); 3471 break; 3472 } 3473 } 3474 FREETMPS; 3475 LEAVE; 3476 } 3477 break; 3478 3479 case EXACTFU_REQ8_t8_pb: 3480 case EXACTFUP_tb_p8: 3481 case EXACTFUP_t8_p8: 3482 case EXACTF_tb_p8: 3483 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */ 3484 case EXACTFAA_NO_TRIE_tb_p8: 3485 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8 3486 patterns */ 3487 assert(0); 3488 3489 default: 3490 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); 3491 } /* End of switch on node type */ 3492 3493 return 0; 3494 3495 got_it: 3496 return s; 3497 } 3498 3499 /* set RX_SAVED_COPY, RX_SUBBEG etc. 3500 * flags have same meanings as with regexec_flags() */ 3501 3502 static void 3503 S_reg_set_capture_string(pTHX_ REGEXP * const rx, 3504 char *strbeg, 3505 char *strend, 3506 SV *sv, 3507 U32 flags, 3508 bool utf8_target) 3509 { 3510 struct regexp *const prog = ReANY(rx); 3511 3512 if (flags & REXEC_COPY_STR) { 3513 #ifdef PERL_ANY_COW 3514 if (SvCANCOW(sv)) { 3515 DEBUG_C(Perl_re_printf( aTHX_ 3516 "Copy on write: regexp capture, type %d\n", 3517 (int) SvTYPE(sv))); 3518 /* Create a new COW SV to share the match string and store 3519 * in saved_copy, unless the current COW SV in saved_copy 3520 * is valid and suitable for our purpose */ 3521 if (( RXp_SAVED_COPY(prog) 3522 && SvIsCOW(RXp_SAVED_COPY(prog)) 3523 && SvPOKp(RXp_SAVED_COPY(prog)) 3524 && SvIsCOW(sv) 3525 && SvPOKp(sv) 3526 && SvPVX(sv) == SvPVX(RXp_SAVED_COPY(prog)))) 3527 { 3528 /* just reuse saved_copy SV */ 3529 if (RXp_MATCH_COPIED(prog)) { 3530 Safefree(RXp_SUBBEG(prog)); 3531 RXp_MATCH_COPIED_off(prog); 3532 } 3533 } 3534 else { 3535 /* create new COW SV to share string */ 3536 RXp_MATCH_COPY_FREE(prog); 3537 RXp_SAVED_COPY(prog) = sv_setsv_cow(RXp_SAVED_COPY(prog), sv); 3538 } 3539 RXp_SUBBEG(prog) = (char *)SvPVX_const(RXp_SAVED_COPY(prog)); 3540 assert (SvPOKp(RXp_SAVED_COPY(prog))); 3541 RXp_SUBLEN(prog) = strend - strbeg; 3542 RXp_SUBOFFSET(prog) = 0; 3543 RXp_SUBCOFFSET(prog) = 0; 3544 } else 3545 #endif 3546 { 3547 SSize_t min = 0; 3548 SSize_t max = strend - strbeg; 3549 SSize_t sublen; 3550 3551 /* NOTE: the following if block is not used or tested 3552 * in standard builds. It is only used when PERL_SAWAMPERSAND is 3553 * defined */ 3554 3555 if ( (flags & REXEC_COPY_SKIP_POST) 3556 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ 3557 && !(PL_sawampersand & SAWAMPERSAND_RIGHT) 3558 ) { /* don't copy $' part of string */ 3559 SSize_t offs_end; 3560 U32 n = 0; 3561 max = -1; 3562 /* calculate the right-most part of the string covered 3563 * by a capture. Due to lookahead, this may be to 3564 * the right of $&, so we have to scan all captures */ 3565 while (n <= RXp_LASTPAREN(prog)) { 3566 if ((offs_end = RXp_OFFS_END(prog,n)) > max) 3567 max = offs_end; 3568 n++; 3569 } 3570 if (max == -1) 3571 max = (PL_sawampersand & SAWAMPERSAND_LEFT) 3572 ? RXp_OFFS_START(prog,0) 3573 : 0; 3574 assert(max >= 0 && max <= strend - strbeg); 3575 } 3576 3577 /* NOTE: the following if block is not used or tested 3578 * in standard builds. It is only used when PERL_SAWAMPERSAND is 3579 * defined */ 3580 if ( (flags & REXEC_COPY_SKIP_PRE) 3581 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ 3582 && !(PL_sawampersand & SAWAMPERSAND_LEFT) 3583 ) { /* don't copy $` part of string */ 3584 U32 n = 0; 3585 min = max; 3586 /* calculate the left-most part of the string covered 3587 * by a capture. Due to lookbehind, this may be to 3588 * the left of $&, so we have to scan all captures */ 3589 while (min && n <= RXp_LASTPAREN(prog)) { 3590 SSize_t start = RXp_OFFS_START(prog,n); 3591 if ( start != -1 3592 && start < min) 3593 { 3594 min = start; 3595 } 3596 n++; 3597 } 3598 if (PL_sawampersand & SAWAMPERSAND_RIGHT) { 3599 SSize_t end = RXp_OFFS_END(prog,0); 3600 if ( min > end ) 3601 min = end; 3602 } 3603 } 3604 3605 assert(min >= 0 && min <= max && min <= strend - strbeg); 3606 sublen = max - min; 3607 3608 if (RXp_MATCH_COPIED(prog)) { 3609 if (sublen > RXp_SUBLEN(prog)) 3610 RXp_SUBBEG(prog) = 3611 (char*)saferealloc(RXp_SUBBEG(prog), sublen+1); 3612 } 3613 else 3614 RXp_SUBBEG(prog) = (char*)safemalloc(sublen+1); 3615 Copy(strbeg + min, RXp_SUBBEG(prog), sublen, char); 3616 RXp_SUBBEG(prog)[sublen] = '\0'; 3617 RXp_SUBOFFSET(prog) = min; 3618 RXp_SUBLEN(prog) = sublen; 3619 RXp_MATCH_COPIED_on(prog); 3620 } 3621 RXp_SUBCOFFSET(prog) = RXp_SUBOFFSET(prog); 3622 if (RXp_SUBOFFSET(prog) && utf8_target) { 3623 /* Convert byte offset to chars. 3624 * XXX ideally should only compute this if @-/@+ 3625 * has been seen, a la PL_sawampersand ??? */ 3626 3627 /* If there's a direct correspondence between the 3628 * string which we're matching and the original SV, 3629 * then we can use the utf8 len cache associated with 3630 * the SV. In particular, it means that under //g, 3631 * sv_pos_b2u() will use the previously cached 3632 * position to speed up working out the new length of 3633 * subcoffset, rather than counting from the start of 3634 * the string each time. This stops 3635 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; 3636 * from going quadratic */ 3637 if (SvPOKp(sv) && SvPVX(sv) == strbeg) 3638 RXp_SUBCOFFSET(prog) = sv_pos_b2u_flags(sv, RXp_SUBCOFFSET(prog), 3639 SV_GMAGIC|SV_CONST_RETURN); 3640 else 3641 RXp_SUBCOFFSET(prog) = utf8_length((U8*)strbeg, 3642 (U8*)(strbeg+RXp_SUBOFFSET(prog))); 3643 } 3644 } 3645 else { 3646 RXp_MATCH_COPY_FREE(prog); 3647 RXp_SUBBEG(prog) = strbeg; 3648 RXp_SUBOFFSET(prog) = 0; 3649 RXp_SUBCOFFSET(prog) = 0; 3650 RXp_SUBLEN(prog) = strend - strbeg; 3651 } 3652 } 3653 3654 3655 3656 3657 /* 3658 - regexec_flags - match a regexp against a string 3659 */ 3660 I32 3661 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, 3662 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) 3663 /* stringarg: the point in the string at which to begin matching */ 3664 /* strend: pointer to null at end of string */ 3665 /* strbeg: real beginning of string */ 3666 /* minend: end of match must be >= minend bytes after stringarg. */ 3667 /* sv: SV being matched: only used for utf8 flag, pos() etc; string 3668 * itself is accessed via the pointers above */ 3669 /* data: May be used for some additional optimizations. 3670 Currently unused. */ 3671 /* flags: For optimizations. See REXEC_* in regexp.h */ 3672 3673 { 3674 struct regexp *const prog = ReANY(rx); 3675 char *s; 3676 regnode *c; 3677 char *startpos; 3678 SSize_t minlen; /* must match at least this many chars */ 3679 SSize_t dontbother = 0; /* how many characters not to try at end */ 3680 const bool utf8_target = cBOOL(DO_UTF8(sv)); 3681 I32 multiline; 3682 RXi_GET_DECL(prog,progi); 3683 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */ 3684 regmatch_info *const reginfo = ®info_buf; 3685 regexp_paren_pair *swap = NULL; 3686 I32 oldsave; 3687 DECLARE_AND_GET_RE_DEBUG_FLAGS; 3688 3689 PERL_ARGS_ASSERT_REGEXEC_FLAGS; 3690 PERL_UNUSED_ARG(data); 3691 3692 /* Be paranoid... */ 3693 if (prog == NULL) { 3694 Perl_croak(aTHX_ "NULL regexp parameter"); 3695 } 3696 3697 DEBUG_EXECUTE_r( 3698 debug_start_match(rx, utf8_target, stringarg, strend, 3699 "Matching"); 3700 ); 3701 3702 startpos = stringarg; 3703 3704 /* set these early as they may be used by the HOP macros below */ 3705 reginfo->strbeg = strbeg; 3706 reginfo->strend = strend; 3707 reginfo->is_utf8_target = cBOOL(utf8_target); 3708 3709 if (prog->intflags & PREGf_GPOS_SEEN) { 3710 MAGIC *mg; 3711 3712 /* set reginfo->ganch, the position where \G can match */ 3713 3714 reginfo->ganch = 3715 (flags & REXEC_IGNOREPOS) 3716 ? stringarg /* use start pos rather than pos() */ 3717 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0) 3718 /* Defined pos(): */ 3719 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) 3720 : strbeg; /* pos() not defined; use start of string */ 3721 3722 DEBUG_GPOS_r(Perl_re_printf( aTHX_ 3723 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg))); 3724 3725 /* in the presence of \G, we may need to start looking earlier in 3726 * the string than the suggested start point of stringarg: 3727 * if prog->gofs is set, then that's a known, fixed minimum 3728 * offset, such as 3729 * /..\G/: gofs = 2 3730 * /ab|c\G/: gofs = 1 3731 * or if the minimum offset isn't known, then we have to go back 3732 * to the start of the string, e.g. /w+\G/ 3733 */ 3734 3735 if (prog->intflags & PREGf_ANCH_GPOS) { 3736 if (prog->gofs) { 3737 startpos = HOPBACKc(reginfo->ganch, prog->gofs); 3738 if (!startpos || 3739 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg)) 3740 { 3741 DEBUG_GPOS_r(Perl_re_printf( aTHX_ 3742 "fail: ganch-gofs before earliest possible start\n")); 3743 return 0; 3744 } 3745 } 3746 else 3747 startpos = reginfo->ganch; 3748 } 3749 else if (prog->gofs) { 3750 startpos = HOPBACKc(startpos, prog->gofs); 3751 if (!startpos) 3752 startpos = strbeg; 3753 } 3754 else if (prog->intflags & PREGf_GPOS_FLOAT) 3755 startpos = strbeg; 3756 } 3757 3758 minlen = prog->minlen; 3759 if ((startpos + minlen) > strend || startpos < strbeg) { 3760 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3761 "Regex match can't succeed, so not even tried\n")); 3762 return 0; 3763 } 3764 3765 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), 3766 * which will call destuctors to reset PL_regmatch_state, free higher 3767 * PL_regmatch_slabs, and clean up regmatch_info_aux and 3768 * regmatch_info_aux_eval */ 3769 3770 oldsave = PL_savestack_ix; 3771 3772 s = startpos; 3773 3774 if ((prog->extflags & RXf_USE_INTUIT) 3775 && !(flags & REXEC_CHECKED)) 3776 { 3777 s = re_intuit_start(rx, sv, strbeg, startpos, strend, 3778 flags, NULL); 3779 if (!s) 3780 return 0; 3781 3782 if (prog->extflags & RXf_CHECK_ALL) { 3783 /* we can match based purely on the result of INTUIT. 3784 * Set up captures etc just for $& and $-[0] 3785 * (an intuit-only match wont have $1,$2,..) */ 3786 assert(!prog->nparens); 3787 3788 /* s/// doesn't like it if $& is earlier than where we asked it to 3789 * start searching (which can happen on something like /.\G/) */ 3790 if ( (flags & REXEC_FAIL_ON_UNDERFLOW) 3791 && (s < stringarg)) 3792 { 3793 /* this should only be possible under \G */ 3794 assert(prog->intflags & PREGf_GPOS_SEEN); 3795 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3796 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); 3797 goto phooey; 3798 } 3799 3800 /* match via INTUIT shouldn't have any captures. 3801 * Let @-, @+, $^N know */ 3802 RXp_LASTPAREN(prog) = RXp_LASTCLOSEPAREN(prog) = 0; 3803 RXp_MATCH_UTF8_set(prog, utf8_target); 3804 SSize_t match_start = s - strbeg; 3805 SSize_t match_end = utf8_target 3806 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg 3807 : s - strbeg + prog->minlenret; 3808 CLOSE_ANY_CAPTURE(prog, 0, match_start, match_end); 3809 if ( !(flags & REXEC_NOT_FIRST) ) 3810 S_reg_set_capture_string(aTHX_ rx, 3811 strbeg, strend, 3812 sv, flags, utf8_target); 3813 3814 return 1; 3815 } 3816 } 3817 3818 multiline = prog->extflags & RXf_PMf_MULTILINE; 3819 3820 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { 3821 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3822 "String too short [regexec_flags]...\n")); 3823 goto phooey; 3824 } 3825 3826 /* Check validity of program. */ 3827 if (UCHARAT(progi->program) != REG_MAGIC) { 3828 Perl_croak(aTHX_ "corrupted regexp program"); 3829 } 3830 3831 RXp_MATCH_TAINTED_off(prog); 3832 RXp_MATCH_UTF8_set(prog, utf8_target); 3833 3834 reginfo->prog = rx; /* Yes, sorry that this is confusing. */ 3835 reginfo->intuit = 0; 3836 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); 3837 reginfo->warned = FALSE; 3838 reginfo->sv = sv; 3839 reginfo->poscache_maxiter = 0; /* not yet started a countdown */ 3840 /* see how far we have to get to not match where we matched before */ 3841 reginfo->till = stringarg + minend; 3842 3843 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { 3844 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after 3845 S_cleanup_regmatch_info_aux has executed (registered by 3846 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies 3847 magic belonging to this SV. 3848 Not newSVsv, either, as it does not COW. 3849 */ 3850 reginfo->sv = newSV_type(SVt_NULL); 3851 SvSetSV_nosteal(reginfo->sv, sv); 3852 SAVEFREESV(reginfo->sv); 3853 } 3854 3855 /* reserve next 2 or 3 slots in PL_regmatch_state: 3856 * slot N+0: may currently be in use: skip it 3857 * slot N+1: use for regmatch_info_aux struct 3858 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s 3859 * slot N+3: ready for use by regmatch() 3860 */ 3861 3862 { 3863 regmatch_state *old_regmatch_state; 3864 regmatch_slab *old_regmatch_slab; 3865 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; 3866 3867 /* on first ever match, allocate first slab */ 3868 if (!PL_regmatch_slab) { 3869 Newx(PL_regmatch_slab, 1, regmatch_slab); 3870 PL_regmatch_slab->prev = NULL; 3871 PL_regmatch_slab->next = NULL; 3872 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); 3873 } 3874 3875 old_regmatch_state = PL_regmatch_state; 3876 old_regmatch_slab = PL_regmatch_slab; 3877 3878 for (i=0; i <= max; i++) { 3879 if (i == 1) 3880 reginfo->info_aux = &(PL_regmatch_state->u.info_aux); 3881 else if (i ==2) 3882 reginfo->info_aux_eval = 3883 reginfo->info_aux->info_aux_eval = 3884 &(PL_regmatch_state->u.info_aux_eval); 3885 3886 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) 3887 PL_regmatch_state = S_push_slab(aTHX); 3888 } 3889 3890 /* note initial PL_regmatch_state position; at end of match we'll 3891 * pop back to there and free any higher slabs */ 3892 3893 reginfo->info_aux->old_regmatch_state = old_regmatch_state; 3894 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; 3895 reginfo->info_aux->poscache = NULL; 3896 3897 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); 3898 3899 if ((prog->extflags & RXf_EVAL_SEEN)) 3900 S_setup_eval_state(aTHX_ reginfo); 3901 else 3902 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; 3903 } 3904 3905 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { 3906 /* We have to be careful. If the previous successful match 3907 was from this regex we don't want a subsequent partially 3908 successful match to clobber the old results. 3909 So when we detect this possibility we add a swap buffer 3910 to the re, and switch the buffer each match. If we fail, 3911 we switch it back; otherwise we leave it swapped. 3912 */ 3913 swap = RXp_OFFSp(prog); 3914 /* avoid leak if we die, or clean up anyway if match completes */ 3915 SAVEFREEPV(swap); 3916 Newxz(RXp_OFFSp(prog), (prog->nparens + 1), regexp_paren_pair); 3917 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 3918 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n", 3919 0, 3920 PTR2UV(prog), 3921 PTR2UV(swap), 3922 PTR2UV(RXp_OFFSp(prog)) 3923 )); 3924 } 3925 3926 if (prog->recurse_locinput) 3927 Zero(prog->recurse_locinput,prog->nparens + 1, char *); 3928 3929 /* Simplest case: anchored match (but not \G) need be tried only once, 3930 * or with MBOL, only at the beginning of each line. 3931 * 3932 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets 3933 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't 3934 * match at the start of the string then it won't match anywhere else 3935 * either; while with /.*.../, if it doesn't match at the beginning, 3936 * the earliest it could match is at the start of the next line */ 3937 3938 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { 3939 char *end; 3940 3941 if (regtry(reginfo, &s)) 3942 goto got_it; 3943 3944 if (!(prog->intflags & PREGf_ANCH_MBOL)) 3945 goto phooey; 3946 3947 /* didn't match at start, try at other newline positions */ 3948 3949 if (minlen) 3950 dontbother = minlen - 1; 3951 end = HOP3c(strend, -dontbother, strbeg) - 1; 3952 3953 /* skip to next newline */ 3954 3955 while (s <= end) { /* note it could be possible to match at the end of the string */ 3956 /* NB: newlines are the same in unicode as they are in latin */ 3957 if (*s++ != '\n') 3958 continue; 3959 if (prog->check_substr || prog->check_utf8) { 3960 /* note that with PREGf_IMPLICIT, intuit can only fail 3961 * or return the start position, so it's of limited utility. 3962 * Nevertheless, I made the decision that the potential for 3963 * quick fail was still worth it - DAPM */ 3964 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL); 3965 if (!s) 3966 goto phooey; 3967 } 3968 if (regtry(reginfo, &s)) 3969 goto got_it; 3970 } 3971 goto phooey; 3972 } /* end anchored search */ 3973 3974 /* anchored \G match */ 3975 if (prog->intflags & PREGf_ANCH_GPOS) 3976 { 3977 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ 3978 assert(prog->intflags & PREGf_GPOS_SEEN); 3979 /* For anchored \G, the only position it can match from is 3980 * (ganch-gofs); we already set startpos to this above; if intuit 3981 * moved us on from there, we can't possibly succeed */ 3982 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs)); 3983 if (s == startpos && regtry(reginfo, &s)) 3984 goto got_it; 3985 goto phooey; 3986 } 3987 3988 /* Messy cases: unanchored match. */ 3989 3990 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { 3991 /* we have /x+whatever/ */ 3992 /* it must be a one character string (XXXX Except is_utf8_pat?) */ 3993 char ch; 3994 #ifdef DEBUGGING 3995 int did_match = 0; 3996 #endif 3997 if (utf8_target) { 3998 if (! prog->anchored_utf8) { 3999 to_utf8_substr(prog); 4000 } 4001 ch = SvPVX_const(prog->anchored_utf8)[0]; 4002 REXEC_FBC_UTF8_SCAN( 4003 if (*s == ch) { 4004 DEBUG_EXECUTE_r( did_match = 1 ); 4005 if (regtry(reginfo, &s)) goto got_it; 4006 s += UTF8_SAFE_SKIP(s, strend); 4007 while (s < strend && *s == ch) 4008 s += UTF8SKIP(s); 4009 } 4010 ); 4011 4012 } 4013 else { 4014 if (! prog->anchored_substr) { 4015 if (! to_byte_substr(prog)) { 4016 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 4017 } 4018 } 4019 ch = SvPVX_const(prog->anchored_substr)[0]; 4020 REXEC_FBC_NON_UTF8_SCAN( 4021 if (*s == ch) { 4022 DEBUG_EXECUTE_r( did_match = 1 ); 4023 if (regtry(reginfo, &s)) goto got_it; 4024 s++; 4025 while (s < strend && *s == ch) 4026 s++; 4027 } 4028 ); 4029 } 4030 DEBUG_EXECUTE_r(if (!did_match) 4031 Perl_re_printf( aTHX_ 4032 "Did not find anchored character...\n") 4033 ); 4034 } 4035 else if (prog->anchored_substr != NULL 4036 || prog->anchored_utf8 != NULL 4037 || ((prog->float_substr != NULL || prog->float_utf8 != NULL) 4038 && prog->float_max_offset < strend - s)) { 4039 SV *must; 4040 SSize_t back_max; 4041 SSize_t back_min; 4042 char *last; 4043 char *last1; /* Last position checked before */ 4044 #ifdef DEBUGGING 4045 int did_match = 0; 4046 #endif 4047 if (prog->anchored_substr || prog->anchored_utf8) { 4048 if (utf8_target) { 4049 if (! prog->anchored_utf8) { 4050 to_utf8_substr(prog); 4051 } 4052 must = prog->anchored_utf8; 4053 } 4054 else { 4055 if (! prog->anchored_substr) { 4056 if (! to_byte_substr(prog)) { 4057 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 4058 } 4059 } 4060 must = prog->anchored_substr; 4061 } 4062 back_max = back_min = prog->anchored_offset; 4063 } else { 4064 if (utf8_target) { 4065 if (! prog->float_utf8) { 4066 to_utf8_substr(prog); 4067 } 4068 must = prog->float_utf8; 4069 } 4070 else { 4071 if (! prog->float_substr) { 4072 if (! to_byte_substr(prog)) { 4073 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 4074 } 4075 } 4076 must = prog->float_substr; 4077 } 4078 back_max = prog->float_max_offset; 4079 back_min = prog->float_min_offset; 4080 } 4081 4082 if (back_min<0) { 4083 last = strend; 4084 } else { 4085 last = HOP3c(strend, /* Cannot start after this */ 4086 -(SSize_t)(CHR_SVLEN(must) 4087 - (SvTAIL(must) != 0) + back_min), strbeg); 4088 } 4089 if (s > reginfo->strbeg) 4090 last1 = HOPc(s, -1); 4091 else 4092 last1 = s - 1; /* bogus */ 4093 4094 /* XXXX check_substr already used to find "s", can optimize if 4095 check_substr==must. */ 4096 dontbother = 0; 4097 strend = HOPc(strend, -dontbother); 4098 while ( (s <= last) && 4099 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), 4100 (unsigned char*)strend, must, 4101 multiline ? FBMrf_MULTILINE : 0)) ) { 4102 DEBUG_EXECUTE_r( did_match = 1 ); 4103 if (HOPc(s, -back_max) > last1) { 4104 last1 = HOPc(s, -back_min); 4105 s = HOPc(s, -back_max); 4106 } 4107 else { 4108 char * const t = (last1 >= reginfo->strbeg) 4109 ? HOPc(last1, 1) : last1 + 1; 4110 4111 last1 = HOPc(s, -back_min); 4112 s = t; 4113 } 4114 if (utf8_target) { 4115 while (s <= last1) { 4116 if (regtry(reginfo, &s)) 4117 goto got_it; 4118 if (s >= last1) { 4119 s++; /* to break out of outer loop */ 4120 break; 4121 } 4122 s += UTF8SKIP(s); 4123 } 4124 } 4125 else { 4126 while (s <= last1) { 4127 if (regtry(reginfo, &s)) 4128 goto got_it; 4129 s++; 4130 } 4131 } 4132 } 4133 DEBUG_EXECUTE_r(if (!did_match) { 4134 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 4135 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 4136 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n", 4137 ((must == prog->anchored_substr || must == prog->anchored_utf8) 4138 ? "anchored" : "floating"), 4139 quoted, RE_SV_TAIL(must)); 4140 }); 4141 goto phooey; 4142 } 4143 else if ( (c = progi->regstclass) ) { 4144 if (minlen) { 4145 const OPCODE op = OP(progi->regstclass); 4146 /* don't bother with what can't match */ 4147 if (REGNODE_TYPE(op) != EXACT && REGNODE_TYPE(op) != TRIE) 4148 strend = HOPc(strend, -(minlen - 1)); 4149 } 4150 DEBUG_EXECUTE_r({ 4151 SV * const prop = sv_newmortal(); 4152 regprop(prog, prop, c, reginfo, NULL); 4153 { 4154 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), 4155 s,strend-s,PL_dump_re_max_len); 4156 Perl_re_printf( aTHX_ 4157 "Matching stclass %.*s against %s (%d bytes)\n", 4158 (int)SvCUR(prop), SvPVX_const(prop), 4159 quoted, (int)(strend - s)); 4160 } 4161 }); 4162 if (find_byclass(prog, c, s, strend, reginfo)) 4163 goto got_it; 4164 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n")); 4165 } 4166 else { 4167 dontbother = 0; 4168 if (prog->float_substr != NULL || prog->float_utf8 != NULL) { 4169 /* Trim the end. */ 4170 char *last= NULL; 4171 SV* float_real; 4172 STRLEN len; 4173 const char *little; 4174 4175 if (utf8_target) { 4176 if (! prog->float_utf8) { 4177 to_utf8_substr(prog); 4178 } 4179 float_real = prog->float_utf8; 4180 } 4181 else { 4182 if (! prog->float_substr) { 4183 if (! to_byte_substr(prog)) { 4184 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 4185 } 4186 } 4187 float_real = prog->float_substr; 4188 } 4189 4190 little = SvPV_const(float_real, len); 4191 if (SvTAIL(float_real)) { 4192 /* This means that float_real contains an artificial \n on 4193 * the end due to the presence of something like this: 4194 * /foo$/ where we can match both "foo" and "foo\n" at the 4195 * end of the string. So we have to compare the end of the 4196 * string first against the float_real without the \n and 4197 * then against the full float_real with the string. We 4198 * have to watch out for cases where the string might be 4199 * smaller than the float_real or the float_real without 4200 * the \n. */ 4201 char *checkpos= strend - len; 4202 DEBUG_OPTIMISE_r( 4203 Perl_re_printf( aTHX_ 4204 "%sChecking for float_real.%s\n", 4205 PL_colors[4], PL_colors[5])); 4206 if (checkpos + 1 < strbeg) { 4207 /* can't match, even if we remove the trailing \n 4208 * string is too short to match */ 4209 DEBUG_EXECUTE_r( 4210 Perl_re_printf( aTHX_ 4211 "%sString shorter than required trailing substring, cannot match.%s\n", 4212 PL_colors[4], PL_colors[5])); 4213 goto phooey; 4214 } else if (memEQ(checkpos + 1, little, len - 1)) { 4215 /* can match, the end of the string matches without the 4216 * "\n" */ 4217 last = checkpos + 1; 4218 } else if (checkpos < strbeg) { 4219 /* cant match, string is too short when the "\n" is 4220 * included */ 4221 DEBUG_EXECUTE_r( 4222 Perl_re_printf( aTHX_ 4223 "%sString does not contain required trailing substring, cannot match.%s\n", 4224 PL_colors[4], PL_colors[5])); 4225 goto phooey; 4226 } else if (!multiline) { 4227 /* non multiline match, so compare with the "\n" at the 4228 * end of the string */ 4229 if (memEQ(checkpos, little, len)) { 4230 last= checkpos; 4231 } else { 4232 DEBUG_EXECUTE_r( 4233 Perl_re_printf( aTHX_ 4234 "%sString does not contain required trailing substring, cannot match.%s\n", 4235 PL_colors[4], PL_colors[5])); 4236 goto phooey; 4237 } 4238 } else { 4239 /* multiline match, so we have to search for a place 4240 * where the full string is located */ 4241 goto find_last; 4242 } 4243 } else { 4244 find_last: 4245 if (len) 4246 last = rninstr(s, strend, little, little + len); 4247 else 4248 last = strend; /* matching "$" */ 4249 } 4250 if (!last) { 4251 /* at one point this block contained a comment which was 4252 * probably incorrect, which said that this was a "should not 4253 * happen" case. Even if it was true when it was written I am 4254 * pretty sure it is not anymore, so I have removed the comment 4255 * and replaced it with this one. Yves */ 4256 DEBUG_EXECUTE_r( 4257 Perl_re_printf( aTHX_ 4258 "%sString does not contain required substring, cannot match.%s\n", 4259 PL_colors[4], PL_colors[5] 4260 )); 4261 goto phooey; 4262 } 4263 dontbother = strend - last + prog->float_min_offset; 4264 } 4265 if (minlen && (dontbother < minlen)) 4266 dontbother = minlen - 1; 4267 strend -= dontbother; /* this one's always in bytes! */ 4268 /* We don't know much -- general case. */ 4269 if (utf8_target) { 4270 for (;;) { 4271 if (regtry(reginfo, &s)) 4272 goto got_it; 4273 if (s >= strend) 4274 break; 4275 s += UTF8SKIP(s); 4276 }; 4277 } 4278 else { 4279 do { 4280 if (regtry(reginfo, &s)) 4281 goto got_it; 4282 } while (s++ < strend); 4283 } 4284 } 4285 4286 /* Failure. */ 4287 goto phooey; 4288 4289 got_it: 4290 /* s/// doesn't like it if $& is earlier than where we asked it to 4291 * start searching (which can happen on something like /.\G/) */ 4292 if ( (flags & REXEC_FAIL_ON_UNDERFLOW) 4293 && (RXp_OFFS_START(prog,0) < stringarg - strbeg)) 4294 { 4295 /* this should only be possible under \G */ 4296 assert(prog->intflags & PREGf_GPOS_SEEN); 4297 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 4298 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); 4299 goto phooey; 4300 } 4301 4302 /* clean up; this will trigger destructors that will free all slabs 4303 * above the current one, and cleanup the regmatch_info_aux 4304 * and regmatch_info_aux_eval sructs */ 4305 4306 LEAVE_SCOPE(oldsave); 4307 4308 if (RXp_PAREN_NAMES(prog)) 4309 (void)hv_iterinit(RXp_PAREN_NAMES(prog)); 4310 4311 /* make sure $`, $&, $', and $digit will work later */ 4312 if ( !(flags & REXEC_NOT_FIRST) ) 4313 S_reg_set_capture_string(aTHX_ rx, 4314 strbeg, reginfo->strend, 4315 sv, flags, utf8_target); 4316 4317 return 1; 4318 4319 phooey: 4320 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n", 4321 PL_colors[4], PL_colors[5])); 4322 4323 if (swap) { 4324 /* we failed :-( roll it back. 4325 * Since the swap buffer will be freed on scope exit which follows 4326 * shortly, restore the old captures by copying 'swap's original 4327 * data to the new offs buffer 4328 */ 4329 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 4330 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n", 4331 0, 4332 PTR2UV(prog), 4333 PTR2UV(RXp_OFFSp(prog)), 4334 PTR2UV(swap) 4335 )); 4336 4337 Copy(swap, RXp_OFFSp(prog), prog->nparens + 1, regexp_paren_pair); 4338 } 4339 4340 /* clean up; this will trigger destructors that will free all slabs 4341 * above the current one, and cleanup the regmatch_info_aux 4342 * and regmatch_info_aux_eval sructs */ 4343 4344 LEAVE_SCOPE(oldsave); 4345 4346 return 0; 4347 } 4348 4349 4350 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting. 4351 * Do inc before dec, in case old and new rex are the same */ 4352 #define SET_reg_curpm(Re2) \ 4353 if (reginfo->info_aux_eval) { \ 4354 (void)ReREFCNT_inc(Re2); \ 4355 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ 4356 PM_SETRE((PL_reg_curpm), (Re2)); \ 4357 } 4358 4359 4360 /* 4361 - regtry - try match at specific point 4362 4363 NOTE: *startpos may be modifed by regtry() to signal to the caller 4364 that the next match should start at a specific position in the 4365 string. The macro LAST_REGTRY_SKIPPED_FORWARD(reginfo) can be 4366 used to detect when this has happened. 4367 4368 */ 4369 STATIC bool /* 0 failure, 1 success */ 4370 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) 4371 { 4372 CHECKPOINT lastcp; 4373 REGEXP *const rx = reginfo->prog; 4374 regexp *const prog = ReANY(rx); 4375 SSize_t result; 4376 #ifdef DEBUGGING 4377 U32 depth = 0; /* used by REGCP_SET */ 4378 #endif 4379 RXi_GET_DECL(prog,progi); 4380 DECLARE_AND_GET_RE_DEBUG_FLAGS; 4381 4382 PERL_ARGS_ASSERT_REGTRY; 4383 4384 reginfo->cutpoint=NULL; 4385 4386 RXp_OFFSp(prog)[0].start = *startposp - reginfo->strbeg; 4387 RXp_LASTPAREN(prog) = 0; 4388 RXp_LASTCLOSEPAREN(prog) = 0; 4389 4390 /* XXXX What this code is doing here?!!! There should be no need 4391 to do this again and again, RXp_LASTPAREN(prog) should take care of 4392 this! --ilya*/ 4393 4394 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. 4395 * Actually, the code in regcppop() (which Ilya may be meaning by 4396 * RXp_LASTPAREN(prog)), is not needed at all by the test suite 4397 * (op/regexp, op/pat, op/split), but that code is needed otherwise 4398 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 4399 * Meanwhile, this code *is* needed for the 4400 * above-mentioned test suite tests to succeed. The common theme 4401 * on those tests seems to be returning null fields from matches. 4402 * --jhi updated by dapm */ 4403 4404 /* After encountering a variant of the issue mentioned above I think 4405 * the point Ilya was making is that if we properly unwind whenever 4406 * we set lastparen to a smaller value then we should not need to do 4407 * this every time, only when needed. So if we have tests that fail if 4408 * we remove this, then it suggests somewhere else we are improperly 4409 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and 4410 * places it is called, and related regcp() routines. - Yves */ 4411 #if 1 4412 if (prog->nparens) { 4413 regexp_paren_pair *pp = RXp_OFFSp(prog); 4414 I32 i; 4415 for (i = prog->nparens; i > (I32)RXp_LASTPAREN(prog); i--) { 4416 ++pp; 4417 pp->start = -1; 4418 pp->end = -1; 4419 } 4420 } 4421 #endif 4422 REGCP_SET(lastcp); 4423 result = regmatch(reginfo, *startposp, progi->program + 1); 4424 if (result != -1) { 4425 RXp_OFFSp(prog)[0].end = result; 4426 return 1; 4427 } 4428 if (reginfo->cutpoint) 4429 *startposp= reginfo->cutpoint; 4430 REGCP_UNWIND(lastcp); 4431 return 0; 4432 } 4433 4434 /* this is used to determine how far from the left messages like 4435 'failed...' are printed in regexec.c. It should be set such that 4436 messages are inline with the regop output that created them. 4437 */ 4438 #define REPORT_CODE_OFF 29 4439 #define INDENT_CHARS(depth) ((int)(depth) % 20) 4440 4441 #ifdef PERL_RE_BUILD_DEBUG 4442 int 4443 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...) 4444 { 4445 va_list ap; 4446 int result; 4447 PerlIO *f= Perl_debug_log; 4448 PERL_ARGS_ASSERT_RE_EXEC_INDENTF; 4449 va_start(ap, depth); 4450 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" ); 4451 result = PerlIO_vprintf(f, fmt, ap); 4452 va_end(ap); 4453 return result; 4454 } 4455 #endif /* DEBUGGING */ 4456 4457 /* grab a new slab and return the first slot in it */ 4458 4459 STATIC regmatch_state * 4460 S_push_slab(pTHX) 4461 { 4462 regmatch_slab *s = PL_regmatch_slab->next; 4463 if (!s) { 4464 Newx(s, 1, regmatch_slab); 4465 s->prev = PL_regmatch_slab; 4466 s->next = NULL; 4467 PL_regmatch_slab->next = s; 4468 } 4469 PL_regmatch_slab = s; 4470 return SLAB_FIRST(s); 4471 } 4472 4473 #ifdef DEBUGGING 4474 4475 STATIC void 4476 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, 4477 const char *start, const char *end, const char *blurb) 4478 { 4479 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; 4480 4481 PERL_ARGS_ASSERT_DEBUG_START_MATCH; 4482 4483 if (!PL_colorset) 4484 reginitcolors(); 4485 { 4486 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), 4487 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len); 4488 4489 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), 4490 start, end - start, PL_dump_re_max_len); 4491 4492 Perl_re_printf( aTHX_ 4493 "%s%s REx%s %s against %s\n", 4494 PL_colors[4], blurb, PL_colors[5], s0, s1); 4495 4496 if (utf8_target||utf8_pat) 4497 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n", 4498 utf8_pat ? "pattern" : "", 4499 utf8_pat && utf8_target ? " and " : "", 4500 utf8_target ? "string" : "" 4501 ); 4502 } 4503 } 4504 4505 STATIC void 4506 S_dump_exec_pos(pTHX_ const char *locinput, 4507 const regnode *scan, 4508 const char *loc_regeol, 4509 const char *loc_bostr, 4510 const char *loc_reg_starttry, 4511 const bool utf8_target, 4512 const U32 depth 4513 ) 4514 { 4515 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; 4516 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ 4517 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); 4518 /* The part of the string before starttry has one color 4519 (pref0_len chars), between starttry and current 4520 position another one (pref_len - pref0_len chars), 4521 after the current position the third one. 4522 We assume that pref0_len <= pref_len, otherwise we 4523 decrease pref0_len. */ 4524 int pref_len = (locinput - loc_bostr) > (5 + taill) - l 4525 ? (5 + taill) - l : locinput - loc_bostr; 4526 int pref0_len; 4527 4528 PERL_ARGS_ASSERT_DUMP_EXEC_POS; 4529 4530 if (utf8_target) { 4531 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) { 4532 pref_len++; 4533 } 4534 } 4535 pref0_len = pref_len - (locinput - loc_reg_starttry); 4536 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) 4537 l = ( loc_regeol - locinput > (5 + taill) - pref_len 4538 ? (5 + taill) - pref_len : loc_regeol - locinput); 4539 if (utf8_target) { 4540 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) { 4541 l--; 4542 } 4543 } 4544 if (pref0_len < 0) 4545 pref0_len = 0; 4546 if (pref0_len > pref_len) 4547 pref0_len = pref_len; 4548 { 4549 const int is_uni = utf8_target ? 1 : 0; 4550 4551 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), 4552 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5); 4553 4554 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), 4555 (locinput - pref_len + pref0_len), 4556 pref_len - pref0_len, PL_dump_re_max_len, 2, 3); 4557 4558 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), 4559 locinput, loc_regeol - locinput, 10, 0, 1); 4560 4561 const STRLEN tlen=len0+len1+len2; 4562 Perl_re_printf( aTHX_ 4563 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ", 4564 (IV)(locinput - loc_bostr), 4565 len0, s0, 4566 len1, s1, 4567 (docolor ? "" : "> <"), 4568 len2, s2, 4569 (int)(tlen > 19 ? 0 : 19 - tlen), 4570 "", 4571 (UV)depth); 4572 } 4573 } 4574 4575 #endif 4576 4577 /* reg_check_named_buff_matched() 4578 * Checks to see if a named buffer has matched. The data array of 4579 * buffer numbers corresponding to the buffer is expected to reside 4580 * in the regexp->data->data array in the slot stored in the ARG1u() of 4581 * node involved. Note that this routine doesn't actually care about the 4582 * name, that information is not preserved from compilation to execution. 4583 * Returns the index of the leftmost defined buffer with the given name 4584 * or 0 if non of the buffers matched. 4585 */ 4586 STATIC I32 4587 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) 4588 { 4589 I32 n; 4590 RXi_GET_DECL(rex,rexi); 4591 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 4592 I32 *nums=(I32*)SvPVX(sv_dat); 4593 4594 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; 4595 4596 for ( n=0; n<SvIVX(sv_dat); n++ ) { 4597 if ((I32)RXp_LASTPAREN(rex) >= nums[n] && 4598 RXp_OFFS_END(rex,nums[n]) != -1) 4599 { 4600 return nums[n]; 4601 } 4602 } 4603 return 0; 4604 } 4605 4606 static bool 4607 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node, 4608 struct next_matchable_info * m, 4609 regmatch_info *reginfo) 4610 { 4611 /* This function determines various characteristics about every possible 4612 * initial match of the passed-in EXACTish <text_node>, and stores them in 4613 * <*m>. 4614 * 4615 * That includes a match string and a parallel mask, such that if you AND 4616 * the target string with the mask and compare with the match string, 4617 * you'll have a pretty good idea, perhaps even perfect, if that portion of 4618 * the target matches or not. 4619 * 4620 * The motivation behind this function is to allow the caller to set up 4621 * tight loops for matching. Consider patterns like '.*B' or '.*?B' where 4622 * B is an arbitrary EXACTish node. To find the end of .*, we look for the 4623 * beginning oF B, which is the passed in <text_node> That's where this 4624 * function comes in. The values it returns can quickly be used to rule 4625 * out many, or all, cases of possible matches not actually being the 4626 * beginning of B, <text_node>. It is also used in regrepeat() where we 4627 * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently 4628 * determine where the span of 'A's stop. 4629 * 4630 * If <text_node> is of type EXACT, there is only one possible character 4631 * that can match its first character, and so the situation is quite 4632 * simple. But things can get much more complicated if folding is 4633 * involved. It may be that the first character of an EXACTFish node 4634 * doesn't participate in any possible fold, e.g., punctuation, so it can 4635 * be matched only by itself. The vast majority of characters that are in 4636 * folds match just two things, their lower and upper-case equivalents. 4637 * But not all are like that; some have multiple possible matches, or match 4638 * sequences of more than one character. This function sorts all that out. 4639 * 4640 * It returns information about all possibilities of what the first 4641 * character(s) of <text_node> could look like. Again, if <text_node> is a 4642 * plain EXACT node, that's just the actual first bytes of the first 4643 * character; but otherwise it is the bytes, that when masked, match all 4644 * possible combinations of all the initial bytes of all the characters 4645 * that could match, folded. (Actually, this is a slight over promise. It 4646 * handles only up to the initial 5 bytes, which is enough for all Unicode 4647 * characters, but not for all non-Unicode ones.) 4648 * 4649 * Here's an example to clarify. Suppose the first character of 4650 * <text_node> is the letter 'C', and we are under /i matching. That means 4651 * 'c' also matches. The representations of these two characters differ in 4652 * just one bit, so the mask would be a zero in that position and ones in 4653 * the other 7. And the returned string would be the AND of these two 4654 * characters, and would be one byte long, since these characters are each 4655 * a single byte. ANDing the target <text_node> with this mask will yield 4656 * the returned string if and only if <text_node> begins with one of these 4657 * two characters. So, the function would also return that the definitive 4658 * length matched is 1 byte. 4659 * 4660 * Now, suppose instead of the letter 'C', <text_node> begins with the 4661 * letter 'F'. The situation is much more complicated because there are 4662 * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also 4663 * begins with 'f', and hence could match. We add these into the returned 4664 * string and mask, but the result isn't definitive; the caller has to 4665 * check further if its AND and compare pass. But the failure of that 4666 * compare will quickly rule out most possible inputs. 4667 * 4668 * Much of this could be done in regcomp.c at compile time, except for 4669 * locale-dependent, and UTF-8 target dependent data. Extra data fields 4670 * could be used for one or the other eventualities. 4671 * 4672 * If this function determines that no possible character in the target 4673 * string can match, it returns FALSE; otherwise TRUE. (The FALSE 4674 * situation occurs if the first character in <text_node> requires UTF-8 to 4675 * represent, and the target string isn't in UTF-8.) 4676 * 4677 * Some analysis is in GH #18414, located at the time of this writing at: 4678 * https://github.com/Perl/perl5/issues/18414 4679 */ 4680 4681 const bool utf8_target = reginfo->is_utf8_target; 4682 bool utf8_pat = reginfo->is_utf8_pat; 4683 4684 PERL_UINT_FAST8_T i; 4685 4686 /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e 4687 */ 4688 U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } }; 4689 U8 lengths[MAX_MATCHES] = { 0 }; 4690 4691 U8 index_of_longest = 0; 4692 4693 U8 *pat = (U8*)STRING(text_node); 4694 Size_t pat_len = STR_LEN(text_node); 4695 U8 op = OP(text_node); 4696 4697 U8 byte_mask[5] = {0}; 4698 U8 byte_anded[5] = {0}; 4699 4700 /* There are some folds in Unicode to multiple characters. This will hold 4701 * such characters that could fold to the beginning of 'text_node' */ 4702 UV multi_fold_from = 0; 4703 4704 /* We may have to create a modified copy of the pattern */ 4705 U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' }; 4706 4707 m->max_length = 0; 4708 m->min_length = 255; 4709 m->count = 0; 4710 4711 /* Even if the first character in the node can match something in Latin1, 4712 * if there is anything in the node that can't, the match must fail */ 4713 if (! utf8_target && isEXACT_REQ8(op)) { 4714 return FALSE; 4715 } 4716 4717 /* Define a temporary op for use in this function, using an existing one that 4718 * should never be a real op during execution */ 4719 #define TURKISH PSEUDO 4720 4721 /* What to do about these two nodes had to be deferred to runtime (which is 4722 * now). If the extra information we now have so indicates, turn them into 4723 * EXACTFU nodes */ 4724 if ( (op == EXACTF && utf8_target) 4725 || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE)) 4726 { 4727 if (op == EXACTFL && IN_UTF8_TURKIC_LOCALE) { 4728 op = TURKISH; 4729 } 4730 else { 4731 op = EXACTFU; 4732 } 4733 4734 /* And certain situations are better handled if we create a modified 4735 * version of the pattern */ 4736 if (utf8_pat) { /* Here, must have been EXACTFL, so look at the 4737 specific problematic characters */ 4738 if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) { 4739 4740 /* The node could start with characters that are the first ones 4741 * of a multi-character fold. */ 4742 multi_fold_from 4743 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len); 4744 if (multi_fold_from) { 4745 4746 /* Here, they do form a sequence that matches the fold of a 4747 * single character. That single character then is a 4748 * possible match. Below we will look again at this, but 4749 * the code below is expecting every character in the 4750 * pattern to be folded, which the input isn't required to 4751 * be in this case. So, just fold the single character, 4752 * and the result will be in the expected form. */ 4753 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len, 4754 FOLD_FLAGS_FULL); 4755 pat = mod_pat; 4756 } 4757 /* Turkish has a couple extra possibilities. */ 4758 else if ( UNLIKELY(op == TURKISH) 4759 && pat_len >= 3 4760 && isALPHA_FOLD_EQ(pat[0], 'f') 4761 && ( memBEGINs(pat + 1, pat_len - 1, 4762 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8) 4763 || ( pat_len >= 4 4764 && isALPHA_FOLD_EQ(pat[1], 'f') 4765 && memBEGINs(pat + 2, pat_len - 2, 4766 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8) 4767 ))) { 4768 /* The macros for finding a multi-char fold don't include 4769 * the Turkish possibilities, in which U+130 folds to 'i'. 4770 * Hard-code these. It's very unlikely that Unicode will 4771 * ever add any others. */ 4772 if (pat[1] == 'f') { 4773 pat_len = 3; 4774 Copy("ffi", mod_pat, pat_len, U8); 4775 } 4776 else { 4777 pat_len = 2; 4778 Copy("fi", mod_pat, pat_len, U8); 4779 } 4780 pat = mod_pat; 4781 } 4782 else if ( UTF8_IS_DOWNGRADEABLE_START(*pat) 4783 && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8)) 4784 && LIKELY(memNEs(pat, pat_len, 4785 LATIN_SMALL_LETTER_SHARP_S_UTF8)) 4786 && (LIKELY(op != TURKISH || *pat != 'I'))) 4787 { 4788 /* For all cases of things between 0-255, except the ones 4789 * in the conditional above, the fold is just the lower 4790 * case, which is faster than the more general case. */ 4791 mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0], 4792 pat[1])); 4793 pat_len = 1; 4794 pat = mod_pat; 4795 utf8_pat = FALSE; 4796 } 4797 else { /* Code point above 255, or needs special handling */ 4798 _to_utf8_fold_flags(pat, pat + pat_len, 4799 mod_pat, &pat_len, 4800 FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE); 4801 pat = mod_pat; 4802 } 4803 } 4804 } 4805 else if /* Below is not a UTF-8 pattern; there's a somewhat different 4806 set of problematic characters */ 4807 ((multi_fold_from 4808 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len))) 4809 { 4810 /* We may have to canonicalize a multi-char fold, as in the UTF-8 4811 * case */ 4812 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len, 4813 FOLD_FLAGS_FULL); 4814 pat = mod_pat; 4815 } 4816 else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) { 4817 mod_pat[0] = mod_pat[1] = 's'; 4818 pat_len = 2; 4819 utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant 4820 chars, and speeds copying */ 4821 pat = mod_pat; 4822 } 4823 else if (LIKELY(op != TURKISH || *pat != 'I')) { 4824 mod_pat[0] = toLOWER_L1(*pat); 4825 pat_len = 1; 4826 pat = mod_pat; 4827 } 4828 } 4829 else if /* Below isn't a node that we convert to UTF-8 */ 4830 ( utf8_target 4831 && ! utf8_pat 4832 && op == EXACTFAA_NO_TRIE 4833 && *pat == LATIN_SMALL_LETTER_SHARP_S) 4834 { 4835 /* A very special case. Folding U+DF goes to U+17F under /iaa. We 4836 * did this at compile time when the pattern was UTF-8 , but otherwise 4837 * we couldn't do it earlier, because it requires a UTF-8 target for 4838 * this match to be legal. */ 4839 pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1); 4840 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 4841 LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8); 4842 pat = mod_pat; 4843 utf8_pat = TRUE; 4844 } 4845 4846 /* Here, we have taken care of the initial work for a few very problematic 4847 * situations, possibly creating a modified pattern. 4848 * 4849 * Now ready for the general case. We build up all the possible things 4850 * that could match the first character of the pattern into the elements of 4851 * 'matches[]' 4852 * 4853 * Everything generally matches at least itself. But if there is a 4854 * UTF8ness mismatch, we have to convert to that of the target string. */ 4855 if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */ 4856 matches[0][0] = pat[0]; 4857 lengths[0] = 1; 4858 m->count++; 4859 } 4860 else if (utf8_target) { 4861 if (utf8_pat) { 4862 lengths[0] = UTF8SKIP(pat); 4863 Copy(pat, matches[0], lengths[0], U8); 4864 m->count++; 4865 } 4866 else { /* target is UTF-8, pattern isn't */ 4867 matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]); 4868 matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]); 4869 lengths[0] = 2; 4870 m->count++; 4871 } 4872 } 4873 else if (! utf8_pat) { /* Neither is UTF-8 */ 4874 matches[0][0] = pat[0]; 4875 lengths[0] = 1; 4876 m->count++; 4877 } 4878 else /* target isn't UTF-8; pattern is. No match possible unless the 4879 pattern's first character can fit in a byte */ 4880 if (UTF8_IS_DOWNGRADEABLE_START(*pat)) 4881 { 4882 matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]); 4883 lengths[0] = 1; 4884 m->count++; 4885 } 4886 4887 /* Here we have taken care of any necessary node-type changes */ 4888 4889 if (m->count) { 4890 m->max_length = lengths[0]; 4891 m->min_length = lengths[0]; 4892 } 4893 4894 /* For non-folding nodes, there are no other possible candidate matches, 4895 * but for foldable ones, we have to look further. */ 4896 if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */ 4897 UV folded; /* The first character in the pattern, folded */ 4898 U32 first_fold_from; /* A character that folds to it */ 4899 const U32 * remaining_fold_froms; /* The remaining characters that 4900 fold to it, if any */ 4901 Size_t folds_to_count; /* The total number of characters that fold to 4902 'folded' */ 4903 4904 /* If the node begins with a sequence of more than one character that 4905 * together form the fold of a single character, it is called a 4906 * 'multi-character fold', and the normal functions don't handle this 4907 * case. We set 'multi_fold_from' to the single folded-from character, 4908 * which is handled in an extra iteration below */ 4909 if (utf8_pat) { 4910 folded = valid_utf8_to_uvchr(pat, NULL); 4911 multi_fold_from 4912 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len); 4913 } 4914 else { 4915 folded = *pat; 4916 4917 /* This may generate illegal combinations for things like EXACTF, 4918 * but rather than repeat the logic and exclude them here, all such 4919 * illegalities are checked for and skipped below in the loop */ 4920 multi_fold_from 4921 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len); 4922 } 4923 4924 /* Everything matches at least itself; initialize to that because the 4925 * only the branches below that set it are the ones where the number 4926 * isn't 1. */ 4927 folds_to_count = 1; 4928 4929 /* There are a few special cases for locale-dependent nodes, where the 4930 * run-time context was needed before we could know what matched */ 4931 if (UNLIKELY(op == EXACTFL) && folded < 256) { 4932 first_fold_from = PL_fold_locale[folded]; 4933 } 4934 else if ( op == EXACTFL && utf8_target && utf8_pat 4935 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8 4936 LATIN_SMALL_LETTER_LONG_S_UTF8)) 4937 { 4938 first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S; 4939 } 4940 else if (UNLIKELY( op == TURKISH 4941 && ( isALPHA_FOLD_EQ(folded, 'i') 4942 || inRANGE(folded, 4943 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE, 4944 LATIN_SMALL_LETTER_DOTLESS_I)))) 4945 { /* Turkish folding requires special handling */ 4946 if (folded == 'i') 4947 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; 4948 else if (folded == 'I') 4949 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I; 4950 else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) 4951 first_fold_from = 'i'; 4952 else first_fold_from = 'I'; 4953 } 4954 else { 4955 /* Here, isn't a special case: use the generic function to 4956 * calculate what folds to this */ 4957 redo_multi: 4958 /* Look up what code points (besides itself) fold to 'folded'; 4959 * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */ 4960 folds_to_count = _inverse_folds(folded, &first_fold_from, 4961 &remaining_fold_froms); 4962 } 4963 4964 /* Add each character that folds to 'folded' to the list of them, 4965 * subject to limitations based on the node type and target UTF8ness. 4966 * If there was a character that folded to multiple characters, do an 4967 * extra iteration for it. (Note the extra iteration if there is a 4968 * multi-character fold) */ 4969 for (i = 0; i < folds_to_count 4970 + UNLIKELY(multi_fold_from != 0); i++) 4971 { 4972 UV fold_from = 0; 4973 4974 if (i >= folds_to_count) { /* Final iteration: handle the 4975 multi-char */ 4976 fold_from = multi_fold_from; 4977 } 4978 else if (i == 0) { 4979 fold_from = first_fold_from; 4980 } 4981 else if (i < folds_to_count) { 4982 fold_from = remaining_fold_froms[i-1]; 4983 } 4984 4985 if (folded == fold_from) { /* We already added the character 4986 itself */ 4987 continue; 4988 } 4989 4990 /* EXACTF doesn't have any non-ascii folds */ 4991 if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) { 4992 continue; 4993 } 4994 4995 /* In /iaa nodes, neither or both must be ASCII to be a legal fold 4996 * */ 4997 if ( isASCII(folded) != isASCII(fold_from) 4998 && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE)) 4999 5000 { 5001 continue; 5002 } 5003 5004 /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8 5005 * locale, but those have been converted to EXACTFU above) */ 5006 if ( op == EXACTFL 5007 && (folded < 256) != (fold_from < 256)) 5008 { 5009 continue; 5010 } 5011 5012 /* If this triggers, it likely is because of the unlikely case 5013 * where a new Unicode standard has changed what MAX_MATCHES should 5014 * be set to */ 5015 assert(m->count < MAX_MATCHES); 5016 5017 /* Add this character to the list of possible matches */ 5018 if (utf8_target) { 5019 uvchr_to_utf8(matches[(U8) m->count], fold_from); 5020 lengths[m->count] = UVCHR_SKIP(fold_from); 5021 m->count++; 5022 } 5023 else { /* Non-UTF8 target: no code point above 255 can appear in it 5024 */ 5025 if (fold_from > 255) { 5026 continue; 5027 } 5028 5029 matches[m->count][0] = fold_from; 5030 lengths[m->count] = 1; 5031 m->count++; 5032 } 5033 5034 /* Update min and mlengths */ 5035 if (m->min_length > lengths[m->count-1]) { 5036 m->min_length = lengths[m->count-1]; 5037 } 5038 5039 if (m->max_length < lengths[m->count-1]) { 5040 index_of_longest = m->count - 1; 5041 m->max_length = lengths[index_of_longest]; 5042 } 5043 } /* looped through each potential fold */ 5044 5045 /* If there is something that folded to an initial multi-character 5046 * fold, repeat, using it. This catches some edge cases. An example 5047 * of one is /ss/i when UTF-8 encoded. The function 5048 * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF 5049 * (LATIN SMALL SHARP S). If it returned a list of characters, this 5050 * code wouldn't be needed. But since it doesn't, we have to look what 5051 * folds to the U+DF. In this case, U+1E9E does, and has to be added. 5052 * */ 5053 if (multi_fold_from) { 5054 folded = multi_fold_from; 5055 multi_fold_from = 0; 5056 goto redo_multi; 5057 } 5058 } /* End of finding things that participate in this fold */ 5059 5060 if (m->count == 0) { /* If nothing found, can't match */ 5061 m->min_length = 0; 5062 return FALSE; 5063 } 5064 5065 /* Have calculated all possible matches. Now calculate the mask and AND 5066 * values */ 5067 m->initial_exact = 0; 5068 m->initial_definitive = 0; 5069 5070 { 5071 unsigned int mask_ones = 0; 5072 unsigned int possible_ones = 0; 5073 U8 j; 5074 5075 /* For each byte that is in all possible matches ... */ 5076 for (j = 0; j < MIN(m->min_length, 5); j++) { 5077 5078 /* Initialize the accumulator for this byte */ 5079 byte_mask[j] = 0xFF; 5080 byte_anded[j] = matches[0][j]; 5081 5082 /* Then the rest of the rows (folds). The mask is based on, like, 5083 * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0 5084 * where they differ. */ 5085 for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) { 5086 byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]); 5087 byte_anded[j] &= matches[i][j]; 5088 } 5089 5090 /* Keep track of the number of initial mask bytes that are all one 5091 * bits. The code calling this can use this number to know that 5092 * a string that matches this number of bytes in the pattern is an 5093 * exact match of that pattern for this number of bytes. But also 5094 * counted are the number of initial bytes that in total have a 5095 * single zero bit. If a string matches those, masked, it must be 5096 * one of two possibilites, both of which this function has 5097 * determined are legal. (But if that single 0 is one of the 5098 * initial bits for masking a UTF-8 start byte, that could 5099 * incorrectly lead to different length strings appearing to be 5100 * equivalent, so only do this optimization when the matchables are 5101 * all the same length. This was uncovered by testing 5102 * /\x{029E}/i.) */ 5103 if (m->min_length == m->max_length) { 5104 mask_ones += PL_bitcount[byte_mask[j]]; 5105 possible_ones += 8; 5106 if (mask_ones + 1 >= possible_ones) { 5107 m->initial_definitive++; 5108 if (mask_ones >= possible_ones) { 5109 m->initial_exact++; 5110 } 5111 } 5112 } 5113 } 5114 } 5115 5116 /* The first byte is separate for speed */ 5117 m->first_byte_mask = byte_mask[0]; 5118 m->first_byte_anded = byte_anded[0]; 5119 5120 /* Then pack up to the next 4 bytes into a word */ 5121 m->mask32 = m->anded32 = 0; 5122 for (i = 1; i < MIN(m->min_length, 5); i++) { 5123 U8 which = i; 5124 U8 shift = (which - 1) * 8; 5125 m->mask32 |= (U32) byte_mask[i] << shift; 5126 m->anded32 |= (U32) byte_anded[i] << shift; 5127 } 5128 5129 /* Finally, take the match strings and place them sequentially into a 5130 * one-dimensional array. (This is done to save significant space in the 5131 * structure.) Sort so the longest (presumably the least likely) is last. 5132 * XXX When this gets moved to regcomp, may want to fully sort shortest 5133 * first, but above we generally used the folded code point first, and 5134 * those tend to be no longer than their upper case values, so this is 5135 * already pretty well sorted by size. 5136 * 5137 * If the asserts fail, it's most likely because a new version of the 5138 * Unicode standard requires more space; simply increase the declaration 5139 * size. */ 5140 { 5141 U8 cur_pos = 0; 5142 U8 output_index = 0; 5143 5144 if (m->count > 1) { /* No need to sort a single entry */ 5145 for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) { 5146 5147 /* Keep the same order for all but the longest. (If the 5148 * asserts fail, it could be because m->matches is declared too 5149 * short, either because of a new Unicode release, or an 5150 * overlooked test case, or it could be a bug.) */ 5151 if (i != index_of_longest) { 5152 assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches)); 5153 Copy(matches[i], m->matches + cur_pos, lengths[i], U8); 5154 cur_pos += lengths[i]; 5155 m->lengths[output_index++] = lengths[i]; 5156 } 5157 } 5158 } 5159 5160 assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches)); 5161 Copy(matches[index_of_longest], m->matches + cur_pos, 5162 lengths[index_of_longest], U8); 5163 5164 /* Place the longest match last */ 5165 m->lengths[output_index] = lengths[index_of_longest]; 5166 } 5167 5168 5169 return TRUE; 5170 } 5171 5172 PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */ 5173 bool 5174 S_test_EXACTISH_ST(const char * loc, 5175 struct next_matchable_info info) 5176 { 5177 /* This function uses the data set up in setup_EXACTISH_ST() to see if the 5178 * bytes starting at 'loc' can match based on 'next_matchable_info' */ 5179 5180 U32 input32 = 0; 5181 5182 /* Check the first byte */ 5183 if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded) 5184 return FALSE; 5185 5186 /* Pack the next up-to-4 bytes into a 32 bit word */ 5187 switch (info.min_length) { 5188 default: 5189 input32 |= (U32) ((U8) loc[4]) << 3 * 8; 5190 /* FALLTHROUGH */ 5191 case 4: 5192 input32 |= (U8) loc[3] << 2 * 8; 5193 /* FALLTHROUGH */ 5194 case 3: 5195 input32 |= (U8) loc[2] << 1 * 8; 5196 /* FALLTHROUGH */ 5197 case 2: 5198 input32 |= (U8) loc[1]; 5199 break; 5200 case 1: 5201 return TRUE; /* We already tested and passed the 0th byte */ 5202 case 0: 5203 ASSUME(0); 5204 } 5205 5206 /* And AND that with the mask and compare that with the assembled ANDED 5207 * values */ 5208 return (input32 & info.mask32) == info.anded32; 5209 } 5210 5211 STATIC bool 5212 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target) 5213 { 5214 /* returns a boolean indicating if there is a Grapheme Cluster Boundary 5215 * between the inputs. See https://www.unicode.org/reports/tr29/. */ 5216 5217 PERL_ARGS_ASSERT_ISGCB; 5218 5219 switch (GCB_table[before][after]) { 5220 case GCB_BREAKABLE: 5221 return TRUE; 5222 5223 case GCB_NOBREAK: 5224 return FALSE; 5225 5226 case GCB_RI_then_RI: 5227 { 5228 int RI_count = 1; 5229 U8 * temp_pos = (U8 *) curpos; 5230 5231 /* Do not break within emoji flag sequences. That is, do not 5232 * break between regional indicator (RI) symbols if there is an 5233 * odd number of RI characters before the break point. 5234 * GB12 sot (RI RI)* RI × RI 5235 * GB13 [^RI] (RI RI)* RI × RI */ 5236 5237 while (backup_one_GCB(strbeg, 5238 &temp_pos, 5239 utf8_target) == GCB_Regional_Indicator) 5240 { 5241 RI_count++; 5242 } 5243 5244 return RI_count % 2 != 1; 5245 } 5246 5247 case GCB_EX_then_EM: 5248 5249 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */ 5250 { 5251 U8 * temp_pos = (U8 *) curpos; 5252 GCB_enum prev; 5253 5254 do { 5255 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target); 5256 } 5257 while (prev == GCB_Extend); 5258 5259 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ; 5260 } 5261 5262 case GCB_Maybe_Emoji_NonBreak: 5263 5264 { 5265 5266 /* Do not break within emoji modifier sequences or emoji zwj sequences. 5267 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic} 5268 */ 5269 U8 * temp_pos = (U8 *) curpos; 5270 GCB_enum prev; 5271 5272 do { 5273 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target); 5274 } 5275 while (prev == GCB_Extend); 5276 5277 return prev != GCB_ExtPict_XX; 5278 } 5279 5280 default: 5281 break; 5282 } 5283 5284 #ifdef DEBUGGING 5285 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n", 5286 before, after, GCB_table[before][after]); 5287 assert(0); 5288 #endif 5289 return TRUE; 5290 } 5291 5292 STATIC GCB_enum 5293 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5294 { 5295 GCB_enum gcb; 5296 5297 PERL_ARGS_ASSERT_BACKUP_ONE_GCB; 5298 5299 if (*curpos < strbeg) { 5300 return GCB_EDGE; 5301 } 5302 5303 if (utf8_target) { 5304 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5305 U8 * prev_prev_char_pos; 5306 5307 if (! prev_char_pos) { 5308 return GCB_EDGE; 5309 } 5310 5311 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) { 5312 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5313 *curpos = prev_char_pos; 5314 prev_char_pos = prev_prev_char_pos; 5315 } 5316 else { 5317 *curpos = (U8 *) strbeg; 5318 return GCB_EDGE; 5319 } 5320 } 5321 else { 5322 if (*curpos - 2 < strbeg) { 5323 *curpos = (U8 *) strbeg; 5324 return GCB_EDGE; 5325 } 5326 (*curpos)--; 5327 gcb = getGCB_VAL_CP(*(*curpos - 1)); 5328 } 5329 5330 return gcb; 5331 } 5332 5333 /* Combining marks attach to most classes that precede them, but this defines 5334 * the exceptions (from TR14) */ 5335 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \ 5336 || prev == LB_Mandatory_Break \ 5337 || prev == LB_Carriage_Return \ 5338 || prev == LB_Line_Feed \ 5339 || prev == LB_Next_Line \ 5340 || prev == LB_Space \ 5341 || prev == LB_ZWSpace)) 5342 5343 STATIC bool 5344 S_isLB(pTHX_ LB_enum before, 5345 LB_enum after, 5346 const U8 * const strbeg, 5347 const U8 * const curpos, 5348 const U8 * const strend, 5349 const bool utf8_target) 5350 { 5351 U8 * temp_pos = (U8 *) curpos; 5352 LB_enum prev = before; 5353 5354 /* Is the boundary between 'before' and 'after' line-breakable? 5355 * Most of this is just a table lookup of a generated table from Unicode 5356 * rules. But some rules require context to decide, and so have to be 5357 * implemented in code */ 5358 5359 PERL_ARGS_ASSERT_ISLB; 5360 5361 /* Rule numbers in the comments below are as of Unicode 9.0 */ 5362 5363 redo: 5364 before = prev; 5365 switch (LB_table[before][after]) { 5366 case LB_BREAKABLE: 5367 return TRUE; 5368 5369 case LB_NOBREAK: 5370 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN: 5371 return FALSE; 5372 5373 case LB_SP_foo + LB_BREAKABLE: 5374 case LB_SP_foo + LB_NOBREAK: 5375 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN: 5376 5377 /* When we have something following a SP, we have to look at the 5378 * context in order to know what to do. 5379 * 5380 * SP SP should not reach here because LB7: Do not break before 5381 * spaces. (For two spaces in a row there is nothing that 5382 * overrides that) */ 5383 assert(after != LB_Space); 5384 5385 /* Here we have a space followed by a non-space. Mostly this is a 5386 * case of LB18: "Break after spaces". But there are complications 5387 * as the handling of spaces is somewhat tricky. They are in a 5388 * number of rules, which have to be applied in priority order, but 5389 * something earlier in the string can cause a rule to be skipped 5390 * and a lower priority rule invoked. A prime example is LB7 which 5391 * says don't break before a space. But rule LB8 (lower priority) 5392 * says that the first break opportunity after a ZW is after any 5393 * span of spaces immediately after it. If a ZW comes before a SP 5394 * in the input, rule LB8 applies, and not LB7. Other such rules 5395 * involve combining marks which are rules 9 and 10, but they may 5396 * override higher priority rules if they come earlier in the 5397 * string. Since we're doing random access into the middle of the 5398 * string, we have to look for rules that should get applied based 5399 * on both string position and priority. Combining marks do not 5400 * attach to either ZW nor SP, so we don't have to consider them 5401 * until later. 5402 * 5403 * To check for LB8, we have to find the first non-space character 5404 * before this span of spaces */ 5405 do { 5406 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 5407 } 5408 while (prev == LB_Space); 5409 5410 /* LB8 Break before any character following a zero-width space, 5411 * even if one or more spaces intervene. 5412 * ZW SP* ÷ 5413 * So if we have a ZW just before this span, and to get here this 5414 * is the final space in the span. */ 5415 if (prev == LB_ZWSpace) { 5416 return TRUE; 5417 } 5418 5419 /* Here, not ZW SP+. There are several rules that have higher 5420 * priority than LB18 and can be resolved now, as they don't depend 5421 * on anything earlier in the string (except ZW, which we have 5422 * already handled). One of these rules is LB11 Do not break 5423 * before Word joiner, but we have specially encoded that in the 5424 * lookup table so it is caught by the single test below which 5425 * catches the other ones. */ 5426 if (LB_table[LB_Space][after] - LB_SP_foo 5427 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN) 5428 { 5429 return FALSE; 5430 } 5431 5432 /* If we get here, we have to XXX consider combining marks. */ 5433 if (prev == LB_Combining_Mark) { 5434 5435 /* What happens with these depends on the character they 5436 * follow. */ 5437 do { 5438 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 5439 } 5440 while (prev == LB_Combining_Mark); 5441 5442 /* Most times these attach to and inherit the characteristics 5443 * of that character, but not always, and when not, they are to 5444 * be treated as AL by rule LB10. */ 5445 if (! LB_CM_ATTACHES_TO(prev)) { 5446 prev = LB_Alphabetic; 5447 } 5448 } 5449 5450 /* Here, we have the character preceding the span of spaces all set 5451 * up. We follow LB18: "Break after spaces" unless the table shows 5452 * that is overridden */ 5453 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN; 5454 5455 case LB_CM_ZWJ_foo: 5456 5457 /* We don't know how to treat the CM except by looking at the first 5458 * non-CM character preceding it. ZWJ is treated as CM */ 5459 do { 5460 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 5461 } 5462 while (prev == LB_Combining_Mark || prev == LB_ZWJ); 5463 5464 /* Here, 'prev' is that first earlier non-CM character. If the CM 5465 * attaches to it, then it inherits the behavior of 'prev'. If it 5466 * doesn't attach, it is to be treated as an AL */ 5467 if (! LB_CM_ATTACHES_TO(prev)) { 5468 prev = LB_Alphabetic; 5469 } 5470 5471 goto redo; 5472 5473 case LB_HY_or_BA_then_foo + LB_BREAKABLE: 5474 case LB_HY_or_BA_then_foo + LB_NOBREAK: 5475 5476 /* LB21a Don't break after Hebrew + Hyphen. 5477 * HL (HY | BA) × */ 5478 5479 if (backup_one_LB(strbeg, &temp_pos, utf8_target) 5480 == LB_Hebrew_Letter) 5481 { 5482 return FALSE; 5483 } 5484 5485 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE; 5486 5487 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE: 5488 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK: 5489 5490 /* LB25a (PR | PO) × ( OP | HY )? NU */ 5491 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) { 5492 return FALSE; 5493 } 5494 5495 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY 5496 == LB_BREAKABLE; 5497 5498 case LB_SY_or_IS_then_various + LB_BREAKABLE: 5499 case LB_SY_or_IS_then_various + LB_NOBREAK: 5500 { 5501 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */ 5502 5503 LB_enum temp = prev; 5504 do { 5505 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 5506 } 5507 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric); 5508 if (temp == LB_Numeric) { 5509 return FALSE; 5510 } 5511 5512 return LB_table[prev][after] - LB_SY_or_IS_then_various 5513 == LB_BREAKABLE; 5514 } 5515 5516 case LB_various_then_PO_or_PR + LB_BREAKABLE: 5517 case LB_various_then_PO_or_PR + LB_NOBREAK: 5518 { 5519 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */ 5520 5521 LB_enum temp = prev; 5522 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis) 5523 { 5524 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 5525 } 5526 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) { 5527 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 5528 } 5529 if (temp == LB_Numeric) { 5530 return FALSE; 5531 } 5532 return LB_various_then_PO_or_PR; 5533 } 5534 5535 case LB_RI_then_RI + LB_NOBREAK: 5536 case LB_RI_then_RI + LB_BREAKABLE: 5537 { 5538 int RI_count = 1; 5539 5540 /* LB30a Break between two regional indicator symbols if and 5541 * only if there are an even number of regional indicators 5542 * preceding the position of the break. 5543 * 5544 * sot (RI RI)* RI × RI 5545 * [^RI] (RI RI)* RI × RI */ 5546 5547 while (backup_one_LB(strbeg, 5548 &temp_pos, 5549 utf8_target) == LB_Regional_Indicator) 5550 { 5551 RI_count++; 5552 } 5553 5554 return RI_count % 2 == 0; 5555 } 5556 5557 default: 5558 break; 5559 } 5560 5561 #ifdef DEBUGGING 5562 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n", 5563 before, after, LB_table[before][after]); 5564 assert(0); 5565 #endif 5566 return TRUE; 5567 } 5568 5569 STATIC LB_enum 5570 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) 5571 { 5572 5573 LB_enum lb; 5574 5575 PERL_ARGS_ASSERT_ADVANCE_ONE_LB; 5576 5577 if (*curpos >= strend) { 5578 return LB_EDGE; 5579 } 5580 5581 if (utf8_target) { 5582 *curpos += UTF8SKIP(*curpos); 5583 if (*curpos >= strend) { 5584 return LB_EDGE; 5585 } 5586 lb = getLB_VAL_UTF8(*curpos, strend); 5587 } 5588 else { 5589 (*curpos)++; 5590 if (*curpos >= strend) { 5591 return LB_EDGE; 5592 } 5593 lb = getLB_VAL_CP(**curpos); 5594 } 5595 5596 return lb; 5597 } 5598 5599 STATIC LB_enum 5600 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5601 { 5602 LB_enum lb; 5603 5604 PERL_ARGS_ASSERT_BACKUP_ONE_LB; 5605 5606 if (*curpos < strbeg) { 5607 return LB_EDGE; 5608 } 5609 5610 if (utf8_target) { 5611 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5612 U8 * prev_prev_char_pos; 5613 5614 if (! prev_char_pos) { 5615 return LB_EDGE; 5616 } 5617 5618 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) { 5619 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5620 *curpos = prev_char_pos; 5621 prev_char_pos = prev_prev_char_pos; 5622 } 5623 else { 5624 *curpos = (U8 *) strbeg; 5625 return LB_EDGE; 5626 } 5627 } 5628 else { 5629 if (*curpos - 2 < strbeg) { 5630 *curpos = (U8 *) strbeg; 5631 return LB_EDGE; 5632 } 5633 (*curpos)--; 5634 lb = getLB_VAL_CP(*(*curpos - 1)); 5635 } 5636 5637 return lb; 5638 } 5639 5640 STATIC bool 5641 S_isSB(pTHX_ SB_enum before, 5642 SB_enum after, 5643 const U8 * const strbeg, 5644 const U8 * const curpos, 5645 const U8 * const strend, 5646 const bool utf8_target) 5647 { 5648 /* returns a boolean indicating if there is a Sentence Boundary Break 5649 * between the inputs. See https://www.unicode.org/reports/tr29/ */ 5650 5651 U8 * lpos = (U8 *) curpos; 5652 bool has_para_sep = FALSE; 5653 bool has_sp = FALSE; 5654 5655 PERL_ARGS_ASSERT_ISSB; 5656 5657 /* Break at the start and end of text. 5658 SB1. sot ÷ 5659 SB2. ÷ eot 5660 But unstated in Unicode is don't break if the text is empty */ 5661 if (before == SB_EDGE || after == SB_EDGE) { 5662 return before != after; 5663 } 5664 5665 /* SB 3: Do not break within CRLF. */ 5666 if (before == SB_CR && after == SB_LF) { 5667 return FALSE; 5668 } 5669 5670 /* Break after paragraph separators. CR and LF are considered 5671 * so because Unicode views text as like word processing text where there 5672 * are no newlines except between paragraphs, and the word processor takes 5673 * care of wrapping without there being hard line-breaks in the text *./ 5674 SB4. Sep | CR | LF ÷ */ 5675 if (before == SB_Sep || before == SB_CR || before == SB_LF) { 5676 return TRUE; 5677 } 5678 5679 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF. 5680 * (See Section 6.2, Replacing Ignore Rules.) 5681 SB5. X (Extend | Format)* → X */ 5682 if (after == SB_Extend || after == SB_Format) { 5683 5684 /* Implied is that these characters attach to everything 5685 * immediately prior to them except for those separator-type 5686 * characters. And the rules earlier have already handled the case 5687 * when one of those immediately precedes the extend char */ 5688 return FALSE; 5689 } 5690 5691 if (before == SB_Extend || before == SB_Format) { 5692 U8 * temp_pos = lpos; 5693 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target); 5694 if ( backup != SB_EDGE 5695 && backup != SB_Sep 5696 && backup != SB_CR 5697 && backup != SB_LF) 5698 { 5699 before = backup; 5700 lpos = temp_pos; 5701 } 5702 5703 /* Here, both 'before' and 'backup' are these types; implied is that we 5704 * don't break between them */ 5705 if (backup == SB_Extend || backup == SB_Format) { 5706 return FALSE; 5707 } 5708 } 5709 5710 /* Do not break after ambiguous terminators like period, if they are 5711 * immediately followed by a number or lowercase letter, if they are 5712 * between uppercase letters, if the first following letter (optionally 5713 * after certain punctuation) is lowercase, or if they are followed by 5714 * "continuation" punctuation such as comma, colon, or semicolon. For 5715 * example, a period may be an abbreviation or numeric period, and thus may 5716 * not mark the end of a sentence. 5717 5718 * SB6. ATerm × Numeric */ 5719 if (before == SB_ATerm && after == SB_Numeric) { 5720 return FALSE; 5721 } 5722 5723 /* SB7. (Upper | Lower) ATerm × Upper */ 5724 if (before == SB_ATerm && after == SB_Upper) { 5725 U8 * temp_pos = lpos; 5726 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target); 5727 if (backup == SB_Upper || backup == SB_Lower) { 5728 return FALSE; 5729 } 5730 } 5731 5732 /* The remaining rules that aren't the final one, all require an STerm or 5733 * an ATerm after having backed up over some Close* Sp*, and in one case an 5734 * optional Paragraph separator, although one rule doesn't have any Sp's in it. 5735 * So do that backup now, setting flags if either Sp or a paragraph 5736 * separator are found */ 5737 5738 if (before == SB_Sep || before == SB_CR || before == SB_LF) { 5739 has_para_sep = TRUE; 5740 before = backup_one_SB(strbeg, &lpos, utf8_target); 5741 } 5742 5743 if (before == SB_Sp) { 5744 has_sp = TRUE; 5745 do { 5746 before = backup_one_SB(strbeg, &lpos, utf8_target); 5747 } 5748 while (before == SB_Sp); 5749 } 5750 5751 while (before == SB_Close) { 5752 before = backup_one_SB(strbeg, &lpos, utf8_target); 5753 } 5754 5755 /* The next few rules apply only when the backed-up-to is an ATerm, and in 5756 * most cases an STerm */ 5757 if (before == SB_STerm || before == SB_ATerm) { 5758 5759 /* So, here the lhs matches 5760 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)? 5761 * and we have set flags if we found an Sp, or the optional Sep,CR,LF. 5762 * The rules that apply here are: 5763 * 5764 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR 5765 | LF | STerm | ATerm) )* Lower 5766 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm) 5767 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF) 5768 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF) 5769 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷ 5770 */ 5771 5772 /* And all but SB11 forbid having seen a paragraph separator */ 5773 if (! has_para_sep) { 5774 if (before == SB_ATerm) { /* SB8 */ 5775 U8 * rpos = (U8 *) curpos; 5776 SB_enum later = after; 5777 5778 while ( later != SB_OLetter 5779 && later != SB_Upper 5780 && later != SB_Lower 5781 && later != SB_Sep 5782 && later != SB_CR 5783 && later != SB_LF 5784 && later != SB_STerm 5785 && later != SB_ATerm 5786 && later != SB_EDGE) 5787 { 5788 later = advance_one_SB(&rpos, strend, utf8_target); 5789 } 5790 if (later == SB_Lower) { 5791 return FALSE; 5792 } 5793 } 5794 5795 if ( after == SB_SContinue /* SB8a */ 5796 || after == SB_STerm 5797 || after == SB_ATerm) 5798 { 5799 return FALSE; 5800 } 5801 5802 if (! has_sp) { /* SB9 applies only if there was no Sp* */ 5803 if ( after == SB_Close 5804 || after == SB_Sp 5805 || after == SB_Sep 5806 || after == SB_CR 5807 || after == SB_LF) 5808 { 5809 return FALSE; 5810 } 5811 } 5812 5813 /* SB10. This and SB9 could probably be combined some way, but khw 5814 * has decided to follow the Unicode rule book precisely for 5815 * simplified maintenance */ 5816 if ( after == SB_Sp 5817 || after == SB_Sep 5818 || after == SB_CR 5819 || after == SB_LF) 5820 { 5821 return FALSE; 5822 } 5823 } 5824 5825 /* SB11. */ 5826 return TRUE; 5827 } 5828 5829 /* Otherwise, do not break. 5830 SB12. Any × Any */ 5831 5832 return FALSE; 5833 } 5834 5835 STATIC SB_enum 5836 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) 5837 { 5838 SB_enum sb; 5839 5840 PERL_ARGS_ASSERT_ADVANCE_ONE_SB; 5841 5842 if (*curpos >= strend) { 5843 return SB_EDGE; 5844 } 5845 5846 if (utf8_target) { 5847 do { 5848 *curpos += UTF8SKIP(*curpos); 5849 if (*curpos >= strend) { 5850 return SB_EDGE; 5851 } 5852 sb = getSB_VAL_UTF8(*curpos, strend); 5853 } while (sb == SB_Extend || sb == SB_Format); 5854 } 5855 else { 5856 do { 5857 (*curpos)++; 5858 if (*curpos >= strend) { 5859 return SB_EDGE; 5860 } 5861 sb = getSB_VAL_CP(**curpos); 5862 } while (sb == SB_Extend || sb == SB_Format); 5863 } 5864 5865 return sb; 5866 } 5867 5868 STATIC SB_enum 5869 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5870 { 5871 SB_enum sb; 5872 5873 PERL_ARGS_ASSERT_BACKUP_ONE_SB; 5874 5875 if (*curpos < strbeg) { 5876 return SB_EDGE; 5877 } 5878 5879 if (utf8_target) { 5880 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5881 if (! prev_char_pos) { 5882 return SB_EDGE; 5883 } 5884 5885 /* Back up over Extend and Format. curpos is always just to the right 5886 * of the character whose value we are getting */ 5887 do { 5888 U8 * prev_prev_char_pos; 5889 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, 5890 strbeg))) 5891 { 5892 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5893 *curpos = prev_char_pos; 5894 prev_char_pos = prev_prev_char_pos; 5895 } 5896 else { 5897 *curpos = (U8 *) strbeg; 5898 return SB_EDGE; 5899 } 5900 } while (sb == SB_Extend || sb == SB_Format); 5901 } 5902 else { 5903 do { 5904 if (*curpos - 2 < strbeg) { 5905 *curpos = (U8 *) strbeg; 5906 return SB_EDGE; 5907 } 5908 (*curpos)--; 5909 sb = getSB_VAL_CP(*(*curpos - 1)); 5910 } while (sb == SB_Extend || sb == SB_Format); 5911 } 5912 5913 return sb; 5914 } 5915 5916 STATIC bool 5917 S_isWB(pTHX_ WB_enum previous, 5918 WB_enum before, 5919 WB_enum after, 5920 const U8 * const strbeg, 5921 const U8 * const curpos, 5922 const U8 * const strend, 5923 const bool utf8_target) 5924 { 5925 /* Return a boolean as to if the boundary between 'before' and 'after' is 5926 * a Unicode word break, using their published algorithm, but tailored for 5927 * Perl by treating spans of white space as one unit. Context may be 5928 * needed to make this determination. If the value for the character 5929 * before 'before' is known, it is passed as 'previous'; otherwise that 5930 * should be set to WB_UNKNOWN. The other input parameters give the 5931 * boundaries and current position in the matching of the string. That 5932 * is, 'curpos' marks the position where the character whose wb value is 5933 * 'after' begins. See https://www.unicode.org/reports/tr29/ */ 5934 5935 U8 * before_pos = (U8 *) curpos; 5936 U8 * after_pos = (U8 *) curpos; 5937 WB_enum prev = before; 5938 WB_enum next; 5939 5940 PERL_ARGS_ASSERT_ISWB; 5941 5942 /* Rule numbers in the comments below are as of Unicode 9.0 */ 5943 5944 redo: 5945 before = prev; 5946 switch (WB_table[before][after]) { 5947 case WB_BREAKABLE: 5948 return TRUE; 5949 5950 case WB_NOBREAK: 5951 return FALSE; 5952 5953 case WB_hs_then_hs: /* 2 horizontal spaces in a row */ 5954 next = advance_one_WB(&after_pos, strend, utf8_target, 5955 FALSE /* Don't skip Extend nor Format */ ); 5956 /* A space immediately preceding an Extend or Format is attached 5957 * to by them, and hence gets separated from previous spaces. 5958 * Otherwise don't break between horizontal white space */ 5959 return next == WB_Extend || next == WB_Format; 5960 5961 /* WB4 Ignore Format and Extend characters, except when they appear at 5962 * the beginning of a region of text. This code currently isn't 5963 * general purpose, but it works as the rules are currently and likely 5964 * to be laid out. The reason it works is that when 'they appear at 5965 * the beginning of a region of text', the rule is to break before 5966 * them, just like any other character. Therefore, the default rule 5967 * applies and we don't have to look in more depth. Should this ever 5968 * change, we would have to have 2 'case' statements, like in the rules 5969 * below, and backup a single character (not spacing over the extend 5970 * ones) and then see if that is one of the region-end characters and 5971 * go from there */ 5972 case WB_Ex_or_FO_or_ZWJ_then_foo: 5973 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); 5974 goto redo; 5975 5976 case WB_DQ_then_HL + WB_BREAKABLE: 5977 case WB_DQ_then_HL + WB_NOBREAK: 5978 5979 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */ 5980 5981 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) 5982 == WB_Hebrew_Letter) 5983 { 5984 return FALSE; 5985 } 5986 5987 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE; 5988 5989 case WB_HL_then_DQ + WB_BREAKABLE: 5990 case WB_HL_then_DQ + WB_NOBREAK: 5991 5992 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */ 5993 5994 if (advance_one_WB(&after_pos, strend, utf8_target, 5995 TRUE /* Do skip Extend and Format */ ) 5996 == WB_Hebrew_Letter) 5997 { 5998 return FALSE; 5999 } 6000 6001 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE; 6002 6003 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK: 6004 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE: 6005 6006 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet 6007 * | Single_Quote) (ALetter | Hebrew_Letter) */ 6008 6009 next = advance_one_WB(&after_pos, strend, utf8_target, 6010 TRUE /* Do skip Extend and Format */ ); 6011 6012 if (next == WB_ALetter || next == WB_Hebrew_Letter) 6013 { 6014 return FALSE; 6015 } 6016 6017 return WB_table[before][after] 6018 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE; 6019 6020 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK: 6021 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE: 6022 6023 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet 6024 * | Single_Quote) × (ALetter | Hebrew_Letter) */ 6025 6026 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); 6027 if (prev == WB_ALetter || prev == WB_Hebrew_Letter) 6028 { 6029 return FALSE; 6030 } 6031 6032 return WB_table[before][after] 6033 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE; 6034 6035 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK: 6036 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE: 6037 6038 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric 6039 * */ 6040 6041 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) 6042 == WB_Numeric) 6043 { 6044 return FALSE; 6045 } 6046 6047 return WB_table[before][after] 6048 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE; 6049 6050 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK: 6051 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE: 6052 6053 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */ 6054 6055 if (advance_one_WB(&after_pos, strend, utf8_target, 6056 TRUE /* Do skip Extend and Format */ ) 6057 == WB_Numeric) 6058 { 6059 return FALSE; 6060 } 6061 6062 return WB_table[before][after] 6063 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE; 6064 6065 case WB_RI_then_RI + WB_NOBREAK: 6066 case WB_RI_then_RI + WB_BREAKABLE: 6067 { 6068 int RI_count = 1; 6069 6070 /* Do not break within emoji flag sequences. That is, do not 6071 * break between regional indicator (RI) symbols if there is an 6072 * odd number of RI characters before the potential break 6073 * point. 6074 * 6075 * WB15 sot (RI RI)* RI × RI 6076 * WB16 [^RI] (RI RI)* RI × RI */ 6077 6078 while (backup_one_WB(&previous, 6079 strbeg, 6080 &before_pos, 6081 utf8_target) == WB_Regional_Indicator) 6082 { 6083 RI_count++; 6084 } 6085 6086 return RI_count % 2 != 1; 6087 } 6088 6089 default: 6090 break; 6091 } 6092 6093 #ifdef DEBUGGING 6094 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n", 6095 before, after, WB_table[before][after]); 6096 assert(0); 6097 #endif 6098 return TRUE; 6099 } 6100 6101 STATIC WB_enum 6102 S_advance_one_WB(pTHX_ U8 ** curpos, 6103 const U8 * const strend, 6104 const bool utf8_target, 6105 const bool skip_Extend_Format) 6106 { 6107 WB_enum wb; 6108 6109 PERL_ARGS_ASSERT_ADVANCE_ONE_WB; 6110 6111 if (*curpos >= strend) { 6112 return WB_EDGE; 6113 } 6114 6115 if (utf8_target) { 6116 6117 /* Advance over Extend and Format */ 6118 do { 6119 *curpos += UTF8SKIP(*curpos); 6120 if (*curpos >= strend) { 6121 return WB_EDGE; 6122 } 6123 wb = getWB_VAL_UTF8(*curpos, strend); 6124 } while ( skip_Extend_Format 6125 && (wb == WB_Extend || wb == WB_Format)); 6126 } 6127 else { 6128 do { 6129 (*curpos)++; 6130 if (*curpos >= strend) { 6131 return WB_EDGE; 6132 } 6133 wb = getWB_VAL_CP(**curpos); 6134 } while ( skip_Extend_Format 6135 && (wb == WB_Extend || wb == WB_Format)); 6136 } 6137 6138 return wb; 6139 } 6140 6141 STATIC WB_enum 6142 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 6143 { 6144 WB_enum wb; 6145 6146 PERL_ARGS_ASSERT_BACKUP_ONE_WB; 6147 6148 /* If we know what the previous character's break value is, don't have 6149 * to look it up */ 6150 if (*previous != WB_UNKNOWN) { 6151 wb = *previous; 6152 6153 /* But we need to move backwards by one */ 6154 if (utf8_target) { 6155 *curpos = reghopmaybe3(*curpos, -1, strbeg); 6156 if (! *curpos) { 6157 *previous = WB_EDGE; 6158 *curpos = (U8 *) strbeg; 6159 } 6160 else { 6161 *previous = WB_UNKNOWN; 6162 } 6163 } 6164 else { 6165 (*curpos)--; 6166 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN; 6167 } 6168 6169 /* And we always back up over these three types */ 6170 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) { 6171 return wb; 6172 } 6173 } 6174 6175 if (*curpos < strbeg) { 6176 return WB_EDGE; 6177 } 6178 6179 if (utf8_target) { 6180 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 6181 if (! prev_char_pos) { 6182 return WB_EDGE; 6183 } 6184 6185 /* Back up over Extend and Format. curpos is always just to the right 6186 * of the character whose value we are getting */ 6187 do { 6188 U8 * prev_prev_char_pos; 6189 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, 6190 -1, 6191 strbeg))) 6192 { 6193 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 6194 *curpos = prev_char_pos; 6195 prev_char_pos = prev_prev_char_pos; 6196 } 6197 else { 6198 *curpos = (U8 *) strbeg; 6199 return WB_EDGE; 6200 } 6201 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ); 6202 } 6203 else { 6204 do { 6205 if (*curpos - 2 < strbeg) { 6206 *curpos = (U8 *) strbeg; 6207 return WB_EDGE; 6208 } 6209 (*curpos)--; 6210 wb = getWB_VAL_CP(*(*curpos - 1)); 6211 } while (wb == WB_Extend || wb == WB_Format); 6212 } 6213 6214 return wb; 6215 } 6216 6217 /* Macros for regmatch(), using its internal variables */ 6218 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ 6219 #define NEXTCHR_IS_EOS (nextbyte < 0) 6220 6221 #define SET_nextchr \ 6222 nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) 6223 6224 #define SET_locinput(p) \ 6225 locinput = (p); \ 6226 SET_nextchr 6227 6228 #define sayYES goto yes 6229 #define sayNO goto no 6230 #define sayNO_SILENT goto no_silent 6231 6232 /* we don't use STMT_START/END here because it leads to 6233 "unreachable code" warnings, which are bogus, but distracting. */ 6234 #define CACHEsayNO \ 6235 if (ST.cache_mask) \ 6236 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ 6237 sayNO 6238 6239 #define EVAL_CLOSE_PAREN_IS(st,expr) \ 6240 ( \ 6241 ( ( st ) ) && \ 6242 ( ( st )->u.eval.close_paren ) && \ 6243 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \ 6244 ) 6245 6246 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \ 6247 ( \ 6248 ( ( st ) ) && \ 6249 ( ( st )->u.eval.close_paren ) && \ 6250 ( ( expr ) ) && \ 6251 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \ 6252 ) 6253 6254 6255 #define EVAL_CLOSE_PAREN_SET(st,expr) \ 6256 (st)->u.eval.close_paren = ( (expr) + 1 ) 6257 6258 #define EVAL_CLOSE_PAREN_CLEAR(st) \ 6259 (st)->u.eval.close_paren = 0 6260 6261 /* push a new state then goto it */ 6262 6263 #define PUSH_STATE_GOTO(state, node, input, eol, sr0) \ 6264 pushinput = input; \ 6265 pusheol = eol; \ 6266 pushsr0 = sr0; \ 6267 scan = node; \ 6268 st->resume_state = state; \ 6269 goto push_state; 6270 6271 /* push a new state with success backtracking, then goto it */ 6272 6273 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \ 6274 pushinput = input; \ 6275 pusheol = eol; \ 6276 pushsr0 = sr0; \ 6277 scan = node; \ 6278 st->resume_state = state; \ 6279 goto push_yes_state; 6280 6281 #define DEBUG_STATE_pp(pp) \ 6282 DEBUG_STATE_r({ \ 6283 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \ 6284 Perl_re_printf( aTHX_ \ 6285 "%*s" pp " %s%s%s%s%s\n", \ 6286 INDENT_CHARS(depth), "", \ 6287 REGNODE_NAME(st->resume_state), \ 6288 ((st==yes_state||st==mark_state) ? "[" : ""), \ 6289 ((st==yes_state) ? "Y" : ""), \ 6290 ((st==mark_state) ? "M" : ""), \ 6291 ((st==yes_state||st==mark_state) ? "]" : "") \ 6292 ); \ 6293 }); 6294 6295 /* 6296 6297 regmatch() - main matching routine 6298 6299 This is basically one big switch statement in a loop. We execute an op, 6300 set 'next' to point the next op, and continue. If we come to a point which 6301 we may need to backtrack to on failure such as (A|B|C), we push a 6302 backtrack state onto the backtrack stack. On failure, we pop the top 6303 state, and re-enter the loop at the state indicated. If there are no more 6304 states to pop, we return failure. 6305 6306 Sometimes we also need to backtrack on success; for example /A+/, where 6307 after successfully matching one A, we need to go back and try to 6308 match another one; similarly for lookahead assertions: if the assertion 6309 completes successfully, we backtrack to the state just before the assertion 6310 and then carry on. In these cases, the pushed state is marked as 6311 'backtrack on success too'. This marking is in fact done by a chain of 6312 pointers, each pointing to the previous 'yes' state. On success, we pop to 6313 the nearest yes state, discarding any intermediate failure-only states. 6314 Sometimes a yes state is pushed just to force some cleanup code to be 6315 called at the end of a successful match or submatch; e.g. (??{$re}) uses 6316 it to free the inner regex. 6317 6318 Note that failure backtracking rewinds the cursor position, while 6319 success backtracking leaves it alone. 6320 6321 A pattern is complete when the END op is executed, while a subpattern 6322 such as (?=foo) is complete when the SUCCESS op is executed. Both of these 6323 ops trigger the "pop to last yes state if any, otherwise return true" 6324 behaviour. 6325 6326 A common convention in this function is to use A and B to refer to the two 6327 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is 6328 the subpattern to be matched possibly multiple times, while B is the entire 6329 rest of the pattern. Variable and state names reflect this convention. 6330 6331 The states in the main switch are the union of ops and failure/success of 6332 substates associated with that op. For example, IFMATCH is the op 6333 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means 6334 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just 6335 successfully matched A and IFMATCH_A_fail is a state saying that we have 6336 just failed to match A. Resume states always come in pairs. The backtrack 6337 state we push is marked as 'IFMATCH_A', but when that is popped, we resume 6338 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking 6339 on success or failure. 6340 6341 The struct that holds a backtracking state is actually a big union, with 6342 one variant for each major type of op. The variable st points to the 6343 top-most backtrack struct. To make the code clearer, within each 6344 block of code we #define ST to alias the relevant union. 6345 6346 Here's a concrete example of a (vastly oversimplified) IFMATCH 6347 implementation: 6348 6349 switch (state) { 6350 .... 6351 6352 #define ST st->u.ifmatch 6353 6354 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 6355 ST.foo = ...; // some state we wish to save 6356 ... 6357 // push a yes backtrack state with a resume value of 6358 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the 6359 // first node of A: 6360 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput); 6361 // NOTREACHED 6362 6363 case IFMATCH_A: // we have successfully executed A; now continue with B 6364 next = B; 6365 bar = ST.foo; // do something with the preserved value 6366 break; 6367 6368 case IFMATCH_A_fail: // A failed, so the assertion failed 6369 ...; // do some housekeeping, then ... 6370 sayNO; // propagate the failure 6371 6372 #undef ST 6373 6374 ... 6375 } 6376 6377 For any old-timers reading this who are familiar with the old recursive 6378 approach, the code above is equivalent to: 6379 6380 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 6381 { 6382 int foo = ... 6383 ... 6384 if (regmatch(A)) { 6385 next = B; 6386 bar = foo; 6387 break; 6388 } 6389 ...; // do some housekeeping, then ... 6390 sayNO; // propagate the failure 6391 } 6392 6393 The topmost backtrack state, pointed to by st, is usually free. If you 6394 want to claim it, populate any ST.foo fields in it with values you wish to 6395 save, then do one of 6396 6397 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol); 6398 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol); 6399 6400 which sets that backtrack state's resume value to 'resume_state', pushes a 6401 new free entry to the top of the backtrack stack, then goes to 'node'. 6402 On backtracking, the free slot is popped, and the saved state becomes the 6403 new free state. An ST.foo field in this new top state can be temporarily 6404 accessed to retrieve values, but once the main loop is re-entered, it 6405 becomes available for reuse. 6406 6407 Note that the depth of the backtrack stack constantly increases during the 6408 left-to-right execution of the pattern, rather than going up and down with 6409 the pattern nesting. For example the stack is at its maximum at Z at the 6410 end of the pattern, rather than at X in the following: 6411 6412 /(((X)+)+)+....(Y)+....Z/ 6413 6414 The only exceptions to this are lookahead/behind assertions and the cut, 6415 (?>A), which pop all the backtrack states associated with A before 6416 continuing. 6417 6418 Backtrack state structs are allocated in slabs of about 4K in size. 6419 PL_regmatch_state and st always point to the currently active state, 6420 and PL_regmatch_slab points to the slab currently containing 6421 PL_regmatch_state. The first time regmatch() is called, the first slab is 6422 allocated, and is never freed until interpreter destruction. When the slab 6423 is full, a new one is allocated and chained to the end. At exit from 6424 regmatch(), slabs allocated since entry are freed. 6425 6426 In order to work with variable length lookbehinds, an upper limit is placed on 6427 lookbehinds which is set to where the match position is at the end of where the 6428 lookbehind would get to. Nothing in the lookbehind should match above that, 6429 except we should be able to look beyond if for things like \b, which need the 6430 next character in the string to be able to determine if this is a boundary or 6431 not. We also can't match the end of string/line unless we are also at the end 6432 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs 6433 that match a width, we have to add a condition that they are within the legal 6434 bounds of our window into the string. 6435 6436 */ 6437 6438 /* returns -1 on failure, $+[0] on success */ 6439 STATIC SSize_t 6440 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) 6441 { 6442 const bool utf8_target = reginfo->is_utf8_target; 6443 const U32 uniflags = UTF8_ALLOW_DEFAULT; 6444 REGEXP *rex_sv = reginfo->prog; 6445 regexp *rex = ReANY(rex_sv); 6446 RXi_GET_DECL(rex,rexi); 6447 /* the current state. This is a cached copy of PL_regmatch_state */ 6448 regmatch_state *st; 6449 /* cache heavy used fields of st in registers */ 6450 regnode *scan; 6451 regnode *next; 6452 U32 n = 0; /* general value; init to avoid compiler warning */ 6453 U32 utmp = 0; /* tmp variable - valid for at most one opcode */ 6454 SSize_t ln = 0; /* len or last; init to avoid compiler warning */ 6455 SSize_t endref = 0; /* offset of end of backref when ln is start */ 6456 char *locinput = startpos; 6457 char *loceol = reginfo->strend; 6458 char *pushinput; /* where to continue after a PUSH */ 6459 char *pusheol; /* where to stop matching (loceol) after a PUSH */ 6460 U8 *pushsr0; /* save starting pos of script run */ 6461 PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1 6462 at EOS */ 6463 6464 bool result = 0; /* return value of S_regmatch */ 6465 U32 depth = 0; /* depth of backtrack stack */ 6466 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ 6467 const U32 max_nochange_depth = 6468 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? 6469 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; 6470 regmatch_state *yes_state = NULL; /* state to pop to on success of 6471 subpattern */ 6472 /* mark_state piggy backs on the yes_state logic so that when we unwind 6473 the stack on success we can update the mark_state as we go */ 6474 regmatch_state *mark_state = NULL; /* last mark state we have seen */ 6475 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ 6476 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ 6477 U32 state_num; 6478 bool no_final = 0; /* prevent failure from backtracking? */ 6479 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ 6480 char *startpoint = locinput; 6481 SV *popmark = NULL; /* are we looking for a mark? */ 6482 SV *sv_commit = NULL; /* last mark name seen in failure */ 6483 SV *sv_yes_mark = NULL; /* last mark name we have seen 6484 during a successful match */ 6485 U32 lastopen = 0; /* last open we saw */ 6486 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0; 6487 SV* const oreplsv = GvSVn(PL_replgv); 6488 /* these three flags are set by various ops to signal information to 6489 * the very next op. They have a useful lifetime of exactly one loop 6490 * iteration, and are not preserved or restored by state pushes/pops 6491 */ 6492 bool sw = 0; /* the condition value in (?(cond)a|b) */ 6493 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ 6494 int logical = 0; /* the following EVAL is: 6495 0: (?{...}) 6496 1: (?(?{...})X|Y) 6497 2: (??{...}) 6498 or the following IFMATCH/UNLESSM is: 6499 false: plain (?=foo) 6500 true: used as a condition: (?(?=foo)) 6501 */ 6502 PAD* last_pad = NULL; 6503 dMULTICALL; 6504 U8 gimme = G_SCALAR; 6505 CV *caller_cv = NULL; /* who called us */ 6506 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ 6507 U32 maxopenparen = 0; /* max '(' index seen so far */ 6508 int to_complement; /* Invert the result? */ 6509 char_class_number_ classnum; 6510 bool is_utf8_pat = reginfo->is_utf8_pat; 6511 bool match = FALSE; 6512 I32 orig_savestack_ix = PL_savestack_ix; 6513 U8 * script_run_begin = NULL; 6514 char *match_end= NULL; /* where a match MUST end to be considered successful */ 6515 bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */ 6516 re_fold_t folder = NULL; /* used by various EXACTish regops */ 6517 const U8 * fold_array = NULL; /* used by various EXACTish regops */ 6518 6519 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */ 6520 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL)) 6521 # define SOLARIS_BAD_OPTIMIZER 6522 const U32 *pl_charclass_dup = PL_charclass; 6523 # define PL_charclass pl_charclass_dup 6524 #endif 6525 6526 #ifdef DEBUGGING 6527 DECLARE_AND_GET_RE_DEBUG_FLAGS; 6528 #endif 6529 6530 /* protect against undef(*^R) */ 6531 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); 6532 6533 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ 6534 multicall_oldcatch = 0; 6535 PERL_UNUSED_VAR(multicall_cop); 6536 6537 PERL_ARGS_ASSERT_REGMATCH; 6538 6539 st = PL_regmatch_state; 6540 6541 /* Note that nextbyte is a byte even in UTF */ 6542 SET_nextchr; 6543 scan = prog; 6544 6545 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ 6546 DUMP_EXEC_POS( locinput, scan, utf8_target, depth ); 6547 Perl_re_printf( aTHX_ "regmatch start\n" ); 6548 })); 6549 6550 while (scan != NULL) { 6551 next = scan + NEXT_OFF(scan); 6552 if (next == scan) 6553 next = NULL; 6554 state_num = OP(scan); 6555 6556 reenter_switch: 6557 DEBUG_EXECUTE_r( 6558 if (state_num <= REGNODE_MAX) { 6559 SV * const prop = sv_newmortal(); 6560 regnode *rnext = regnext(scan); 6561 6562 DUMP_EXEC_POS( locinput, scan, utf8_target, depth ); 6563 regprop(rex, prop, scan, reginfo, NULL); 6564 Perl_re_printf( aTHX_ 6565 "%*s%" IVdf ":%s(%" IVdf ")\n", 6566 INDENT_CHARS(depth), "", 6567 (IV)(scan - rexi->program), 6568 SvPVX_const(prop), 6569 (REGNODE_TYPE(OP(scan)) == END || !rnext) ? 6570 0 : (IV)(rnext - rexi->program)); 6571 } 6572 ); 6573 6574 to_complement = 0; 6575 6576 SET_nextchr; 6577 assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS)); 6578 6579 switch (state_num) { 6580 SV * anyofh_list; 6581 6582 case SBOL: /* /^../ and /\A../ */ 6583 if (locinput == reginfo->strbeg) 6584 break; 6585 sayNO; 6586 6587 case MBOL: /* /^../m */ 6588 if (locinput == reginfo->strbeg || 6589 (!NEXTCHR_IS_EOS && locinput[-1] == '\n')) 6590 { 6591 break; 6592 } 6593 sayNO; 6594 6595 case GPOS: /* \G */ 6596 if (locinput == reginfo->ganch) 6597 break; 6598 sayNO; 6599 6600 case KEEPS: /* \K */ 6601 /* update the startpoint */ 6602 st->u.keeper.val = RXp_OFFS_START(rex,0); 6603 RXp_OFFSp(rex)[0].start = locinput - reginfo->strbeg; 6604 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol, 6605 script_run_begin); 6606 NOT_REACHED; /* NOTREACHED */ 6607 6608 case KEEPS_next_fail: 6609 /* rollback the start point change */ 6610 RXp_OFFSp(rex)[0].start = st->u.keeper.val; 6611 sayNO_SILENT; 6612 NOT_REACHED; /* NOTREACHED */ 6613 6614 case MEOL: /* /..$/m */ 6615 if (!NEXTCHR_IS_EOS && nextbyte != '\n') 6616 sayNO; 6617 break; 6618 6619 case SEOL: /* /..$/ */ 6620 if (!NEXTCHR_IS_EOS && nextbyte != '\n') 6621 sayNO; 6622 if (reginfo->strend - locinput > 1) 6623 sayNO; 6624 break; 6625 6626 case EOS: /* \z */ 6627 if (!NEXTCHR_IS_EOS) 6628 sayNO; 6629 break; 6630 6631 case SANY: /* /./s */ 6632 if (NEXTCHR_IS_EOS || locinput >= loceol) 6633 sayNO; 6634 goto increment_locinput; 6635 6636 case REG_ANY: /* /./ */ 6637 if ( NEXTCHR_IS_EOS 6638 || locinput >= loceol 6639 || nextbyte == '\n') 6640 { 6641 sayNO; 6642 } 6643 goto increment_locinput; 6644 6645 6646 #undef ST 6647 #define ST st->u.trie 6648 case TRIEC: /* (ab|cd) with known charclass */ 6649 /* In this case the charclass data is available inline so 6650 we can fail fast without a lot of extra overhead. 6651 */ 6652 if ( ! NEXTCHR_IS_EOS 6653 && locinput < loceol 6654 && ! ANYOF_BITMAP_TEST(scan, nextbyte)) 6655 { 6656 DEBUG_EXECUTE_r( 6657 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n", 6658 depth, PL_colors[4], PL_colors[5]) 6659 ); 6660 sayNO_SILENT; 6661 NOT_REACHED; /* NOTREACHED */ 6662 } 6663 /* FALLTHROUGH */ 6664 case TRIE: /* (ab|cd) */ 6665 /* the basic plan of execution of the trie is: 6666 * At the beginning, run though all the states, and 6667 * find the longest-matching word. Also remember the position 6668 * of the shortest matching word. For example, this pattern: 6669 * 1 2 3 4 5 6670 * ab|a|x|abcd|abc 6671 * when matched against the string "abcde", will generate 6672 * accept states for all words except 3, with the longest 6673 * matching word being 4, and the shortest being 2 (with 6674 * the position being after char 1 of the string). 6675 * 6676 * Then for each matching word, in word order (i.e. 1,2,4,5), 6677 * we run the remainder of the pattern; on each try setting 6678 * the current position to the character following the word, 6679 * returning to try the next word on failure. 6680 * 6681 * We avoid having to build a list of words at runtime by 6682 * using a compile-time structure, wordinfo[].prev, which 6683 * gives, for each word, the previous accepting word (if any). 6684 * In the case above it would contain the mappings 1->2, 2->0, 6685 * 3->0, 4->5, 5->1. We can use this table to generate, from 6686 * the longest word (4 above), a list of all words, by 6687 * following the list of prev pointers; this gives us the 6688 * unordered list 4,5,1,2. Then given the current word we have 6689 * just tried, we can go through the list and find the 6690 * next-biggest word to try (so if we just failed on word 2, 6691 * the next in the list is 4). 6692 * 6693 * Since at runtime we don't record the matching position in 6694 * the string for each word, we have to work that out for 6695 * each word we're about to process. The wordinfo table holds 6696 * the character length of each word; given that we recorded 6697 * at the start: the position of the shortest word and its 6698 * length in chars, we just need to move the pointer the 6699 * difference between the two char lengths. Depending on 6700 * Unicode status and folding, that's cheap or expensive. 6701 * 6702 * This algorithm is optimised for the case where are only a 6703 * small number of accept states, i.e. 0,1, or maybe 2. 6704 * With lots of accepts states, and having to try all of them, 6705 * it becomes quadratic on number of accept states to find all 6706 * the next words. 6707 */ 6708 6709 { 6710 /* what type of TRIE am I? (utf8 makes this contextual) */ 6711 DECL_TRIE_TYPE(scan); 6712 6713 /* what trie are we using right now */ 6714 reg_trie_data * const trie 6715 = (reg_trie_data*)rexi->data->data[ ARG1u( scan ) ]; 6716 ST.before_paren = trie->before_paren; 6717 ST.after_paren = trie->after_paren; 6718 assert(ST.before_paren<=rex->nparens); 6719 assert(ST.after_paren<=rex->nparens); 6720 6721 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG1u( scan ) + 1 ]); 6722 U32 state = trie->startstate; 6723 6724 if (FLAGS(scan) == EXACTL || FLAGS(scan) == EXACTFLU8) { 6725 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 6726 if (utf8_target 6727 && ! NEXTCHR_IS_EOS 6728 && UTF8_IS_ABOVE_LATIN1(nextbyte) 6729 && FLAGS(scan) == EXACTL) 6730 { 6731 /* We only output for EXACTL, as we let the folder 6732 * output this message for EXACTFLU8 to avoid 6733 * duplication */ 6734 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, 6735 reginfo->strend); 6736 } 6737 } 6738 if ( trie->bitmap 6739 && ( NEXTCHR_IS_EOS 6740 || locinput >= loceol 6741 || ! TRIE_BITMAP_TEST(trie, nextbyte))) 6742 { 6743 if (trie->states[ state ].wordnum) { 6744 DEBUG_EXECUTE_r( 6745 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n", 6746 depth, PL_colors[4], PL_colors[5]) 6747 ); 6748 if (!trie->jump) 6749 break; 6750 } else { 6751 DEBUG_EXECUTE_r( 6752 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n", 6753 depth, PL_colors[4], PL_colors[5]) 6754 ); 6755 sayNO_SILENT; 6756 } 6757 } 6758 6759 { 6760 U8 *uc = ( U8* )locinput; 6761 6762 STRLEN len = 0; 6763 STRLEN foldlen = 0; 6764 U8 *uscan = (U8*)NULL; 6765 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 6766 U32 charcount = 0; /* how many input chars we have matched */ 6767 U32 accepted = 0; /* have we seen any accepting states? */ 6768 6769 ST.jump = trie->jump; 6770 ST.j_before_paren = trie->j_before_paren; 6771 ST.j_after_paren= trie->j_after_paren; 6772 ST.me = scan; 6773 ST.firstpos = NULL; 6774 ST.longfold = FALSE; /* char longer if folded => it's harder */ 6775 ST.nextword = 0; 6776 6777 /* fully traverse the TRIE; note the position of the 6778 shortest accept state and the wordnum of the longest 6779 accept state */ 6780 6781 while ( state && uc <= (U8*)(loceol) ) { 6782 U32 base = trie->states[ state ].trans.base; 6783 UV uvc = 0; 6784 U16 charid = 0; 6785 U16 wordnum; 6786 wordnum = trie->states[ state ].wordnum; 6787 6788 if (wordnum) { /* it's an accept state */ 6789 if (!accepted) { 6790 accepted = 1; 6791 /* record first match position */ 6792 if (ST.longfold) { 6793 ST.firstpos = (U8*)locinput; 6794 ST.firstchars = 0; 6795 } 6796 else { 6797 ST.firstpos = uc; 6798 ST.firstchars = charcount; 6799 } 6800 } 6801 if (!ST.nextword || wordnum < ST.nextword) 6802 ST.nextword = wordnum; 6803 ST.topword = wordnum; 6804 } 6805 6806 DEBUG_TRIE_EXECUTE_r({ 6807 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth ); 6808 /* HERE */ 6809 PerlIO_printf( Perl_debug_log, 6810 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ", 6811 INDENT_CHARS(depth), "", PL_colors[4], 6812 (UV)state, (accepted ? 'Y' : 'N')); 6813 }); 6814 6815 /* read a char and goto next state */ 6816 if ( base && (foldlen || uc < (U8*)(loceol))) { 6817 I32 offset; 6818 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 6819 (U8 *) loceol, uscan, 6820 len, uvc, charid, foldlen, 6821 foldbuf, uniflags); 6822 charcount++; 6823 if (foldlen>0) 6824 ST.longfold = TRUE; 6825 if (charid && 6826 ( ((offset = 6827 base + charid - 1 - trie->uniquecharcount)) >= 0) 6828 6829 && ((U32)offset < trie->lasttrans) 6830 && trie->trans[offset].check == state) 6831 { 6832 state = trie->trans[offset].next; 6833 } 6834 else { 6835 state = 0; 6836 } 6837 uc += len; 6838 6839 } 6840 else { 6841 state = 0; 6842 } 6843 DEBUG_TRIE_EXECUTE_r( 6844 Perl_re_printf( aTHX_ 6845 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n", 6846 charid, uvc, (UV)state, PL_colors[5] ); 6847 ); 6848 } 6849 if (!accepted) 6850 sayNO; 6851 6852 /* calculate total number of accept states */ 6853 { 6854 U16 w = ST.topword; 6855 accepted = 0; 6856 while (w) { 6857 w = trie->wordinfo[w].prev; 6858 accepted++; 6859 } 6860 ST.accepted = accepted; 6861 } 6862 6863 DEBUG_EXECUTE_r( 6864 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n", 6865 depth, 6866 PL_colors[4], (IV)ST.accepted, PL_colors[5] ); 6867 ); 6868 goto trie_first_try; /* jump into the fail handler */ 6869 }} 6870 NOT_REACHED; /* NOTREACHED */ 6871 6872 case TRIE_next_fail: /* we failed - try next alternative */ 6873 { 6874 U8 *uc; 6875 if (RE_PESSIMISTIC_PARENS) { 6876 REGCP_UNWIND(ST.lastcp); 6877 regcppop(rex,&maxopenparen); 6878 } 6879 if ( ST.jump ) { 6880 /* undo any captures done in the tail part of a branch, 6881 * e.g. 6882 * /(?:X(.)(.)|Y(.)).../ 6883 * where the trie just matches X then calls out to do the 6884 * rest of the branch */ 6885 REGCP_UNWIND(ST.cp); 6886 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 6887 if (ST.after_paren) { 6888 assert(ST.before_paren<=rex->nparens && ST.after_paren<=rex->nparens); 6889 CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "TRIE_next_fail"); 6890 } 6891 } 6892 if (!--ST.accepted) { 6893 DEBUG_EXECUTE_r({ 6894 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n", 6895 depth, 6896 PL_colors[4], 6897 PL_colors[5] ); 6898 }); 6899 sayNO_SILENT; 6900 } 6901 { 6902 /* Find next-highest word to process. Note that this code 6903 * is O(N^2) per trie run (O(N) per branch), so keep tight */ 6904 U16 min = 0; 6905 U16 word; 6906 U16 const nextword = ST.nextword; 6907 reg_trie_wordinfo * const wordinfo 6908 = ((reg_trie_data*)rexi->data->data[ARG1u(ST.me)])->wordinfo; 6909 for (word=ST.topword; word; word=wordinfo[word].prev) { 6910 if (word > nextword && (!min || word < min)) 6911 min = word; 6912 } 6913 ST.nextword = min; 6914 } 6915 6916 trie_first_try: 6917 if (do_cutgroup) { 6918 do_cutgroup = 0; 6919 no_final = 0; 6920 } 6921 6922 if ( ST.jump ) { 6923 ST.lastparen = RXp_LASTPAREN(rex); 6924 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex); 6925 REGCP_SET(ST.cp); 6926 } 6927 6928 /* find start char of end of current word */ 6929 { 6930 U32 chars; /* how many chars to skip */ 6931 reg_trie_data * const trie 6932 = (reg_trie_data*)rexi->data->data[ARG1u(ST.me)]; 6933 6934 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen) 6935 >= ST.firstchars); 6936 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen) 6937 - ST.firstchars; 6938 uc = ST.firstpos; 6939 6940 if (ST.longfold) { 6941 /* the hard option - fold each char in turn and find 6942 * its folded length (which may be different */ 6943 U8 foldbuf[UTF8_MAXBYTES_CASE + 1]; 6944 STRLEN foldlen; 6945 STRLEN len; 6946 UV uvc; 6947 U8 *uscan; 6948 6949 while (chars) { 6950 if (utf8_target) { 6951 /* XXX This assumes the length is well-formed, as 6952 * does the UTF8SKIP below */ 6953 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, 6954 uniflags); 6955 uc += len; 6956 } 6957 else { 6958 uvc = *uc; 6959 uc++; 6960 } 6961 uvc = to_uni_fold(uvc, foldbuf, &foldlen); 6962 uscan = foldbuf; 6963 while (foldlen) { 6964 if (!--chars) 6965 break; 6966 uvc = utf8n_to_uvchr(uscan, foldlen, &len, 6967 uniflags); 6968 uscan += len; 6969 foldlen -= len; 6970 } 6971 } 6972 } 6973 else { 6974 if (utf8_target) 6975 uc = utf8_hop(uc, chars); 6976 else 6977 uc += chars; 6978 } 6979 } 6980 if (ST.jump && ST.jump[ST.nextword]) { 6981 scan = ST.me + ST.jump[ST.nextword]; 6982 ST.before_paren = ST.j_before_paren[ST.nextword]; 6983 assert(ST.before_paren <= rex->nparens); 6984 ST.after_paren = ST.j_after_paren[ST.nextword]; 6985 assert(ST.after_paren <= rex->nparens); 6986 } else { 6987 scan = ST.me + NEXT_OFF(ST.me); 6988 } 6989 6990 6991 DEBUG_EXECUTE_r({ 6992 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n", 6993 depth, 6994 PL_colors[4], 6995 ST.nextword, 6996 PL_colors[5] 6997 ); 6998 }); 6999 7000 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) { 7001 if (RE_PESSIMISTIC_PARENS) { 7002 (void)regcppush(rex, 0, maxopenparen); 7003 REGCP_SET(ST.lastcp); 7004 } 7005 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol, 7006 script_run_begin); 7007 NOT_REACHED; /* NOTREACHED */ 7008 } 7009 /* only one choice left - just continue */ 7010 DEBUG_EXECUTE_r({ 7011 AV *const trie_words 7012 = MUTABLE_AV(rexi->data->data[ARG1u(ST.me)+TRIE_WORDS_OFFSET]); 7013 SV ** const tmp = trie_words 7014 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL; 7015 SV *sv= tmp ? sv_newmortal() : NULL; 7016 7017 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n", 7018 depth, PL_colors[4], 7019 ST.nextword, 7020 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 7021 PL_colors[0], PL_colors[1], 7022 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII 7023 ) 7024 : "not compiled under -Dr", 7025 PL_colors[5] ); 7026 }); 7027 7028 locinput = (char*)uc; 7029 continue; /* execute rest of RE */ 7030 /* NOTREACHED */ 7031 } 7032 #undef ST 7033 7034 case LEXACT_REQ8: 7035 if (! utf8_target) { 7036 sayNO; 7037 } 7038 /* FALLTHROUGH */ 7039 7040 case LEXACT: 7041 { 7042 char *s; 7043 7044 s = STRINGl(scan); 7045 ln = STR_LENl(scan); 7046 goto join_short_long_exact; 7047 7048 case EXACTL: /* /abc/l */ 7049 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7050 7051 /* Complete checking would involve going through every character 7052 * matched by the string to see if any is above latin1. But the 7053 * comparison otherwise might very well be a fast assembly 7054 * language routine, and I (khw) don't think slowing things down 7055 * just to check for this warning is worth it. So this just checks 7056 * the first character */ 7057 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) { 7058 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); 7059 } 7060 goto do_exact; 7061 case EXACT_REQ8: 7062 if (! utf8_target) { 7063 sayNO; 7064 } 7065 /* FALLTHROUGH */ 7066 7067 case EXACT: /* /abc/ */ 7068 do_exact: 7069 s = STRINGs(scan); 7070 ln = STR_LENs(scan); 7071 7072 join_short_long_exact: 7073 if (utf8_target != is_utf8_pat) { 7074 /* The target and the pattern have differing utf8ness. */ 7075 char *l = locinput; 7076 const char * const e = s + ln; 7077 7078 if (utf8_target) { 7079 /* The target is utf8, the pattern is not utf8. 7080 * Above-Latin1 code points can't match the pattern; 7081 * invariants match exactly, and the other Latin1 ones need 7082 * to be downgraded to a single byte in order to do the 7083 * comparison. (If we could be confident that the target 7084 * is not malformed, this could be refactored to have fewer 7085 * tests by just assuming that if the first bytes match, it 7086 * is an invariant, but there are tests in the test suite 7087 * dealing with (??{...}) which violate this) */ 7088 while (s < e) { 7089 if ( l >= loceol 7090 || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) 7091 { 7092 sayNO; 7093 } 7094 if (UTF8_IS_INVARIANT(*(U8*)l)) { 7095 if (*l != *s) { 7096 sayNO; 7097 } 7098 l++; 7099 } 7100 else { 7101 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) 7102 { 7103 sayNO; 7104 } 7105 l += 2; 7106 } 7107 s++; 7108 } 7109 } 7110 else { 7111 /* The target is not utf8, the pattern is utf8. */ 7112 while (s < e) { 7113 if ( l >= loceol 7114 || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) 7115 { 7116 sayNO; 7117 } 7118 if (UTF8_IS_INVARIANT(*(U8*)s)) { 7119 if (*s != *l) { 7120 sayNO; 7121 } 7122 s++; 7123 } 7124 else { 7125 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) 7126 { 7127 sayNO; 7128 } 7129 s += 2; 7130 } 7131 l++; 7132 } 7133 } 7134 locinput = l; 7135 } 7136 else { 7137 /* The target and the pattern have the same utf8ness. */ 7138 /* Inline the first character, for speed. */ 7139 if ( loceol - locinput < ln 7140 || UCHARAT(s) != nextbyte 7141 || (ln > 1 && memNE(s, locinput, ln))) 7142 { 7143 sayNO; 7144 } 7145 locinput += ln; 7146 } 7147 break; 7148 } 7149 7150 case EXACTFL: /* /abc/il */ 7151 { 7152 const char * s; 7153 U32 fold_utf8_flags; 7154 7155 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7156 folder = Perl_foldEQ_locale; 7157 fold_array = PL_fold_locale; 7158 fold_utf8_flags = FOLDEQ_LOCALE; 7159 goto do_exactf; 7160 7161 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so 7162 is effectively /u; hence to match, target 7163 must be UTF-8. */ 7164 if (! utf8_target) { 7165 sayNO; 7166 } 7167 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED 7168 | FOLDEQ_S2_FOLDS_SANE; 7169 folder = S_foldEQ_latin1_s2_folded; 7170 fold_array = PL_fold_latin1; 7171 goto do_exactf; 7172 7173 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */ 7174 if (! utf8_target) { 7175 sayNO; 7176 } 7177 assert(is_utf8_pat); 7178 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; 7179 #ifdef DEBUGGING 7180 /* this is only used in an assert check, so we restrict it to DEBUGGING mode. 7181 * In theory neither of these variables should be used in this mode. */ 7182 folder = NULL; 7183 fold_array = NULL; 7184 #endif 7185 goto do_exactf; 7186 7187 case EXACTFUP: /* /foo/iu, and something is problematic in 7188 'foo' so can't take shortcuts. */ 7189 assert(! is_utf8_pat); 7190 folder = Perl_foldEQ_latin1; 7191 fold_array = PL_fold_latin1; 7192 fold_utf8_flags = 0; 7193 goto do_exactf; 7194 7195 case EXACTFU: /* /abc/iu */ 7196 folder = S_foldEQ_latin1_s2_folded; 7197 fold_array = PL_fold_latin1; 7198 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; 7199 goto do_exactf; 7200 7201 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 7202 patterns */ 7203 assert(! is_utf8_pat); 7204 /* FALLTHROUGH */ 7205 case EXACTFAA: /* /abc/iaa */ 7206 folder = S_foldEQ_latin1_s2_folded; 7207 fold_array = PL_fold_latin1; 7208 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7209 if (is_utf8_pat || ! utf8_target) { 7210 7211 /* The possible presence of a MICRO SIGN in the pattern forbids 7212 * us to view a non-UTF-8 pattern as folded when there is a 7213 * UTF-8 target */ 7214 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED 7215 |FOLDEQ_S2_FOLDS_SANE; 7216 } 7217 goto do_exactf; 7218 7219 7220 case EXACTF: /* /abc/i This node only generated for 7221 non-utf8 patterns */ 7222 assert(! is_utf8_pat); 7223 folder = Perl_foldEQ; 7224 fold_array = PL_fold; 7225 fold_utf8_flags = 0; 7226 7227 do_exactf: 7228 s = STRINGs(scan); 7229 ln = STR_LENs(scan); 7230 7231 if ( utf8_target 7232 || is_utf8_pat 7233 || state_num == EXACTFUP 7234 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) 7235 { 7236 /* Either target or the pattern are utf8, or has the issue where 7237 * the fold lengths may differ. */ 7238 const char * const l = locinput; 7239 char *e = loceol; 7240 7241 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target, 7242 s, 0, ln, is_utf8_pat,fold_utf8_flags)) 7243 { 7244 sayNO; 7245 } 7246 locinput = e; 7247 break; 7248 } 7249 7250 /* Neither the target nor the pattern are utf8 */ 7251 assert(fold_array); 7252 if (UCHARAT(s) != nextbyte 7253 && !NEXTCHR_IS_EOS 7254 && UCHARAT(s) != fold_array[nextbyte]) 7255 { 7256 sayNO; 7257 } 7258 if (loceol - locinput < ln) 7259 sayNO; 7260 assert(folder); 7261 if (ln > 1 && ! folder(aTHX_ locinput, s, ln)) 7262 sayNO; 7263 locinput += ln; 7264 break; 7265 } 7266 7267 case NBOUNDL: /* /\B/l */ 7268 to_complement = 1; 7269 /* FALLTHROUGH */ 7270 7271 case BOUNDL: /* /\b/l */ 7272 { 7273 bool b1, b2; 7274 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7275 7276 if (FLAGS(scan) != TRADITIONAL_BOUND) { 7277 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 7278 goto boundu; 7279 } 7280 7281 if (utf8_target) { 7282 if (locinput == reginfo->strbeg) 7283 b1 = isWORDCHAR_LC('\n'); 7284 else { 7285 U8 *p = reghop3((U8*)locinput, -1, 7286 (U8*)(reginfo->strbeg)); 7287 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend)); 7288 } 7289 b2 = (NEXTCHR_IS_EOS) 7290 ? isWORDCHAR_LC('\n') 7291 : isWORDCHAR_LC_utf8_safe((U8*) locinput, 7292 (U8*) reginfo->strend); 7293 } 7294 else { /* Here the string isn't utf8 */ 7295 b1 = (locinput == reginfo->strbeg) 7296 ? isWORDCHAR_LC('\n') 7297 : isWORDCHAR_LC(UCHARAT(locinput - 1)); 7298 b2 = (NEXTCHR_IS_EOS) 7299 ? isWORDCHAR_LC('\n') 7300 : isWORDCHAR_LC(nextbyte); 7301 } 7302 if (to_complement ^ (b1 == b2)) { 7303 sayNO; 7304 } 7305 break; 7306 } 7307 7308 case NBOUND: /* /\B/ */ 7309 to_complement = 1; 7310 /* FALLTHROUGH */ 7311 7312 case BOUND: /* /\b/ */ 7313 if (utf8_target) { 7314 goto bound_utf8; 7315 } 7316 goto bound_ascii_match_only; 7317 7318 case NBOUNDA: /* /\B/a */ 7319 to_complement = 1; 7320 /* FALLTHROUGH */ 7321 7322 case BOUNDA: /* /\b/a */ 7323 { 7324 bool b1, b2; 7325 7326 bound_ascii_match_only: 7327 /* Here the string isn't utf8, or is utf8 and only ascii characters 7328 * are to match \w. In the latter case looking at the byte just 7329 * prior to the current one may be just the final byte of a 7330 * multi-byte character. This is ok. There are two cases: 7331 * 1) it is a single byte character, and then the test is doing 7332 * just what it's supposed to. 7333 * 2) it is a multi-byte character, in which case the final byte is 7334 * never mistakable for ASCII, and so the test will say it is 7335 * not a word character, which is the correct answer. */ 7336 b1 = (locinput == reginfo->strbeg) 7337 ? isWORDCHAR_A('\n') 7338 : isWORDCHAR_A(UCHARAT(locinput - 1)); 7339 b2 = (NEXTCHR_IS_EOS) 7340 ? isWORDCHAR_A('\n') 7341 : isWORDCHAR_A(nextbyte); 7342 if (to_complement ^ (b1 == b2)) { 7343 sayNO; 7344 } 7345 break; 7346 } 7347 7348 case NBOUNDU: /* /\B/u */ 7349 to_complement = 1; 7350 /* FALLTHROUGH */ 7351 7352 case BOUNDU: /* /\b/u */ 7353 7354 boundu: 7355 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) { 7356 match = FALSE; 7357 } 7358 else if (utf8_target) { 7359 bound_utf8: 7360 switch((bound_type) FLAGS(scan)) { 7361 case TRADITIONAL_BOUND: 7362 { 7363 bool b1, b2; 7364 if (locinput == reginfo->strbeg) { 7365 b1 = 0 /* isWORDCHAR_L1('\n') */; 7366 } 7367 else { 7368 U8 *p = reghop3((U8*)locinput, -1, 7369 (U8*)(reginfo->strbeg)); 7370 7371 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend); 7372 } 7373 b2 = (NEXTCHR_IS_EOS) 7374 ? 0 /* isWORDCHAR_L1('\n') */ 7375 : isWORDCHAR_utf8_safe((U8*)locinput, 7376 (U8*) reginfo->strend); 7377 match = cBOOL(b1 != b2); 7378 break; 7379 } 7380 case GCB_BOUND: 7381 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7382 match = TRUE; /* GCB always matches at begin and 7383 end */ 7384 } 7385 else { 7386 /* Find the gcb values of previous and current 7387 * chars, then see if is a break point */ 7388 match = isGCB(getGCB_VAL_UTF8( 7389 reghop3((U8*)locinput, 7390 -1, 7391 (U8*)(reginfo->strbeg)), 7392 (U8*) reginfo->strend), 7393 getGCB_VAL_UTF8((U8*) locinput, 7394 (U8*) reginfo->strend), 7395 (U8*) reginfo->strbeg, 7396 (U8*) locinput, 7397 utf8_target); 7398 } 7399 break; 7400 7401 case LB_BOUND: 7402 if (locinput == reginfo->strbeg) { 7403 match = FALSE; 7404 } 7405 else if (NEXTCHR_IS_EOS) { 7406 match = TRUE; 7407 } 7408 else { 7409 match = isLB(getLB_VAL_UTF8( 7410 reghop3((U8*)locinput, 7411 -1, 7412 (U8*)(reginfo->strbeg)), 7413 (U8*) reginfo->strend), 7414 getLB_VAL_UTF8((U8*) locinput, 7415 (U8*) reginfo->strend), 7416 (U8*) reginfo->strbeg, 7417 (U8*) locinput, 7418 (U8*) reginfo->strend, 7419 utf8_target); 7420 } 7421 break; 7422 7423 case SB_BOUND: /* Always matches at begin and end */ 7424 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7425 match = TRUE; 7426 } 7427 else { 7428 match = isSB(getSB_VAL_UTF8( 7429 reghop3((U8*)locinput, 7430 -1, 7431 (U8*)(reginfo->strbeg)), 7432 (U8*) reginfo->strend), 7433 getSB_VAL_UTF8((U8*) locinput, 7434 (U8*) reginfo->strend), 7435 (U8*) reginfo->strbeg, 7436 (U8*) locinput, 7437 (U8*) reginfo->strend, 7438 utf8_target); 7439 } 7440 break; 7441 7442 case WB_BOUND: 7443 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7444 match = TRUE; 7445 } 7446 else { 7447 match = isWB(WB_UNKNOWN, 7448 getWB_VAL_UTF8( 7449 reghop3((U8*)locinput, 7450 -1, 7451 (U8*)(reginfo->strbeg)), 7452 (U8*) reginfo->strend), 7453 getWB_VAL_UTF8((U8*) locinput, 7454 (U8*) reginfo->strend), 7455 (U8*) reginfo->strbeg, 7456 (U8*) locinput, 7457 (U8*) reginfo->strend, 7458 utf8_target); 7459 } 7460 break; 7461 } 7462 } 7463 else { /* Not utf8 target */ 7464 switch((bound_type) FLAGS(scan)) { 7465 case TRADITIONAL_BOUND: 7466 { 7467 bool b1, b2; 7468 b1 = (locinput == reginfo->strbeg) 7469 ? 0 /* isWORDCHAR_L1('\n') */ 7470 : isWORDCHAR_L1(UCHARAT(locinput - 1)); 7471 b2 = (NEXTCHR_IS_EOS) 7472 ? 0 /* isWORDCHAR_L1('\n') */ 7473 : isWORDCHAR_L1(nextbyte); 7474 match = cBOOL(b1 != b2); 7475 break; 7476 } 7477 7478 case GCB_BOUND: 7479 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7480 match = TRUE; /* GCB always matches at begin and 7481 end */ 7482 } 7483 else { /* Only CR-LF combo isn't a GCB in 0-255 7484 range */ 7485 match = UCHARAT(locinput - 1) != '\r' 7486 || UCHARAT(locinput) != '\n'; 7487 } 7488 break; 7489 7490 case LB_BOUND: 7491 if (locinput == reginfo->strbeg) { 7492 match = FALSE; 7493 } 7494 else if (NEXTCHR_IS_EOS) { 7495 match = TRUE; 7496 } 7497 else { 7498 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)), 7499 getLB_VAL_CP(UCHARAT(locinput)), 7500 (U8*) reginfo->strbeg, 7501 (U8*) locinput, 7502 (U8*) reginfo->strend, 7503 utf8_target); 7504 } 7505 break; 7506 7507 case SB_BOUND: /* Always matches at begin and end */ 7508 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7509 match = TRUE; 7510 } 7511 else { 7512 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)), 7513 getSB_VAL_CP(UCHARAT(locinput)), 7514 (U8*) reginfo->strbeg, 7515 (U8*) locinput, 7516 (U8*) reginfo->strend, 7517 utf8_target); 7518 } 7519 break; 7520 7521 case WB_BOUND: 7522 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7523 match = TRUE; 7524 } 7525 else { 7526 match = isWB(WB_UNKNOWN, 7527 getWB_VAL_CP(UCHARAT(locinput -1)), 7528 getWB_VAL_CP(UCHARAT(locinput)), 7529 (U8*) reginfo->strbeg, 7530 (U8*) locinput, 7531 (U8*) reginfo->strend, 7532 utf8_target); 7533 } 7534 break; 7535 } 7536 } 7537 7538 if (to_complement ^ ! match) { 7539 sayNO; 7540 } 7541 break; 7542 7543 case ANYOFPOSIXL: 7544 case ANYOFL: /* /[abc]/l */ 7545 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7546 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan); 7547 7548 /* FALLTHROUGH */ 7549 case ANYOFD: /* /[abc]/d */ 7550 case ANYOF: /* /[abc]/ */ 7551 if (NEXTCHR_IS_EOS || locinput >= loceol) 7552 sayNO; 7553 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput)) 7554 && ! ANYOF_FLAGS(scan) 7555 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(scan)) 7556 { 7557 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) { 7558 sayNO; 7559 } 7560 locinput++; 7561 } 7562 else { 7563 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 7564 utf8_target)) 7565 { 7566 sayNO; 7567 } 7568 goto increment_locinput; 7569 } 7570 break; 7571 7572 case ANYOFM: 7573 if ( NEXTCHR_IS_EOS 7574 || (UCHARAT(locinput) & FLAGS(scan)) != ARG1u(scan) 7575 || locinput >= loceol) 7576 { 7577 sayNO; 7578 } 7579 locinput++; /* ANYOFM is always single byte */ 7580 break; 7581 7582 case NANYOFM: 7583 if ( NEXTCHR_IS_EOS 7584 || (UCHARAT(locinput) & FLAGS(scan)) == ARG1u(scan) 7585 || locinput >= loceol) 7586 { 7587 sayNO; 7588 } 7589 goto increment_locinput; 7590 break; 7591 7592 case ANYOFH: 7593 if ( ! utf8_target 7594 || NEXTCHR_IS_EOS 7595 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) 7596 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan)) 7597 || ! _invlist_contains_cp(anyofh_list, 7598 utf8_to_uvchr_buf((U8 *) locinput, 7599 (U8 *) loceol, 7600 NULL))) 7601 { 7602 sayNO; 7603 } 7604 goto increment_locinput; 7605 break; 7606 7607 case ANYOFHb: 7608 if ( ! utf8_target 7609 || NEXTCHR_IS_EOS 7610 || ANYOF_FLAGS(scan) != (U8) *locinput 7611 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan)) 7612 || ! _invlist_contains_cp(anyofh_list, 7613 utf8_to_uvchr_buf((U8 *) locinput, 7614 (U8 *) loceol, 7615 NULL))) 7616 { 7617 sayNO; 7618 } 7619 goto increment_locinput; 7620 break; 7621 7622 case ANYOFHbbm: 7623 if ( ! utf8_target 7624 || NEXTCHR_IS_EOS 7625 || ANYOF_FLAGS(scan) != (U8) locinput[0] 7626 || locinput >= reginfo->strend 7627 || ! BITMAP_TEST(( (struct regnode_bbm *) scan)->bitmap, 7628 (U8) locinput[1] & UTF_CONTINUATION_MASK)) 7629 { 7630 sayNO; 7631 } 7632 goto increment_locinput; 7633 break; 7634 7635 case ANYOFHr: 7636 if ( ! utf8_target 7637 || NEXTCHR_IS_EOS 7638 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput), 7639 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)), 7640 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan))) 7641 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan)) 7642 || ! _invlist_contains_cp(anyofh_list, 7643 utf8_to_uvchr_buf((U8 *) locinput, 7644 (U8 *) loceol, 7645 NULL))) 7646 { 7647 sayNO; 7648 } 7649 goto increment_locinput; 7650 break; 7651 7652 case ANYOFHs: 7653 if ( ! utf8_target 7654 || NEXTCHR_IS_EOS 7655 || loceol - locinput < FLAGS(scan) 7656 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan)) 7657 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan)) 7658 || ! _invlist_contains_cp(anyofh_list, 7659 utf8_to_uvchr_buf((U8 *) locinput, 7660 (U8 *) loceol, 7661 NULL))) 7662 { 7663 sayNO; 7664 } 7665 goto increment_locinput; 7666 break; 7667 7668 case ANYOFR: 7669 if (NEXTCHR_IS_EOS) { 7670 sayNO; 7671 } 7672 7673 if (utf8_target) { 7674 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) 7675 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, 7676 (U8 *) reginfo->strend, 7677 NULL), 7678 ANYOFRbase(scan), ANYOFRdelta(scan))) 7679 { 7680 sayNO; 7681 } 7682 } 7683 else { 7684 if (! withinCOUNT((U8) *locinput, 7685 ANYOFRbase(scan), ANYOFRdelta(scan))) 7686 { 7687 sayNO; 7688 } 7689 } 7690 goto increment_locinput; 7691 break; 7692 7693 case ANYOFRb: 7694 if (NEXTCHR_IS_EOS) { 7695 sayNO; 7696 } 7697 7698 if (utf8_target) { 7699 if ( ANYOF_FLAGS(scan) != (U8) *locinput 7700 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, 7701 (U8 *) reginfo->strend, 7702 NULL), 7703 ANYOFRbase(scan), ANYOFRdelta(scan))) 7704 { 7705 sayNO; 7706 } 7707 } 7708 else { 7709 if (! withinCOUNT((U8) *locinput, 7710 ANYOFRbase(scan), ANYOFRdelta(scan))) 7711 { 7712 sayNO; 7713 } 7714 } 7715 goto increment_locinput; 7716 break; 7717 7718 /* The argument (FLAGS) to all the POSIX node types is the class number 7719 * */ 7720 7721 case NPOSIXL: /* \W or [:^punct:] etc. under /l */ 7722 to_complement = 1; 7723 /* FALLTHROUGH */ 7724 7725 case POSIXL: /* \w or [:punct:] etc. under /l */ 7726 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7727 if (NEXTCHR_IS_EOS || locinput >= loceol) 7728 sayNO; 7729 7730 /* Use isFOO_lc() for characters within Latin1. (Note that 7731 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else 7732 * wouldn't be invariant) */ 7733 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) { 7734 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) { 7735 sayNO; 7736 } 7737 7738 locinput++; 7739 break; 7740 } 7741 7742 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { 7743 /* An above Latin-1 code point, or malformed */ 7744 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, 7745 reginfo->strend); 7746 goto utf8_posix_above_latin1; 7747 } 7748 7749 /* Here is a UTF-8 variant code point below 256 and the target is 7750 * UTF-8 */ 7751 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), 7752 EIGHT_BIT_UTF8_TO_NATIVE(nextbyte, 7753 *(locinput + 1)))))) 7754 { 7755 sayNO; 7756 } 7757 7758 goto increment_locinput; 7759 7760 case NPOSIXD: /* \W or [:^punct:] etc. under /d */ 7761 to_complement = 1; 7762 /* FALLTHROUGH */ 7763 7764 case POSIXD: /* \w or [:punct:] etc. under /d */ 7765 if (utf8_target) { 7766 goto utf8_posix; 7767 } 7768 goto posixa; 7769 7770 case NPOSIXA: /* \W or [:^punct:] etc. under /a */ 7771 7772 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7773 sayNO; 7774 } 7775 7776 /* All UTF-8 variants match */ 7777 if (! UTF8_IS_INVARIANT(nextbyte)) { 7778 goto increment_locinput; 7779 } 7780 7781 to_complement = 1; 7782 goto join_nposixa; 7783 7784 case POSIXA: /* \w or [:punct:] etc. under /a */ 7785 7786 posixa: 7787 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in 7788 * UTF-8, and also from NPOSIXA even in UTF-8 when the current 7789 * character is a single byte */ 7790 7791 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7792 sayNO; 7793 } 7794 7795 join_nposixa: 7796 7797 if (! (to_complement ^ cBOOL(generic_isCC_A_(nextbyte, 7798 FLAGS(scan))))) 7799 { 7800 sayNO; 7801 } 7802 7803 /* Here we are either not in utf8, or we matched a utf8-invariant, 7804 * so the next char is the next byte */ 7805 locinput++; 7806 break; 7807 7808 case NPOSIXU: /* \W or [:^punct:] etc. under /u */ 7809 to_complement = 1; 7810 /* FALLTHROUGH */ 7811 7812 case POSIXU: /* \w or [:punct:] etc. under /u */ 7813 utf8_posix: 7814 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7815 sayNO; 7816 } 7817 7818 /* Use generic_isCC_() for characters within Latin1. (Note that 7819 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else 7820 * wouldn't be invariant) */ 7821 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) { 7822 if (! (to_complement ^ cBOOL(generic_isCC_(nextbyte, 7823 FLAGS(scan))))) 7824 { 7825 sayNO; 7826 } 7827 locinput++; 7828 } 7829 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { 7830 if (! (to_complement 7831 ^ cBOOL(generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte, 7832 *(locinput + 1)), 7833 FLAGS(scan))))) 7834 { 7835 sayNO; 7836 } 7837 locinput += 2; 7838 } 7839 else { /* Handle above Latin-1 code points */ 7840 utf8_posix_above_latin1: 7841 classnum = (char_class_number_) FLAGS(scan); 7842 switch (classnum) { 7843 default: 7844 if (! (to_complement 7845 ^ cBOOL(_invlist_contains_cp( 7846 PL_XPosix_ptrs[classnum], 7847 utf8_to_uvchr_buf((U8 *) locinput, 7848 (U8 *) reginfo->strend, 7849 NULL))))) 7850 { 7851 sayNO; 7852 } 7853 break; 7854 case CC_ENUM_SPACE_: 7855 if (! (to_complement 7856 ^ cBOOL(is_XPERLSPACE_high(locinput)))) 7857 { 7858 sayNO; 7859 } 7860 break; 7861 case CC_ENUM_BLANK_: 7862 if (! (to_complement 7863 ^ cBOOL(is_HORIZWS_high(locinput)))) 7864 { 7865 sayNO; 7866 } 7867 break; 7868 case CC_ENUM_XDIGIT_: 7869 if (! (to_complement 7870 ^ cBOOL(is_XDIGIT_high(locinput)))) 7871 { 7872 sayNO; 7873 } 7874 break; 7875 case CC_ENUM_VERTSPACE_: 7876 if (! (to_complement 7877 ^ cBOOL(is_VERTWS_high(locinput)))) 7878 { 7879 sayNO; 7880 } 7881 break; 7882 case CC_ENUM_CNTRL_: /* These can't match above Latin1 */ 7883 case CC_ENUM_ASCII_: 7884 if (! to_complement) { 7885 sayNO; 7886 } 7887 break; 7888 } 7889 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); 7890 } 7891 break; 7892 7893 case CLUMP: /* Match \X: logical Unicode character. This is defined as 7894 a Unicode extended Grapheme Cluster */ 7895 if (NEXTCHR_IS_EOS || locinput >= loceol) 7896 sayNO; 7897 if (! utf8_target) { 7898 7899 /* Match either CR LF or '.', as all the other possibilities 7900 * require utf8 */ 7901 locinput++; /* Match the . or CR */ 7902 if (nextbyte == '\r' /* And if it was CR, and the next is LF, 7903 match the LF */ 7904 && locinput < loceol 7905 && UCHARAT(locinput) == '\n') 7906 { 7907 locinput++; 7908 } 7909 } 7910 else { 7911 7912 /* Get the gcb type for the current character */ 7913 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput, 7914 (U8*) reginfo->strend); 7915 7916 /* Then scan through the input until we get to the first 7917 * character whose type is supposed to be a gcb with the 7918 * current character. (There is always a break at the 7919 * end-of-input) */ 7920 locinput += UTF8SKIP(locinput); 7921 while (locinput < loceol) { 7922 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput, 7923 (U8*) reginfo->strend); 7924 if (isGCB(prev_gcb, cur_gcb, 7925 (U8*) reginfo->strbeg, (U8*) locinput, 7926 utf8_target)) 7927 { 7928 break; 7929 } 7930 7931 prev_gcb = cur_gcb; 7932 locinput += UTF8SKIP(locinput); 7933 } 7934 7935 7936 } 7937 break; 7938 7939 case REFFLN: /* /\g{name}/il */ 7940 { /* The capture buffer cases. The ones beginning with N for the 7941 named buffers just convert to the equivalent numbered and 7942 pretend they were called as the corresponding numbered buffer 7943 op. */ 7944 /* don't initialize these in the declaration, it makes C++ 7945 unhappy */ 7946 const char *s; 7947 char type; 7948 re_fold_t folder; 7949 const U8 *fold_array; 7950 UV utf8_fold_flags; 7951 7952 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7953 folder = Perl_foldEQ_locale; 7954 fold_array = PL_fold_locale; 7955 type = REFFL; 7956 utf8_fold_flags = FOLDEQ_LOCALE; 7957 goto do_nref; 7958 7959 case REFFAN: /* /\g{name}/iaa */ 7960 folder = Perl_foldEQ_latin1; 7961 fold_array = PL_fold_latin1; 7962 type = REFFA; 7963 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7964 goto do_nref; 7965 7966 case REFFUN: /* /\g{name}/iu */ 7967 folder = Perl_foldEQ_latin1; 7968 fold_array = PL_fold_latin1; 7969 type = REFFU; 7970 utf8_fold_flags = 0; 7971 goto do_nref; 7972 7973 case REFFN: /* /\g{name}/i */ 7974 folder = Perl_foldEQ; 7975 fold_array = PL_fold; 7976 type = REFF; 7977 utf8_fold_flags = 0; 7978 goto do_nref; 7979 7980 case REFN: /* /\g{name}/ */ 7981 type = REF; 7982 folder = NULL; 7983 fold_array = NULL; 7984 utf8_fold_flags = 0; 7985 do_nref: 7986 7987 /* For the named back references, find the corresponding buffer 7988 * number */ 7989 n = reg_check_named_buff_matched(rex,scan); 7990 7991 if ( ! n ) { 7992 sayNO; 7993 } 7994 goto do_nref_ref_common; 7995 7996 case REFFL: /* /\1/il */ 7997 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7998 folder = Perl_foldEQ_locale; 7999 fold_array = PL_fold_locale; 8000 utf8_fold_flags = FOLDEQ_LOCALE; 8001 goto do_ref; 8002 8003 case REFFA: /* /\1/iaa */ 8004 folder = Perl_foldEQ_latin1; 8005 fold_array = PL_fold_latin1; 8006 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 8007 goto do_ref; 8008 8009 case REFFU: /* /\1/iu */ 8010 folder = Perl_foldEQ_latin1; 8011 fold_array = PL_fold_latin1; 8012 utf8_fold_flags = 0; 8013 goto do_ref; 8014 8015 case REFF: /* /\1/i */ 8016 folder = Perl_foldEQ; 8017 fold_array = PL_fold; 8018 utf8_fold_flags = 0; 8019 goto do_ref; 8020 8021 #undef ST 8022 #define ST st->u.backref 8023 case REF: /* /\1/ */ 8024 folder = NULL; 8025 fold_array = NULL; 8026 utf8_fold_flags = 0; 8027 8028 do_ref: 8029 type = OP(scan); 8030 n = ARG1u(scan); /* which paren pair */ 8031 if (rex->logical_to_parno) { 8032 n = rex->logical_to_parno[n]; 8033 do { 8034 if ( RXp_LASTPAREN(rex) < n || 8035 RXp_OFFS_START(rex,n) == -1 || 8036 RXp_OFFS_END(rex,n) == -1 8037 ) { 8038 n = rex->parno_to_logical_next[n]; 8039 } 8040 else { 8041 break; 8042 } 8043 } while(n); 8044 8045 if (!n) /* this means there is nothing that matched */ 8046 sayNO; 8047 } 8048 8049 do_nref_ref_common: 8050 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ 8051 if (RXp_LASTPAREN(rex) < n) 8052 sayNO; 8053 8054 ln = RXp_OFFSp(rex)[n].start; 8055 endref = RXp_OFFSp(rex)[n].end; 8056 if (ln == -1 || endref == -1) 8057 sayNO; /* Do not match unless seen CLOSEn. */ 8058 8059 if (ln == endref) 8060 goto ref_yes; 8061 8062 s = reginfo->strbeg + ln; 8063 if (type != REF /* REF can do byte comparison */ 8064 && (utf8_target || type == REFFU || type == REFFL)) 8065 { 8066 char * limit = loceol; 8067 8068 /* This call case insensitively compares the entire buffer 8069 * at s, with the current input starting at locinput, but 8070 * not going off the end given by loceol, and 8071 * returns in <limit> upon success, how much of the 8072 * current input was matched */ 8073 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target, 8074 locinput, &limit, 0, utf8_target, utf8_fold_flags)) 8075 { 8076 sayNO; 8077 } 8078 locinput = limit; 8079 goto ref_yes; 8080 } 8081 8082 /* Not utf8: Inline the first character, for speed. */ 8083 if ( ! NEXTCHR_IS_EOS 8084 && locinput < loceol 8085 && UCHARAT(s) != nextbyte 8086 && ( type == REF 8087 || UCHARAT(s) != fold_array[nextbyte])) 8088 { 8089 sayNO; 8090 } 8091 ln = endref - ln; 8092 if (locinput + ln > loceol) 8093 sayNO; 8094 if (ln > 1 && (type == REF 8095 ? memNE(s, locinput, ln) 8096 : ! folder(aTHX_ locinput, s, ln))) 8097 sayNO; 8098 locinput += ln; 8099 } 8100 ref_yes: 8101 if (FLAGS(scan)) { /* == VOLATILE_REF but only other value is 0 */ 8102 ST.cp = regcppush(rex, ARG2u(scan) - 1, maxopenparen); 8103 REGCP_SET(ST.lastcp); 8104 PUSH_STATE_GOTO(REF_next, next, locinput, loceol, 8105 script_run_begin); 8106 } 8107 break; 8108 NOT_REACHED; /* NOTREACHED */ 8109 8110 case REF_next: 8111 sayYES; 8112 break; 8113 8114 case REF_next_fail: 8115 REGCP_UNWIND(ST.lastcp); 8116 regcppop(rex, &maxopenparen); 8117 sayNO; 8118 break; 8119 8120 case NOTHING: /* null op; e.g. the 'nothing' following 8121 * the '*' in m{(a+|b)*}' */ 8122 break; 8123 case TAIL: /* placeholder while compiling (A|B|C) */ 8124 break; 8125 8126 #undef ST 8127 #define ST st->u.eval 8128 #define CUR_EVAL cur_eval->u.eval 8129 8130 { 8131 SV *ret; 8132 REGEXP *re_sv; 8133 regexp *re; 8134 regexp_internal *rei; 8135 regnode *startpoint; 8136 U32 arg; 8137 8138 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ 8139 arg = ARG1u(scan); 8140 if (cur_eval && cur_eval->locinput == locinput) { 8141 if ( ++nochange_depth > max_nochange_depth ) 8142 Perl_croak(aTHX_ 8143 "Pattern subroutine nesting without pos change" 8144 " exceeded limit in regex"); 8145 } else { 8146 nochange_depth = 0; 8147 } 8148 re_sv = rex_sv; 8149 re = rex; 8150 rei = rexi; 8151 startpoint = scan + ARG2i(scan); 8152 EVAL_CLOSE_PAREN_SET( st, arg ); 8153 /* Detect infinite recursion 8154 * 8155 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/ 8156 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever. 8157 * So we track the position in the string we are at each time 8158 * we recurse and if we try to enter the same routine twice from 8159 * the same position we throw an error. 8160 */ 8161 if ( rex->recurse_locinput[arg] == locinput ) { 8162 /* FIXME: we should show the regop that is failing as part 8163 * of the error message. */ 8164 Perl_croak(aTHX_ "Infinite recursion in regex"); 8165 } else { 8166 ST.prev_recurse_locinput= rex->recurse_locinput[arg]; 8167 rex->recurse_locinput[arg]= locinput; 8168 8169 DEBUG_r({ 8170 DECLARE_AND_GET_RE_DEBUG_FLAGS; 8171 DEBUG_STACK_r({ 8172 Perl_re_exec_indentf( aTHX_ 8173 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n", 8174 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg] 8175 ); 8176 }); 8177 }); 8178 } 8179 8180 /* Save all the positions seen so far. */ 8181 ST.cp = regcppush(rex, 0, maxopenparen); 8182 REGCP_SET(ST.lastcp); 8183 8184 /* and then jump to the code we share with EVAL */ 8185 goto eval_recurse_doit; 8186 /* NOTREACHED */ 8187 8188 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */ 8189 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) { 8190 if ( ++nochange_depth > max_nochange_depth ) 8191 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); 8192 } else { 8193 nochange_depth = 0; 8194 } 8195 { 8196 /* execute the code in the {...} */ 8197 8198 dSP; 8199 IV before; 8200 OP * const oop = PL_op; 8201 COP * const ocurcop = PL_curcop; 8202 OP *nop; 8203 CV *newcv; 8204 8205 /* save *all* paren positions */ 8206 ST.cp = regcppush(rex, 0, maxopenparen); 8207 REGCP_SET(ST.lastcp); 8208 8209 if (!caller_cv) 8210 caller_cv = find_runcv(NULL); 8211 8212 n = ARG1u(scan); 8213 8214 if (rexi->data->what[n] == 'r') { /* code from an external qr */ 8215 newcv = (ReANY( 8216 (REGEXP*)(rexi->data->data[n]) 8217 ))->qr_anoncv; 8218 nop = (OP*)rexi->data->data[n+1]; 8219 } 8220 else if (rexi->data->what[n] == 'l') { /* literal code */ 8221 newcv = caller_cv; 8222 nop = (OP*)rexi->data->data[n]; 8223 assert(CvDEPTH(newcv)); 8224 } 8225 else { 8226 /* literal with own CV */ 8227 assert(rexi->data->what[n] == 'L'); 8228 newcv = rex->qr_anoncv; 8229 nop = (OP*)rexi->data->data[n]; 8230 } 8231 8232 /* Some notes about MULTICALL and the context and save stacks. 8233 * 8234 * In something like 8235 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../ 8236 * since codeblocks don't introduce a new scope (so that 8237 * local() etc accumulate), at the end of a successful 8238 * match there will be a SAVEt_CLEARSV on the savestack 8239 * for each of $x, $y, $z. If the three code blocks above 8240 * happen to have come from different CVs (e.g. via 8241 * embedded qr//s), then we must ensure that during any 8242 * savestack unwinding, PL_comppad always points to the 8243 * right pad at each moment. We achieve this by 8244 * interleaving SAVEt_COMPPAD's on the savestack whenever 8245 * there is a change of pad. 8246 * In theory whenever we call a code block, we should 8247 * push a CXt_SUB context, then pop it on return from 8248 * that code block. This causes a bit of an issue in that 8249 * normally popping a context also clears the savestack 8250 * back to cx->blk_oldsaveix, but here we specifically 8251 * don't want to clear the save stack on exit from the 8252 * code block. 8253 * Also for efficiency we don't want to keep pushing and 8254 * popping the single SUB context as we backtrack etc. 8255 * So instead, we push a single context the first time 8256 * we need, it, then hang onto it until the end of this 8257 * function. Whenever we encounter a new code block, we 8258 * update the CV etc if that's changed. During the times 8259 * in this function where we're not executing a code 8260 * block, having the SUB context still there is a bit 8261 * naughty - but we hope that no-one notices. 8262 * When the SUB context is initially pushed, we fake up 8263 * cx->blk_oldsaveix to be as if we'd pushed this context 8264 * on first entry to S_regmatch rather than at some random 8265 * point during the regexe execution. That way if we 8266 * croak, popping the context stack will ensure that 8267 * *everything* SAVEd by this function is undone and then 8268 * the context popped, rather than e.g., popping the 8269 * context (and restoring the original PL_comppad) then 8270 * popping more of the savestack and restoring a bad 8271 * PL_comppad. 8272 */ 8273 8274 /* If this is the first EVAL, push a MULTICALL. On 8275 * subsequent calls, if we're executing a different CV, or 8276 * if PL_comppad has got messed up from backtracking 8277 * through SAVECOMPPADs, then refresh the context. 8278 */ 8279 if (newcv != last_pushed_cv || PL_comppad != last_pad) 8280 { 8281 U8 flags = (CXp_SUB_RE | 8282 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); 8283 SAVECOMPPAD(); 8284 if (last_pushed_cv) { 8285 CHANGE_MULTICALL_FLAGS(newcv, flags); 8286 } 8287 else { 8288 PUSH_MULTICALL_FLAGS(newcv, flags); 8289 } 8290 /* see notes above */ 8291 CX_CUR()->blk_oldsaveix = orig_savestack_ix; 8292 8293 last_pushed_cv = newcv; 8294 } 8295 else { 8296 /* these assignments are just to silence compiler 8297 * warnings */ 8298 multicall_cop = NULL; 8299 } 8300 last_pad = PL_comppad; 8301 8302 /* the initial nextstate you would normally execute 8303 * at the start of an eval (which would cause error 8304 * messages to come from the eval), may be optimised 8305 * away from the execution path in the regex code blocks; 8306 * so manually set PL_curcop to it initially */ 8307 { 8308 OP *o = cUNOPx(nop)->op_first; 8309 assert(o->op_type == OP_NULL); 8310 if (o->op_targ == OP_SCOPE) { 8311 o = cUNOPo->op_first; 8312 } 8313 else { 8314 assert(o->op_targ == OP_LEAVE); 8315 o = cUNOPo->op_first; 8316 assert(o->op_type == OP_ENTER); 8317 o = OpSIBLING(o); 8318 } 8319 8320 if (o->op_type != OP_STUB) { 8321 assert( o->op_type == OP_NEXTSTATE 8322 || o->op_type == OP_DBSTATE 8323 || (o->op_type == OP_NULL 8324 && ( o->op_targ == OP_NEXTSTATE 8325 || o->op_targ == OP_DBSTATE 8326 ) 8327 ) 8328 ); 8329 PL_curcop = (COP*)o; 8330 } 8331 } 8332 nop = nop->op_next; 8333 8334 DEBUG_STATE_r( Perl_re_printf( aTHX_ 8335 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) ); 8336 8337 RXp_OFFSp(rex)[0].end = locinput - reginfo->strbeg; 8338 if (reginfo->info_aux_eval->pos_magic) 8339 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, 8340 reginfo->sv, reginfo->strbeg, 8341 locinput - reginfo->strbeg); 8342 8343 if (sv_yes_mark) { 8344 SV *sv_mrk = get_sv("REGMARK", 1); 8345 sv_setsv(sv_mrk, sv_yes_mark); 8346 } 8347 8348 /* we don't use MULTICALL here as we want to call the 8349 * first op of the block of interest, rather than the 8350 * first op of the sub. Also, we don't want to free 8351 * the savestack frame */ 8352 before = (IV)(SP-PL_stack_base); 8353 PL_op = nop; 8354 CALLRUNOPS(aTHX); /* Scalar context. */ 8355 SPAGAIN; 8356 if ((IV)(SP-PL_stack_base) == before) 8357 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ 8358 else { 8359 ret = POPs; 8360 PUTBACK; 8361 } 8362 8363 /* before restoring everything, evaluate the returned 8364 * value, so that 'uninit' warnings don't use the wrong 8365 * PL_op or pad. Also need to process any magic vars 8366 * (e.g. $1) *before* parentheses are restored */ 8367 8368 PL_op = NULL; 8369 8370 re_sv = NULL; 8371 if (logical == 0) { /* /(?{ ... })/ and /(*{ ... })/ */ 8372 SV *replsv = save_scalar(PL_replgv); 8373 sv_setsv(replsv, ret); /* $^R */ 8374 SvSETMAGIC(replsv); 8375 } 8376 else if (logical == 1) { /* /(?(?{...})X|Y)/ */ 8377 sw = cBOOL(SvTRUE_NN(ret)); 8378 logical = 0; 8379 } 8380 else { /* /(??{ ... }) */ 8381 /* if its overloaded, let the regex compiler handle 8382 * it; otherwise extract regex, or stringify */ 8383 if (SvGMAGICAL(ret)) 8384 ret = sv_mortalcopy(ret); 8385 if (!SvAMAGIC(ret)) { 8386 SV *sv = ret; 8387 if (SvROK(sv)) 8388 sv = SvRV(sv); 8389 if (SvTYPE(sv) == SVt_REGEXP) 8390 re_sv = (REGEXP*) sv; 8391 else if (SvSMAGICAL(ret)) { 8392 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); 8393 if (mg) 8394 re_sv = (REGEXP *) mg->mg_obj; 8395 } 8396 8397 /* force any undef warnings here */ 8398 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { 8399 ret = sv_mortalcopy(ret); 8400 (void) SvPV_force_nolen(ret); 8401 } 8402 } 8403 8404 } 8405 8406 /* *** Note that at this point we don't restore 8407 * PL_comppad, (or pop the CxSUB) on the assumption it may 8408 * be used again soon. This is safe as long as nothing 8409 * in the regexp code uses the pad ! */ 8410 PL_op = oop; 8411 PL_curcop = ocurcop; 8412 regcp_restore(rex, ST.lastcp, &maxopenparen); 8413 PL_curpm_under = PL_curpm; 8414 PL_curpm = PL_reg_curpm; 8415 8416 if (logical != 2) { 8417 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol, 8418 script_run_begin); 8419 /* NOTREACHED */ 8420 } 8421 } 8422 8423 /* only /(??{ ... })/ from now on */ 8424 logical = 0; 8425 { 8426 /* extract RE object from returned value; compiling if 8427 * necessary */ 8428 8429 if (re_sv) { 8430 re_sv = reg_temp_copy(NULL, re_sv); 8431 } 8432 else { 8433 U32 pm_flags = 0; 8434 8435 if (SvUTF8(ret) && IN_BYTES) { 8436 /* In use 'bytes': make a copy of the octet 8437 * sequence, but without the flag on */ 8438 STRLEN len; 8439 const char *const p = SvPV(ret, len); 8440 ret = newSVpvn_flags(p, len, SVs_TEMP); 8441 } 8442 if (rex->intflags & PREGf_USE_RE_EVAL) 8443 pm_flags |= PMf_USE_RE_EVAL; 8444 8445 /* if we got here, it should be an engine which 8446 * supports compiling code blocks and stuff */ 8447 assert(rex->engine && rex->engine->op_comp); 8448 assert(!(FLAGS(scan) & ~RXf_PMf_COMPILETIME)); 8449 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, 8450 rex->engine, NULL, NULL, 8451 /* copy /msixn etc to inner pattern */ 8452 ARG2i(scan), 8453 pm_flags); 8454 8455 if (!(SvFLAGS(ret) 8456 & (SVs_TEMP | SVs_GMG | SVf_ROK)) 8457 && (!SvPADTMP(ret) || SvREADONLY(ret))) { 8458 /* This isn't a first class regexp. Instead, it's 8459 caching a regexp onto an existing, Perl visible 8460 scalar. */ 8461 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); 8462 } 8463 } 8464 SAVEFREESV(re_sv); 8465 re = ReANY(re_sv); 8466 } 8467 RXp_MATCH_COPIED_off(re); 8468 RXp_SUBBEG(re) = RXp_SUBBEG(rex); 8469 RXp_SUBLEN(re) = RXp_SUBLEN(rex); 8470 RXp_SUBOFFSET(re) = RXp_SUBOFFSET(rex); 8471 RXp_SUBCOFFSET(re) = RXp_SUBCOFFSET(rex); 8472 RXp_LASTPAREN(re) = 0; 8473 RXp_LASTCLOSEPAREN(re) = 0; 8474 rei = RXi_GET(re); 8475 DEBUG_EXECUTE_r( 8476 debug_start_match(re_sv, utf8_target, locinput, 8477 reginfo->strend, "EVAL/GOSUB: Matching embedded"); 8478 ); 8479 startpoint = rei->program + 1; 8480 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0; 8481 * close_paren only for GOSUB */ 8482 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */ 8483 8484 /* note we saved the paren state earlier: 8485 ST.cp = regcppush(rex, 0, maxopenparen); 8486 REGCP_SET(ST.lastcp); 8487 */ 8488 /* and set maxopenparen to 0, since we are starting a "fresh" match */ 8489 maxopenparen = 0; 8490 /* run the pattern returned from (??{...}) */ 8491 8492 eval_recurse_doit: /* Share code with GOSUB below this line 8493 * At this point we expect the stack context to be 8494 * set up correctly */ 8495 8496 /* invalidate the S-L poscache. We're now executing a 8497 * different set of WHILEM ops (and their associated 8498 * indexes) against the same string, so the bits in the 8499 * cache are meaningless. Setting maxiter to zero forces 8500 * the cache to be invalidated and zeroed before reuse. 8501 * XXX This is too dramatic a measure. Ideally we should 8502 * save the old cache and restore when running the outer 8503 * pattern again */ 8504 reginfo->poscache_maxiter = 0; 8505 8506 /* the new regexp might have a different is_utf8_pat than we do */ 8507 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); 8508 8509 ST.prev_rex = rex_sv; 8510 ST.prev_curlyx = cur_curlyx; 8511 rex_sv = re_sv; 8512 SET_reg_curpm(rex_sv); 8513 rex = re; 8514 rexi = rei; 8515 cur_curlyx = NULL; 8516 ST.B = next; 8517 ST.prev_eval = cur_eval; 8518 cur_eval = st; 8519 /* now continue from first node in postoned RE */ 8520 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, 8521 loceol, script_run_begin); 8522 NOT_REACHED; /* NOTREACHED */ 8523 } 8524 8525 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */ 8526 /* note: this is called twice; first after popping B, then A */ 8527 DEBUG_STACK_r({ 8528 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n", 8529 depth, cur_eval, ST.prev_eval); 8530 }); 8531 8532 #define SET_RECURSE_LOCINPUT(STR,VAL)\ 8533 if ( cur_eval && CUR_EVAL.close_paren ) {\ 8534 DEBUG_STACK_r({ \ 8535 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\ 8536 depth, \ 8537 CUR_EVAL.close_paren - 1,\ 8538 cur_eval, \ 8539 VAL); \ 8540 }); \ 8541 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\ 8542 } 8543 8544 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput); 8545 8546 rex_sv = ST.prev_rex; 8547 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 8548 SET_reg_curpm(rex_sv); 8549 rex = ReANY(rex_sv); 8550 rexi = RXi_GET(rex); 8551 { 8552 /* preserve $^R across LEAVE's. See Bug 121070. */ 8553 SV *save_sv= GvSV(PL_replgv); 8554 SV *replsv; 8555 SvREFCNT_inc(save_sv); 8556 regcpblow(ST.cp); /* LEAVE in disguise */ 8557 /* don't move this initialization up */ 8558 replsv = GvSV(PL_replgv); 8559 sv_setsv(replsv, save_sv); 8560 SvSETMAGIC(replsv); 8561 SvREFCNT_dec(save_sv); 8562 } 8563 cur_eval = ST.prev_eval; 8564 cur_curlyx = ST.prev_curlyx; 8565 8566 /* Invalidate cache. See "invalidate" comment above. */ 8567 reginfo->poscache_maxiter = 0; 8568 if ( nochange_depth ) 8569 nochange_depth--; 8570 8571 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput); 8572 sayYES; 8573 8574 8575 case EVAL_B_fail: /* unsuccessful B in (?{...})B */ 8576 REGCP_UNWIND(ST.lastcp); 8577 regcppop(rex, &maxopenparen); 8578 sayNO; 8579 8580 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ 8581 /* note: this is called twice; first after popping B, then A */ 8582 DEBUG_STACK_r({ 8583 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n", 8584 depth, cur_eval, ST.prev_eval); 8585 }); 8586 8587 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput); 8588 8589 rex_sv = ST.prev_rex; 8590 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 8591 SET_reg_curpm(rex_sv); 8592 rex = ReANY(rex_sv); 8593 rexi = RXi_GET(rex); 8594 8595 REGCP_UNWIND(ST.lastcp); 8596 regcppop(rex, &maxopenparen); 8597 cur_eval = ST.prev_eval; 8598 cur_curlyx = ST.prev_curlyx; 8599 8600 /* Invalidate cache. See "invalidate" comment above. */ 8601 reginfo->poscache_maxiter = 0; 8602 if ( nochange_depth ) 8603 nochange_depth--; 8604 8605 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput); 8606 sayNO_SILENT; 8607 #undef ST 8608 8609 case OPEN: /* ( */ 8610 n = PARNO(scan); /* which paren pair */ 8611 RXp_OFFSp(rex)[n].start_tmp = locinput - reginfo->strbeg; 8612 if (n > maxopenparen) 8613 maxopenparen = n; 8614 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 8615 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n", 8616 depth, 8617 PTR2UV(rex), 8618 PTR2UV(RXp_OFFSp(rex)), 8619 (UV)n, 8620 (IV)RXp_OFFSp(rex)[n].start_tmp, 8621 (UV)maxopenparen 8622 )); 8623 lastopen = n; 8624 break; 8625 8626 case SROPEN: /* (*SCRIPT_RUN: */ 8627 script_run_begin = (U8 *) locinput; 8628 break; 8629 8630 8631 case CLOSE: /* ) */ 8632 n = PARNO(scan); /* which paren pair */ 8633 CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp, 8634 locinput - reginfo->strbeg); 8635 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) ) 8636 goto fake_end; 8637 8638 break; 8639 8640 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */ 8641 8642 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target)) 8643 { 8644 sayNO; 8645 } 8646 8647 break; 8648 8649 8650 case ACCEPT: /* (*ACCEPT) */ 8651 is_accepted = true; 8652 if (FLAGS(scan)) 8653 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 8654 utmp = ARG2u(scan); 8655 8656 if ( utmp ) { 8657 regnode *cursor; 8658 for ( 8659 cursor = scan; 8660 cursor && ( OP(cursor) != END ); 8661 cursor = ( 8662 REGNODE_TYPE( OP(cursor) ) == END 8663 || REGNODE_TYPE( OP(cursor) ) == WHILEM 8664 ) 8665 ? REGNODE_AFTER(cursor) 8666 : regnext(cursor) 8667 ){ 8668 if ( OP(cursor) != CLOSE ) 8669 continue; 8670 8671 n = PARNO(cursor); 8672 8673 if ( n > lastopen ) /* might be OPEN/CLOSE in the way */ 8674 continue; /* so skip this one */ 8675 8676 CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp, 8677 locinput - reginfo->strbeg); 8678 8679 if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) ) 8680 break; 8681 } 8682 } 8683 goto fake_end; 8684 /* NOTREACHED */ 8685 8686 case GROUPP: /* (?(1)) */ 8687 n = ARG1u(scan); /* which paren pair */ 8688 sw = cBOOL(RXp_LASTPAREN(rex) >= n && RXp_OFFS_END(rex,n) != -1); 8689 break; 8690 8691 case GROUPPN: /* (?(<name>)) */ 8692 /* reg_check_named_buff_matched returns 0 for no match */ 8693 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); 8694 break; 8695 8696 case INSUBP: /* (?(R)) */ 8697 n = ARG1u(scan); 8698 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg 8699 * of SCAN is already set up as matches a eval.close_paren */ 8700 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n); 8701 break; 8702 8703 case DEFINEP: /* (?(DEFINE)) */ 8704 sw = 0; 8705 break; 8706 8707 case IFTHEN: /* (?(cond)A|B) */ 8708 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ 8709 if (sw) 8710 next = REGNODE_AFTER_type(scan,tregnode_IFTHEN); 8711 else { 8712 next = scan + ARG1u(scan); 8713 if (OP(next) == IFTHEN) /* Fake one. */ 8714 next = REGNODE_AFTER_type(next,tregnode_IFTHEN); 8715 } 8716 break; 8717 8718 case LOGICAL: /* modifier for EVAL and IFMATCH */ 8719 logical = FLAGS(scan) & EVAL_FLAGS_MASK; /* reserve a bit for optimistic eval */ 8720 break; 8721 8722 /******************************************************************* 8723 8724 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ 8725 pattern, where A and B are subpatterns. (For simple A, CURLYM or 8726 STAR/PLUS/CURLY/CURLYN are used instead.) 8727 8728 A*B is compiled as <CURLYX><A><WHILEM><B> 8729 8730 On entry to the subpattern, CURLYX is called. This pushes a CURLYX 8731 state, which contains the current count, initialised to -1. It also sets 8732 cur_curlyx to point to this state, with any previous value saved in the 8733 state block. 8734 8735 CURLYX then jumps straight to the WHILEM op, rather than executing A, 8736 since the pattern may possibly match zero times (i.e. it's a while {} loop 8737 rather than a do {} while loop). 8738 8739 Each entry to WHILEM represents a successful match of A. The count in the 8740 CURLYX block is incremented, another WHILEM state is pushed, and execution 8741 passes to A or B depending on greediness and the current count. 8742 8743 For example, if matching against the string a1a2a3b (where the aN are 8744 substrings that match /A/), then the match progresses as follows: (the 8745 pushed states are interspersed with the bits of strings matched so far): 8746 8747 <CURLYX cnt=-1> 8748 <CURLYX cnt=0><WHILEM> 8749 <CURLYX cnt=1><WHILEM> a1 <WHILEM> 8750 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM> 8751 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> 8752 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b 8753 8754 (Contrast this with something like CURLYM, which maintains only a single 8755 backtrack state: 8756 8757 <CURLYM cnt=0> a1 8758 a1 <CURLYM cnt=1> a2 8759 a1 a2 <CURLYM cnt=2> a3 8760 a1 a2 a3 <CURLYM cnt=3> b 8761 ) 8762 8763 Each WHILEM state block marks a point to backtrack to upon partial failure 8764 of A or B, and also contains some minor state data related to that 8765 iteration. The CURLYX block, pointed to by cur_curlyx, contains the 8766 overall state, such as the count, and pointers to the A and B ops. 8767 8768 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx 8769 must always point to the *current* CURLYX block, the rules are: 8770 8771 When executing CURLYX, save the old cur_curlyx in the CURLYX state block, 8772 and set cur_curlyx to point the new block. 8773 8774 When popping the CURLYX block after a successful or unsuccessful match, 8775 restore the previous cur_curlyx. 8776 8777 When WHILEM is about to execute B, save the current cur_curlyx, and set it 8778 to the outer one saved in the CURLYX block. 8779 8780 When popping the WHILEM block after a successful or unsuccessful B match, 8781 restore the previous cur_curlyx. 8782 8783 Here's an example for the pattern (AI* BI)*BO 8784 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: 8785 8786 cur_ 8787 curlyx backtrack stack 8788 ------ --------------- 8789 NULL 8790 CO <CO prev=NULL> <WO> 8791 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 8792 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 8793 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo 8794 8795 At this point the pattern succeeds, and we work back down the stack to 8796 clean up, restoring as we go: 8797 8798 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 8799 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 8800 CO <CO prev=NULL> <WO> 8801 NULL 8802 8803 *******************************************************************/ 8804 8805 #define ST st->u.curlyx 8806 8807 case CURLYX: /* start of /A*B/ (for complex A) */ 8808 { 8809 /* No need to save/restore up to this paren */ 8810 I32 parenfloor = FLAGS(scan); 8811 8812 assert(next); /* keep Coverity happy */ 8813 if (OP(REGNODE_BEFORE(next)) == NOTHING) /* LONGJMP */ 8814 next += ARG1u(next); 8815 8816 /* XXXX Probably it is better to teach regpush to support 8817 parenfloor > maxopenparen ... */ 8818 if (parenfloor > (I32)RXp_LASTPAREN(rex)) 8819 parenfloor = RXp_LASTPAREN(rex); /* Pessimization... */ 8820 8821 ST.prev_curlyx= cur_curlyx; 8822 cur_curlyx = st; 8823 ST.cp = PL_savestack_ix; 8824 8825 /* these fields contain the state of the current curly. 8826 * they are accessed by subsequent WHILEMs */ 8827 ST.parenfloor = parenfloor; 8828 ST.me = scan; 8829 ST.B = next; 8830 ST.minmod = minmod; 8831 minmod = 0; 8832 ST.count = -1; /* this will be updated by WHILEM */ 8833 ST.lastloc = NULL; /* this will be updated by WHILEM */ 8834 8835 PUSH_YES_STATE_GOTO(CURLYX_end, REGNODE_BEFORE(next), locinput, loceol, 8836 script_run_begin); 8837 NOT_REACHED; /* NOTREACHED */ 8838 } 8839 8840 case CURLYX_end: /* just finished matching all of A*B */ 8841 cur_curlyx = ST.prev_curlyx; 8842 sayYES; 8843 NOT_REACHED; /* NOTREACHED */ 8844 8845 case CURLYX_end_fail: /* just failed to match all of A*B */ 8846 regcpblow(ST.cp); 8847 cur_curlyx = ST.prev_curlyx; 8848 sayNO; 8849 NOT_REACHED; /* NOTREACHED */ 8850 8851 8852 #undef ST 8853 #define ST st->u.whilem 8854 8855 case WHILEM: /* just matched an A in /A*B/ (for complex A) */ 8856 { 8857 /* see the discussion above about CURLYX/WHILEM */ 8858 I32 n; 8859 int min, max; 8860 /* U16 first_paren, last_paren; */ 8861 regnode *A; 8862 8863 assert(cur_curlyx); /* keep Coverity happy */ 8864 8865 min = ARG1i(cur_curlyx->u.curlyx.me); 8866 max = ARG2i(cur_curlyx->u.curlyx.me); 8867 /* first_paren = ARG3a(cur_curlyx->u.curlyx.me); */ 8868 /* last_paren = ARG3b(cur_curlyx->u.curlyx.me); */ 8869 A = REGNODE_AFTER(cur_curlyx->u.curlyx.me); 8870 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ 8871 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; 8872 ST.cache_offset = 0; 8873 ST.cache_mask = 0; 8874 8875 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n", 8876 depth, (long)n, min, max) 8877 ); 8878 8879 /* First just match a string of min A's. */ 8880 if (n < min) { 8881 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen); 8882 cur_curlyx->u.curlyx.lastloc = locinput; 8883 REGCP_SET(ST.lastcp); 8884 8885 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol, 8886 script_run_begin); 8887 NOT_REACHED; /* NOTREACHED */ 8888 } 8889 8890 /* If degenerate A matches "", assume A done. */ 8891 8892 if (locinput == cur_curlyx->u.curlyx.lastloc) { 8893 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n", 8894 depth) 8895 ); 8896 goto do_whilem_B_max; 8897 } 8898 8899 /* super-linear cache processing. 8900 * 8901 * The idea here is that for certain types of CURLYX/WHILEM - 8902 * principally those whose upper bound is infinity (and 8903 * excluding regexes that have things like \1 and other very 8904 * non-regular expressiony things), then if a pattern like 8905 * /....A*.../ fails and we backtrack to the WHILEM, then we 8906 * make a note that this particular WHILEM op was at string 8907 * position 47 (say) when the rest of pattern failed. Then, if 8908 * we ever find ourselves back at that WHILEM, and at string 8909 * position 47 again, we can just fail immediately rather than 8910 * running the rest of the pattern again. 8911 * 8912 * This is very handy when patterns start to go 8913 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up 8914 * with a combinatorial explosion of backtracking. 8915 * 8916 * The cache is implemented as a bit array, with one bit per 8917 * string byte position per WHILEM op (up to 16) - so its 8918 * between 0.25 and 2x the string size. 8919 * 8920 * To avoid allocating a poscache buffer every time, we do an 8921 * initially countdown; only after we have executed a WHILEM 8922 * op (string-length x #WHILEMs) times do we allocate the 8923 * cache. 8924 * 8925 * The top 4 bits of FLAGS(scan) byte say how many different 8926 * relevant CURLLYX/WHILEM op pairs there are, while the 8927 * bottom 4-bits is the identifying index number of this 8928 * WHILEM. 8929 */ 8930 8931 if (FLAGS(scan)) { 8932 8933 if (!reginfo->poscache_maxiter) { 8934 /* start the countdown: Postpone detection until we 8935 * know the match is not *that* much linear. */ 8936 reginfo->poscache_maxiter 8937 = (reginfo->strend - reginfo->strbeg + 1) 8938 * (FLAGS(scan)>>4); 8939 /* possible overflow for long strings and many CURLYX's */ 8940 if (reginfo->poscache_maxiter < 0) 8941 reginfo->poscache_maxiter = I32_MAX; 8942 reginfo->poscache_iter = reginfo->poscache_maxiter; 8943 } 8944 8945 if (reginfo->poscache_iter-- == 0) { 8946 /* initialise cache */ 8947 const SSize_t size = (reginfo->poscache_maxiter + 7)/8; 8948 regmatch_info_aux *const aux = reginfo->info_aux; 8949 if (aux->poscache) { 8950 if ((SSize_t)reginfo->poscache_size < size) { 8951 Renew(aux->poscache, size, char); 8952 reginfo->poscache_size = size; 8953 } 8954 Zero(aux->poscache, size, char); 8955 } 8956 else { 8957 reginfo->poscache_size = size; 8958 Newxz(aux->poscache, size, char); 8959 } 8960 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 8961 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n", 8962 PL_colors[4], PL_colors[5]) 8963 ); 8964 } 8965 8966 if (reginfo->poscache_iter < 0) { 8967 /* have we already failed at this position? */ 8968 SSize_t offset, mask; 8969 8970 reginfo->poscache_iter = -1; /* stop eventual underflow */ 8971 offset = (FLAGS(scan) & 0xf) - 1 8972 + (locinput - reginfo->strbeg) 8973 * (FLAGS(scan)>>4); 8974 mask = 1 << (offset % 8); 8975 offset /= 8; 8976 if (reginfo->info_aux->poscache[offset] & mask) { 8977 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n", 8978 depth) 8979 ); 8980 cur_curlyx->u.curlyx.count--; 8981 sayNO; /* cache records failure */ 8982 } 8983 ST.cache_offset = offset; 8984 ST.cache_mask = mask; 8985 } 8986 } 8987 8988 /* Prefer B over A for minimal matching. */ 8989 8990 if (cur_curlyx->u.curlyx.minmod) { 8991 ST.save_curlyx = cur_curlyx; 8992 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 8993 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, 8994 locinput, loceol, script_run_begin); 8995 NOT_REACHED; /* NOTREACHED */ 8996 } 8997 8998 /* Prefer A over B for maximal matching. */ 8999 9000 if (n < max) { /* More greed allowed? */ 9001 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, 9002 maxopenparen); 9003 cur_curlyx->u.curlyx.lastloc = locinput; 9004 REGCP_SET(ST.lastcp); 9005 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol, 9006 script_run_begin); 9007 NOT_REACHED; /* NOTREACHED */ 9008 } 9009 goto do_whilem_B_max; 9010 } 9011 NOT_REACHED; /* NOTREACHED */ 9012 9013 case WHILEM_B_min: /* just matched B in a minimal match */ 9014 case WHILEM_B_max: /* just matched B in a maximal match */ 9015 cur_curlyx = ST.save_curlyx; 9016 sayYES; 9017 NOT_REACHED; /* NOTREACHED */ 9018 9019 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ 9020 cur_curlyx = ST.save_curlyx; 9021 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 9022 cur_curlyx->u.curlyx.count--; 9023 CACHEsayNO; 9024 NOT_REACHED; /* NOTREACHED */ 9025 9026 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ 9027 /* FALLTHROUGH */ 9028 case WHILEM_A_pre_fail: /* just failed to match even minimal A */ 9029 REGCP_UNWIND(ST.lastcp); 9030 regcppop(rex, &maxopenparen); 9031 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 9032 cur_curlyx->u.curlyx.count--; 9033 CACHEsayNO; 9034 NOT_REACHED; /* NOTREACHED */ 9035 9036 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ 9037 REGCP_UNWIND(ST.lastcp); 9038 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */ 9039 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n", 9040 depth) 9041 ); 9042 9043 do_whilem_B_max: 9044 /* now try B */ 9045 ST.save_curlyx = cur_curlyx; 9046 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 9047 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, 9048 locinput, loceol, script_run_begin); 9049 NOT_REACHED; /* NOTREACHED */ 9050 9051 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ 9052 cur_curlyx = ST.save_curlyx; 9053 9054 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2i(cur_curlyx->u.curlyx.me)) { 9055 /* Maximum greed exceeded */ 9056 cur_curlyx->u.curlyx.count--; 9057 CACHEsayNO; 9058 } 9059 9060 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth) 9061 ); 9062 /* Try grabbing another A and see if it helps. */ 9063 cur_curlyx->u.curlyx.lastloc = locinput; 9064 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen); 9065 REGCP_SET(ST.lastcp); 9066 PUSH_STATE_GOTO(WHILEM_A_min, 9067 /*A*/ REGNODE_AFTER(ST.save_curlyx->u.curlyx.me), 9068 locinput, loceol, script_run_begin); 9069 NOT_REACHED; /* NOTREACHED */ 9070 9071 #undef ST 9072 #define ST st->u.branch 9073 9074 case BRANCHJ: /* /(...|A|...)/ with long next pointer */ 9075 next = scan + ARG1u(scan); 9076 if (next == scan) 9077 next = NULL; 9078 ST.before_paren = ARG2a(scan); 9079 ST.after_paren = ARG2b(scan); 9080 goto branch_logic; 9081 NOT_REACHED; /* NOTREACHED */ 9082 9083 case BRANCH: /* /(...|A|...)/ */ 9084 ST.before_paren = ARG1a(scan); 9085 ST.after_paren = ARG1b(scan); 9086 branch_logic: 9087 scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */ 9088 assert(scan); 9089 ST.lastparen = RXp_LASTPAREN(rex); 9090 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex); 9091 ST.next_branch = next; 9092 REGCP_SET(ST.cp); 9093 if (RE_PESSIMISTIC_PARENS) { 9094 regcppush(rex, 0, maxopenparen); 9095 REGCP_SET(ST.lastcp); 9096 } 9097 9098 /* Now go into the branch */ 9099 if (has_cutgroup) { 9100 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol, 9101 script_run_begin); 9102 } else { 9103 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol, 9104 script_run_begin); 9105 } 9106 NOT_REACHED; /* NOTREACHED */ 9107 9108 case CUTGROUP: /* /(*THEN)/ */ 9109 sv_yes_mark = st->u.mark.mark_name = FLAGS(scan) 9110 ? MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]) 9111 : NULL; 9112 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol, 9113 script_run_begin); 9114 NOT_REACHED; /* NOTREACHED */ 9115 9116 case CUTGROUP_next_fail: 9117 do_cutgroup = 1; 9118 no_final = 1; 9119 if (st->u.mark.mark_name) 9120 sv_commit = st->u.mark.mark_name; 9121 sayNO; 9122 NOT_REACHED; /* NOTREACHED */ 9123 9124 case BRANCH_next: 9125 sayYES; 9126 NOT_REACHED; /* NOTREACHED */ 9127 9128 case BRANCH_next_fail: /* that branch failed; try the next, if any */ 9129 if (do_cutgroup) { 9130 do_cutgroup = 0; 9131 no_final = 0; 9132 } 9133 if (RE_PESSIMISTIC_PARENS) { 9134 REGCP_UNWIND(ST.lastcp); 9135 regcppop(rex,&maxopenparen); 9136 } 9137 REGCP_UNWIND(ST.cp); 9138 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9139 CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "BRANCH_next_fail"); 9140 scan = ST.next_branch; 9141 /* no more branches? */ 9142 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { 9143 DEBUG_EXECUTE_r({ 9144 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n", 9145 depth, 9146 PL_colors[4], 9147 PL_colors[5] ); 9148 }); 9149 sayNO_SILENT; 9150 } 9151 continue; /* execute next BRANCH[J] op */ 9152 /* NOTREACHED */ 9153 9154 case MINMOD: /* next op will be non-greedy, e.g. A*? */ 9155 minmod = 1; 9156 break; 9157 9158 #undef ST 9159 #define ST st->u.curlym 9160 9161 case CURLYM: /* /A{m,n}B/ where A is fixed-length */ 9162 9163 /* This is an optimisation of CURLYX that enables us to push 9164 * only a single backtracking state, no matter how many matches 9165 * there are in {m,n}. It relies on the pattern being constant 9166 * length, with no parens to influence future backrefs 9167 */ 9168 9169 ST.me = scan; 9170 scan = REGNODE_AFTER_type(scan, tregnode_CURLYM); 9171 9172 ST.lastparen = RXp_LASTPAREN(rex); 9173 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex); 9174 9175 /* if paren positive, emulate an OPEN/CLOSE around A */ 9176 if (FLAGS(ST.me)) { 9177 U32 paren = FLAGS(ST.me); 9178 lastopen = paren; 9179 if (paren > maxopenparen) 9180 maxopenparen = paren; 9181 scan += NEXT_OFF(scan); /* Skip former OPEN. */ 9182 } 9183 ST.A = scan; 9184 ST.B = next; 9185 ST.alen = 0; 9186 ST.count = 0; 9187 ST.minmod = minmod; 9188 minmod = 0; 9189 ST.Binfo.count = -1; 9190 REGCP_SET(ST.cp); 9191 9192 if (!(ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me))) /* min/max */ 9193 goto curlym_do_B; 9194 9195 curlym_do_A: /* execute the A in /A{m,n}B/ */ 9196 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */ 9197 script_run_begin); 9198 NOT_REACHED; /* NOTREACHED */ 9199 9200 case CURLYM_A: /* we've just matched an A */ 9201 ST.count++; 9202 /* after first match, determine A's length: u.curlym.alen */ 9203 if (ST.count == 1) { 9204 if (reginfo->is_utf8_target) { 9205 char *s = st->locinput; 9206 while (s < locinput) { 9207 ST.alen++; 9208 s += UTF8SKIP(s); 9209 } 9210 } 9211 else { 9212 ST.alen = locinput - st->locinput; 9213 } 9214 if (ST.alen == 0) 9215 ST.count = ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me); 9216 } 9217 DEBUG_EXECUTE_r( 9218 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n", 9219 depth, (IV) ST.count, (IV)ST.alen) 9220 ); 9221 9222 if (FLAGS(ST.me)) { 9223 /* emulate CLOSE: mark current A as captured */ 9224 U32 paren = (U32)FLAGS(ST.me); 9225 CLOSE_CAPTURE(rex, paren, 9226 HOPc(locinput, -ST.alen) - reginfo->strbeg, 9227 locinput - reginfo->strbeg); 9228 } 9229 9230 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me))) 9231 goto fake_end; 9232 9233 9234 if (!is_accepted) { 9235 I32 max = (ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me)); 9236 if ( max == REG_INFTY || ST.count < max ) 9237 goto curlym_do_A; /* try to match another A */ 9238 } 9239 goto curlym_do_B; /* try to match B */ 9240 9241 case CURLYM_A_fail: /* just failed to match an A */ 9242 REGCP_UNWIND(ST.cp); 9243 9244 9245 if (ST.minmod || ST.count < ARG1i(ST.me) /* min*/ 9246 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me))) 9247 sayNO; 9248 9249 curlym_do_B: /* execute the B in /A{m,n}B/ */ 9250 if (is_accepted) 9251 goto curlym_close_B; 9252 9253 if (ST.Binfo.count < 0) { 9254 /* calculate possible match of 1st char following curly */ 9255 assert(ST.B); 9256 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { 9257 regnode *text_node = ST.B; 9258 if (! HAS_TEXT(text_node)) 9259 FIND_NEXT_IMPT(text_node); 9260 if (REGNODE_TYPE(OP(text_node)) == EXACT) { 9261 if (! S_setup_EXACTISH_ST(aTHX_ text_node, 9262 &ST.Binfo, reginfo)) 9263 { 9264 sayNO; 9265 } 9266 } 9267 } 9268 } 9269 9270 DEBUG_EXECUTE_r( 9271 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n", 9272 depth, (IV)ST.count) 9273 ); 9274 if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) { 9275 assert(ST.Binfo.count > 0); 9276 9277 /* Do a quick test to hopefully rule out most non-matches */ 9278 if ( locinput + ST.Binfo.min_length > loceol 9279 || ! S_test_EXACTISH_ST(locinput, ST.Binfo)) 9280 { 9281 DEBUG_OPTIMISE_r( 9282 Perl_re_exec_indentf( aTHX_ 9283 "CURLYM Fast bail next target=0x%X anded==0x%X" 9284 " mask=0x%X\n", 9285 depth, 9286 (int) nextbyte, ST.Binfo.first_byte_anded, 9287 ST.Binfo.first_byte_mask) 9288 ); 9289 state_num = CURLYM_B_fail; 9290 goto reenter_switch; 9291 } 9292 } 9293 9294 curlym_close_B: 9295 if (FLAGS(ST.me)) { 9296 /* emulate CLOSE: mark current A as captured */ 9297 U32 paren = (U32)FLAGS(ST.me); 9298 if (ST.count || is_accepted) { 9299 CLOSE_CAPTURE(rex, paren, 9300 HOPc(locinput, -ST.alen) - reginfo->strbeg, 9301 locinput - reginfo->strbeg); 9302 } 9303 else 9304 RXp_OFFSp(rex)[paren].end = -1; 9305 9306 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me))) 9307 { 9308 if (ST.count || is_accepted) 9309 goto fake_end; 9310 else 9311 sayNO; 9312 } 9313 } 9314 9315 if (is_accepted) 9316 goto fake_end; 9317 9318 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */ 9319 script_run_begin); 9320 NOT_REACHED; /* NOTREACHED */ 9321 9322 case CURLYM_B_fail: /* just failed to match a B */ 9323 REGCP_UNWIND(ST.cp); 9324 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9325 if (ST.minmod) { 9326 I32 max = ARG2i(ST.me); 9327 if (max != REG_INFTY && ST.count == max) 9328 sayNO; 9329 goto curlym_do_A; /* try to match a further A */ 9330 } 9331 /* backtrack one A */ 9332 if (ST.count == ARG1i(ST.me) /* min */) 9333 sayNO; 9334 ST.count--; 9335 SET_locinput(HOPc(locinput, -ST.alen)); 9336 goto curlym_do_B; /* try to match B */ 9337 9338 #undef ST 9339 #define ST st->u.curly 9340 9341 #define CURLY_SETPAREN(paren, success) \ 9342 if (paren) { \ 9343 if (success) { \ 9344 CLOSE_CAPTURE(rex, paren, HOPc(locinput, -1) - reginfo->strbeg, \ 9345 locinput - reginfo->strbeg); \ 9346 } \ 9347 else { \ 9348 RXp_OFFSp(rex)[paren].end = -1; \ 9349 RXp_LASTPAREN(rex) = ST.lastparen; \ 9350 RXp_LASTCLOSEPAREN(rex) = ST.lastcloseparen; \ 9351 } \ 9352 } 9353 9354 case STAR: /* /A*B/ where A is width 1 char */ 9355 ST.paren = 0; 9356 ST.min = 0; 9357 ST.max = REG_INFTY; 9358 scan = REGNODE_AFTER_type(scan,tregnode_STAR); 9359 goto repeat; 9360 9361 case PLUS: /* /A+B/ where A is width 1 char */ 9362 ST.paren = 0; 9363 ST.min = 1; 9364 ST.max = REG_INFTY; 9365 scan = REGNODE_AFTER_type(scan,tregnode_PLUS); 9366 goto repeat; 9367 9368 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ 9369 ST.paren = FLAGS(scan); /* Which paren to set */ 9370 ST.lastparen = RXp_LASTPAREN(rex); 9371 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex); 9372 if (ST.paren > maxopenparen) 9373 maxopenparen = ST.paren; 9374 ST.min = ARG1i(scan); /* min to match */ 9375 ST.max = ARG2i(scan); /* max to match */ 9376 scan = regnext(REGNODE_AFTER_type(scan, tregnode_CURLYN)); 9377 9378 /* handle the single-char capture called as a GOSUB etc */ 9379 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren)) 9380 { 9381 char *li = locinput; 9382 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1)) 9383 sayNO; 9384 SET_locinput(li); 9385 goto fake_end; 9386 } 9387 9388 goto repeat; 9389 9390 case CURLY: /* /A{m,n}B/ where A is width 1 char */ 9391 ST.paren = 0; 9392 ST.min = ARG1i(scan); /* min to match */ 9393 ST.max = ARG2i(scan); /* max to match */ 9394 scan = REGNODE_AFTER_type(scan, tregnode_CURLY); 9395 repeat: 9396 /* 9397 * Lookahead to avoid useless match attempts 9398 * when we know what character comes next. 9399 * 9400 * Used to only do .*x and .*?x, but now it allows 9401 * for )'s, ('s and (?{ ... })'s to be in the way 9402 * of the quantifier and the EXACT-like node. -- japhy 9403 */ 9404 9405 assert(ST.min <= ST.max); 9406 if (! HAS_TEXT(next) && ! JUMPABLE(next)) { 9407 ST.Binfo.count = 0; 9408 } 9409 else { 9410 regnode *text_node = next; 9411 9412 if (! HAS_TEXT(text_node)) 9413 FIND_NEXT_IMPT(text_node); 9414 9415 if (! HAS_TEXT(text_node)) 9416 ST.Binfo.count = 0; 9417 else { 9418 if ( REGNODE_TYPE(OP(text_node)) != EXACT ) { 9419 ST.Binfo.count = 0; 9420 } 9421 else { 9422 if (! S_setup_EXACTISH_ST(aTHX_ text_node, 9423 &ST.Binfo, reginfo)) 9424 { 9425 sayNO; 9426 } 9427 } 9428 } 9429 } 9430 9431 ST.A = scan; 9432 ST.B = next; 9433 if (minmod) { 9434 char *li = locinput; 9435 minmod = 0; 9436 if (ST.min && 9437 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min) 9438 < ST.min) 9439 sayNO; 9440 SET_locinput(li); 9441 ST.count = ST.min; 9442 REGCP_SET(ST.cp); 9443 9444 if (ST.Binfo.count <= 0) 9445 goto curly_try_B_min; 9446 9447 ST.oldloc = locinput; 9448 9449 /* set ST.maxpos to the furthest point along the 9450 * string that could possibly match, i.e., that a match could 9451 * start at. */ 9452 if (ST.max == REG_INFTY) { 9453 ST.maxpos = loceol - 1; 9454 if (utf8_target) 9455 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) 9456 ST.maxpos--; 9457 } 9458 else if (utf8_target) { 9459 int m = ST.max - ST.min; 9460 for (ST.maxpos = locinput; 9461 m >0 && ST.maxpos < loceol; m--) 9462 ST.maxpos += UTF8SKIP(ST.maxpos); 9463 } 9464 else { 9465 ST.maxpos = locinput + ST.max - ST.min; 9466 if (ST.maxpos >= loceol) 9467 ST.maxpos = loceol - 1; 9468 } 9469 goto curly_try_B_min_known; 9470 9471 } 9472 else { 9473 /* avoid taking address of locinput, so it can remain 9474 * a register var */ 9475 char *li = locinput; 9476 if (ST.max) 9477 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max); 9478 else 9479 ST.count = 0; 9480 if (ST.count < ST.min) 9481 sayNO; 9482 SET_locinput(li); 9483 if ((ST.count > ST.min) 9484 && (REGNODE_TYPE(OP(ST.B)) == EOL) && (OP(ST.B) != MEOL)) 9485 { 9486 /* A{m,n} must come at the end of the string, there's 9487 * no point in backing off ... */ 9488 ST.min = ST.count; 9489 /* ...except that $ and \Z can match before *and* after 9490 newline at the end. Consider "\n\n" =~ /\n+\Z\n/. 9491 We may back off by one in this case. */ 9492 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS) 9493 ST.min--; 9494 } 9495 REGCP_SET(ST.cp); 9496 goto curly_try_B_max; 9497 } 9498 NOT_REACHED; /* NOTREACHED */ 9499 9500 case CURLY_B_min_fail: 9501 /* failed to find B in a non-greedy match. */ 9502 if (RE_PESSIMISTIC_PARENS) { 9503 REGCP_UNWIND(ST.lastcp); 9504 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */ 9505 } 9506 REGCP_UNWIND(ST.cp); 9507 if (ST.paren) { 9508 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9509 } 9510 9511 if (ST.Binfo.count == 0) { 9512 /* failed -- move forward one */ 9513 char *li = locinput; 9514 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) { 9515 sayNO; 9516 } 9517 locinput = li; 9518 ST.count++; 9519 if (!( ST.count <= ST.max 9520 /* count overflow ? */ 9521 || (ST.max == REG_INFTY && ST.count > 0)) 9522 ) 9523 sayNO; 9524 } 9525 else { 9526 int n; 9527 /* Couldn't or didn't -- move forward. */ 9528 ST.oldloc = locinput; 9529 if (utf8_target) 9530 locinput += UTF8SKIP(locinput); 9531 else 9532 locinput++; 9533 ST.count++; 9534 9535 curly_try_B_min_known: 9536 /* find the next place where 'B' could work, then call B */ 9537 if (locinput + ST.Binfo.initial_exact < loceol) { 9538 if (ST.Binfo.initial_exact >= ST.Binfo.max_length) { 9539 9540 /* Here, the mask is all 1's for the entire length of 9541 * any possible match. (That actually means that there 9542 * is only one possible match.) Look for the next 9543 * occurrence */ 9544 locinput = ninstr(locinput, loceol, 9545 (char *) ST.Binfo.matches, 9546 (char *) ST.Binfo.matches 9547 + ST.Binfo.initial_exact); 9548 if (locinput == NULL) { 9549 sayNO; 9550 } 9551 } 9552 else do { 9553 /* If the first byte(s) of the mask are all ones, it 9554 * means those bytes must match identically, so can use 9555 * ninstr() to find the next possible matchpoint */ 9556 if (ST.Binfo.initial_exact > 0) { 9557 locinput = ninstr(locinput, loceol, 9558 (char *) ST.Binfo.matches, 9559 (char *) ST.Binfo.matches 9560 + ST.Binfo.initial_exact); 9561 } 9562 else { /* Otherwise find the next byte that matches, 9563 masked */ 9564 locinput = (char *) find_next_masked( 9565 (U8 *) locinput, (U8 *) loceol, 9566 ST.Binfo.first_byte_anded, 9567 ST.Binfo.first_byte_mask); 9568 /* Advance to the end of a multi-byte character */ 9569 if (utf8_target) { 9570 while ( locinput < loceol 9571 && UTF8_IS_CONTINUATION(*locinput)) 9572 { 9573 locinput++; 9574 } 9575 } 9576 } 9577 if ( locinput == NULL 9578 || locinput + ST.Binfo.min_length > loceol) 9579 { 9580 sayNO; 9581 } 9582 9583 /* Here, we have found a possible match point; if can't 9584 * rule it out, quit the loop so can check fully */ 9585 if (S_test_EXACTISH_ST(locinput, ST.Binfo)) { 9586 break; 9587 } 9588 9589 locinput += (utf8_target) ? UTF8SKIP(locinput) : 1; 9590 9591 } while (locinput <= ST.maxpos); 9592 } 9593 9594 if (locinput > ST.maxpos) 9595 sayNO; 9596 9597 n = (utf8_target) 9598 ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput) 9599 : (STRLEN) (locinput - ST.oldloc); 9600 9601 9602 /* Here is at the beginning of a character that meets the mask 9603 * criteria. Need to make sure that some real possibility */ 9604 9605 if (n) { 9606 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is 9607 * at what may be the beginning of b; check that everything 9608 * between oldloc and locinput matches */ 9609 char *li = ST.oldloc; 9610 ST.count += n; 9611 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n) 9612 sayNO; 9613 assert(n == REG_INFTY || locinput == li); 9614 } 9615 } 9616 9617 curly_try_B_min: 9618 if (RE_PESSIMISTIC_PARENS) { 9619 (void)regcppush(rex, 0, maxopenparen); 9620 REGCP_SET(ST.lastcp); 9621 } 9622 CURLY_SETPAREN(ST.paren, ST.count); 9623 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol, 9624 script_run_begin); 9625 NOT_REACHED; /* NOTREACHED */ 9626 9627 9628 curly_try_B_max: 9629 /* a successful greedy match: now try to match B */ 9630 if ( ST.Binfo.count <= 0 9631 || ( ST.Binfo.count > 0 9632 && locinput + ST.Binfo.min_length <= loceol 9633 && S_test_EXACTISH_ST(locinput, ST.Binfo))) 9634 { 9635 if (RE_PESSIMISTIC_PARENS) { 9636 (void)regcppush(rex, 0, maxopenparen); 9637 REGCP_SET(ST.lastcp); 9638 } 9639 CURLY_SETPAREN(ST.paren, ST.count); 9640 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol, 9641 script_run_begin); 9642 NOT_REACHED; /* NOTREACHED */ 9643 } 9644 goto CURLY_B_all_failed; 9645 NOT_REACHED; /* NOTREACHED */ 9646 9647 case CURLY_B_max_fail: 9648 /* failed to find B in a greedy match */ 9649 9650 if (RE_PESSIMISTIC_PARENS) { 9651 REGCP_UNWIND(ST.lastcp); 9652 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */ 9653 } 9654 CURLY_B_all_failed: 9655 REGCP_UNWIND(ST.cp); 9656 if (ST.paren) { 9657 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9658 } 9659 /* back up. */ 9660 if (--ST.count < ST.min) 9661 sayNO; 9662 locinput = HOPc(locinput, -1); 9663 goto curly_try_B_max; 9664 9665 #undef ST 9666 9667 case END: /* last op of main pattern */ 9668 fake_end: 9669 if (cur_eval) { 9670 /* we've just finished A in /(??{A})B/; now continue with B */ 9671 is_accepted= false; 9672 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput); 9673 st->u.eval.prev_rex = rex_sv; /* inner */ 9674 9675 /* Save *all* the positions. */ 9676 st->u.eval.cp = regcppush(rex, 0, maxopenparen); 9677 rex_sv = CUR_EVAL.prev_rex; 9678 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 9679 SET_reg_curpm(rex_sv); 9680 rex = ReANY(rex_sv); 9681 rexi = RXi_GET(rex); 9682 9683 st->u.eval.prev_curlyx = cur_curlyx; 9684 cur_curlyx = CUR_EVAL.prev_curlyx; 9685 9686 REGCP_SET(st->u.eval.lastcp); 9687 9688 /* Restore parens of the outer rex without popping the 9689 * savestack */ 9690 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen); 9691 9692 st->u.eval.prev_eval = cur_eval; 9693 cur_eval = CUR_EVAL.prev_eval; 9694 DEBUG_EXECUTE_r( 9695 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n", 9696 depth, cur_eval);); 9697 if ( nochange_depth ) 9698 nochange_depth--; 9699 9700 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput); 9701 9702 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */ 9703 st->u.eval.prev_eval->u.eval.B, 9704 locinput, loceol, script_run_begin); 9705 } 9706 9707 if (locinput < reginfo->till) { 9708 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 9709 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n", 9710 PL_colors[4], 9711 (long)(locinput - startpos), 9712 (long)(reginfo->till - startpos), 9713 PL_colors[5])); 9714 9715 sayNO_SILENT; /* Cannot match: too short. */ 9716 } 9717 sayYES; /* Success! */ 9718 9719 case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH 9720 matches end at the right spot, required for 9721 variable length matches. */ 9722 if (match_end && locinput != match_end) 9723 { 9724 DEBUG_EXECUTE_r( 9725 Perl_re_exec_indentf( aTHX_ 9726 "%sLOOKBEHIND_END: subpattern failed...%s\n", 9727 depth, PL_colors[4], PL_colors[5])); 9728 sayNO; /* Variable length match didn't line up */ 9729 } 9730 /* FALLTHROUGH */ 9731 9732 case SUCCEED: /* successful SUSPEND/CURLYM and 9733 *lookahead* IFMATCH/UNLESSM*/ 9734 DEBUG_EXECUTE_r( 9735 Perl_re_exec_indentf( aTHX_ 9736 "%sSUCCEED: subpattern success...%s\n", 9737 depth, PL_colors[4], PL_colors[5])); 9738 sayYES; /* Success! */ 9739 9740 #undef ST 9741 #define ST st->u.ifmatch 9742 9743 case SUSPEND: /* (?>A) */ 9744 ST.wanted = 1; 9745 ST.start = locinput; 9746 ST.end = loceol; 9747 ST.count = 1; 9748 goto do_ifmatch; 9749 9750 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */ 9751 ST.wanted = 0; 9752 goto ifmatch_trivial_fail_test; 9753 9754 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */ 9755 ST.wanted = 1; 9756 ifmatch_trivial_fail_test: 9757 ST.prev_match_end= match_end; 9758 ST.count = NEXT_OFF(scan) + 1; /* next_off repurposed to be 9759 lookbehind count, requires 9760 non-zero flags */ 9761 if (! FLAGS(scan)) { /* 'flags' zero means lookahed */ 9762 9763 /* Lookahead starts here and ends at the normal place */ 9764 ST.start = locinput; 9765 ST.end = loceol; 9766 match_end = NULL; 9767 } 9768 else { 9769 PERL_UINT_FAST8_T back_count = FLAGS(scan); 9770 char * s; 9771 match_end = locinput; 9772 9773 /* Lookbehind can look beyond the current position */ 9774 ST.end = loceol; 9775 9776 /* ... and starts at the first place in the input that is in 9777 * the range of the possible start positions */ 9778 for (; ST.count > 0; ST.count--, back_count--) { 9779 s = HOPBACKc(locinput, back_count); 9780 if (s) { 9781 ST.start = s; 9782 goto do_ifmatch; 9783 } 9784 } 9785 9786 /* If the lookbehind doesn't start in the actual string, is a 9787 * trivial match failure */ 9788 match_end = ST.prev_match_end; 9789 if (logical) { 9790 logical = 0; 9791 sw = 1 - cBOOL(ST.wanted); 9792 } 9793 else if (ST.wanted) 9794 sayNO; 9795 9796 /* Here, we didn't want it to match, so is actually success */ 9797 next = scan + ARG1u(scan); 9798 if (next == scan) 9799 next = NULL; 9800 break; 9801 } 9802 9803 do_ifmatch: 9804 ST.me = scan; 9805 ST.logical = logical; 9806 logical = 0; /* XXX: reset state of logical once it has been saved into ST */ 9807 9808 /* execute body of (?...A) */ 9809 PUSH_YES_STATE_GOTO(IFMATCH_A, REGNODE_AFTER(scan), ST.start, 9810 ST.end, script_run_begin); 9811 NOT_REACHED; /* NOTREACHED */ 9812 9813 { 9814 bool matched; 9815 9816 case IFMATCH_A_fail: /* body of (?...A) failed */ 9817 if (! ST.logical && ST.count > 1) { 9818 9819 /* It isn't a real failure until we've tried all starting 9820 * positions. Move to the next starting position and retry */ 9821 ST.count--; 9822 ST.start = HOPc(ST.start, 1); 9823 scan = ST.me; 9824 logical = ST.logical; 9825 goto do_ifmatch; 9826 } 9827 9828 /* Here, all starting positions have been tried. */ 9829 matched = FALSE; 9830 goto ifmatch_done; 9831 9832 case IFMATCH_A: /* body of (?...A) succeeded */ 9833 matched = TRUE; 9834 ifmatch_done: 9835 sw = matched == ST.wanted; 9836 match_end = ST.prev_match_end; 9837 if (! ST.logical && !sw) { 9838 sayNO; 9839 } 9840 9841 if (OP(ST.me) != SUSPEND) { 9842 /* restore old position except for (?>...) */ 9843 locinput = st->locinput; 9844 loceol = st->loceol; 9845 script_run_begin = st->sr0; 9846 } 9847 scan = ST.me + ARG1u(ST.me); 9848 if (scan == ST.me) 9849 scan = NULL; 9850 continue; /* execute B */ 9851 } 9852 9853 #undef ST 9854 9855 case LONGJMP: /* alternative with many branches compiles to 9856 * (BRANCHJ; EXACT ...; LONGJMP ) x N */ 9857 next = scan + ARG1u(scan); 9858 if (next == scan) 9859 next = NULL; 9860 break; 9861 9862 case COMMIT: /* (*COMMIT) */ 9863 reginfo->cutpoint = loceol; 9864 /* FALLTHROUGH */ 9865 9866 case PRUNE: /* (*PRUNE) */ 9867 if (FLAGS(scan)) 9868 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 9869 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol, 9870 script_run_begin); 9871 NOT_REACHED; /* NOTREACHED */ 9872 9873 case COMMIT_next_fail: 9874 no_final = 1; 9875 /* FALLTHROUGH */ 9876 sayNO; 9877 NOT_REACHED; /* NOTREACHED */ 9878 9879 case OPFAIL: /* (*FAIL) */ 9880 if (FLAGS(scan)) 9881 sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 9882 if (logical) { 9883 /* deal with (?(?!)X|Y) properly, 9884 * make sure we trigger the no branch 9885 * of the trailing IFTHEN structure*/ 9886 sw= 0; 9887 break; 9888 } else { 9889 sayNO; 9890 } 9891 NOT_REACHED; /* NOTREACHED */ 9892 9893 #define ST st->u.mark 9894 case MARKPOINT: /* (*MARK:foo) */ 9895 ST.prev_mark = mark_state; 9896 ST.mark_name = sv_commit = sv_yes_mark 9897 = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 9898 mark_state = st; 9899 ST.mark_loc = locinput; 9900 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol, 9901 script_run_begin); 9902 NOT_REACHED; /* NOTREACHED */ 9903 9904 case MARKPOINT_next: 9905 mark_state = ST.prev_mark; 9906 sayYES; 9907 NOT_REACHED; /* NOTREACHED */ 9908 9909 case MARKPOINT_next_fail: 9910 if (popmark && sv_eq(ST.mark_name,popmark)) 9911 { 9912 if (ST.mark_loc > startpoint) 9913 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 9914 popmark = NULL; /* we found our mark */ 9915 sv_commit = ST.mark_name; 9916 9917 DEBUG_EXECUTE_r({ 9918 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n", 9919 depth, 9920 PL_colors[4], SVfARG(sv_commit), PL_colors[5]); 9921 }); 9922 } 9923 mark_state = ST.prev_mark; 9924 sv_yes_mark = mark_state ? 9925 mark_state->u.mark.mark_name : NULL; 9926 sayNO; 9927 NOT_REACHED; /* NOTREACHED */ 9928 9929 case SKIP: /* (*SKIP) */ 9930 if (!FLAGS(scan)) { 9931 /* (*SKIP) : if we fail we cut here*/ 9932 ST.mark_name = NULL; 9933 ST.mark_loc = locinput; 9934 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol, 9935 script_run_begin); 9936 } else { 9937 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, 9938 otherwise do nothing. Meaning we need to scan 9939 */ 9940 regmatch_state *cur = mark_state; 9941 SV *find = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 9942 9943 while (cur) { 9944 if ( sv_eq( cur->u.mark.mark_name, 9945 find ) ) 9946 { 9947 ST.mark_name = find; 9948 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol, 9949 script_run_begin); 9950 } 9951 cur = cur->u.mark.prev_mark; 9952 } 9953 } 9954 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ 9955 break; 9956 9957 case SKIP_next_fail: 9958 if (ST.mark_name) { 9959 /* (*CUT:NAME) - Set up to search for the name as we 9960 collapse the stack*/ 9961 popmark = ST.mark_name; 9962 } else { 9963 /* (*CUT) - No name, we cut here.*/ 9964 if (ST.mark_loc > startpoint) 9965 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 9966 /* but we set sv_commit to latest mark_name if there 9967 is one so they can test to see how things lead to this 9968 cut */ 9969 if (mark_state) 9970 sv_commit=mark_state->u.mark.mark_name; 9971 } 9972 no_final = 1; 9973 sayNO; 9974 NOT_REACHED; /* NOTREACHED */ 9975 #undef ST 9976 9977 case LNBREAK: /* \R */ 9978 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) { 9979 locinput += n; 9980 } else 9981 sayNO; 9982 break; 9983 9984 default: 9985 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n", 9986 PTR2UV(scan), OP(scan)); 9987 Perl_croak(aTHX_ "regexp memory corruption"); 9988 9989 /* this is a point to jump to in order to increment 9990 * locinput by one character */ 9991 increment_locinput: 9992 assert(!NEXTCHR_IS_EOS); 9993 if (utf8_target) { 9994 locinput += PL_utf8skip[nextbyte]; 9995 /* locinput is allowed to go 1 char off the end (signifying 9996 * EOS), but not 2+ */ 9997 if (locinput > loceol) 9998 sayNO; 9999 } 10000 else 10001 locinput++; 10002 break; 10003 10004 } /* end switch */ 10005 10006 /* switch break jumps here */ 10007 scan = next; /* prepare to execute the next op and ... */ 10008 continue; /* ... jump back to the top, reusing st */ 10009 /* NOTREACHED */ 10010 10011 push_yes_state: 10012 /* push a state that backtracks on success */ 10013 st->u.yes.prev_yes_state = yes_state; 10014 yes_state = st; 10015 /* FALLTHROUGH */ 10016 push_state: 10017 /* push a new regex state, then continue at scan */ 10018 { 10019 regmatch_state *newst; 10020 DECLARE_AND_GET_RE_DEBUG_FLAGS; 10021 10022 DEBUG_r( /* DEBUG_STACK_r */ 10023 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) { 10024 regmatch_state *cur = st; 10025 regmatch_state *curyes = yes_state; 10026 U32 i; 10027 regmatch_slab *slab = PL_regmatch_slab; 10028 for (i = 0; i < 3 && i <= depth; cur--,i++) { 10029 if (cur < SLAB_FIRST(slab)) { 10030 slab = slab->prev; 10031 cur = SLAB_LAST(slab); 10032 } 10033 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n", 10034 depth, 10035 i ? " " : "push", 10036 depth - i, REGNODE_NAME(cur->resume_state), 10037 (curyes == cur) ? "yes" : "" 10038 ); 10039 if (curyes == cur) 10040 curyes = cur->u.yes.prev_yes_state; 10041 } 10042 } else { 10043 DEBUG_STATE_pp("push") 10044 }); 10045 depth++; 10046 st->locinput = locinput; 10047 st->loceol = loceol; 10048 st->sr0 = script_run_begin; 10049 newst = st+1; 10050 if (newst > SLAB_LAST(PL_regmatch_slab)) 10051 newst = S_push_slab(aTHX); 10052 PL_regmatch_state = newst; 10053 10054 locinput = pushinput; 10055 loceol = pusheol; 10056 script_run_begin = pushsr0; 10057 st = newst; 10058 continue; 10059 /* NOTREACHED */ 10060 } 10061 } 10062 #ifdef SOLARIS_BAD_OPTIMIZER 10063 # undef PL_charclass 10064 #endif 10065 10066 /* 10067 * We get here only if there's trouble -- normally "case END" is 10068 * the terminating point. 10069 */ 10070 Perl_croak(aTHX_ "corrupted regexp pointers"); 10071 NOT_REACHED; /* NOTREACHED */ 10072 10073 yes: 10074 if (yes_state) { 10075 /* we have successfully completed a subexpression, but we must now 10076 * pop to the state marked by yes_state and continue from there */ 10077 assert(st != yes_state); 10078 #ifdef DEBUGGING 10079 while (st != yes_state) { 10080 st--; 10081 if (st < SLAB_FIRST(PL_regmatch_slab)) { 10082 PL_regmatch_slab = PL_regmatch_slab->prev; 10083 st = SLAB_LAST(PL_regmatch_slab); 10084 } 10085 DEBUG_STATE_r({ 10086 if (no_final) { 10087 DEBUG_STATE_pp("pop (no final)"); 10088 } else { 10089 DEBUG_STATE_pp("pop (yes)"); 10090 } 10091 }); 10092 depth--; 10093 } 10094 #else 10095 while (yes_state < SLAB_FIRST(PL_regmatch_slab) 10096 || yes_state > SLAB_LAST(PL_regmatch_slab)) 10097 { 10098 /* not in this slab, pop slab */ 10099 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); 10100 PL_regmatch_slab = PL_regmatch_slab->prev; 10101 st = SLAB_LAST(PL_regmatch_slab); 10102 } 10103 depth -= (st - yes_state); 10104 #endif 10105 st = yes_state; 10106 yes_state = st->u.yes.prev_yes_state; 10107 PL_regmatch_state = st; 10108 10109 if (no_final) { 10110 locinput= st->locinput; 10111 loceol= st->loceol; 10112 script_run_begin = st->sr0; 10113 } 10114 state_num = st->resume_state + no_final; 10115 goto reenter_switch; 10116 } 10117 10118 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n", 10119 PL_colors[4], PL_colors[5])); 10120 10121 if (reginfo->info_aux_eval) { 10122 /* each successfully executed (?{...}) block does the equivalent of 10123 * local $^R = do {...} 10124 * When popping the save stack, all these locals would be undone; 10125 * bypass this by setting the outermost saved $^R to the latest 10126 * value */ 10127 /* I don't know if this is needed or works properly now. 10128 * see code related to PL_replgv elsewhere in this file. 10129 * Yves 10130 */ 10131 if (oreplsv != GvSV(PL_replgv)) { 10132 sv_setsv(oreplsv, GvSV(PL_replgv)); 10133 SvSETMAGIC(oreplsv); 10134 } 10135 } 10136 result = 1; 10137 goto final_exit; 10138 10139 no: 10140 DEBUG_EXECUTE_r( 10141 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n", 10142 depth, 10143 PL_colors[4], PL_colors[5]) 10144 ); 10145 10146 no_silent: 10147 if (no_final) { 10148 if (yes_state) { 10149 goto yes; 10150 } else { 10151 goto final_exit; 10152 } 10153 } 10154 if (depth) { 10155 /* there's a previous state to backtrack to */ 10156 st--; 10157 if (st < SLAB_FIRST(PL_regmatch_slab)) { 10158 PL_regmatch_slab = PL_regmatch_slab->prev; 10159 st = SLAB_LAST(PL_regmatch_slab); 10160 } 10161 PL_regmatch_state = st; 10162 locinput= st->locinput; 10163 loceol= st->loceol; 10164 script_run_begin = st->sr0; 10165 10166 DEBUG_STATE_pp("pop"); 10167 depth--; 10168 if (yes_state == st) 10169 yes_state = st->u.yes.prev_yes_state; 10170 10171 state_num = st->resume_state + 1; /* failure = success + 1 */ 10172 PERL_ASYNC_CHECK(); 10173 goto reenter_switch; 10174 } 10175 result = 0; 10176 10177 final_exit: 10178 if (rex->intflags & PREGf_VERBARG_SEEN) { 10179 SV *sv_err = get_sv("REGERROR", 1); 10180 SV *sv_mrk = get_sv("REGMARK", 1); 10181 if (result) { 10182 sv_commit = &PL_sv_no; 10183 if (!sv_yes_mark) 10184 sv_yes_mark = &PL_sv_yes; 10185 } else { 10186 if (!sv_commit) 10187 sv_commit = &PL_sv_yes; 10188 sv_yes_mark = &PL_sv_no; 10189 } 10190 assert(sv_err); 10191 assert(sv_mrk); 10192 sv_setsv(sv_err, sv_commit); 10193 sv_setsv(sv_mrk, sv_yes_mark); 10194 } 10195 10196 10197 if (last_pushed_cv) { 10198 dSP; 10199 /* see "Some notes about MULTICALL" above */ 10200 POP_MULTICALL; 10201 PERL_UNUSED_VAR(SP); 10202 } 10203 else 10204 LEAVE_SCOPE(orig_savestack_ix); 10205 10206 assert(!result || locinput - reginfo->strbeg >= 0); 10207 return result ? locinput - reginfo->strbeg : -1; 10208 } 10209 10210 /* 10211 - regrepeat - repeatedly match something simple, report how many 10212 * 10213 * What 'simple' means is a node which can be the operand of a quantifier like 10214 * '+', or {1,3} 10215 * 10216 * startposp - pointer to a pointer to the start position. This is updated 10217 * to point to the byte following the highest successful 10218 * match. 10219 * p - the regnode to be repeatedly matched against. 10220 * loceol - pointer to the end position beyond which we aren't supposed to 10221 * look. 10222 * reginfo - struct holding match state, such as utf8_target 10223 * max - maximum number of things to match. 10224 * depth - (for debugging) backtracking depth. 10225 */ 10226 STATIC I32 10227 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, 10228 char * loceol, regmatch_info *const reginfo, I32 max comma_pDEPTH) 10229 { 10230 char *scan; /* Pointer to current position in target string */ 10231 I32 c; 10232 char *this_eol = loceol; /* potentially adjusted version. */ 10233 I32 hardcount = 0; /* How many matches so far */ 10234 bool utf8_target = reginfo->is_utf8_target; 10235 unsigned int to_complement = 0; /* Invert the result? */ 10236 char_class_number_ classnum; 10237 10238 PERL_ARGS_ASSERT_REGREPEAT; 10239 10240 /* This routine is structured so that we switch on the input OP. Each OP 10241 * case: statement contains a loop to repeatedly apply the OP, advancing 10242 * the input until it fails, or reaches the end of the input, or until it 10243 * reaches the upper limit of matches. */ 10244 10245 scan = *startposp; 10246 if (max == REG_INFTY) /* This is a special marker to go to the platform's 10247 max */ 10248 max = I32_MAX; 10249 else if (! utf8_target && this_eol - scan > max) 10250 this_eol = scan + max; 10251 10252 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> 10253 * down to the maximum of how far we should go in it (but leaving it set to 10254 * the real end if the maximum permissible would take us beyond that). 10255 * This allows us to make the loop exit condition that we haven't gone past 10256 * <this_eol> to also mean that we haven't exceeded the max permissible 10257 * count, saving a test each time through the loop. But it assumes that 10258 * the OP matches a single byte, which is true for most of the OPs below 10259 * when applied to a non-UTF-8 target. Those relatively few OPs that don't 10260 * have this characteristic have to compensate. 10261 * 10262 * There is no such adjustment for UTF-8 targets, since the number of bytes 10263 * per character can vary. OPs will have to test both that the count is 10264 * less than the max permissible (using <hardcount> to keep track), and 10265 * that we are still within the bounds of the string (using <this_eol>. A 10266 * few OPs match a single byte no matter what the encoding. They can omit 10267 * the max test if, for the UTF-8 case, they do the adjustment that was 10268 * skipped above. 10269 * 10270 * Thus, the code above sets things up for the common case; and exceptional 10271 * cases need extra work; the common case is to make sure <scan> doesn't go 10272 * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the 10273 * count doesn't exceed the maximum permissible */ 10274 10275 switch (with_t_UTF8ness(OP(p), utf8_target)) { 10276 SV * anyofh_list; 10277 10278 case REG_ANY_t8: 10279 while (scan < this_eol && hardcount < max && *scan != '\n') { 10280 scan += UTF8SKIP(scan); 10281 hardcount++; 10282 } 10283 break; 10284 10285 case REG_ANY_tb: 10286 scan = (char *) memchr(scan, '\n', this_eol - scan); 10287 if (! scan) { 10288 scan = this_eol; 10289 } 10290 break; 10291 10292 case SANY_t8: 10293 while (scan < this_eol && hardcount < max) { 10294 scan += UTF8SKIP(scan); 10295 hardcount++; 10296 } 10297 break; 10298 10299 case SANY_tb: 10300 scan = this_eol; 10301 break; 10302 10303 case EXACT_REQ8_tb: 10304 case LEXACT_REQ8_tb: 10305 case EXACTFU_REQ8_tb: 10306 break; 10307 10308 case EXACTL_t8: 10309 if (UTF8_IS_ABOVE_LATIN1(*scan)) { 10310 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol); 10311 } 10312 /* FALLTHROUGH */ 10313 10314 case EXACTL_tb: 10315 case EXACTFL_t8: 10316 case EXACTFL_tb: 10317 case EXACTFLU8_t8: 10318 case EXACTFLU8_tb: 10319 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 10320 /* FALLTHROUGH */ 10321 10322 case EXACT_REQ8_t8: 10323 case LEXACT_REQ8_t8: 10324 case EXACTFU_REQ8_t8: 10325 case LEXACT_t8: 10326 case LEXACT_tb: 10327 case EXACT_t8: 10328 case EXACT_tb: 10329 case EXACTF_t8: 10330 case EXACTF_tb: 10331 case EXACTFAA_NO_TRIE_t8: 10332 case EXACTFAA_NO_TRIE_tb: 10333 case EXACTFAA_t8: 10334 case EXACTFAA_tb: 10335 case EXACTFU_t8: 10336 case EXACTFU_tb: 10337 case EXACTFUP_t8: 10338 case EXACTFUP_tb: 10339 10340 { 10341 struct next_matchable_info Binfo; 10342 PERL_UINT_FAST8_T definitive_len; 10343 10344 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); 10345 10346 /* Set up termination info, and quit if we can rule out that we've 10347 * gotten a match of the termination criteria */ 10348 if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo) 10349 || scan + Binfo.min_length > this_eol 10350 || ! S_test_EXACTISH_ST(scan, Binfo)) 10351 { 10352 break; 10353 } 10354 10355 definitive_len = Binfo.initial_definitive; 10356 10357 /* Here there are potential matches, and the first byte(s) matched our 10358 * filter 10359 * 10360 * If we got a definitive match of some initial bytes, there is no 10361 * possibility of false positives as far as it got */ 10362 if (definitive_len > 0) { 10363 10364 /* If as far as it got is the maximum possible, there were no false 10365 * positives at all. Since we have everything set up, see how many 10366 * repeats there are. */ 10367 if (definitive_len >= Binfo.max_length) { 10368 10369 /* We've already found one match */ 10370 scan += definitive_len; 10371 hardcount++; 10372 10373 /* If want more than the one match, and there is room for more, 10374 * see if there are any */ 10375 if (hardcount < max && scan + definitive_len <= this_eol) { 10376 10377 /* If the character is only a single byte long, just span 10378 * all such bytes. */ 10379 if (definitive_len == 1) { 10380 const char * orig_scan = scan; 10381 10382 if (this_eol - (scan - hardcount) > max) { 10383 this_eol = scan - hardcount + max; 10384 } 10385 10386 /* Use different routines depending on whether it's an 10387 * exact match or matches with a mask */ 10388 if (Binfo.initial_exact == 1) { 10389 scan = (char *) find_span_end((U8 *) scan, 10390 (U8 *) this_eol, 10391 Binfo.matches[0]); 10392 } 10393 else { 10394 scan = (char *) find_span_end_mask( 10395 (U8 *) scan, 10396 (U8 *) this_eol, 10397 Binfo.first_byte_anded, 10398 Binfo.first_byte_mask); 10399 } 10400 10401 hardcount += scan - orig_scan; 10402 } 10403 else { /* Here, the full character definitive match is more 10404 than one byte */ 10405 while ( hardcount < max 10406 && scan + definitive_len <= this_eol 10407 && S_test_EXACTISH_ST(scan, Binfo)) 10408 { 10409 scan += definitive_len; 10410 hardcount++; 10411 } 10412 } 10413 } 10414 10415 break; 10416 } /* End of a full character is definitively matched */ 10417 10418 /* Here, an initial portion of the character matched definitively, 10419 * and the rest matched as well, but could have false positives */ 10420 10421 do { 10422 int i; 10423 U8 * matches = Binfo.matches; 10424 10425 /* The first bytes were definitive. Look at the remaining */ 10426 for (i = 0; i < Binfo.count; i++) { 10427 if (memEQ(scan + definitive_len, 10428 matches + definitive_len, 10429 Binfo.lengths[i] - definitive_len)) 10430 { 10431 goto found_a_completion; 10432 } 10433 10434 matches += Binfo.lengths[i]; 10435 } 10436 10437 /* Didn't find anything to complete our initial match. Stop 10438 * here */ 10439 break; 10440 10441 found_a_completion: 10442 10443 /* Here, matched a full character, Include it in the result, 10444 * and then look to see if the next char matches */ 10445 hardcount++; 10446 scan += Binfo.lengths[i]; 10447 10448 } while ( hardcount < max 10449 && scan + definitive_len < this_eol 10450 && S_test_EXACTISH_ST(scan, Binfo)); 10451 10452 /* Here, have advanced as far as possible */ 10453 break; 10454 } /* End of found some initial bytes that definitively matched */ 10455 10456 /* Here, we can't rule out that we have found the beginning of 'B', but 10457 * there were no initial bytes that could rule out anything 10458 * definitively. Use brute force to examine all the possibilities */ 10459 while (scan < this_eol && hardcount < max) { 10460 int i; 10461 U8 * matches = Binfo.matches; 10462 10463 for (i = 0; i < Binfo.count; i++) { 10464 if (memEQ(scan, matches, Binfo.lengths[i])) { 10465 goto found1; 10466 } 10467 10468 matches += Binfo.lengths[i]; 10469 } 10470 10471 break; 10472 10473 found1: 10474 hardcount++; 10475 scan += Binfo.lengths[i]; 10476 } 10477 10478 break; 10479 } 10480 10481 case ANYOFPOSIXL_t8: 10482 case ANYOFL_t8: 10483 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 10484 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p); 10485 10486 /* FALLTHROUGH */ 10487 case ANYOFD_t8: 10488 case ANYOF_t8: 10489 while ( hardcount < max 10490 && scan < this_eol 10491 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) 10492 { 10493 scan += UTF8SKIP(scan); 10494 hardcount++; 10495 } 10496 break; 10497 10498 case ANYOFPOSIXL_tb: 10499 case ANYOFL_tb: 10500 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 10501 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p); 10502 /* FALLTHROUGH */ 10503 10504 case ANYOFD_tb: 10505 case ANYOF_tb: 10506 if (ANYOF_FLAGS(p) || ANYOF_HAS_AUX(p)) { 10507 while ( scan < this_eol 10508 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0)) 10509 scan++; 10510 } 10511 else { 10512 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan))) 10513 scan++; 10514 } 10515 break; 10516 10517 case ANYOFM_t8: 10518 if (this_eol - scan > max) { 10519 10520 /* We didn't adjust <this_eol> at the beginning of this routine 10521 * because is UTF-8, but it is actually ok to do so, since here, to 10522 * match, 1 char == 1 byte. */ 10523 this_eol = scan + max; 10524 } 10525 /* FALLTHROUGH */ 10526 10527 case ANYOFM_tb: 10528 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, 10529 (U8) ARG1u(p), FLAGS(p)); 10530 break; 10531 10532 case NANYOFM_t8: 10533 while ( hardcount < max 10534 && scan < this_eol 10535 && (*scan & FLAGS(p)) != ARG1u(p)) 10536 { 10537 scan += UTF8SKIP(scan); 10538 hardcount++; 10539 } 10540 break; 10541 10542 case NANYOFM_tb: 10543 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, 10544 (U8) ARG1u(p), FLAGS(p)); 10545 break; 10546 10547 case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */ 10548 case ANYOFHb_tb: 10549 case ANYOFHbbm_tb: 10550 case ANYOFHr_tb: 10551 case ANYOFHs_tb: 10552 break; 10553 10554 case ANYOFH_t8: 10555 anyofh_list = GET_ANYOFH_INVLIST(prog, p); 10556 while ( hardcount < max 10557 && scan < this_eol 10558 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 10559 && _invlist_contains_cp(anyofh_list, 10560 utf8_to_uvchr_buf((U8 *) scan, 10561 (U8 *) this_eol, 10562 NULL))) 10563 { 10564 scan += UTF8SKIP(scan); 10565 hardcount++; 10566 } 10567 break; 10568 10569 case ANYOFHb_t8: 10570 /* we know the first byte must be the FLAGS field */ 10571 anyofh_list = GET_ANYOFH_INVLIST(prog, p); 10572 while ( hardcount < max 10573 && scan < this_eol 10574 && (U8) *scan == ANYOF_FLAGS(p) 10575 && _invlist_contains_cp(anyofh_list, 10576 utf8_to_uvchr_buf((U8 *) scan, 10577 (U8 *) this_eol, 10578 NULL))) 10579 { 10580 scan += UTF8SKIP(scan); 10581 hardcount++; 10582 } 10583 break; 10584 10585 case ANYOFHbbm_t8: 10586 while ( hardcount < max 10587 && scan + 1 < this_eol 10588 && (U8) *scan == ANYOF_FLAGS(p) 10589 && BITMAP_TEST(( (struct regnode_bbm *) p)->bitmap, 10590 (U8) scan[1] & UTF_CONTINUATION_MASK)) 10591 { 10592 scan += 2; /* This node only matces 2-byte UTF-8 */ 10593 hardcount++; 10594 } 10595 break; 10596 10597 case ANYOFHr_t8: 10598 anyofh_list = GET_ANYOFH_INVLIST(prog, p); 10599 while ( hardcount < max 10600 && scan < this_eol 10601 && inRANGE(NATIVE_UTF8_TO_I8(*scan), 10602 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)), 10603 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p))) 10604 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 10605 && _invlist_contains_cp(anyofh_list, 10606 utf8_to_uvchr_buf((U8 *) scan, 10607 (U8 *) this_eol, 10608 NULL))) 10609 { 10610 scan += UTF8SKIP(scan); 10611 hardcount++; 10612 } 10613 break; 10614 10615 case ANYOFHs_t8: 10616 anyofh_list = GET_ANYOFH_INVLIST(prog, p); 10617 while ( hardcount < max 10618 && scan + FLAGS(p) < this_eol 10619 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p)) 10620 && _invlist_contains_cp(anyofh_list, 10621 utf8_to_uvchr_buf((U8 *) scan, 10622 (U8 *) this_eol, 10623 NULL))) 10624 { 10625 scan += UTF8SKIP(scan); 10626 hardcount++; 10627 } 10628 break; 10629 10630 case ANYOFR_t8: 10631 while ( hardcount < max 10632 && scan < this_eol 10633 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 10634 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, 10635 (U8 *) this_eol, 10636 NULL), 10637 ANYOFRbase(p), ANYOFRdelta(p))) 10638 { 10639 scan += UTF8SKIP(scan); 10640 hardcount++; 10641 } 10642 break; 10643 10644 case ANYOFR_tb: 10645 while ( hardcount < max 10646 && scan < this_eol 10647 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) 10648 { 10649 scan++; 10650 hardcount++; 10651 } 10652 break; 10653 10654 case ANYOFRb_t8: 10655 while ( hardcount < max 10656 && scan < this_eol 10657 && (U8) *scan == ANYOF_FLAGS(p) 10658 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, 10659 (U8 *) this_eol, 10660 NULL), 10661 ANYOFRbase(p), ANYOFRdelta(p))) 10662 { 10663 scan += UTF8SKIP(scan); 10664 hardcount++; 10665 } 10666 break; 10667 10668 case ANYOFRb_tb: 10669 while ( hardcount < max 10670 && scan < this_eol 10671 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) 10672 { 10673 scan++; 10674 hardcount++; 10675 } 10676 break; 10677 10678 /* The argument (FLAGS) to all the POSIX node types is the class number */ 10679 10680 case NPOSIXL_tb: 10681 to_complement = 1; 10682 /* FALLTHROUGH */ 10683 10684 case POSIXL_tb: 10685 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 10686 while ( scan < this_eol 10687 && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) 10688 { 10689 scan++; 10690 } 10691 break; 10692 10693 case NPOSIXL_t8: 10694 to_complement = 1; 10695 /* FALLTHROUGH */ 10696 10697 case POSIXL_t8: 10698 while ( hardcount < max && scan < this_eol 10699 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), 10700 (U8 *) scan, 10701 (U8 *) this_eol))) 10702 { 10703 scan += UTF8SKIP(scan); 10704 hardcount++; 10705 } 10706 break; 10707 10708 case POSIXD_tb: 10709 /* FALLTHROUGH */ 10710 10711 case POSIXA_t8: 10712 if (this_eol - scan > max) { 10713 10714 /* We didn't adjust <this_eol> at the beginning of this routine 10715 * because is UTF-8, but it is actually ok to do so, since here, to 10716 * match, 1 char == 1 byte. */ 10717 this_eol = scan + max; 10718 } 10719 /* FALLTHROUGH */ 10720 10721 case POSIXA_tb: 10722 while (scan < this_eol && generic_isCC_A_((U8) *scan, FLAGS(p))) { 10723 scan++; 10724 } 10725 break; 10726 10727 case NPOSIXD_tb: 10728 /* FALLTHROUGH */ 10729 10730 case NPOSIXA_tb: 10731 while (scan < this_eol && ! generic_isCC_A_((U8) *scan, FLAGS(p))) { 10732 scan++; 10733 } 10734 break; 10735 10736 case NPOSIXA_t8: 10737 10738 /* The complement of something that matches only ASCII matches all 10739 * non-ASCII, plus everything in ASCII that isn't in the class. */ 10740 while ( hardcount < max && scan < this_eol 10741 && ( ! isASCII_utf8_safe(scan, loceol) 10742 || ! generic_isCC_A_((U8) *scan, FLAGS(p)))) 10743 { 10744 scan += UTF8SKIP(scan); 10745 hardcount++; 10746 } 10747 break; 10748 10749 case NPOSIXU_tb: 10750 to_complement = 1; 10751 /* FALLTHROUGH */ 10752 10753 case POSIXU_tb: 10754 while ( scan < this_eol 10755 && to_complement ^ cBOOL(generic_isCC_((U8) *scan, FLAGS(p)))) 10756 { 10757 scan++; 10758 } 10759 break; 10760 10761 case NPOSIXU_t8: 10762 case NPOSIXD_t8: 10763 to_complement = 1; 10764 /* FALLTHROUGH */ 10765 10766 case POSIXD_t8: 10767 case POSIXU_t8: 10768 classnum = (char_class_number_) FLAGS(p); 10769 switch (classnum) { 10770 default: 10771 while ( hardcount < max && scan < this_eol 10772 && to_complement 10773 ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum], 10774 utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL)))) 10775 { 10776 scan += UTF8SKIP(scan); 10777 hardcount++; 10778 } 10779 break; 10780 10781 /* For the classes below, the knowledge of how to handle every code 10782 * point is compiled into Perl via a macro. This code is written 10783 * for making the loops as tight as possible. It could be 10784 * refactored to save space instead. */ 10785 10786 case CC_ENUM_SPACE_: 10787 while ( hardcount < max 10788 && scan < this_eol 10789 && (to_complement 10790 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol)))) 10791 { 10792 scan += UTF8SKIP(scan); 10793 hardcount++; 10794 } 10795 break; 10796 case CC_ENUM_BLANK_: 10797 while ( hardcount < max 10798 && scan < this_eol 10799 && (to_complement 10800 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol)))) 10801 { 10802 scan += UTF8SKIP(scan); 10803 hardcount++; 10804 } 10805 break; 10806 case CC_ENUM_XDIGIT_: 10807 while ( hardcount < max 10808 && scan < this_eol 10809 && (to_complement 10810 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol)))) 10811 { 10812 scan += UTF8SKIP(scan); 10813 hardcount++; 10814 } 10815 break; 10816 case CC_ENUM_VERTSPACE_: 10817 while ( hardcount < max 10818 && scan < this_eol 10819 && (to_complement 10820 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol)))) 10821 { 10822 scan += UTF8SKIP(scan); 10823 hardcount++; 10824 } 10825 break; 10826 case CC_ENUM_CNTRL_: 10827 while ( hardcount < max 10828 && scan < this_eol 10829 && (to_complement 10830 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol)))) 10831 { 10832 scan += UTF8SKIP(scan); 10833 hardcount++; 10834 } 10835 break; 10836 } 10837 break; 10838 10839 case LNBREAK_t8: 10840 while ( hardcount < max && scan < this_eol 10841 && (c=is_LNBREAK_utf8_safe(scan, this_eol))) 10842 { 10843 scan += c; 10844 hardcount++; 10845 } 10846 break; 10847 10848 case LNBREAK_tb: 10849 /* LNBREAK can match one or two latin chars, which is ok, but we have 10850 * to use hardcount in this situation, and throw away the adjustment to 10851 * <this_eol> done before the switch statement */ 10852 while ( 10853 hardcount < max && scan < loceol 10854 && (c = is_LNBREAK_latin1_safe(scan, loceol)) 10855 ) { 10856 scan += c; 10857 hardcount++; 10858 } 10859 break; 10860 10861 default: 10862 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized" 10863 " node type %d='%s'", OP(p), REGNODE_NAME(OP(p))); 10864 NOT_REACHED; /* NOTREACHED */ 10865 10866 } 10867 10868 if (hardcount) 10869 c = hardcount; 10870 else 10871 c = scan - *startposp; 10872 *startposp = scan; 10873 10874 DEBUG_r({ 10875 DECLARE_AND_GET_RE_DEBUG_FLAGS; 10876 DEBUG_EXECUTE_r({ 10877 SV * const prop = sv_newmortal(); 10878 regprop(prog, prop, p, reginfo, NULL); 10879 Perl_re_exec_indentf( aTHX_ 10880 "%s can match %" IVdf " times out of %" IVdf "...\n", 10881 depth, SvPVX_const(prop),(IV)c,(IV)max); 10882 }); 10883 }); 10884 10885 return(c); 10886 } 10887 10888 /* 10889 - reginclass - determine if a character falls into a character class 10890 10891 n is the ANYOF-type regnode 10892 p is the target string 10893 p_end points to one byte beyond the end of the target string 10894 utf8_target tells whether p is in UTF-8. 10895 10896 Returns true if matched; false otherwise. 10897 10898 Note that this can be a synthetic start class, a combination of various 10899 nodes, so things you think might be mutually exclusive, such as locale, 10900 aren't. It can match both locale and non-locale 10901 10902 */ 10903 10904 STATIC bool 10905 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) 10906 { 10907 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs)) 10908 ? 0 10909 : ANYOF_FLAGS(n); 10910 bool match = FALSE; 10911 UV c = *p; 10912 10913 PERL_ARGS_ASSERT_REGINCLASS; 10914 10915 /* If c is not already the code point, get it. Note that 10916 * UTF8_IS_INVARIANT() works even if not in UTF-8 */ 10917 if (! UTF8_IS_INVARIANT(c) && utf8_target) { 10918 STRLEN c_len = 0; 10919 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT; 10920 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY); 10921 if (c_len == (STRLEN)-1) { 10922 _force_out_malformed_utf8_message(p, p_end, 10923 utf8n_flags, 10924 1 /* 1 means die */ ); 10925 NOT_REACHED; /* NOTREACHED */ 10926 } 10927 if ( c > 255 10928 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL) 10929 && ! (flags & ANYOFL_UTF8_LOCALE_REQD)) 10930 { 10931 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); 10932 } 10933 } 10934 10935 /* If this character is potentially in the bitmap, check it */ 10936 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) { 10937 if (ANYOF_BITMAP_TEST(n, c)) 10938 match = TRUE; 10939 else if ( (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) 10940 && OP(n) == ANYOFD 10941 && ! utf8_target 10942 && ! isASCII(c)) 10943 { 10944 match = TRUE; 10945 } 10946 else if (flags & ANYOF_LOCALE_FLAGS) { 10947 if ( (flags & ANYOFL_FOLD) 10948 && c < 256 10949 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) 10950 { 10951 match = TRUE; 10952 } 10953 else if ( ANYOF_POSIXL_TEST_ANY_SET(n) 10954 && c <= U8_MAX /* param to isFOO_lc() */ 10955 ) { 10956 /* The data structure is arranged so bits 0, 2, 4, ... are set 10957 * if the class includes the Posix character class given by 10958 * bit/2; and 1, 3, 5, ... are set if the class includes the 10959 * complemented Posix class given by int(bit/2), so the 10960 * remainder modulo 2 tells us if to complement or not. 10961 * 10962 * Note that this code assumes that all the classes are closed 10963 * under folding. For example, if a character matches \w, then 10964 * its fold does too; and vice versa. This should be true for 10965 * any well-behaved locale for all the currently defined Posix 10966 * classes, except for :lower: and :upper:, which are handled 10967 * by the pseudo-class :cased: which matches if either of the 10968 * other two does. To get rid of this assumption, an outer 10969 * loop could be used below to iterate over both the source 10970 * character, and its fold (if different) */ 10971 10972 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n); 10973 10974 do { 10975 /* Find the next set bit indicating a class to try matching 10976 * against */ 10977 U8 bit_pos = lsbit_pos32(posixl_bits); 10978 10979 if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) { 10980 match = TRUE; 10981 break; 10982 } 10983 10984 /* Remove this class from consideration; repeat */ 10985 POSIXL_CLEAR(posixl_bits, bit_pos); 10986 } while(posixl_bits != 0); 10987 } 10988 } 10989 } 10990 10991 /* If the bitmap didn't (or couldn't) match, and something outside the 10992 * bitmap could match, try that. */ 10993 if (!match) { 10994 if ( c >= NUM_ANYOF_CODE_POINTS 10995 && ANYOF_ONLY_HAS_BITMAP(n) 10996 && ! (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES)) 10997 { 10998 /* In this case, the ARG is set up so that the final bit indicates 10999 * whether it matches or not */ 11000 match = ARG1u(n) & 1; 11001 } 11002 else 11003 /* Here, the main way it could match is if the code point is 11004 * outside the bitmap and an inversion list exists to handle such 11005 * things. The other ways all occur when a flag is set to indicate 11006 * we need special handling. That handling doesn't come in to 11007 * effect for ANYOFD nodes unless the target string is UTF-8 and 11008 * that matters to code point being matched. */ 11009 if ( c >= NUM_ANYOF_CODE_POINTS 11010 || ( (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) 11011 && ( UNLIKELY(OP(n) != ANYOFD) 11012 || (utf8_target && ! isASCII_uvchr(c) 11013 # if NUM_ANYOF_CODE_POINTS > 256 11014 && c < 256 11015 # endif 11016 )))) 11017 { 11018 /* Here, we have an inversion list for outside-the-bitmap code 11019 * points and/or the flag is set indicating special handling is 11020 * needed. The flag is set when there is auxiliary data beyond the 11021 * normal inversion list, or if there is the possibility of a 11022 * special Turkic locale match, even without auxiliary data. 11023 * 11024 * First check if there is an inversion list and/or auxiliary data. 11025 * */ 11026 if (ANYOF_HAS_AUX(n)) { 11027 SV* only_utf8_locale = NULL; 11028 11029 /* This call will return in 'definition' the union of the base 11030 * non-bitmap inversion list, if any, plus the deferred 11031 * definitions of user-defined properties, if any. It croaks 11032 * if there is such a property but which still has no definition 11033 * available */ 11034 SV * const definition = GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0, 11035 &only_utf8_locale, NULL); 11036 if (definition) { 11037 /* Most likely is the outside-the-bitmap inversion list. */ 11038 if (_invlist_contains_cp(definition, c)) { 11039 match = TRUE; 11040 } 11041 else /* Failing that, hardcode the two tests for a Turkic 11042 locale */ 11043 if ( UNLIKELY(IN_UTF8_TURKIC_LOCALE) 11044 && isALPHA_FOLD_EQ(*p, 'i')) 11045 { 11046 /* Turkish locales have these hard-coded rules 11047 * overriding normal ones */ 11048 if (*p == 'i') { 11049 if (_invlist_contains_cp(definition, 11050 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) 11051 { 11052 match = TRUE; 11053 } 11054 } 11055 else if (_invlist_contains_cp(definition, 11056 LATIN_SMALL_LETTER_DOTLESS_I)) 11057 { 11058 match = TRUE; 11059 } 11060 } 11061 } 11062 11063 if ( UNLIKELY(only_utf8_locale) 11064 && UNLIKELY(IN_UTF8_CTYPE_LOCALE) 11065 && ! match) 11066 { 11067 match = _invlist_contains_cp(only_utf8_locale, c); 11068 } 11069 } 11070 11071 /* In a Turkic locale under folding, hard-code the I i case pair 11072 * matches; these wouldn't have the ANYOF_HAS_EXTRA_RUNTIME_MATCHES 11073 * flag set unless [Ii] were match possibilities */ 11074 if (UNLIKELY(IN_UTF8_TURKIC_LOCALE) && ! match) { 11075 if (utf8_target) { 11076 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { 11077 if (ANYOF_BITMAP_TEST(n, 'i')) { 11078 match = TRUE; 11079 } 11080 } 11081 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) { 11082 if (ANYOF_BITMAP_TEST(n, 'I')) { 11083 match = TRUE; 11084 } 11085 } 11086 } 11087 11088 #if NUM_ANYOF_CODE_POINTS > 256 11089 /* Larger bitmap means these special cases aren't handled 11090 * outside the bitmap above. */ 11091 if (*p == 'i') { 11092 if (ANYOF_BITMAP_TEST(n, 11093 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) 11094 { 11095 match = TRUE; 11096 } 11097 } 11098 else if (*p == 'I') { 11099 if (ANYOF_BITMAP_TEST(n, LATIN_SMALL_LETTER_DOTLESS_I)) { 11100 match = TRUE; 11101 } 11102 } 11103 #endif 11104 } 11105 } 11106 11107 if ( UNICODE_IS_SUPER(c) 11108 && (flags & ANYOF_WARN_SUPER__shared) 11109 && OP(n) != ANYOFD 11110 && ckWARN_d(WARN_NON_UNICODE)) 11111 { 11112 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), 11113 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c); 11114 } 11115 } 11116 11117 #if ANYOF_INVERT != 1 11118 /* Depending on compiler optimization cBOOL takes time, so if don't have to 11119 * use it, don't */ 11120 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, 11121 #endif 11122 11123 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ 11124 return (flags & ANYOF_INVERT) ^ match; 11125 } 11126 11127 STATIC U8 * 11128 S_reghop3(U8 *s, SSize_t off, const U8* lim) 11129 { 11130 /* return the position 'off' UTF-8 characters away from 's', forward if 11131 * 'off' >= 0, backwards if negative. But don't go outside of position 11132 * 'lim', which better be < s if off < 0 */ 11133 11134 PERL_ARGS_ASSERT_REGHOP3; 11135 11136 if (off >= 0) { 11137 while (off-- && s < lim) { 11138 /* XXX could check well-formedness here */ 11139 U8 *new_s = s + UTF8SKIP(s); 11140 if (new_s > lim) /* lim may be in the middle of a long character */ 11141 return s; 11142 s = new_s; 11143 } 11144 } 11145 else { 11146 while (off++ && s > lim) { 11147 s--; 11148 if (UTF8_IS_CONTINUED(*s)) { 11149 while (s > lim && UTF8_IS_CONTINUATION(*s)) 11150 s--; 11151 if (! UTF8_IS_START(*s)) { 11152 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 11153 } 11154 } 11155 /* XXX could check well-formedness here */ 11156 } 11157 } 11158 return s; 11159 } 11160 11161 STATIC U8 * 11162 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) 11163 { 11164 PERL_ARGS_ASSERT_REGHOP4; 11165 11166 return utf8_hop_safe(s, off, llim, rlim); 11167 } 11168 11169 /* like reghop3, but returns NULL on overrun, rather than returning last 11170 * char pos */ 11171 11172 STATIC U8 * 11173 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim) 11174 { 11175 PERL_ARGS_ASSERT_REGHOPMAYBE3; 11176 11177 if (off >= 0) { 11178 while (off-- && s < lim) { 11179 /* XXX could check well-formedness here */ 11180 s += UTF8SKIP(s); 11181 } 11182 if (off >= 0) 11183 return NULL; 11184 } 11185 else { 11186 while (off++ && s > lim) { 11187 s--; 11188 if (UTF8_IS_CONTINUED(*s)) { 11189 while (s > lim && UTF8_IS_CONTINUATION(*s)) 11190 s--; 11191 if (! UTF8_IS_START(*s)) { 11192 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 11193 } 11194 } 11195 /* XXX could check well-formedness here */ 11196 } 11197 if (off <= 0) 11198 return NULL; 11199 } 11200 return s; 11201 } 11202 11203 11204 /* when executing a regex that may have (?{}), extra stuff needs setting 11205 up that will be visible to the called code, even before the current 11206 match has finished. In particular: 11207 11208 * $_ is localised to the SV currently being matched; 11209 * pos($_) is created if necessary, ready to be updated on each call-out 11210 to code; 11211 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm 11212 isn't set until the current pattern is successfully finished), so that 11213 $1 etc of the match-so-far can be seen; 11214 * save the old values of subbeg etc of the current regex, and set then 11215 to the current string (again, this is normally only done at the end 11216 of execution) 11217 */ 11218 11219 static void 11220 S_setup_eval_state(pTHX_ regmatch_info *const reginfo) 11221 { 11222 MAGIC *mg; 11223 regexp *const rex = ReANY(reginfo->prog); 11224 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; 11225 11226 eval_state->rex = rex; 11227 eval_state->sv = reginfo->sv; 11228 11229 if (reginfo->sv) { 11230 /* Make $_ available to executed code. */ 11231 if (reginfo->sv != DEFSV) { 11232 SAVE_DEFSV; 11233 DEFSV_set(reginfo->sv); 11234 } 11235 /* will be dec'd by S_cleanup_regmatch_info_aux */ 11236 SvREFCNT_inc_NN(reginfo->sv); 11237 11238 if (!(mg = mg_find_mglob(reginfo->sv))) { 11239 /* prepare for quick setting of pos */ 11240 mg = sv_magicext_mglob(reginfo->sv); 11241 mg->mg_len = -1; 11242 } 11243 eval_state->pos_magic = mg; 11244 eval_state->pos = mg->mg_len; 11245 eval_state->pos_flags = mg->mg_flags; 11246 } 11247 else 11248 eval_state->pos_magic = NULL; 11249 11250 if (!PL_reg_curpm) { 11251 /* PL_reg_curpm is a fake PMOP that we can attach the current 11252 * regex to and point PL_curpm at, so that $1 et al are visible 11253 * within a /(?{})/. It's just allocated once per interpreter the 11254 * first time its needed */ 11255 Newxz(PL_reg_curpm, 1, PMOP); 11256 #ifdef USE_ITHREADS 11257 { 11258 SV* const repointer = &PL_sv_undef; 11259 /* this regexp is also owned by the new PL_reg_curpm, which 11260 will try to free it. */ 11261 av_push(PL_regex_padav, repointer); 11262 PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav); 11263 PL_regex_pad = AvARRAY(PL_regex_padav); 11264 } 11265 #endif 11266 } 11267 SET_reg_curpm(reginfo->prog); 11268 eval_state->curpm = PL_curpm; 11269 PL_curpm_under = PL_curpm; 11270 PL_curpm = PL_reg_curpm; 11271 if (RXp_MATCH_COPIED(rex)) { 11272 /* Here is a serious problem: we cannot rewrite subbeg, 11273 since it may be needed if this match fails. Thus 11274 $` inside (?{}) could fail... */ 11275 eval_state->subbeg = RXp_SUBBEG(rex); 11276 eval_state->sublen = RXp_SUBLEN(rex); 11277 eval_state->suboffset = RXp_SUBOFFSET(rex); 11278 eval_state->subcoffset = RXp_SUBCOFFSET(rex); 11279 #ifdef PERL_ANY_COW 11280 eval_state->saved_copy = RXp_SAVED_COPY(rex); 11281 #endif 11282 RXp_MATCH_COPIED_off(rex); 11283 } 11284 else 11285 eval_state->subbeg = NULL; 11286 RXp_SUBBEG(rex) = (char *)reginfo->strbeg; 11287 RXp_SUBOFFSET(rex) = 0; 11288 RXp_SUBCOFFSET(rex) = 0; 11289 RXp_SUBLEN(rex) = reginfo->strend - reginfo->strbeg; 11290 } 11291 11292 11293 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ 11294 11295 static void 11296 S_cleanup_regmatch_info_aux(pTHX_ void *arg) 11297 { 11298 regmatch_info_aux *aux = (regmatch_info_aux *) arg; 11299 regmatch_info_aux_eval *eval_state = aux->info_aux_eval; 11300 regmatch_slab *s; 11301 11302 Safefree(aux->poscache); 11303 11304 if (eval_state) { 11305 11306 /* undo the effects of S_setup_eval_state() */ 11307 11308 if (eval_state->subbeg) { 11309 regexp * const rex = eval_state->rex; 11310 RXp_SUBBEG(rex) = eval_state->subbeg; 11311 RXp_SUBLEN(rex) = eval_state->sublen; 11312 RXp_SUBOFFSET(rex) = eval_state->suboffset; 11313 RXp_SUBCOFFSET(rex) = eval_state->subcoffset; 11314 #ifdef PERL_ANY_COW 11315 RXp_SAVED_COPY(rex) = eval_state->saved_copy; 11316 #endif 11317 RXp_MATCH_COPIED_on(rex); 11318 } 11319 if (eval_state->pos_magic) 11320 { 11321 eval_state->pos_magic->mg_len = eval_state->pos; 11322 eval_state->pos_magic->mg_flags = 11323 (eval_state->pos_magic->mg_flags & ~MGf_BYTES) 11324 | (eval_state->pos_flags & MGf_BYTES); 11325 } 11326 11327 PL_curpm = eval_state->curpm; 11328 SvREFCNT_dec(eval_state->sv); 11329 } 11330 11331 PL_regmatch_state = aux->old_regmatch_state; 11332 PL_regmatch_slab = aux->old_regmatch_slab; 11333 11334 /* free all slabs above current one - this must be the last action 11335 * of this function, as aux and eval_state are allocated within 11336 * slabs and may be freed here */ 11337 11338 s = PL_regmatch_slab->next; 11339 if (s) { 11340 PL_regmatch_slab->next = NULL; 11341 while (s) { 11342 regmatch_slab * const osl = s; 11343 s = s->next; 11344 Safefree(osl); 11345 } 11346 } 11347 } 11348 11349 11350 STATIC void 11351 S_to_utf8_substr(pTHX_ regexp *prog) 11352 { 11353 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile 11354 * on the converted value */ 11355 11356 int i = 1; 11357 11358 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; 11359 11360 do { 11361 if (prog->substrs->data[i].substr 11362 && !prog->substrs->data[i].utf8_substr) { 11363 SV* const sv = newSVsv(prog->substrs->data[i].substr); 11364 prog->substrs->data[i].utf8_substr = sv; 11365 sv_utf8_upgrade(sv); 11366 if (SvVALID(prog->substrs->data[i].substr)) { 11367 if (SvTAIL(prog->substrs->data[i].substr)) { 11368 /* Trim the trailing \n that fbm_compile added last 11369 time. */ 11370 SvCUR_set(sv, SvCUR(sv) - 1); 11371 /* Whilst this makes the SV technically "invalid" (as its 11372 buffer is no longer followed by "\0") when fbm_compile() 11373 adds the "\n" back, a "\0" is restored. */ 11374 fbm_compile(sv, FBMcf_TAIL); 11375 } else 11376 fbm_compile(sv, 0); 11377 } 11378 if (prog->substrs->data[i].substr == prog->check_substr) 11379 prog->check_utf8 = sv; 11380 } 11381 } while (i--); 11382 } 11383 11384 STATIC bool 11385 S_to_byte_substr(pTHX_ regexp *prog) 11386 { 11387 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile 11388 * on the converted value; returns FALSE if can't be converted. */ 11389 11390 int i = 1; 11391 11392 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; 11393 11394 do { 11395 if (prog->substrs->data[i].utf8_substr 11396 && !prog->substrs->data[i].substr) { 11397 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); 11398 if (! sv_utf8_downgrade(sv, TRUE)) { 11399 SvREFCNT_dec_NN(sv); 11400 return FALSE; 11401 } 11402 if (SvVALID(prog->substrs->data[i].utf8_substr)) { 11403 if (SvTAIL(prog->substrs->data[i].utf8_substr)) { 11404 /* Trim the trailing \n that fbm_compile added last 11405 time. */ 11406 SvCUR_set(sv, SvCUR(sv) - 1); 11407 fbm_compile(sv, FBMcf_TAIL); 11408 } else 11409 fbm_compile(sv, 0); 11410 } 11411 prog->substrs->data[i].substr = sv; 11412 if (prog->substrs->data[i].utf8_substr == prog->check_utf8) 11413 prog->check_substr = sv; 11414 } 11415 } while (i--); 11416 11417 return TRUE; 11418 } 11419 11420 #ifndef PERL_IN_XSUB_RE 11421 11422 bool 11423 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) 11424 { 11425 /* Temporary helper function for toke.c. Verify that the code point 'cp' 11426 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in 11427 * the larger string bounded by 'strbeg' and 'strend'. 11428 * 11429 * 'cp' needs to be assigned (if not, a future version of the Unicode 11430 * Standard could make it something that combines with adjacent characters, 11431 * so code using it would then break), and there has to be a GCB break 11432 * before and after the character. */ 11433 11434 11435 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val; 11436 const U8 * prev_cp_start; 11437 11438 PERL_ARGS_ASSERT_IS_GRAPHEME; 11439 11440 if ( UNLIKELY(UNICODE_IS_SUPER(cp)) 11441 || UNLIKELY(UNICODE_IS_NONCHAR(cp))) 11442 { 11443 /* These are considered graphemes */ 11444 return TRUE; 11445 } 11446 11447 /* Otherwise, unassigned code points are forbidden */ 11448 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST( 11449 _invlist_search(PL_Assigned_invlist, cp)))) 11450 { 11451 return FALSE; 11452 } 11453 11454 cp_gcb_val = getGCB_VAL_CP(cp); 11455 11456 /* Find the GCB value of the previous code point in the input */ 11457 prev_cp_start = utf8_hop_back(s, -1, strbeg); 11458 if (UNLIKELY(prev_cp_start == s)) { 11459 prev_cp_gcb_val = GCB_EDGE; 11460 } 11461 else { 11462 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend); 11463 } 11464 11465 /* And check that is a grapheme boundary */ 11466 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s, 11467 TRUE /* is UTF-8 encoded */ )) 11468 { 11469 return FALSE; 11470 } 11471 11472 /* Similarly verify there is a break between the current character and the 11473 * following one */ 11474 s += UTF8SKIP(s); 11475 if (s >= strend) { 11476 next_cp_gcb_val = GCB_EDGE; 11477 } 11478 else { 11479 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend); 11480 } 11481 11482 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE); 11483 } 11484 11485 /* 11486 =for apidoc_section $unicode 11487 11488 =for apidoc isSCRIPT_RUN 11489 11490 Returns a bool as to whether or not the sequence of bytes from C<s> up to but 11491 not including C<send> form a "script run". C<utf8_target> is TRUE iff the 11492 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for 11493 two degenerate cases given below, this function returns TRUE iff all code 11494 points in it come from any combination of three "scripts" given by the Unicode 11495 "Script Extensions" property: Common, Inherited, and possibly one other. 11496 Additionally all decimal digits must come from the same consecutive sequence of 11497 10. 11498 11499 For example, if all the characters in the sequence are Greek, or Common, or 11500 Inherited, this function will return TRUE, provided any decimal digits in it 11501 are from the same block of digits in Common. (These are the ASCII digits 11502 "0".."9" and additionally a block for full width forms of these, and several 11503 others used in mathematical notation.) For scripts (unlike Greek) that have 11504 their own digits defined this will accept either digits from that set or from 11505 one of the Common digit sets, but not a combination of the two. Some scripts, 11506 such as Arabic, have more than one set of digits. All digits must come from 11507 the same set for this function to return TRUE. 11508 11509 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE 11510 contain the script found, using the C<SCX_enum> typedef. Its value will be 11511 C<SCX_INVALID> if the function returns FALSE. 11512 11513 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for) 11514 will be C<SCX_INVALID>. 11515 11516 If the sequence contains a single code point which is unassigned to a character 11517 in the version of Unicode being used, the function will return TRUE, and the 11518 script will be C<SCX_Unknown>. Any other combination of unassigned code points 11519 in the input sequence will result in the function treating the input as not 11520 being a script run. 11521 11522 The returned script will be C<SCX_Inherited> iff all the code points in it are 11523 from the Inherited script. 11524 11525 Otherwise, the returned script will be C<SCX_Common> iff all the code points in 11526 it are from the Inherited or Common scripts. 11527 11528 =cut 11529 11530 */ 11531 11532 bool 11533 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) 11534 { 11535 /* Basically, it looks at each character in the sequence to see if the 11536 * above conditions are met; if not it fails. It uses an inversion map to 11537 * find the enum corresponding to the script of each character. But this 11538 * is complicated by the fact that a few code points can be in any of 11539 * several scripts. The data has been constructed so that there are 11540 * additional enum values (all negative) for these situations. The 11541 * absolute value of those is an index into another table which contains 11542 * pointers to auxiliary tables for each such situation. Each aux array 11543 * lists all the scripts for the given situation. There is another, 11544 * parallel, table that gives the number of entries in each aux table. 11545 * These are all defined in charclass_invlists.h */ 11546 11547 /* XXX Here are the additional things UTS 39 says could be done: 11548 * 11549 * Forbid sequences of the same nonspacing mark 11550 * 11551 * Check to see that all the characters are in the sets of exemplar 11552 * characters for at least one language in the Unicode Common Locale Data 11553 * Repository [CLDR]. */ 11554 11555 11556 /* Things that match /\d/u */ 11557 SV * decimals_invlist = PL_XPosix_ptrs[CC_DIGIT_]; 11558 UV * decimals_array = invlist_array(decimals_invlist); 11559 11560 /* What code point is the digit '0' of the script run? (0 meaning FALSE if 11561 * not currently known) */ 11562 UV zero_of_run = 0; 11563 11564 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */ 11565 SCX_enum script_of_char = SCX_INVALID; 11566 11567 /* If the script remains not fully determined from iteration to iteration, 11568 * this is the current intersection of the possiblities. */ 11569 SCX_enum * intersection = NULL; 11570 PERL_UINT_FAST8_T intersection_len = 0; 11571 11572 bool retval = TRUE; 11573 SCX_enum * ret_script = NULL; 11574 11575 assert(send >= s); 11576 11577 PERL_ARGS_ASSERT_ISSCRIPT_RUN; 11578 11579 /* All code points in 0..255 are either Common or Latin, so must be a 11580 * script run. We can return immediately unless we need to know which 11581 * script it is. */ 11582 if (! utf8_target && LIKELY(send > s)) { 11583 if (ret_script == NULL) { 11584 return TRUE; 11585 } 11586 11587 /* If any character is Latin, the run is Latin */ 11588 while (s < send) { 11589 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) { 11590 *ret_script = SCX_Latin; 11591 return TRUE; 11592 } 11593 } 11594 11595 /* Here, all are Common */ 11596 *ret_script = SCX_Common; 11597 return TRUE; 11598 } 11599 11600 /* Look at each character in the sequence */ 11601 while (s < send) { 11602 /* If the current character being examined is a digit, this is the code 11603 * point of the zero for its sequence of 10 */ 11604 UV zero_of_char; 11605 11606 UV cp; 11607 11608 /* The code allows all scripts to use the ASCII digits. This is 11609 * because they are in the Common script. Hence any ASCII ones found 11610 * are ok, unless and until a digit from another set has already been 11611 * encountered. digit ranges in Common are not similarly blessed) */ 11612 if (UNLIKELY(isDIGIT(*s))) { 11613 if (UNLIKELY(script_of_run == SCX_Unknown)) { 11614 retval = FALSE; 11615 break; 11616 } 11617 if (zero_of_run) { 11618 if (zero_of_run != '0') { 11619 retval = FALSE; 11620 break; 11621 } 11622 } 11623 else { 11624 zero_of_run = '0'; 11625 } 11626 s++; 11627 continue; 11628 } 11629 11630 /* Here, isn't an ASCII digit. Find the code point of the character */ 11631 if (! UTF8_IS_INVARIANT(*s)) { 11632 Size_t len; 11633 cp = valid_utf8_to_uvchr((U8 *) s, &len); 11634 s += len; 11635 } 11636 else { 11637 cp = *(s++); 11638 } 11639 11640 /* If is within the range [+0 .. +9] of the script's zero, it also is a 11641 * digit in that script. We can skip the rest of this code for this 11642 * character. */ 11643 if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) { 11644 continue; 11645 } 11646 11647 /* Find the character's script. The correct values are hard-coded here 11648 * for small-enough code points. */ 11649 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely 11650 unlikely to change */ 11651 if ( cp > 255 11652 || ( isALPHA_L1(cp) 11653 && LIKELY(cp != MICRO_SIGN_NATIVE))) 11654 { 11655 script_of_char = SCX_Latin; 11656 } 11657 else { 11658 script_of_char = SCX_Common; 11659 } 11660 } 11661 else { 11662 script_of_char = _Perl_SCX_invmap[ 11663 _invlist_search(PL_SCX_invlist, cp)]; 11664 } 11665 11666 /* We arbitrarily accept a single unassigned character, but not in 11667 * combination with anything else, and not a run of them. */ 11668 if ( UNLIKELY(script_of_run == SCX_Unknown) 11669 || UNLIKELY( script_of_run != SCX_INVALID 11670 && script_of_char == SCX_Unknown)) 11671 { 11672 retval = FALSE; 11673 break; 11674 } 11675 11676 /* For the first character, or the run is inherited, the run's script 11677 * is set to the char's */ 11678 if ( UNLIKELY(script_of_run == SCX_INVALID) 11679 || UNLIKELY(script_of_run == SCX_Inherited)) 11680 { 11681 script_of_run = script_of_char; 11682 } 11683 11684 /* For the character's script to be Unknown, it must be the first 11685 * character in the sequence (for otherwise a test above would have 11686 * prevented us from reaching here), and we have set the run's script 11687 * to it. Nothing further to be done for this character */ 11688 if (UNLIKELY(script_of_char == SCX_Unknown)) { 11689 continue; 11690 } 11691 11692 /* We accept 'inherited' script characters currently even at the 11693 * beginning. (We know that no characters in Inherited are digits, or 11694 * we'd have to check for that) */ 11695 if (UNLIKELY(script_of_char == SCX_Inherited)) { 11696 continue; 11697 } 11698 11699 /* If the run so far is Common, and the new character isn't, change the 11700 * run's script to that of this character */ 11701 if (script_of_run == SCX_Common && script_of_char != SCX_Common) { 11702 script_of_run = script_of_char; 11703 } 11704 11705 /* Now we can see if the script of the new character is the same as 11706 * that of the run */ 11707 if (LIKELY(script_of_char == script_of_run)) { 11708 /* By far the most common case */ 11709 goto scripts_match; 11710 } 11711 11712 /* Here, the script of the run isn't Common. But characters in Common 11713 * match any script */ 11714 if (script_of_char == SCX_Common) { 11715 goto scripts_match; 11716 } 11717 11718 #ifndef HAS_SCX_AUX_TABLES 11719 11720 /* Too early a Unicode version to have a code point belonging to more 11721 * than one script, so, if the scripts don't exactly match, fail */ 11722 PERL_UNUSED_VAR(intersection_len); 11723 retval = FALSE; 11724 break; 11725 11726 #else 11727 11728 /* Here there is no exact match between the character's script and the 11729 * run's. And we've handled the special cases of scripts Unknown, 11730 * Inherited, and Common. 11731 * 11732 * Negative script numbers signify that the value may be any of several 11733 * scripts, and we need to look at auxiliary information to make our 11734 * determination. But if both are non-negative, we can fail now */ 11735 if (LIKELY(script_of_char >= 0)) { 11736 const SCX_enum * search_in; 11737 PERL_UINT_FAST8_T search_in_len; 11738 PERL_UINT_FAST8_T i; 11739 11740 if (LIKELY(script_of_run >= 0)) { 11741 retval = FALSE; 11742 break; 11743 } 11744 11745 /* Use the previously constructed set of possible scripts, if any. 11746 * */ 11747 if (intersection) { 11748 search_in = intersection; 11749 search_in_len = intersection_len; 11750 } 11751 else { 11752 search_in = SCX_AUX_TABLE_ptrs[-script_of_run]; 11753 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run]; 11754 } 11755 11756 for (i = 0; i < search_in_len; i++) { 11757 if (search_in[i] == script_of_char) { 11758 script_of_run = script_of_char; 11759 goto scripts_match; 11760 } 11761 } 11762 11763 retval = FALSE; 11764 break; 11765 } 11766 else if (LIKELY(script_of_run >= 0)) { 11767 /* script of character could be one of several, but run is a single 11768 * script */ 11769 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char]; 11770 const PERL_UINT_FAST8_T search_in_len 11771 = SCX_AUX_TABLE_lengths[-script_of_char]; 11772 PERL_UINT_FAST8_T i; 11773 11774 for (i = 0; i < search_in_len; i++) { 11775 if (search_in[i] == script_of_run) { 11776 script_of_char = script_of_run; 11777 goto scripts_match; 11778 } 11779 } 11780 11781 retval = FALSE; 11782 break; 11783 } 11784 else { 11785 /* Both run and char could be in one of several scripts. If the 11786 * intersection is empty, then this character isn't in this script 11787 * run. Otherwise, we need to calculate the intersection to use 11788 * for future iterations of the loop, unless we are already at the 11789 * final character */ 11790 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char]; 11791 const PERL_UINT_FAST8_T char_len 11792 = SCX_AUX_TABLE_lengths[-script_of_char]; 11793 const SCX_enum * search_run; 11794 PERL_UINT_FAST8_T run_len; 11795 11796 SCX_enum * new_overlap = NULL; 11797 PERL_UINT_FAST8_T i, j; 11798 11799 if (intersection) { 11800 search_run = intersection; 11801 run_len = intersection_len; 11802 } 11803 else { 11804 search_run = SCX_AUX_TABLE_ptrs[-script_of_run]; 11805 run_len = SCX_AUX_TABLE_lengths[-script_of_run]; 11806 } 11807 11808 intersection_len = 0; 11809 11810 for (i = 0; i < run_len; i++) { 11811 for (j = 0; j < char_len; j++) { 11812 if (search_run[i] == search_char[j]) { 11813 11814 /* Here, the script at i,j matches. That means this 11815 * character is in the run. But continue on to find 11816 * the complete intersection, for the next loop 11817 * iteration, and for the digit check after it. 11818 * 11819 * On the first found common script, we malloc space 11820 * for the intersection list for the worst case of the 11821 * intersection, which is the minimum of the number of 11822 * scripts remaining in each set. */ 11823 if (intersection_len == 0) { 11824 Newx(new_overlap, 11825 MIN(run_len - i, char_len - j), 11826 SCX_enum); 11827 } 11828 new_overlap[intersection_len++] = search_run[i]; 11829 } 11830 } 11831 } 11832 11833 /* Here we've looked through everything. If they have no scripts 11834 * in common, not a run */ 11835 if (intersection_len == 0) { 11836 retval = FALSE; 11837 break; 11838 } 11839 11840 /* If there is only a single script in common, set to that. 11841 * Otherwise, use the intersection going forward */ 11842 Safefree(intersection); 11843 intersection = NULL; 11844 if (intersection_len == 1) { 11845 script_of_run = script_of_char = new_overlap[0]; 11846 Safefree(new_overlap); 11847 new_overlap = NULL; 11848 } 11849 else { 11850 intersection = new_overlap; 11851 } 11852 } 11853 11854 #endif 11855 11856 scripts_match: 11857 11858 /* Here, the script of the character is compatible with that of the 11859 * run. That means that in most cases, it continues the script run. 11860 * Either it and the run match exactly, or one or both can be in any of 11861 * several scripts, and the intersection is not empty. However, if the 11862 * character is a decimal digit, it could still mean failure if it is 11863 * from the wrong sequence of 10. So, we need to look at if it's a 11864 * digit. We've already handled the 10 digits [0-9], and the next 11865 * lowest one is this one: */ 11866 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) { 11867 continue; /* Not a digit; this character is part of the run */ 11868 } 11869 11870 /* If we have a definitive '0' for the script of this character, we 11871 * know that for this to be a digit, it must be in the range of +0..+9 11872 * of that zero. */ 11873 if ( script_of_char >= 0 11874 && (zero_of_char = script_zeros[script_of_char])) 11875 { 11876 if (! withinCOUNT(cp, zero_of_char, 9)) { 11877 continue; /* Not a digit; this character is part of the run 11878 */ 11879 } 11880 11881 } 11882 else { /* Need to look up if this character is a digit or not */ 11883 SSize_t index_of_zero_of_char; 11884 index_of_zero_of_char = _invlist_search(decimals_invlist, cp); 11885 if ( UNLIKELY(index_of_zero_of_char < 0) 11886 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char)) 11887 { 11888 continue; /* Not a digit; this character is part of the run. 11889 */ 11890 } 11891 11892 zero_of_char = decimals_array[index_of_zero_of_char]; 11893 } 11894 11895 /* Here, the character is a decimal digit, and the zero of its sequence 11896 * of 10 is in 'zero_of_char'. If we already have a zero for this run, 11897 * they better be the same. */ 11898 if (zero_of_run) { 11899 if (zero_of_run != zero_of_char) { 11900 retval = FALSE; 11901 break; 11902 } 11903 } 11904 else { /* Otherwise we now have a zero for this run */ 11905 zero_of_run = zero_of_char; 11906 } 11907 } /* end of looping through CLOSESR text */ 11908 11909 Safefree(intersection); 11910 11911 if (ret_script != NULL) { 11912 if (retval) { 11913 *ret_script = script_of_run; 11914 } 11915 else { 11916 *ret_script = SCX_INVALID; 11917 } 11918 } 11919 11920 return retval; 11921 } 11922 #endif /* ifndef PERL_IN_XSUB_RE */ 11923 11924 /* Buffer logic. */ 11925 SV* 11926 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, 11927 const U32 flags) 11928 { 11929 PERL_ARGS_ASSERT_REG_NAMED_BUFF; 11930 11931 PERL_UNUSED_ARG(value); 11932 11933 if (flags & RXapif_FETCH) { 11934 return reg_named_buff_fetch(rx, key, flags); 11935 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { 11936 Perl_croak_no_modify(); 11937 return NULL; 11938 } else if (flags & RXapif_EXISTS) { 11939 return reg_named_buff_exists(rx, key, flags) 11940 ? &PL_sv_yes 11941 : &PL_sv_no; 11942 } else if (flags & RXapif_REGNAMES) { 11943 return reg_named_buff_all(rx, flags); 11944 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { 11945 return reg_named_buff_scalar(rx, flags); 11946 } else { 11947 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); 11948 return NULL; 11949 } 11950 } 11951 11952 SV* 11953 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, 11954 const U32 flags) 11955 { 11956 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; 11957 PERL_UNUSED_ARG(lastkey); 11958 11959 if (flags & RXapif_FIRSTKEY) 11960 return reg_named_buff_firstkey(rx, flags); 11961 else if (flags & RXapif_NEXTKEY) 11962 return reg_named_buff_nextkey(rx, flags); 11963 else { 11964 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", 11965 (int)flags); 11966 return NULL; 11967 } 11968 } 11969 11970 SV* 11971 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, 11972 const U32 flags) 11973 { 11974 SV *ret; 11975 struct regexp *const rx = ReANY(r); 11976 11977 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; 11978 11979 if (rx && RXp_PAREN_NAMES(rx)) { 11980 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); 11981 if (he_str) { 11982 IV i; 11983 SV* sv_dat=HeVAL(he_str); 11984 I32 *nums=(I32*)SvPVX(sv_dat); 11985 AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL; 11986 for ( i=0; i<SvIVX(sv_dat); i++ ) { 11987 if ((I32)(rx->nparens) >= nums[i] 11988 && RXp_OFFS_VALID(rx,nums[i])) 11989 { 11990 ret = newSVpvs(""); 11991 Perl_reg_numbered_buff_fetch_flags(aTHX_ r, nums[i], ret, REG_FETCH_ABSOLUTE); 11992 if (!retarray) 11993 return ret; 11994 } else { 11995 if (retarray) 11996 ret = newSV_type(SVt_NULL); 11997 } 11998 if (retarray) 11999 av_push_simple(retarray, ret); 12000 } 12001 if (retarray) 12002 return newRV_noinc(MUTABLE_SV(retarray)); 12003 } 12004 } 12005 return NULL; 12006 } 12007 12008 bool 12009 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, 12010 const U32 flags) 12011 { 12012 struct regexp *const rx = ReANY(r); 12013 12014 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; 12015 12016 if (rx && RXp_PAREN_NAMES(rx)) { 12017 if (flags & RXapif_ALL) { 12018 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); 12019 } else { 12020 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); 12021 if (sv) { 12022 SvREFCNT_dec_NN(sv); 12023 return TRUE; 12024 } else { 12025 return FALSE; 12026 } 12027 } 12028 } else { 12029 return FALSE; 12030 } 12031 } 12032 12033 SV* 12034 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) 12035 { 12036 struct regexp *const rx = ReANY(r); 12037 12038 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; 12039 12040 if ( rx && RXp_PAREN_NAMES(rx) ) { 12041 (void)hv_iterinit(RXp_PAREN_NAMES(rx)); 12042 12043 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); 12044 } else { 12045 return FALSE; 12046 } 12047 } 12048 12049 SV* 12050 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) 12051 { 12052 struct regexp *const rx = ReANY(r); 12053 DECLARE_AND_GET_RE_DEBUG_FLAGS; 12054 12055 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; 12056 12057 if (rx && RXp_PAREN_NAMES(rx)) { 12058 HV *hv = RXp_PAREN_NAMES(rx); 12059 HE *temphe; 12060 while ( (temphe = hv_iternext_flags(hv, 0)) ) { 12061 IV i; 12062 IV parno = 0; 12063 SV* sv_dat = HeVAL(temphe); 12064 I32 *nums = (I32*)SvPVX(sv_dat); 12065 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 12066 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] && 12067 RXp_OFFS_VALID(rx,nums[i])) 12068 { 12069 parno = nums[i]; 12070 break; 12071 } 12072 } 12073 if (parno || flags & RXapif_ALL) { 12074 return newSVhek(HeKEY_hek(temphe)); 12075 } 12076 } 12077 } 12078 return NULL; 12079 } 12080 12081 SV* 12082 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) 12083 { 12084 SV *ret; 12085 AV *av; 12086 SSize_t length; 12087 struct regexp *const rx = ReANY(r); 12088 12089 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; 12090 12091 if (rx && RXp_PAREN_NAMES(rx)) { 12092 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { 12093 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); 12094 } else if (flags & RXapif_ONE) { 12095 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); 12096 av = MUTABLE_AV(SvRV(ret)); 12097 length = av_count(av); 12098 SvREFCNT_dec_NN(ret); 12099 return newSViv(length); 12100 } else { 12101 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", 12102 (int)flags); 12103 return NULL; 12104 } 12105 } 12106 return &PL_sv_undef; 12107 } 12108 12109 SV* 12110 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) 12111 { 12112 struct regexp *const rx = ReANY(r); 12113 AV *av = newAV(); 12114 12115 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; 12116 12117 if (rx && RXp_PAREN_NAMES(rx)) { 12118 HV *hv= RXp_PAREN_NAMES(rx); 12119 HE *temphe; 12120 (void)hv_iterinit(hv); 12121 while ( (temphe = hv_iternext_flags(hv, 0)) ) { 12122 IV i; 12123 IV parno = 0; 12124 SV* sv_dat = HeVAL(temphe); 12125 I32 *nums = (I32*)SvPVX(sv_dat); 12126 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 12127 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] && 12128 RXp_OFFS_VALID(rx,nums[i])) 12129 { 12130 parno = nums[i]; 12131 break; 12132 } 12133 } 12134 if (parno || flags & RXapif_ALL) { 12135 av_push_simple(av, newSVhek(HeKEY_hek(temphe))); 12136 } 12137 } 12138 } 12139 12140 return newRV_noinc(MUTABLE_SV(av)); 12141 } 12142 12143 void 12144 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re, const I32 paren, 12145 SV * const sv) 12146 { 12147 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; 12148 Perl_reg_numbered_buff_fetch_flags(aTHX_ re, paren, sv, 0); 12149 } 12150 12151 #ifndef PERL_IN_XSUB_RE 12152 12153 void 12154 Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re, const I32 paren, 12155 SV * const sv, U32 flags) 12156 { 12157 struct regexp *const rx = ReANY(re); 12158 char *s = NULL; 12159 SSize_t i,t = 0; 12160 SSize_t s1, t1; 12161 I32 n = paren; 12162 I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens; 12163 12164 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH_FLAGS; 12165 12166 if ( n == RX_BUFF_IDX_CARET_PREMATCH 12167 || n == RX_BUFF_IDX_CARET_FULLMATCH 12168 || n == RX_BUFF_IDX_CARET_POSTMATCH 12169 ) 12170 { 12171 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); 12172 if (!keepcopy) { 12173 /* on something like 12174 * $r = qr/.../; 12175 * /$qr/p; 12176 * the KEEPCOPY is set on the PMOP rather than the regex */ 12177 if (PL_curpm && re == PM_GETRE(PL_curpm)) 12178 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); 12179 } 12180 if (!keepcopy) 12181 goto ret_undef; 12182 } 12183 12184 if (!RXp_SUBBEG(rx)) 12185 goto ret_undef; 12186 12187 if (n == RX_BUFF_IDX_CARET_FULLMATCH) 12188 /* no need to distinguish between them any more */ 12189 n = RX_BUFF_IDX_FULLMATCH; 12190 12191 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) 12192 && (i = RXp_OFFS_START(rx,0)) != -1) 12193 { 12194 /* $`, ${^PREMATCH} */ 12195 s = RXp_SUBBEG(rx); 12196 } 12197 else 12198 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) 12199 && (t = RXp_OFFS_END(rx,0)) != -1) 12200 { 12201 /* $', ${^POSTMATCH} */ 12202 s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + t; 12203 i = RXp_SUBLEN(rx) + RXp_SUBOFFSET(rx) - t; 12204 } 12205 else /* when flags is true we do an absolute lookup, and compare against rx->nparens */ 12206 if (inRANGE(n, 0, flags ? (I32)rx->nparens : logical_nparens)) { 12207 I32 *map = (!flags && n) ? rx->logical_to_parno : NULL; 12208 I32 true_parno = map ? map[n] : n; 12209 do { 12210 if (((s1 = RXp_OFFS_START(rx,true_parno)) != -1) && 12211 ((t1 = RXp_OFFS_END(rx,true_parno)) != -1)) 12212 { 12213 /* $&, ${^MATCH}, $1 ... */ 12214 i = t1 - s1; 12215 s = RXp_SUBBEG(rx) + s1 - RXp_SUBOFFSET(rx); 12216 goto found_it; 12217 } 12218 else if (map) { 12219 true_parno = rx->parno_to_logical_next[true_parno]; 12220 } 12221 else { 12222 break; 12223 } 12224 } while (true_parno); 12225 goto ret_undef; 12226 } else { 12227 goto ret_undef; 12228 } 12229 12230 found_it: 12231 assert(s >= RXp_SUBBEG(rx)); 12232 assert((STRLEN)RXp_SUBLEN(rx) >= (STRLEN)((s - RXp_SUBBEG(rx)) + i) ); 12233 if (i >= 0) { 12234 #ifdef NO_TAINT_SUPPORT 12235 sv_setpvn(sv, s, i); 12236 #else 12237 const int oldtainted = TAINT_get; 12238 TAINT_NOT; 12239 sv_setpvn(sv, s, i); 12240 TAINT_set(oldtainted); 12241 #endif 12242 if (RXp_MATCH_UTF8(rx)) 12243 SvUTF8_on(sv); 12244 else 12245 SvUTF8_off(sv); 12246 if (TAINTING_get) { 12247 if (RXp_MATCH_TAINTED(rx)) { 12248 if (SvTYPE(sv) >= SVt_PVMG) { 12249 MAGIC* const mg = SvMAGIC(sv); 12250 MAGIC* mgt; 12251 TAINT; 12252 SvMAGIC_set(sv, mg->mg_moremagic); 12253 SvTAINT(sv); 12254 if ((mgt = SvMAGIC(sv))) { 12255 mg->mg_moremagic = mgt; 12256 SvMAGIC_set(sv, mg); 12257 } 12258 } else { 12259 TAINT; 12260 SvTAINT(sv); 12261 } 12262 } else 12263 SvTAINTED_off(sv); 12264 } 12265 } else { 12266 ret_undef: 12267 sv_set_undef(sv); 12268 return; 12269 } 12270 } 12271 12272 #endif 12273 12274 void 12275 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, 12276 SV const * const value) 12277 { 12278 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; 12279 12280 PERL_UNUSED_ARG(rx); 12281 PERL_UNUSED_ARG(paren); 12282 PERL_UNUSED_ARG(value); 12283 12284 if (!PL_localizing) 12285 Perl_croak_no_modify(); 12286 } 12287 12288 I32 12289 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, 12290 const I32 paren) 12291 { 12292 struct regexp *const rx = ReANY(r); 12293 I32 i,j; 12294 I32 s1, t1; 12295 I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens; 12296 12297 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; 12298 12299 if ( paren == RX_BUFF_IDX_CARET_PREMATCH 12300 || paren == RX_BUFF_IDX_CARET_FULLMATCH 12301 || paren == RX_BUFF_IDX_CARET_POSTMATCH 12302 ) 12303 { 12304 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); 12305 if (!keepcopy) { 12306 /* on something like 12307 * $r = qr/.../; 12308 * /$qr/p; 12309 * the KEEPCOPY is set on the PMOP rather than the regex */ 12310 if (PL_curpm && r == PM_GETRE(PL_curpm)) 12311 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); 12312 } 12313 if (!keepcopy) 12314 goto warn_undef; 12315 } 12316 12317 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ 12318 switch (paren) { 12319 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ 12320 case RX_BUFF_IDX_PREMATCH: /* $` */ 12321 if ( (i = RXp_OFFS_START(rx,0)) != -1) { 12322 if (i > 0) { 12323 s1 = 0; 12324 t1 = i; 12325 goto getlen; 12326 } 12327 } 12328 return 0; 12329 12330 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ 12331 case RX_BUFF_IDX_POSTMATCH: /* $' */ 12332 if ( (j = RXp_OFFS_END(rx,0)) != -1 ) { 12333 i = RXp_SUBLEN(rx) - j; 12334 if (i > 0) { 12335 s1 = j; 12336 t1 = RXp_SUBLEN(rx); 12337 goto getlen; 12338 } 12339 } 12340 return 0; 12341 12342 default: /* $& / ${^MATCH}, $1, $2, ... */ 12343 if (paren <= logical_nparens) { 12344 I32 true_paren = rx->logical_to_parno 12345 ? rx->logical_to_parno[paren] 12346 : paren; 12347 do { 12348 if (((s1 = RXp_OFFS_START(rx,true_paren)) != -1) && 12349 ((t1 = RXp_OFFS_END(rx,true_paren)) != -1)) 12350 { 12351 i = t1 - s1; 12352 goto getlen; 12353 } else if (rx->parno_to_logical_next) { 12354 true_paren = rx->parno_to_logical_next[true_paren]; 12355 } else { 12356 break; 12357 } 12358 } while(true_paren); 12359 } 12360 warn_undef: 12361 if (ckWARN(WARN_UNINITIALIZED)) 12362 report_uninit((const SV *)sv); 12363 return 0; 12364 } 12365 getlen: 12366 if (i > 0 && RXp_MATCH_UTF8(rx)) { 12367 const char * const s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + s1; 12368 const U8 *ep; 12369 STRLEN el; 12370 12371 i = t1 - s1; 12372 if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) 12373 i = el; 12374 } 12375 return i; 12376 } 12377 12378 /* 12379 * ex: set ts=8 sts=4 sw=4 et: 12380 */ 12381