1 /* regexec.c 2 */ 3 4 /* 5 * One Ring to rule them all, One Ring to find them 6 * 7 * [p.v of _The Lord of the Rings_, opening poem] 8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] 9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] 10 */ 11 12 /* This file contains functions for executing a regular expression. See 13 * also regcomp.c which funnily enough, contains functions for compiling 14 * a regular expression. 15 * 16 * This file is also copied at build time to ext/re/re_exec.c, where 17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 18 * This causes the main functions to be compiled under new names and with 19 * debugging support added, which makes "use re 'debug'" work. 20 */ 21 22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 23 * confused with the original package (see point 3 below). Thanks, Henry! 24 */ 25 26 /* Additional note: this code is very heavily munged from Henry's version 27 * in places. In some spots I've traded clarity for efficiency, so don't 28 * blame Henry for some of the lack of readability. 29 */ 30 31 /* The names of the functions have been changed from regcomp and 32 * regexec to pregcomp and pregexec in order to avoid conflicts 33 * with the POSIX routines of the same names. 34 */ 35 36 #ifdef PERL_EXT_RE_BUILD 37 #include "re_top.h" 38 #endif 39 40 /* 41 * pregcomp and pregexec -- regsub and regerror are not used in perl 42 * 43 * Copyright (c) 1986 by University of Toronto. 44 * Written by Henry Spencer. Not derived from licensed software. 45 * 46 * Permission is granted to anyone to use this software for any 47 * purpose on any computer system, and to redistribute it freely, 48 * subject to the following restrictions: 49 * 50 * 1. The author is not responsible for the consequences of use of 51 * this software, no matter how awful, even if they arise 52 * from defects in it. 53 * 54 * 2. The origin of this software must not be misrepresented, either 55 * by explicit claim or by omission. 56 * 57 * 3. Altered versions must be plainly marked as such, and must not 58 * be misrepresented as being the original software. 59 * 60 **** Alterations to Henry's code are... 61 **** 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 64 **** by Larry Wall and others 65 **** 66 **** You may distribute under the terms of either the GNU General Public 67 **** License or the Artistic License, as specified in the README file. 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73 #include "EXTERN.h" 74 #define PERL_IN_REGEXEC_C 75 #include "perl.h" 76 77 #ifdef PERL_IN_XSUB_RE 78 # include "re_comp.h" 79 #else 80 # include "regcomp.h" 81 #endif 82 83 #include "invlist_inline.h" 84 #include "unicode_constants.h" 85 86 static const char b_utf8_locale_required[] = 87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong." 88 " Assuming a UTF-8 locale"; 89 90 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \ 91 STMT_START { \ 92 if (! IN_UTF8_CTYPE_LOCALE) { \ 93 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \ 94 b_utf8_locale_required); \ 95 } \ 96 } STMT_END 97 98 static const char sets_utf8_locale_required[] = 99 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"; 100 101 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \ 102 STMT_START { \ 103 if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) { \ 104 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \ 105 sets_utf8_locale_required); \ 106 } \ 107 } STMT_END 108 109 #ifdef DEBUGGING 110 /* At least one required character in the target string is expressible only in 111 * UTF-8. */ 112 static const char non_utf8_target_but_utf8_required[] 113 = "Can't match, because target string needs to be in UTF-8\n"; 114 #endif 115 116 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ 117 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\ 118 goto target; \ 119 } STMT_END 120 121 #ifndef STATIC 122 #define STATIC static 123 #endif 124 125 /* 126 * Forwards. 127 */ 128 129 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) 130 131 #define HOPc(pos,off) \ 132 (char *)(reginfo->is_utf8_target \ 133 ? reghop3((U8*)pos, off, \ 134 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ 135 : (U8*)(pos + off)) 136 137 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */ 138 #define HOPBACK3(pos, off, lim) \ 139 (reginfo->is_utf8_target \ 140 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \ 141 : (pos - off >= lim) \ 142 ? (U8*)pos - off \ 143 : NULL) 144 145 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg)) 146 147 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) 148 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) 149 150 /* lim must be +ve. Returns NULL on overshoot */ 151 #define HOPMAYBE3(pos,off,lim) \ 152 (reginfo->is_utf8_target \ 153 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ 154 : ((U8*)pos + off <= lim) \ 155 ? (U8*)pos + off \ 156 : NULL) 157 158 /* like HOP3, but limits the result to <= lim even for the non-utf8 case. 159 * off must be >=0; args should be vars rather than expressions */ 160 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ 161 ? reghop3((U8*)(pos), off, (U8*)(lim)) \ 162 : (U8*)((pos + off) > lim ? lim : (pos + off))) 163 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim)) 164 165 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ 166 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ 167 : (U8*)(pos + off)) 168 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) 169 170 #define PLACEHOLDER /* Something for the preprocessor to grab onto */ 171 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ 172 173 /* for use after a quantifier and before an EXACT-like node -- japhy */ 174 /* it would be nice to rework regcomp.sym to generate this stuff. sigh 175 * 176 * NOTE that *nothing* that affects backtracking should be in here, specifically 177 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a 178 * node that is in between two EXACT like nodes when ascertaining what the required 179 * "follow" character is. This should probably be moved to regex compile time 180 * although it may be done at run time beause of the REF possibility - more 181 * investigation required. -- demerphq 182 */ 183 #define JUMPABLE(rn) ( \ 184 OP(rn) == OPEN || \ 185 (OP(rn) == CLOSE && \ 186 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \ 187 OP(rn) == EVAL || \ 188 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ 189 OP(rn) == PLUS || OP(rn) == MINMOD || \ 190 OP(rn) == KEEPS || \ 191 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ 192 ) 193 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) 194 195 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF ) 196 197 /* 198 Search for mandatory following text node; for lookahead, the text must 199 follow but for lookbehind (rn->flags != 0) we skip to the next step. 200 */ 201 #define FIND_NEXT_IMPT(rn) STMT_START { \ 202 while (JUMPABLE(rn)) { \ 203 const OPCODE type = OP(rn); \ 204 if (type == SUSPEND || PL_regkind[type] == CURLY) \ 205 rn = NEXTOPER(NEXTOPER(rn)); \ 206 else if (type == PLUS) \ 207 rn = NEXTOPER(rn); \ 208 else if (type == IFMATCH) \ 209 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ 210 else rn += NEXT_OFF(rn); \ 211 } \ 212 } STMT_END 213 214 #define SLAB_FIRST(s) (&(s)->states[0]) 215 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) 216 217 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); 218 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); 219 static regmatch_state * S_push_slab(pTHX); 220 221 #define REGCP_PAREN_ELEMS 3 222 #define REGCP_OTHER_ELEMS 3 223 #define REGCP_FRAME_ELEMS 1 224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and 225 * are needed for the regexp context stack bookkeeping. */ 226 227 STATIC CHECKPOINT 228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) 229 { 230 const int retval = PL_savestack_ix; 231 const int paren_elems_to_push = 232 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS; 233 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; 234 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; 235 I32 p; 236 DECLARE_AND_GET_RE_DEBUG_FLAGS; 237 238 PERL_ARGS_ASSERT_REGCPPUSH; 239 240 if (paren_elems_to_push < 0) 241 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u", 242 (int)paren_elems_to_push, (int)maxopenparen, 243 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS); 244 245 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) 246 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf 247 " out of range (%lu-%ld)", 248 total_elems, 249 (unsigned long)maxopenparen, 250 (long)parenfloor); 251 252 SSGROW(total_elems + REGCP_FRAME_ELEMS); 253 254 DEBUG_BUFFERS_r( 255 if ((int)maxopenparen > (int)parenfloor) 256 Perl_re_exec_indentf( aTHX_ 257 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n", 258 depth, 259 PTR2UV(rex), 260 PTR2UV(rex->offs) 261 ); 262 ); 263 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { 264 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ 265 SSPUSHIV(rex->offs[p].end); 266 SSPUSHIV(rex->offs[p].start); 267 SSPUSHINT(rex->offs[p].start_tmp); 268 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 269 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n", 270 depth, 271 (UV)p, 272 (IV)rex->offs[p].start, 273 (IV)rex->offs[p].start_tmp, 274 (IV)rex->offs[p].end 275 )); 276 } 277 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ 278 SSPUSHINT(maxopenparen); 279 SSPUSHINT(rex->lastparen); 280 SSPUSHINT(rex->lastcloseparen); 281 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ 282 283 return retval; 284 } 285 286 /* These are needed since we do not localize EVAL nodes: */ 287 #define REGCP_SET(cp) \ 288 DEBUG_STATE_r( \ 289 Perl_re_exec_indentf( aTHX_ \ 290 "Setting an EVAL scope, savestack=%" IVdf ",\n", \ 291 depth, (IV)PL_savestack_ix \ 292 ) \ 293 ); \ 294 cp = PL_savestack_ix 295 296 #define REGCP_UNWIND(cp) \ 297 DEBUG_STATE_r( \ 298 if (cp != PL_savestack_ix) \ 299 Perl_re_exec_indentf( aTHX_ \ 300 "Clearing an EVAL scope, savestack=%" \ 301 IVdf "..%" IVdf "\n", \ 302 depth, (IV)(cp), (IV)PL_savestack_ix \ 303 ) \ 304 ); \ 305 regcpblow(cp) 306 307 /* set the start and end positions of capture ix */ 308 #define CLOSE_CAPTURE(ix, s, e) \ 309 rex->offs[ix].start = s; \ 310 rex->offs[ix].end = e; \ 311 if (ix > rex->lastparen) \ 312 rex->lastparen = ix; \ 313 rex->lastcloseparen = ix; \ 314 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \ 315 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \ 316 depth, \ 317 PTR2UV(rex), \ 318 PTR2UV(rex->offs), \ 319 (UV)ix, \ 320 (IV)rex->offs[ix].start, \ 321 (IV)rex->offs[ix].end, \ 322 (UV)rex->lastparen \ 323 )) 324 325 #define UNWIND_PAREN(lp, lcp) \ 326 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \ 327 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \ 328 depth, \ 329 PTR2UV(rex), \ 330 PTR2UV(rex->offs), \ 331 (UV)(lp), \ 332 (UV)(rex->lastparen), \ 333 (UV)(lcp) \ 334 )); \ 335 for (n = rex->lastparen; n > lp; n--) \ 336 rex->offs[n].end = -1; \ 337 rex->lastparen = n; \ 338 rex->lastcloseparen = lcp; 339 340 341 STATIC void 342 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH) 343 { 344 UV i; 345 U32 paren; 346 DECLARE_AND_GET_RE_DEBUG_FLAGS; 347 348 PERL_ARGS_ASSERT_REGCPPOP; 349 350 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ 351 i = SSPOPUV; 352 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ 353 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ 354 rex->lastcloseparen = SSPOPINT; 355 rex->lastparen = SSPOPINT; 356 *maxopenparen_p = SSPOPINT; 357 358 i -= REGCP_OTHER_ELEMS; 359 /* Now restore the parentheses context. */ 360 DEBUG_BUFFERS_r( 361 if (i || rex->lastparen + 1 <= rex->nparens) 362 Perl_re_exec_indentf( aTHX_ 363 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n", 364 depth, 365 PTR2UV(rex), 366 PTR2UV(rex->offs) 367 ); 368 ); 369 paren = *maxopenparen_p; 370 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { 371 SSize_t tmps; 372 rex->offs[paren].start_tmp = SSPOPINT; 373 rex->offs[paren].start = SSPOPIV; 374 tmps = SSPOPIV; 375 if (paren <= rex->lastparen) 376 rex->offs[paren].end = tmps; 377 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_ 378 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n", 379 depth, 380 (UV)paren, 381 (IV)rex->offs[paren].start, 382 (IV)rex->offs[paren].start_tmp, 383 (IV)rex->offs[paren].end, 384 (paren > rex->lastparen ? "(skipped)" : "")); 385 ); 386 paren--; 387 } 388 #if 1 389 /* It would seem that the similar code in regtry() 390 * already takes care of this, and in fact it is in 391 * a better location to since this code can #if 0-ed out 392 * but the code in regtry() is needed or otherwise tests 393 * requiring null fields (pat.t#187 and split.t#{13,14} 394 * (as of patchlevel 7877) will fail. Then again, 395 * this code seems to be necessary or otherwise 396 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 397 * --jhi updated by dapm */ 398 for (i = rex->lastparen + 1; i <= rex->nparens; i++) { 399 if (i > *maxopenparen_p) 400 rex->offs[i].start = -1; 401 rex->offs[i].end = -1; 402 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_ 403 " \\%" UVuf ": %s ..-1 undeffing\n", 404 depth, 405 (UV)i, 406 (i > *maxopenparen_p) ? "-1" : " " 407 )); 408 } 409 #endif 410 } 411 412 /* restore the parens and associated vars at savestack position ix, 413 * but without popping the stack */ 414 415 STATIC void 416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH) 417 { 418 I32 tmpix = PL_savestack_ix; 419 PERL_ARGS_ASSERT_REGCP_RESTORE; 420 421 PL_savestack_ix = ix; 422 regcppop(rex, maxopenparen_p); 423 PL_savestack_ix = tmpix; 424 } 425 426 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ 427 428 STATIC bool 429 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) 430 { 431 /* Returns a boolean as to whether or not 'character' is a member of the 432 * Posix character class given by 'classnum' that should be equivalent to a 433 * value in the typedef '_char_class_number'. 434 * 435 * Ideally this could be replaced by a just an array of function pointers 436 * to the C library functions that implement the macros this calls. 437 * However, to compile, the precise function signatures are required, and 438 * these may vary from platform to platform. To avoid having to figure 439 * out what those all are on each platform, I (khw) am using this method, 440 * which adds an extra layer of function call overhead (unless the C 441 * optimizer strips it away). But we don't particularly care about 442 * performance with locales anyway. */ 443 444 switch ((_char_class_number) classnum) { 445 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character); 446 case _CC_ENUM_ALPHA: return isALPHA_LC(character); 447 case _CC_ENUM_ASCII: return isASCII_LC(character); 448 case _CC_ENUM_BLANK: return isBLANK_LC(character); 449 case _CC_ENUM_CASED: return isLOWER_LC(character) 450 || isUPPER_LC(character); 451 case _CC_ENUM_CNTRL: return isCNTRL_LC(character); 452 case _CC_ENUM_DIGIT: return isDIGIT_LC(character); 453 case _CC_ENUM_GRAPH: return isGRAPH_LC(character); 454 case _CC_ENUM_LOWER: return isLOWER_LC(character); 455 case _CC_ENUM_PRINT: return isPRINT_LC(character); 456 case _CC_ENUM_PUNCT: return isPUNCT_LC(character); 457 case _CC_ENUM_SPACE: return isSPACE_LC(character); 458 case _CC_ENUM_UPPER: return isUPPER_LC(character); 459 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character); 460 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character); 461 default: /* VERTSPACE should never occur in locales */ 462 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum); 463 } 464 465 NOT_REACHED; /* NOTREACHED */ 466 return FALSE; 467 } 468 469 PERL_STATIC_INLINE I32 470 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len) 471 { 472 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be 473 * folded. Works on all folds representable without UTF-8, except for 474 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it 475 * check that the strings each have at least 'len' characters. 476 * 477 * There is almost an identical API function where s2 need not be folded: 478 * Perl_foldEQ_latin1() */ 479 480 const U8 *a = (const U8 *)s1; 481 const U8 *b = (const U8 *)s2; 482 483 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED; 484 485 assert(len >= 0); 486 487 while (len--) { 488 assert(! isUPPER_L1(*b)); 489 if (toLOWER_L1(*a) != *b) { 490 return 0; 491 } 492 a++, b++; 493 } 494 return 1; 495 } 496 497 STATIC bool 498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e) 499 { 500 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded 501 * 'character' is a member of the Posix character class given by 'classnum' 502 * that should be equivalent to a value in the typedef 503 * '_char_class_number'. 504 * 505 * This just calls isFOO_lc on the code point for the character if it is in 506 * the range 0-255. Outside that range, all characters use Unicode 507 * rules, ignoring any locale. So use the Unicode function if this class 508 * requires an inversion list, and use the Unicode macro otherwise. */ 509 510 511 PERL_ARGS_ASSERT_ISFOO_UTF8_LC; 512 513 if (UTF8_IS_INVARIANT(*character)) { 514 return isFOO_lc(classnum, *character); 515 } 516 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { 517 return isFOO_lc(classnum, 518 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1))); 519 } 520 521 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e); 522 523 switch ((_char_class_number) classnum) { 524 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character); 525 case _CC_ENUM_BLANK: return is_HORIZWS_high(character); 526 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character); 527 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character); 528 default: 529 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], 530 utf8_to_uvchr_buf(character, e, NULL)); 531 } 532 533 return FALSE; /* Things like CNTRL are always below 256 */ 534 } 535 536 STATIC U8 * 537 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte) 538 { 539 /* Returns the position of the first byte in the sequence between 's' and 540 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found. 541 * */ 542 543 PERL_ARGS_ASSERT_FIND_SPAN_END; 544 545 assert(send >= s); 546 547 if ((STRLEN) (send - s) >= PERL_WORDSIZE 548 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 549 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 550 { 551 PERL_UINTMAX_T span_word; 552 553 /* Process per-byte until reach word boundary. XXX This loop could be 554 * eliminated if we knew that this platform had fast unaligned reads */ 555 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 556 if (*s != span_byte) { 557 return s; 558 } 559 s++; 560 } 561 562 /* Create a word filled with the bytes we are spanning */ 563 span_word = PERL_COUNT_MULTIPLIER * span_byte; 564 565 /* Process per-word as long as we have at least a full word left */ 566 do { 567 568 /* Keep going if the whole word is composed of 'span_byte's */ 569 if ((* (PERL_UINTMAX_T *) s) == span_word) { 570 s += PERL_WORDSIZE; 571 continue; 572 } 573 574 /* Here, at least one byte in the word isn't 'span_byte'. */ 575 576 #ifdef EBCDIC 577 578 break; 579 580 #else 581 582 /* This xor leaves 1 bits only in those non-matching bytes */ 583 span_word ^= * (PERL_UINTMAX_T *) s; 584 585 /* Make sure the upper bit of each non-matching byte is set. This 586 * makes each such byte look like an ASCII platform variant byte */ 587 span_word |= span_word << 1; 588 span_word |= span_word << 2; 589 span_word |= span_word << 4; 590 591 /* That reduces the problem to what this function solves */ 592 return s + variant_byte_number(span_word); 593 594 #endif 595 596 } while (s + PERL_WORDSIZE <= send); 597 } 598 599 /* Process the straggler bytes beyond the final word boundary */ 600 while (s < send) { 601 if (*s != span_byte) { 602 return s; 603 } 604 s++; 605 } 606 607 return s; 608 } 609 610 STATIC U8 * 611 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask) 612 { 613 /* Returns the position of the first byte in the sequence between 's' 614 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte'; 615 * returns 'send' if none found. It uses word-level operations instead of 616 * byte to speed up the process */ 617 618 PERL_ARGS_ASSERT_FIND_NEXT_MASKED; 619 620 assert(send >= s); 621 assert((byte & mask) == byte); 622 623 #ifndef EBCDIC 624 625 if ((STRLEN) (send - s) >= PERL_WORDSIZE 626 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 627 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 628 { 629 PERL_UINTMAX_T word, mask_word; 630 631 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 632 if (((*s) & mask) == byte) { 633 return s; 634 } 635 s++; 636 } 637 638 word = PERL_COUNT_MULTIPLIER * byte; 639 mask_word = PERL_COUNT_MULTIPLIER * mask; 640 641 do { 642 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word; 643 644 /* If 'masked' contains bytes with the bit pattern of 'byte' within 645 * it, xoring with 'word' will leave each of the 8 bits in such 646 * bytes be 0, and no byte containing any other bit pattern will be 647 * 0. */ 648 masked ^= word; 649 650 /* This causes the most significant bit to be set to 1 for any 651 * bytes in the word that aren't completely 0 */ 652 masked |= masked << 1; 653 masked |= masked << 2; 654 masked |= masked << 4; 655 656 /* The msbits are the same as what marks a byte as variant, so we 657 * can use this mask. If all msbits are 1, the word doesn't 658 * contain 'byte' */ 659 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) { 660 s += PERL_WORDSIZE; 661 continue; 662 } 663 664 /* Here, the msbit of bytes in the word that aren't 'byte' are 1, 665 * and any that are, are 0. Complement and re-AND to swap that */ 666 masked = ~ masked; 667 masked &= PERL_VARIANTS_WORD_MASK; 668 669 /* This reduces the problem to that solved by this function */ 670 s += variant_byte_number(masked); 671 return s; 672 673 } while (s + PERL_WORDSIZE <= send); 674 } 675 676 #endif 677 678 while (s < send) { 679 if (((*s) & mask) == byte) { 680 return s; 681 } 682 s++; 683 } 684 685 return s; 686 } 687 688 STATIC U8 * 689 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask) 690 { 691 /* Returns the position of the first byte in the sequence between 's' and 692 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'. 693 * 'span_byte' should have been ANDed with 'mask' in the call of this 694 * function. Returns 'send' if none found. Works like find_span_end(), 695 * except for the AND */ 696 697 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK; 698 699 assert(send >= s); 700 assert((span_byte & mask) == span_byte); 701 702 if ((STRLEN) (send - s) >= PERL_WORDSIZE 703 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 704 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 705 { 706 PERL_UINTMAX_T span_word, mask_word; 707 708 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 709 if (((*s) & mask) != span_byte) { 710 return s; 711 } 712 s++; 713 } 714 715 span_word = PERL_COUNT_MULTIPLIER * span_byte; 716 mask_word = PERL_COUNT_MULTIPLIER * mask; 717 718 do { 719 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word; 720 721 if (masked == span_word) { 722 s += PERL_WORDSIZE; 723 continue; 724 } 725 726 #ifdef EBCDIC 727 728 break; 729 730 #else 731 732 masked ^= span_word; 733 masked |= masked << 1; 734 masked |= masked << 2; 735 masked |= masked << 4; 736 return s + variant_byte_number(masked); 737 738 #endif 739 740 } while (s + PERL_WORDSIZE <= send); 741 } 742 743 while (s < send) { 744 if (((*s) & mask) != span_byte) { 745 return s; 746 } 747 s++; 748 } 749 750 return s; 751 } 752 753 /* 754 * pregexec and friends 755 */ 756 757 #ifndef PERL_IN_XSUB_RE 758 /* 759 - pregexec - match a regexp against a string 760 */ 761 I32 762 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, 763 char *strbeg, SSize_t minend, SV *screamer, U32 nosave) 764 /* stringarg: the point in the string at which to begin matching */ 765 /* strend: pointer to null at end of string */ 766 /* strbeg: real beginning of string */ 767 /* minend: end of match must be >= minend bytes after stringarg. */ 768 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string 769 * itself is accessed via the pointers above */ 770 /* nosave: For optimizations. */ 771 { 772 PERL_ARGS_ASSERT_PREGEXEC; 773 774 return 775 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, 776 nosave ? 0 : REXEC_COPY_STR); 777 } 778 #endif 779 780 781 782 /* re_intuit_start(): 783 * 784 * Based on some optimiser hints, try to find the earliest position in the 785 * string where the regex could match. 786 * 787 * rx: the regex to match against 788 * sv: the SV being matched: only used for utf8 flag; the string 789 * itself is accessed via the pointers below. Note that on 790 * something like an overloaded SV, SvPOK(sv) may be false 791 * and the string pointers may point to something unrelated to 792 * the SV itself. 793 * strbeg: real beginning of string 794 * strpos: the point in the string at which to begin matching 795 * strend: pointer to the byte following the last char of the string 796 * flags currently unused; set to 0 797 * data: currently unused; set to NULL 798 * 799 * The basic idea of re_intuit_start() is to use some known information 800 * about the pattern, namely: 801 * 802 * a) the longest known anchored substring (i.e. one that's at a 803 * constant offset from the beginning of the pattern; but not 804 * necessarily at a fixed offset from the beginning of the 805 * string); 806 * b) the longest floating substring (i.e. one that's not at a constant 807 * offset from the beginning of the pattern); 808 * c) Whether the pattern is anchored to the string; either 809 * an absolute anchor: /^../, or anchored to \n: /^.../m, 810 * or anchored to pos(): /\G/; 811 * d) A start class: a real or synthetic character class which 812 * represents which characters are legal at the start of the pattern; 813 * 814 * to either quickly reject the match, or to find the earliest position 815 * within the string at which the pattern might match, thus avoiding 816 * running the full NFA engine at those earlier locations, only to 817 * eventually fail and retry further along. 818 * 819 * Returns NULL if the pattern can't match, or returns the address within 820 * the string which is the earliest place the match could occur. 821 * 822 * The longest of the anchored and floating substrings is called 'check' 823 * and is checked first. The other is called 'other' and is checked 824 * second. The 'other' substring may not be present. For example, 825 * 826 * /(abc|xyz)ABC\d{0,3}DEFG/ 827 * 828 * will have 829 * 830 * check substr (float) = "DEFG", offset 6..9 chars 831 * other substr (anchored) = "ABC", offset 3..3 chars 832 * stclass = [ax] 833 * 834 * Be aware that during the course of this function, sometimes 'anchored' 835 * refers to a substring being anchored relative to the start of the 836 * pattern, and sometimes to the pattern itself being anchored relative to 837 * the string. For example: 838 * 839 * /\dabc/: "abc" is anchored to the pattern; 840 * /^\dabc/: "abc" is anchored to the pattern and the string; 841 * /\d+abc/: "abc" is anchored to neither the pattern nor the string; 842 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, 843 * but the pattern is anchored to the string. 844 */ 845 846 char * 847 Perl_re_intuit_start(pTHX_ 848 REGEXP * const rx, 849 SV *sv, 850 const char * const strbeg, 851 char *strpos, 852 char *strend, 853 const U32 flags, 854 re_scream_pos_data *data) 855 { 856 struct regexp *const prog = ReANY(rx); 857 SSize_t start_shift = prog->check_offset_min; 858 /* Should be nonnegative! */ 859 SSize_t end_shift = 0; 860 /* current lowest pos in string where the regex can start matching */ 861 char *rx_origin = strpos; 862 SV *check; 863 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ 864 U8 other_ix = 1 - prog->substrs->check_ix; 865 bool ml_anch = 0; 866 char *other_last = strpos;/* latest pos 'other' substr already checked to */ 867 char *check_at = NULL; /* check substr found at this pos */ 868 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; 869 RXi_GET_DECL(prog,progi); 870 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ 871 regmatch_info *const reginfo = ®info_buf; 872 DECLARE_AND_GET_RE_DEBUG_FLAGS; 873 874 PERL_ARGS_ASSERT_RE_INTUIT_START; 875 PERL_UNUSED_ARG(flags); 876 PERL_UNUSED_ARG(data); 877 878 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 879 "Intuit: trying to determine minimum start position...\n")); 880 881 /* for now, assume that all substr offsets are positive. If at some point 882 * in the future someone wants to do clever things with lookbehind and 883 * -ve offsets, they'll need to fix up any code in this function 884 * which uses these offsets. See the thread beginning 885 * <20140113145929.GF27210@iabyn.com> 886 */ 887 assert(prog->substrs->data[0].min_offset >= 0); 888 assert(prog->substrs->data[0].max_offset >= 0); 889 assert(prog->substrs->data[1].min_offset >= 0); 890 assert(prog->substrs->data[1].max_offset >= 0); 891 assert(prog->substrs->data[2].min_offset >= 0); 892 assert(prog->substrs->data[2].max_offset >= 0); 893 894 /* for now, assume that if both present, that the floating substring 895 * doesn't start before the anchored substring. 896 * If you break this assumption (e.g. doing better optimisations 897 * with lookahead/behind), then you'll need to audit the code in this 898 * function carefully first 899 */ 900 assert( 901 ! ( (prog->anchored_utf8 || prog->anchored_substr) 902 && (prog->float_utf8 || prog->float_substr)) 903 || (prog->float_min_offset >= prog->anchored_offset)); 904 905 /* byte rather than char calculation for efficiency. It fails 906 * to quickly reject some cases that can't match, but will reject 907 * them later after doing full char arithmetic */ 908 if (prog->minlen > strend - strpos) { 909 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 910 " String too short...\n")); 911 goto fail; 912 } 913 914 RXp_MATCH_UTF8_set(prog, utf8_target); 915 reginfo->is_utf8_target = cBOOL(utf8_target); 916 reginfo->info_aux = NULL; 917 reginfo->strbeg = strbeg; 918 reginfo->strend = strend; 919 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); 920 reginfo->intuit = 1; 921 /* not actually used within intuit, but zero for safety anyway */ 922 reginfo->poscache_maxiter = 0; 923 924 if (utf8_target) { 925 if ((!prog->anchored_utf8 && prog->anchored_substr) 926 || (!prog->float_utf8 && prog->float_substr)) 927 to_utf8_substr(prog); 928 check = prog->check_utf8; 929 } else { 930 if (!prog->check_substr && prog->check_utf8) { 931 if (! to_byte_substr(prog)) { 932 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); 933 } 934 } 935 check = prog->check_substr; 936 } 937 938 /* dump the various substring data */ 939 DEBUG_OPTIMISE_MORE_r({ 940 int i; 941 for (i=0; i<=2; i++) { 942 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr 943 : prog->substrs->data[i].substr); 944 if (!sv) 945 continue; 946 947 Perl_re_printf( aTHX_ 948 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf 949 " useful=%" IVdf " utf8=%d [%s]\n", 950 i, 951 (IV)prog->substrs->data[i].min_offset, 952 (IV)prog->substrs->data[i].max_offset, 953 (IV)prog->substrs->data[i].end_shift, 954 BmUSEFUL(sv), 955 utf8_target ? 1 : 0, 956 SvPEEK(sv)); 957 } 958 }); 959 960 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ 961 962 /* ml_anch: check after \n? 963 * 964 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning 965 * with /.*.../, these flags will have been added by the 966 * compiler: 967 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL 968 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL 969 */ 970 ml_anch = (prog->intflags & PREGf_ANCH_MBOL) 971 && !(prog->intflags & PREGf_IMPLICIT); 972 973 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { 974 /* we are only allowed to match at BOS or \G */ 975 976 /* trivially reject if there's a BOS anchor and we're not at BOS. 977 * 978 * Note that we don't try to do a similar quick reject for 979 * \G, since generally the caller will have calculated strpos 980 * based on pos() and gofs, so the string is already correctly 981 * anchored by definition; and handling the exceptions would 982 * be too fiddly (e.g. REXEC_IGNOREPOS). 983 */ 984 if ( strpos != strbeg 985 && (prog->intflags & PREGf_ANCH_SBOL)) 986 { 987 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 988 " Not at start...\n")); 989 goto fail; 990 } 991 992 /* in the presence of an anchor, the anchored (relative to the 993 * start of the regex) substr must also be anchored relative 994 * to strpos. So quickly reject if substr isn't found there. 995 * This works for \G too, because the caller will already have 996 * subtracted gofs from pos, and gofs is the offset from the 997 * \G to the start of the regex. For example, in /.abc\Gdef/, 998 * where substr="abcdef", pos()=3, gofs=4, offset_min=1: 999 * caller will have set strpos=pos()-4; we look for the substr 1000 * at position pos()-4+1, which lines up with the "a" */ 1001 1002 if (prog->check_offset_min == prog->check_offset_max) { 1003 /* Substring at constant offset from beg-of-str... */ 1004 SSize_t slen = SvCUR(check); 1005 char *s = HOP3c(strpos, prog->check_offset_min, strend); 1006 1007 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1008 " Looking for check substr at fixed offset %" IVdf "...\n", 1009 (IV)prog->check_offset_min)); 1010 1011 if (SvTAIL(check)) { 1012 /* In this case, the regex is anchored at the end too. 1013 * Unless it's a multiline match, the lengths must match 1014 * exactly, give or take a \n. NB: slen >= 1 since 1015 * the last char of check is \n */ 1016 if (!multiline 1017 && ( strend - s > slen 1018 || strend - s < slen - 1 1019 || (strend - s == slen && strend[-1] != '\n'))) 1020 { 1021 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1022 " String too long...\n")); 1023 goto fail_finish; 1024 } 1025 /* Now should match s[0..slen-2] */ 1026 slen--; 1027 } 1028 if (slen && (strend - s < slen 1029 || *SvPVX_const(check) != *s 1030 || (slen > 1 && (memNE(SvPVX_const(check), s, slen))))) 1031 { 1032 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1033 " String not equal...\n")); 1034 goto fail_finish; 1035 } 1036 1037 check_at = s; 1038 goto success_at_start; 1039 } 1040 } 1041 } 1042 1043 end_shift = prog->check_end_shift; 1044 1045 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ 1046 if (end_shift < 0) 1047 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ", 1048 (IV)end_shift, RX_PRECOMP(rx)); 1049 #endif 1050 1051 restart: 1052 1053 /* This is the (re)entry point of the main loop in this function. 1054 * The goal of this loop is to: 1055 * 1) find the "check" substring in the region rx_origin..strend 1056 * (adjusted by start_shift / end_shift). If not found, reject 1057 * immediately. 1058 * 2) If it exists, look for the "other" substr too if defined; for 1059 * example, if the check substr maps to the anchored substr, then 1060 * check the floating substr, and vice-versa. If not found, go 1061 * back to (1) with rx_origin suitably incremented. 1062 * 3) If we find an rx_origin position that doesn't contradict 1063 * either of the substrings, then check the possible additional 1064 * constraints on rx_origin of /^.../m or a known start class. 1065 * If these fail, then depending on which constraints fail, jump 1066 * back to here, or to various other re-entry points further along 1067 * that skip some of the first steps. 1068 * 4) If we pass all those tests, update the BmUSEFUL() count on the 1069 * substring. If the start position was determined to be at the 1070 * beginning of the string - so, not rejected, but not optimised, 1071 * since we have to run regmatch from position 0 - decrement the 1072 * BmUSEFUL() count. Otherwise increment it. 1073 */ 1074 1075 1076 /* first, look for the 'check' substring */ 1077 1078 { 1079 U8* start_point; 1080 U8* end_point; 1081 1082 DEBUG_OPTIMISE_MORE_r({ 1083 Perl_re_printf( aTHX_ 1084 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf 1085 " Start shift: %" IVdf " End shift %" IVdf 1086 " Real end Shift: %" IVdf "\n", 1087 (IV)(rx_origin - strbeg), 1088 (IV)prog->check_offset_min, 1089 (IV)start_shift, 1090 (IV)end_shift, 1091 (IV)prog->check_end_shift); 1092 }); 1093 1094 end_point = HOPBACK3(strend, end_shift, rx_origin); 1095 if (!end_point) 1096 goto fail_finish; 1097 start_point = HOPMAYBE3(rx_origin, start_shift, end_point); 1098 if (!start_point) 1099 goto fail_finish; 1100 1101 1102 /* If the regex is absolutely anchored to either the start of the 1103 * string (SBOL) or to pos() (ANCH_GPOS), then 1104 * check_offset_max represents an upper bound on the string where 1105 * the substr could start. For the ANCH_GPOS case, we assume that 1106 * the caller of intuit will have already set strpos to 1107 * pos()-gofs, so in this case strpos + offset_max will still be 1108 * an upper bound on the substr. 1109 */ 1110 if (!ml_anch 1111 && prog->intflags & PREGf_ANCH 1112 && prog->check_offset_max != SSize_t_MAX) 1113 { 1114 SSize_t check_len = SvCUR(check) - !!SvTAIL(check); 1115 const char * const anchor = 1116 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); 1117 SSize_t targ_len = (char*)end_point - anchor; 1118 1119 if (check_len > targ_len) { 1120 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1121 "Target string too short to match required substring...\n")); 1122 goto fail_finish; 1123 } 1124 1125 /* do a bytes rather than chars comparison. It's conservative; 1126 * so it skips doing the HOP if the result can't possibly end 1127 * up earlier than the old value of end_point. 1128 */ 1129 assert(anchor + check_len <= (char *)end_point); 1130 if (prog->check_offset_max + check_len < targ_len) { 1131 end_point = HOP3lim((U8*)anchor, 1132 prog->check_offset_max, 1133 end_point - check_len 1134 ) 1135 + check_len; 1136 if (end_point < start_point) 1137 goto fail_finish; 1138 } 1139 } 1140 1141 check_at = fbm_instr( start_point, end_point, 1142 check, multiline ? FBMrf_MULTILINE : 0); 1143 1144 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1145 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n", 1146 (IV)((char*)start_point - strbeg), 1147 (IV)((char*)end_point - strbeg), 1148 (IV)(check_at ? check_at - strbeg : -1) 1149 )); 1150 1151 /* Update the count-of-usability, remove useless subpatterns, 1152 unshift s. */ 1153 1154 DEBUG_EXECUTE_r({ 1155 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 1156 SvPVX_const(check), RE_SV_DUMPLEN(check), 30); 1157 Perl_re_printf( aTHX_ " %s %s substr %s%s%s", 1158 (check_at ? "Found" : "Did not find"), 1159 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) 1160 ? "anchored" : "floating"), 1161 quoted, 1162 RE_SV_TAIL(check), 1163 (check_at ? " at offset " : "...\n") ); 1164 }); 1165 1166 if (!check_at) 1167 goto fail_finish; 1168 /* set rx_origin to the minimum position where the regex could start 1169 * matching, given the constraint of the just-matched check substring. 1170 * But don't set it lower than previously. 1171 */ 1172 1173 if (check_at - rx_origin > prog->check_offset_max) 1174 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); 1175 /* Finish the diagnostic message */ 1176 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1177 "%ld (rx_origin now %" IVdf ")...\n", 1178 (long)(check_at - strbeg), 1179 (IV)(rx_origin - strbeg) 1180 )); 1181 } 1182 1183 1184 /* now look for the 'other' substring if defined */ 1185 1186 if (prog->substrs->data[other_ix].utf8_substr 1187 || prog->substrs->data[other_ix].substr) 1188 { 1189 /* Take into account the "other" substring. */ 1190 char *last, *last1; 1191 char *s; 1192 SV* must; 1193 struct reg_substr_datum *other; 1194 1195 do_other_substr: 1196 other = &prog->substrs->data[other_ix]; 1197 if (!utf8_target && !other->substr) { 1198 if (!to_byte_substr(prog)) { 1199 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); 1200 } 1201 } 1202 1203 /* if "other" is anchored: 1204 * we've previously found a floating substr starting at check_at. 1205 * This means that the regex origin must lie somewhere 1206 * between min (rx_origin): HOP3(check_at, -check_offset_max) 1207 * and max: HOP3(check_at, -check_offset_min) 1208 * (except that min will be >= strpos) 1209 * So the fixed substr must lie somewhere between 1210 * HOP3(min, anchored_offset) 1211 * HOP3(max, anchored_offset) + SvCUR(substr) 1212 */ 1213 1214 /* if "other" is floating 1215 * Calculate last1, the absolute latest point where the 1216 * floating substr could start in the string, ignoring any 1217 * constraints from the earlier fixed match. It is calculated 1218 * as follows: 1219 * 1220 * strend - prog->minlen (in chars) is the absolute latest 1221 * position within the string where the origin of the regex 1222 * could appear. The latest start point for the floating 1223 * substr is float_min_offset(*) on from the start of the 1224 * regex. last1 simply combines thee two offsets. 1225 * 1226 * (*) You might think the latest start point should be 1227 * float_max_offset from the regex origin, and technically 1228 * you'd be correct. However, consider 1229 * /a\d{2,4}bcd\w/ 1230 * Here, float min, max are 3,5 and minlen is 7. 1231 * This can match either 1232 * /a\d\dbcd\w/ 1233 * /a\d\d\dbcd\w/ 1234 * /a\d\d\d\dbcd\w/ 1235 * In the first case, the regex matches minlen chars; in the 1236 * second, minlen+1, in the third, minlen+2. 1237 * In the first case, the floating offset is 3 (which equals 1238 * float_min), in the second, 4, and in the third, 5 (which 1239 * equals float_max). In all cases, the floating string bcd 1240 * can never start more than 4 chars from the end of the 1241 * string, which equals minlen - float_min. As the substring 1242 * starts to match more than float_min from the start of the 1243 * regex, it makes the regex match more than minlen chars, 1244 * and the two cancel each other out. So we can always use 1245 * float_min - minlen, rather than float_max - minlen for the 1246 * latest position in the string. 1247 * 1248 * Note that -minlen + float_min_offset is equivalent (AFAIKT) 1249 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift 1250 */ 1251 1252 assert(prog->minlen >= other->min_offset); 1253 last1 = HOP3c(strend, 1254 other->min_offset - prog->minlen, strbeg); 1255 1256 if (other_ix) {/* i.e. if (other-is-float) */ 1257 /* last is the latest point where the floating substr could 1258 * start, *given* any constraints from the earlier fixed 1259 * match. This constraint is that the floating string starts 1260 * <= float_max_offset chars from the regex origin (rx_origin). 1261 * If this value is less than last1, use it instead. 1262 */ 1263 assert(rx_origin <= last1); 1264 last = 1265 /* this condition handles the offset==infinity case, and 1266 * is a short-cut otherwise. Although it's comparing a 1267 * byte offset to a char length, it does so in a safe way, 1268 * since 1 char always occupies 1 or more bytes, 1269 * so if a string range is (last1 - rx_origin) bytes, 1270 * it will be less than or equal to (last1 - rx_origin) 1271 * chars; meaning it errs towards doing the accurate HOP3 1272 * rather than just using last1 as a short-cut */ 1273 (last1 - rx_origin) < other->max_offset 1274 ? last1 1275 : (char*)HOP3lim(rx_origin, other->max_offset, last1); 1276 } 1277 else { 1278 assert(strpos + start_shift <= check_at); 1279 last = HOP4c(check_at, other->min_offset - start_shift, 1280 strbeg, strend); 1281 } 1282 1283 s = HOP3c(rx_origin, other->min_offset, strend); 1284 if (s < other_last) /* These positions already checked */ 1285 s = other_last; 1286 1287 must = utf8_target ? other->utf8_substr : other->substr; 1288 assert(SvPOK(must)); 1289 { 1290 char *from = s; 1291 char *to = last + SvCUR(must) - (SvTAIL(must)!=0); 1292 1293 if (to > strend) 1294 to = strend; 1295 if (from > to) { 1296 s = NULL; 1297 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1298 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n", 1299 (IV)(from - strbeg), 1300 (IV)(to - strbeg) 1301 )); 1302 } 1303 else { 1304 s = fbm_instr( 1305 (unsigned char*)from, 1306 (unsigned char*)to, 1307 must, 1308 multiline ? FBMrf_MULTILINE : 0 1309 ); 1310 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1311 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n", 1312 (IV)(from - strbeg), 1313 (IV)(to - strbeg), 1314 (IV)(s ? s - strbeg : -1) 1315 )); 1316 } 1317 } 1318 1319 DEBUG_EXECUTE_r({ 1320 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 1321 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 1322 Perl_re_printf( aTHX_ " %s %s substr %s%s", 1323 s ? "Found" : "Contradicts", 1324 other_ix ? "floating" : "anchored", 1325 quoted, RE_SV_TAIL(must)); 1326 }); 1327 1328 1329 if (!s) { 1330 /* last1 is latest possible substr location. If we didn't 1331 * find it before there, we never will */ 1332 if (last >= last1) { 1333 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1334 "; giving up...\n")); 1335 goto fail_finish; 1336 } 1337 1338 /* try to find the check substr again at a later 1339 * position. Maybe next time we'll find the "other" substr 1340 * in range too */ 1341 other_last = HOP3c(last, 1, strend) /* highest failure */; 1342 rx_origin = 1343 other_ix /* i.e. if other-is-float */ 1344 ? HOP3c(rx_origin, 1, strend) 1345 : HOP4c(last, 1 - other->min_offset, strbeg, strend); 1346 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1347 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n", 1348 (other_ix ? "floating" : "anchored"), 1349 (long)(HOP3c(check_at, 1, strend) - strbeg), 1350 (IV)(rx_origin - strbeg) 1351 )); 1352 goto restart; 1353 } 1354 else { 1355 if (other_ix) { /* if (other-is-float) */ 1356 /* other_last is set to s, not s+1, since its possible for 1357 * a floating substr to fail first time, then succeed 1358 * second time at the same floating position; e.g.: 1359 * "-AB--AABZ" =~ /\wAB\d*Z/ 1360 * The first time round, anchored and float match at 1361 * "-(AB)--AAB(Z)" then fail on the initial \w character 1362 * class. Second time round, they match at "-AB--A(AB)(Z)". 1363 */ 1364 other_last = s; 1365 } 1366 else { 1367 rx_origin = HOP3c(s, -other->min_offset, strbeg); 1368 other_last = HOP3c(s, 1, strend); 1369 } 1370 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1371 " at offset %ld (rx_origin now %" IVdf ")...\n", 1372 (long)(s - strbeg), 1373 (IV)(rx_origin - strbeg) 1374 )); 1375 1376 } 1377 } 1378 else { 1379 DEBUG_OPTIMISE_MORE_r( 1380 Perl_re_printf( aTHX_ 1381 " Check-only match: offset min:%" IVdf " max:%" IVdf 1382 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf 1383 " strend:%" IVdf "\n", 1384 (IV)prog->check_offset_min, 1385 (IV)prog->check_offset_max, 1386 (IV)(check_at-strbeg), 1387 (IV)(rx_origin-strbeg), 1388 (IV)(rx_origin-check_at), 1389 (IV)(strend-strbeg) 1390 ) 1391 ); 1392 } 1393 1394 postprocess_substr_matches: 1395 1396 /* handle the extra constraint of /^.../m if present */ 1397 1398 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { 1399 char *s; 1400 1401 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1402 " looking for /^/m anchor")); 1403 1404 /* we have failed the constraint of a \n before rx_origin. 1405 * Find the next \n, if any, even if it's beyond the current 1406 * anchored and/or floating substrings. Whether we should be 1407 * scanning ahead for the next \n or the next substr is debatable. 1408 * On the one hand you'd expect rare substrings to appear less 1409 * often than \n's. On the other hand, searching for \n means 1410 * we're effectively flipping between check_substr and "\n" on each 1411 * iteration as the current "rarest" candidate string, which 1412 * means for example that we'll quickly reject the whole string if 1413 * hasn't got a \n, rather than trying every substr position 1414 * first 1415 */ 1416 1417 s = HOP3c(strend, - prog->minlen, strpos); 1418 if (s <= rx_origin || 1419 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) 1420 { 1421 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1422 " Did not find /%s^%s/m...\n", 1423 PL_colors[0], PL_colors[1])); 1424 goto fail_finish; 1425 } 1426 1427 /* earliest possible origin is 1 char after the \n. 1428 * (since *rx_origin == '\n', it's safe to ++ here rather than 1429 * HOP(rx_origin, 1)) */ 1430 rx_origin++; 1431 1432 if (prog->substrs->check_ix == 0 /* check is anchored */ 1433 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) 1434 { 1435 /* Position contradicts check-string; either because 1436 * check was anchored (and thus has no wiggle room), 1437 * or check was float and rx_origin is above the float range */ 1438 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1439 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n", 1440 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg))); 1441 goto restart; 1442 } 1443 1444 /* if we get here, the check substr must have been float, 1445 * is in range, and we may or may not have had an anchored 1446 * "other" substr which still contradicts */ 1447 assert(prog->substrs->check_ix); /* check is float */ 1448 1449 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { 1450 /* whoops, the anchored "other" substr exists, so we still 1451 * contradict. On the other hand, the float "check" substr 1452 * didn't contradict, so just retry the anchored "other" 1453 * substr */ 1454 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1455 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n", 1456 PL_colors[0], PL_colors[1], 1457 (IV)(rx_origin - strbeg + prog->anchored_offset), 1458 (IV)(rx_origin - strbeg) 1459 )); 1460 goto do_other_substr; 1461 } 1462 1463 /* success: we don't contradict the found floating substring 1464 * (and there's no anchored substr). */ 1465 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1466 " Found /%s^%s/m with rx_origin %ld...\n", 1467 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg))); 1468 } 1469 else { 1470 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1471 " (multiline anchor test skipped)\n")); 1472 } 1473 1474 success_at_start: 1475 1476 1477 /* if we have a starting character class, then test that extra constraint. 1478 * (trie stclasses are too expensive to use here, we are better off to 1479 * leave it to regmatch itself) */ 1480 1481 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { 1482 const U8* const str = (U8*)STRING(progi->regstclass); 1483 1484 /* XXX this value could be pre-computed */ 1485 const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT 1486 ? (reginfo->is_utf8_pat 1487 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str) 1488 : (SSize_t)STR_LEN(progi->regstclass)) 1489 : 1); 1490 char * endpos; 1491 char *s; 1492 /* latest pos that a matching float substr constrains rx start to */ 1493 char *rx_max_float = NULL; 1494 1495 /* if the current rx_origin is anchored, either by satisfying an 1496 * anchored substring constraint, or a /^.../m constraint, then we 1497 * can reject the current origin if the start class isn't found 1498 * at the current position. If we have a float-only match, then 1499 * rx_origin is constrained to a range; so look for the start class 1500 * in that range. if neither, then look for the start class in the 1501 * whole rest of the string */ 1502 1503 /* XXX DAPM it's not clear what the minlen test is for, and why 1504 * it's not used in the floating case. Nothing in the test suite 1505 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. 1506 * Here are some old comments, which may or may not be correct: 1507 * 1508 * minlen == 0 is possible if regstclass is \b or \B, 1509 * and the fixed substr is ''$. 1510 * Since minlen is already taken into account, rx_origin+1 is 1511 * before strend; accidentally, minlen >= 1 guaranties no false 1512 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : 1513 * 0) below assumes that regstclass does not come from lookahead... 1514 * If regstclass takes bytelength more than 1: If charlength==1, OK. 1515 * This leaves EXACTF-ish only, which are dealt with in 1516 * find_byclass(). 1517 */ 1518 1519 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) 1520 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend); 1521 else if (prog->float_substr || prog->float_utf8) { 1522 rx_max_float = HOP3c(check_at, -start_shift, strbeg); 1523 endpos = HOP3clim(rx_max_float, cl_l, strend); 1524 } 1525 else 1526 endpos= strend; 1527 1528 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1529 " looking for class: start_shift: %" IVdf " check_at: %" IVdf 1530 " rx_origin: %" IVdf " endpos: %" IVdf "\n", 1531 (IV)start_shift, (IV)(check_at - strbeg), 1532 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); 1533 1534 s = find_byclass(prog, progi->regstclass, rx_origin, endpos, 1535 reginfo); 1536 if (!s) { 1537 if (endpos == strend) { 1538 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1539 " Could not match STCLASS...\n") ); 1540 goto fail; 1541 } 1542 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1543 " This position contradicts STCLASS...\n") ); 1544 if ((prog->intflags & PREGf_ANCH) && !ml_anch 1545 && !(prog->intflags & PREGf_IMPLICIT)) 1546 goto fail; 1547 1548 /* Contradict one of substrings */ 1549 if (prog->anchored_substr || prog->anchored_utf8) { 1550 if (prog->substrs->check_ix == 1) { /* check is float */ 1551 /* Have both, check_string is floating */ 1552 assert(rx_origin + start_shift <= check_at); 1553 if (rx_origin + start_shift != check_at) { 1554 /* not at latest position float substr could match: 1555 * Recheck anchored substring, but not floating. 1556 * The condition above is in bytes rather than 1557 * chars for efficiency. It's conservative, in 1558 * that it errs on the side of doing 'goto 1559 * do_other_substr'. In this case, at worst, 1560 * an extra anchored search may get done, but in 1561 * practice the extra fbm_instr() is likely to 1562 * get skipped anyway. */ 1563 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1564 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n", 1565 (long)(other_last - strbeg), 1566 (IV)(rx_origin - strbeg) 1567 )); 1568 goto do_other_substr; 1569 } 1570 } 1571 } 1572 else { 1573 /* float-only */ 1574 1575 if (ml_anch) { 1576 /* In the presence of ml_anch, we might be able to 1577 * find another \n without breaking the current float 1578 * constraint. */ 1579 1580 /* strictly speaking this should be HOP3c(..., 1, ...), 1581 * but since we goto a block of code that's going to 1582 * search for the next \n if any, its safe here */ 1583 rx_origin++; 1584 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1585 " about to look for /%s^%s/m starting at rx_origin %ld...\n", 1586 PL_colors[0], PL_colors[1], 1587 (long)(rx_origin - strbeg)) ); 1588 goto postprocess_substr_matches; 1589 } 1590 1591 /* strictly speaking this can never be true; but might 1592 * be if we ever allow intuit without substrings */ 1593 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) 1594 goto fail; 1595 1596 rx_origin = rx_max_float; 1597 } 1598 1599 /* at this point, any matching substrings have been 1600 * contradicted. Start again... */ 1601 1602 rx_origin = HOP3c(rx_origin, 1, strend); 1603 1604 /* uses bytes rather than char calculations for efficiency. 1605 * It's conservative: it errs on the side of doing 'goto restart', 1606 * where there is code that does a proper char-based test */ 1607 if (rx_origin + start_shift + end_shift > strend) { 1608 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1609 " Could not match STCLASS...\n") ); 1610 goto fail; 1611 } 1612 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1613 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n", 1614 (prog->substrs->check_ix ? "floating" : "anchored"), 1615 (long)(rx_origin + start_shift - strbeg), 1616 (IV)(rx_origin - strbeg) 1617 )); 1618 goto restart; 1619 } 1620 1621 /* Success !!! */ 1622 1623 if (rx_origin != s) { 1624 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1625 " By STCLASS: moving %ld --> %ld\n", 1626 (long)(rx_origin - strbeg), (long)(s - strbeg)) 1627 ); 1628 } 1629 else { 1630 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1631 " Does not contradict STCLASS...\n"); 1632 ); 1633 } 1634 } 1635 1636 /* Decide whether using the substrings helped */ 1637 1638 if (rx_origin != strpos) { 1639 /* Fixed substring is found far enough so that the match 1640 cannot start at strpos. */ 1641 1642 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n")); 1643 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ 1644 } 1645 else { 1646 /* The found rx_origin position does not prohibit matching at 1647 * strpos, so calling intuit didn't gain us anything. Decrement 1648 * the BmUSEFUL() count on the check substring, and if we reach 1649 * zero, free it. */ 1650 if (!(prog->intflags & PREGf_NAUGHTY) 1651 && (utf8_target ? ( 1652 prog->check_utf8 /* Could be deleted already */ 1653 && --BmUSEFUL(prog->check_utf8) < 0 1654 && (prog->check_utf8 == prog->float_utf8) 1655 ) : ( 1656 prog->check_substr /* Could be deleted already */ 1657 && --BmUSEFUL(prog->check_substr) < 0 1658 && (prog->check_substr == prog->float_substr) 1659 ))) 1660 { 1661 /* If flags & SOMETHING - do not do it many times on the same match */ 1662 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n")); 1663 /* XXX Does the destruction order has to change with utf8_target? */ 1664 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); 1665 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); 1666 prog->check_substr = prog->check_utf8 = NULL; /* disable */ 1667 prog->float_substr = prog->float_utf8 = NULL; /* clear */ 1668 check = NULL; /* abort */ 1669 /* XXXX This is a remnant of the old implementation. It 1670 looks wasteful, since now INTUIT can use many 1671 other heuristics. */ 1672 prog->extflags &= ~RXf_USE_INTUIT; 1673 } 1674 } 1675 1676 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1677 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", 1678 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) ); 1679 1680 return rx_origin; 1681 1682 fail_finish: /* Substring not found */ 1683 if (prog->check_substr || prog->check_utf8) /* could be removed already */ 1684 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ 1685 fail: 1686 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n", 1687 PL_colors[4], PL_colors[5])); 1688 return NULL; 1689 } 1690 1691 1692 #define DECL_TRIE_TYPE(scan) \ 1693 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ 1694 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \ 1695 trie_utf8l, trie_flu8, trie_flu8_latin } \ 1696 trie_type = ((scan->flags == EXACT) \ 1697 ? (utf8_target ? trie_utf8 : trie_plain) \ 1698 : (scan->flags == EXACTL) \ 1699 ? (utf8_target ? trie_utf8l : trie_plain) \ 1700 : (scan->flags == EXACTFAA) \ 1701 ? (utf8_target \ 1702 ? trie_utf8_exactfa_fold \ 1703 : trie_latin_utf8_exactfa_fold) \ 1704 : (scan->flags == EXACTFLU8 \ 1705 ? (utf8_target \ 1706 ? trie_flu8 \ 1707 : trie_flu8_latin) \ 1708 : (utf8_target \ 1709 ? trie_utf8_fold \ 1710 : trie_latin_utf8_fold))) 1711 1712 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is 1713 * 'foldbuf+sizeof(foldbuf)' */ 1714 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ 1715 STMT_START { \ 1716 STRLEN skiplen; \ 1717 U8 flags = FOLD_FLAGS_FULL; \ 1718 switch (trie_type) { \ 1719 case trie_flu8: \ 1720 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ 1721 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \ 1722 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \ 1723 } \ 1724 goto do_trie_utf8_fold; \ 1725 case trie_utf8_exactfa_fold: \ 1726 flags |= FOLD_FLAGS_NOMIX_ASCII; \ 1727 /* FALLTHROUGH */ \ 1728 case trie_utf8_fold: \ 1729 do_trie_utf8_fold: \ 1730 if ( foldlen>0 ) { \ 1731 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \ 1732 foldlen -= len; \ 1733 uscan += len; \ 1734 len=0; \ 1735 } else { \ 1736 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \ 1737 flags); \ 1738 len = UTF8_SAFE_SKIP(uc, uc_end); \ 1739 skiplen = UVCHR_SKIP( uvc ); \ 1740 foldlen -= skiplen; \ 1741 uscan = foldbuf + skiplen; \ 1742 } \ 1743 break; \ 1744 case trie_flu8_latin: \ 1745 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ 1746 goto do_trie_latin_utf8_fold; \ 1747 case trie_latin_utf8_exactfa_fold: \ 1748 flags |= FOLD_FLAGS_NOMIX_ASCII; \ 1749 /* FALLTHROUGH */ \ 1750 case trie_latin_utf8_fold: \ 1751 do_trie_latin_utf8_fold: \ 1752 if ( foldlen>0 ) { \ 1753 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \ 1754 foldlen -= len; \ 1755 uscan += len; \ 1756 len=0; \ 1757 } else { \ 1758 len = 1; \ 1759 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ 1760 skiplen = UVCHR_SKIP( uvc ); \ 1761 foldlen -= skiplen; \ 1762 uscan = foldbuf + skiplen; \ 1763 } \ 1764 break; \ 1765 case trie_utf8l: \ 1766 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ 1767 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \ 1768 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \ 1769 } \ 1770 /* FALLTHROUGH */ \ 1771 case trie_utf8: \ 1772 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \ 1773 break; \ 1774 case trie_plain: \ 1775 uvc = (UV)*uc; \ 1776 len = 1; \ 1777 } \ 1778 if (uvc < 256) { \ 1779 charid = trie->charmap[ uvc ]; \ 1780 } \ 1781 else { \ 1782 charid = 0; \ 1783 if (widecharmap) { \ 1784 SV** const svpp = hv_fetch(widecharmap, \ 1785 (char*)&uvc, sizeof(UV), 0); \ 1786 if (svpp) \ 1787 charid = (U16)SvIV(*svpp); \ 1788 } \ 1789 } \ 1790 } STMT_END 1791 1792 #define DUMP_EXEC_POS(li,s,doutf8,depth) \ 1793 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ 1794 startpos, doutf8, depth) 1795 1796 #define REXEC_FBC_UTF8_SCAN(CODE) \ 1797 STMT_START { \ 1798 while (s < strend) { \ 1799 CODE \ 1800 s += UTF8_SAFE_SKIP(s, reginfo->strend); \ 1801 } \ 1802 } STMT_END 1803 1804 #define REXEC_FBC_NON_UTF8_SCAN(CODE) \ 1805 STMT_START { \ 1806 while (s < strend) { \ 1807 CODE \ 1808 s++; \ 1809 } \ 1810 } STMT_END 1811 1812 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \ 1813 STMT_START { \ 1814 while (s < strend) { \ 1815 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \ 1816 } \ 1817 } STMT_END 1818 1819 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \ 1820 STMT_START { \ 1821 while (s < strend) { \ 1822 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \ 1823 } \ 1824 } STMT_END 1825 1826 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \ 1827 if (COND) { \ 1828 FBC_CHECK_AND_TRY \ 1829 s += UTF8_SAFE_SKIP(s, reginfo->strend); \ 1830 previous_occurrence_end = s; \ 1831 } \ 1832 else { \ 1833 s += UTF8SKIP(s); \ 1834 } 1835 1836 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \ 1837 if (COND) { \ 1838 FBC_CHECK_AND_TRY \ 1839 s++; \ 1840 previous_occurrence_end = s; \ 1841 } \ 1842 else { \ 1843 s++; \ 1844 } 1845 1846 /* We keep track of where the next character should start after an occurrence 1847 * of the one we're looking for. Knowing that, we can see right away if the 1848 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we 1849 * don't accept the 2nd and succeeding adjacent occurrences */ 1850 #define FBC_CHECK_AND_TRY \ 1851 if ( ( doevery \ 1852 || s != previous_occurrence_end) \ 1853 && ( reginfo->intuit \ 1854 || (s <= reginfo->strend && regtry(reginfo, &s)))) \ 1855 { \ 1856 goto got_it; \ 1857 } 1858 1859 1860 /* These differ from the above macros in that they call a function which 1861 * returns the next occurrence of the thing being looked for in 's'; and 1862 * 'strend' if there is no such occurrence. 'f' is something like fcn(a,b,c) 1863 * */ 1864 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \ 1865 while (s < strend) { \ 1866 s = (char *) (f); \ 1867 if (s >= strend) { \ 1868 break; \ 1869 } \ 1870 \ 1871 FBC_CHECK_AND_TRY \ 1872 s += UTF8SKIP(s); \ 1873 previous_occurrence_end = s; \ 1874 } 1875 1876 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \ 1877 while (s < strend) { \ 1878 s = (char *) (f); \ 1879 if (s >= strend) { \ 1880 break; \ 1881 } \ 1882 \ 1883 FBC_CHECK_AND_TRY \ 1884 s++; \ 1885 previous_occurrence_end = s; \ 1886 } 1887 1888 /* This is like the above macro except the function returns NULL if there is no 1889 * occurrence, and there is a further condition that must be matched besides 1890 * the function */ 1891 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND) \ 1892 while (s < strend) { \ 1893 s = (char *) (f); \ 1894 if (s == NULL) { \ 1895 s = (char *) strend; \ 1896 break; \ 1897 } \ 1898 \ 1899 if (COND) { \ 1900 FBC_CHECK_AND_TRY \ 1901 s += UTF8_SAFE_SKIP(s, reginfo->strend); \ 1902 previous_occurrence_end = s; \ 1903 } \ 1904 else { \ 1905 s += UTF8SKIP(s); \ 1906 } \ 1907 } 1908 1909 /* This differs from the above macros in that it is passed a single byte that 1910 * is known to begin the next occurrence of the thing being looked for in 's'. 1911 * It does a memchr to find the next occurrence of 'byte', before trying 'COND' 1912 * at that position. */ 1913 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \ 1914 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s), \ 1915 COND) 1916 1917 /* This is like the function above, but takes an entire string to look for 1918 * instead of a single byte */ 1919 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND) \ 1920 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND( \ 1921 ninstr(s, strend, substr, substr_end), \ 1922 COND) 1923 1924 /* The four macros below are slightly different versions of the same logic. 1925 * 1926 * The first is for /a and /aa when the target string is UTF-8. This can only 1927 * match ascii, but it must advance based on UTF-8. The other three handle 1928 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are 1929 * looking for the boundary (or non-boundary) between a word and non-word 1930 * character. The utf8 and non-utf8 cases have the same logic, but the details 1931 * must be different. Find the "wordness" of the character just prior to this 1932 * one, and compare it with the wordness of this one. If they differ, we have 1933 * a boundary. At the beginning of the string, pretend that the previous 1934 * character was a new-line. 1935 * 1936 * All these macros uncleanly have side-effects with each other and outside 1937 * variables. So far it's been too much trouble to clean-up 1938 * 1939 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is 1940 * a word character or not. 1941 * IF_SUCCESS is code to do if it finds that we are at a boundary between 1942 * word/non-word 1943 * IF_FAIL is code to do if we aren't at a boundary between word/non-word 1944 * 1945 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we 1946 * are looking for a boundary or for a non-boundary. If we are looking for a 1947 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and 1948 * see if this tentative match actually works, and if so, to quit the loop 1949 * here. And vice-versa if we are looking for a non-boundary. 1950 * 1951 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and 1952 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of 1953 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be 1954 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal 1955 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that 1956 * complement. But in that branch we complement tmp, meaning that at the 1957 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s), 1958 * which means at the top of the loop in the next iteration, it is 1959 * TEST_NON_UTF8(s-1) */ 1960 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 1961 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ 1962 tmp = TEST_NON_UTF8(tmp); \ 1963 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ 1964 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ 1965 tmp = !tmp; \ 1966 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \ 1967 } \ 1968 else { \ 1969 IF_FAIL; \ 1970 } \ 1971 ); \ 1972 1973 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and 1974 * TEST_UTF8 is a macro that for the same input code points returns identically 1975 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an 1976 * end pointer as well) */ 1977 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \ 1978 if (s == reginfo->strbeg) { \ 1979 tmp = '\n'; \ 1980 } \ 1981 else { /* Back-up to the start of the previous character */ \ 1982 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ 1983 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ 1984 0, UTF8_ALLOW_DEFAULT); \ 1985 } \ 1986 tmp = TEST_UV(tmp); \ 1987 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \ 1988 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \ 1989 tmp = !tmp; \ 1990 IF_SUCCESS; \ 1991 } \ 1992 else { \ 1993 IF_FAIL; \ 1994 } \ 1995 ); 1996 1997 /* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the 1998 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases, 1999 * set-up by the FBC_BOUND, etc macros below */ 2000 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 2001 UTF8_CODE; \ 2002 /* Here, things have been set up by the previous code so that tmp is the \ 2003 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \ 2004 * against the EOS, which we treat as a \n */ \ 2005 if (tmp == ! TEST_NON_UTF8('\n')) { \ 2006 IF_SUCCESS; \ 2007 } \ 2008 else { \ 2009 IF_FAIL; \ 2010 } 2011 2012 /* Same as the macro above, but the target isn't UTF-8 */ 2013 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 2014 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ 2015 tmp = TEST_NON_UTF8(tmp); \ 2016 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \ 2017 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \ 2018 IF_SUCCESS; \ 2019 tmp = !tmp; \ 2020 } \ 2021 else { \ 2022 IF_FAIL; \ 2023 } \ 2024 ); \ 2025 /* Here, things have been set up by the previous code so that tmp is \ 2026 * the return of TEST_NON_UTF8(s-1). We also have to check if this \ 2027 * matches against the EOS, which we treat as a \n */ \ 2028 if (tmp == ! TEST_NON_UTF8('\n')) { \ 2029 IF_SUCCESS; \ 2030 } \ 2031 else { \ 2032 IF_FAIL; \ 2033 } 2034 2035 /* This is the macro to use when we want to see if something that looks like it 2036 * could match, actually does, and if so exits the loop. It needs to be used 2037 * only for bounds checking macros, as it allows for matching beyond the end of 2038 * string (which should be zero length without having to look at the string 2039 * contents) */ 2040 #define REXEC_FBC_TRYIT \ 2041 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \ 2042 goto got_it 2043 2044 /* The only difference between the BOUND and NBOUND cases is that 2045 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in 2046 * NBOUND. This is accomplished by passing it as either the if or else clause, 2047 * with the other one being empty (PLACEHOLDER is defined as empty). 2048 * 2049 * The TEST_FOO parameters are for operating on different forms of input, but 2050 * all should be ones that return identically for the same underlying code 2051 * points */ 2052 2053 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ 2054 FBC_BOUND_COMMON_UTF8( \ 2055 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ 2056 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2057 2058 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \ 2059 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2060 2061 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \ 2062 FBC_BOUND_COMMON_UTF8( \ 2063 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\ 2064 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2065 2066 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \ 2067 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2068 2069 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ 2070 FBC_BOUND_COMMON_UTF8( \ 2071 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ 2072 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2073 2074 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \ 2075 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2076 2077 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \ 2078 FBC_BOUND_COMMON_UTF8( \ 2079 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ 2080 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2081 2082 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \ 2083 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2084 2085 #ifdef DEBUGGING 2086 static IV 2087 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) { 2088 IV cp_out = _invlist_search(invlist, cp_in); 2089 assert(cp_out >= 0); 2090 return cp_out; 2091 } 2092 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \ 2093 invmap[S_get_break_val_cp_checked(invlist, cp)] 2094 #else 2095 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \ 2096 invmap[_invlist_search(invlist, cp)] 2097 #endif 2098 2099 /* Takes a pointer to an inversion list, a pointer to its corresponding 2100 * inversion map, and a code point, and returns the code point's value 2101 * according to the two arrays. It assumes that all code points have a value. 2102 * This is used as the base macro for macros for particular properties */ 2103 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \ 2104 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) 2105 2106 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead 2107 * of a code point, returning the value for the first code point in the string. 2108 * And it takes the particular macro name that finds the desired value given a 2109 * code point. Merely convert the UTF-8 to code point and call the cp macro */ 2110 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \ 2111 (__ASSERT_(pos < strend) \ 2112 /* Note assumes is valid UTF-8 */ \ 2113 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL)))) 2114 2115 /* Returns the GCB value for the input code point */ 2116 #define getGCB_VAL_CP(cp) \ 2117 _generic_GET_BREAK_VAL_CP( \ 2118 PL_GCB_invlist, \ 2119 _Perl_GCB_invmap, \ 2120 (cp)) 2121 2122 /* Returns the GCB value for the first code point in the UTF-8 encoded string 2123 * bounded by pos and strend */ 2124 #define getGCB_VAL_UTF8(pos, strend) \ 2125 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend) 2126 2127 /* Returns the LB value for the input code point */ 2128 #define getLB_VAL_CP(cp) \ 2129 _generic_GET_BREAK_VAL_CP( \ 2130 PL_LB_invlist, \ 2131 _Perl_LB_invmap, \ 2132 (cp)) 2133 2134 /* Returns the LB value for the first code point in the UTF-8 encoded string 2135 * bounded by pos and strend */ 2136 #define getLB_VAL_UTF8(pos, strend) \ 2137 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend) 2138 2139 2140 /* Returns the SB value for the input code point */ 2141 #define getSB_VAL_CP(cp) \ 2142 _generic_GET_BREAK_VAL_CP( \ 2143 PL_SB_invlist, \ 2144 _Perl_SB_invmap, \ 2145 (cp)) 2146 2147 /* Returns the SB value for the first code point in the UTF-8 encoded string 2148 * bounded by pos and strend */ 2149 #define getSB_VAL_UTF8(pos, strend) \ 2150 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend) 2151 2152 /* Returns the WB value for the input code point */ 2153 #define getWB_VAL_CP(cp) \ 2154 _generic_GET_BREAK_VAL_CP( \ 2155 PL_WB_invlist, \ 2156 _Perl_WB_invmap, \ 2157 (cp)) 2158 2159 /* Returns the WB value for the first code point in the UTF-8 encoded string 2160 * bounded by pos and strend */ 2161 #define getWB_VAL_UTF8(pos, strend) \ 2162 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend) 2163 2164 /* We know what class REx starts with. Try to find this position... */ 2165 /* if reginfo->intuit, its a dryrun */ 2166 /* annoyingly all the vars in this routine have different names from their counterparts 2167 in regmatch. /grrr */ 2168 STATIC char * 2169 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, 2170 const char *strend, regmatch_info *reginfo) 2171 { 2172 2173 /* TRUE if x+ need not match at just the 1st pos of run of x's */ 2174 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; 2175 2176 char *pat_string; /* The pattern's exactish string */ 2177 char *pat_end; /* ptr to end char of pat_string */ 2178 re_fold_t folder; /* Function for computing non-utf8 folds */ 2179 const U8 *fold_array; /* array for folding ords < 256 */ 2180 STRLEN ln; 2181 STRLEN lnc; 2182 U8 c1; 2183 U8 c2; 2184 char *e = NULL; 2185 2186 /* In some cases we accept only the first occurence of 'x' in a sequence of 2187 * them. This variable points to just beyond the end of the previous 2188 * occurrence of 'x', hence we can tell if we are in a sequence. (Having 2189 * it point to beyond the 'x' allows us to work for UTF-8 without having to 2190 * hop back.) */ 2191 char * previous_occurrence_end = 0; 2192 2193 I32 tmp; /* Scratch variable */ 2194 const bool utf8_target = reginfo->is_utf8_target; 2195 UV utf8_fold_flags = 0; 2196 const bool is_utf8_pat = reginfo->is_utf8_pat; 2197 bool to_complement = FALSE; /* Invert the result? Taking the xor of this 2198 with a result inverts that result, as 0^1 = 2199 1 and 1^1 = 0 */ 2200 _char_class_number classnum; 2201 2202 RXi_GET_DECL(prog,progi); 2203 2204 PERL_ARGS_ASSERT_FIND_BYCLASS; 2205 2206 /* We know what class it must start with. The case statements below have 2207 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb' 2208 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern 2209 * ('p8' and 'pb'. */ 2210 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) { 2211 2212 case ANYOFPOSIXL_t8_pb: 2213 case ANYOFPOSIXL_t8_p8: 2214 case ANYOFL_t8_pb: 2215 case ANYOFL_t8_p8: 2216 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2217 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c); 2218 2219 /* FALLTHROUGH */ 2220 2221 case ANYOFD_t8_pb: 2222 case ANYOFD_t8_p8: 2223 case ANYOF_t8_pb: 2224 case ANYOF_t8_p8: 2225 REXEC_FBC_UTF8_CLASS_SCAN( 2226 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)); 2227 break; 2228 2229 case ANYOFPOSIXL_tb_pb: 2230 case ANYOFPOSIXL_tb_p8: 2231 case ANYOFL_tb_pb: 2232 case ANYOFL_tb_p8: 2233 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2234 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c); 2235 2236 /* FALLTHROUGH */ 2237 2238 case ANYOFD_tb_pb: 2239 case ANYOFD_tb_p8: 2240 case ANYOF_tb_pb: 2241 case ANYOF_tb_p8: 2242 if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) { 2243 /* We know that s is in the bitmap range since the target isn't 2244 * UTF-8, so what happens for out-of-range values is not relevant, 2245 * so exclude that from the flags */ 2246 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 2247 0)); 2248 } 2249 else { 2250 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s))); 2251 } 2252 break; 2253 2254 case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */ 2255 case ANYOFM_tb_p8: 2256 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN( 2257 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c))); 2258 break; 2259 2260 case ANYOFM_t8_pb: 2261 case ANYOFM_t8_p8: 2262 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But 2263 * we do anyway for performance reasons, as otherwise we would have to 2264 * examine all the continuation characters */ 2265 REXEC_FBC_UTF8_FIND_NEXT_SCAN( 2266 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c))); 2267 break; 2268 2269 case NANYOFM_tb_pb: 2270 case NANYOFM_tb_p8: 2271 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN( 2272 find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c))); 2273 break; 2274 2275 case NANYOFM_t8_pb: 2276 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8 2277 variants. */ 2278 REXEC_FBC_UTF8_FIND_NEXT_SCAN( 2279 (char *) find_span_end_mask((U8 *) s, (U8 *) strend, 2280 (U8) ARG(c), FLAGS(c))); 2281 break; 2282 2283 /* These nodes all require at least one code point to be in UTF-8 to 2284 * match */ 2285 case ANYOFH_tb_pb: 2286 case ANYOFH_tb_p8: 2287 case ANYOFHb_tb_pb: 2288 case ANYOFHb_tb_p8: 2289 case ANYOFHr_tb_pb: 2290 case ANYOFHr_tb_p8: 2291 case ANYOFHs_tb_pb: 2292 case ANYOFHs_tb_p8: 2293 case EXACTFLU8_tb_pb: 2294 case EXACTFLU8_tb_p8: 2295 case EXACTFU_REQ8_tb_pb: 2296 case EXACTFU_REQ8_tb_p8: 2297 break; 2298 2299 case ANYOFH_t8_pb: 2300 case ANYOFH_t8_p8: 2301 REXEC_FBC_UTF8_CLASS_SCAN( 2302 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) 2303 && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */))); 2304 break; 2305 2306 case ANYOFHb_t8_pb: 2307 case ANYOFHb_t8_p8: 2308 { 2309 /* We know what the first byte of any matched string should be. */ 2310 U8 first_byte = FLAGS(c); 2311 2312 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, 2313 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)); 2314 } 2315 break; 2316 2317 case ANYOFHr_t8_pb: 2318 case ANYOFHr_t8_p8: 2319 REXEC_FBC_UTF8_CLASS_SCAN( 2320 ( inRANGE(NATIVE_UTF8_TO_I8(*s), 2321 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)), 2322 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c))) 2323 && reginclass(prog, c, (U8*)s, (U8*) strend, 2324 1 /* is utf8 */))); 2325 break; 2326 2327 case ANYOFHs_t8_pb: 2328 case ANYOFHs_t8_p8: 2329 REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN( 2330 ((struct regnode_anyofhs *) c)->string, 2331 /* Note FLAGS is the string length in this regnode */ 2332 ((struct regnode_anyofhs *) c)->string + FLAGS(c), 2333 reginclass(prog, c, (U8*)s, (U8*) strend, 2334 1 /* is utf8 */)); 2335 break; 2336 2337 case ANYOFR_tb_pb: 2338 case ANYOFR_tb_p8: 2339 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s, 2340 ANYOFRbase(c), ANYOFRdelta(c))); 2341 break; 2342 2343 case ANYOFR_t8_pb: 2344 case ANYOFR_t8_p8: 2345 REXEC_FBC_UTF8_CLASS_SCAN( 2346 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) 2347 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s, 2348 (U8 *) strend, 2349 NULL), 2350 ANYOFRbase(c), ANYOFRdelta(c)))); 2351 break; 2352 2353 case ANYOFRb_tb_pb: 2354 case ANYOFRb_tb_p8: 2355 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s, 2356 ANYOFRbase(c), ANYOFRdelta(c))); 2357 break; 2358 2359 case ANYOFRb_t8_pb: 2360 case ANYOFRb_t8_p8: 2361 { /* We know what the first byte of any matched string should be */ 2362 U8 first_byte = FLAGS(c); 2363 2364 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, 2365 withinCOUNT(utf8_to_uvchr_buf((U8 *) s, 2366 (U8 *) strend, 2367 NULL), 2368 ANYOFRbase(c), ANYOFRdelta(c))); 2369 } 2370 break; 2371 2372 case EXACTFAA_tb_pb: 2373 2374 /* Latin1 folds are not affected by /a, except it excludes the sharp s, 2375 * which these functions don't handle anyway */ 2376 fold_array = PL_fold_latin1; 2377 folder = foldEQ_latin1_s2_folded; 2378 goto do_exactf_non_utf8; 2379 2380 case EXACTF_tb_pb: 2381 fold_array = PL_fold; 2382 folder = foldEQ; 2383 goto do_exactf_non_utf8; 2384 2385 case EXACTFL_tb_pb: 2386 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2387 2388 if (IN_UTF8_CTYPE_LOCALE) { 2389 utf8_fold_flags = FOLDEQ_LOCALE; 2390 goto do_exactf_utf8; 2391 } 2392 2393 fold_array = PL_fold_locale; 2394 folder = foldEQ_locale; 2395 goto do_exactf_non_utf8; 2396 2397 case EXACTFU_tb_pb: 2398 /* Any 'ss' in the pattern should have been replaced by regcomp, so we 2399 * don't have to worry here about this single special case in the 2400 * Latin1 range */ 2401 fold_array = PL_fold_latin1; 2402 folder = foldEQ_latin1_s2_folded; 2403 2404 /* FALLTHROUGH */ 2405 2406 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there 2407 are no glitches with fold-length differences 2408 between the target string and pattern */ 2409 2410 /* The idea in the non-utf8 EXACTF* cases is to first find the first 2411 * character of the EXACTF* node and then, if necessary, 2412 * case-insensitively compare the full text of the node. c1 is the 2413 * first character. c2 is its fold. This logic will not work for 2414 * Unicode semantics and the german sharp ss, which hence should not be 2415 * compiled into a node that gets here. */ 2416 pat_string = STRINGs(c); 2417 ln = STR_LENs(c); /* length to match in octets/bytes */ 2418 2419 /* We know that we have to match at least 'ln' bytes (which is the same 2420 * as characters, since not utf8). If we have to match 3 characters, 2421 * and there are only 2 availabe, we know without trying that it will 2422 * fail; so don't start a match past the required minimum number from 2423 * the far end */ 2424 e = HOP3c(strend, -((SSize_t)ln), s); 2425 if (e < s) 2426 break; 2427 2428 c1 = *pat_string; 2429 c2 = fold_array[c1]; 2430 if (c1 == c2) { /* If char and fold are the same */ 2431 while (s <= e) { 2432 s = (char *) memchr(s, c1, e + 1 - s); 2433 if (s == NULL) { 2434 break; 2435 } 2436 2437 /* Check that the rest of the node matches */ 2438 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1)) 2439 && (reginfo->intuit || regtry(reginfo, &s)) ) 2440 { 2441 goto got_it; 2442 } 2443 s++; 2444 } 2445 } 2446 else { 2447 U8 bits_differing = c1 ^ c2; 2448 2449 /* If the folds differ in one bit position only, we can mask to 2450 * match either of them, and can use this faster find method. Both 2451 * ASCII and EBCDIC tend to have their case folds differ in only 2452 * one position, so this is very likely */ 2453 if (LIKELY(PL_bitcount[bits_differing] == 1)) { 2454 bits_differing = ~ bits_differing; 2455 while (s <= e) { 2456 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1, 2457 (c1 & bits_differing), bits_differing); 2458 if (s > e) { 2459 break; 2460 } 2461 2462 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1)) 2463 && (reginfo->intuit || regtry(reginfo, &s)) ) 2464 { 2465 goto got_it; 2466 } 2467 s++; 2468 } 2469 } 2470 else { /* Otherwise, stuck with looking byte-at-a-time. This 2471 should actually happen only in EXACTFL nodes */ 2472 while (s <= e) { 2473 if ( (*(U8*)s == c1 || *(U8*)s == c2) 2474 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1)) 2475 && (reginfo->intuit || regtry(reginfo, &s)) ) 2476 { 2477 goto got_it; 2478 } 2479 s++; 2480 } 2481 } 2482 } 2483 break; 2484 2485 case EXACTFAA_tb_p8: 2486 case EXACTFAA_t8_p8: 2487 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII 2488 |FOLDEQ_S2_ALREADY_FOLDED 2489 |FOLDEQ_S2_FOLDS_SANE; 2490 goto do_exactf_utf8; 2491 2492 case EXACTFAA_NO_TRIE_tb_pb: 2493 case EXACTFAA_NO_TRIE_t8_pb: 2494 case EXACTFAA_t8_pb: 2495 2496 /* Here, and elsewhere in this file, the reason we can't consider a 2497 * non-UTF-8 pattern already folded in the presence of a UTF-8 target 2498 * is because any MICRO SIGN in the pattern won't be folded. Since the 2499 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider 2500 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */ 2501 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 2502 goto do_exactf_utf8; 2503 2504 case EXACTFL_tb_p8: 2505 case EXACTFL_t8_pb: 2506 case EXACTFL_t8_p8: 2507 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2508 utf8_fold_flags = FOLDEQ_LOCALE; 2509 goto do_exactf_utf8; 2510 2511 case EXACTFLU8_t8_pb: 2512 case EXACTFLU8_t8_p8: 2513 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED 2514 | FOLDEQ_S2_FOLDS_SANE; 2515 goto do_exactf_utf8; 2516 2517 case EXACTFU_REQ8_t8_p8: 2518 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; 2519 goto do_exactf_utf8; 2520 2521 case EXACTFU_tb_p8: 2522 case EXACTFU_t8_pb: 2523 case EXACTFU_t8_p8: 2524 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; 2525 goto do_exactf_utf8; 2526 2527 /* The following are problematic even though pattern isn't UTF-8. Use 2528 * full functionality normally not done except for UTF-8. */ 2529 case EXACTF_t8_pb: 2530 case EXACTFUP_tb_pb: 2531 case EXACTFUP_t8_pb: 2532 2533 do_exactf_utf8: 2534 { 2535 unsigned expansion; 2536 2537 /* If one of the operands is in utf8, we can't use the simpler 2538 * folding above, due to the fact that many different characters 2539 * can have the same fold, or portion of a fold, or different- 2540 * length fold */ 2541 pat_string = STRINGs(c); 2542 ln = STR_LENs(c); /* length to match in octets/bytes */ 2543 pat_end = pat_string + ln; 2544 lnc = is_utf8_pat /* length to match in characters */ 2545 ? utf8_length((U8 *) pat_string, (U8 *) pat_end) 2546 : ln; 2547 2548 /* We have 'lnc' characters to match in the pattern, but because of 2549 * multi-character folding, each character in the target can match 2550 * up to 3 characters (Unicode guarantees it will never exceed 2551 * this) if it is utf8-encoded; and up to 2 if not (based on the 2552 * fact that the Latin 1 folds are already determined, and the only 2553 * multi-char fold in that range is the sharp-s folding to 'ss'. 2554 * Thus, a pattern character can match as little as 1/3 of a string 2555 * character. Adjust lnc accordingly, rounding up, so that if we 2556 * need to match at least 4+1/3 chars, that really is 5. */ 2557 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; 2558 lnc = (lnc + expansion - 1) / expansion; 2559 2560 /* As in the non-UTF8 case, if we have to match 3 characters, and 2561 * only 2 are left, it's guaranteed to fail, so don't start a match 2562 * that would require us to go beyond the end of the string */ 2563 e = HOP3c(strend, -((SSize_t)lnc), s); 2564 2565 /* XXX Note that we could recalculate e to stop the loop earlier, 2566 * as the worst case expansion above will rarely be met, and as we 2567 * go along we would usually find that e moves further to the left. 2568 * This would happen only after we reached the point in the loop 2569 * where if there were no expansion we should fail. Unclear if 2570 * worth the expense */ 2571 2572 while (s <= e) { 2573 char *my_strend= (char *)strend; 2574 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, 2575 pat_string, NULL, ln, is_utf8_pat, 2576 utf8_fold_flags) 2577 && (reginfo->intuit || regtry(reginfo, &s)) ) 2578 { 2579 goto got_it; 2580 } 2581 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; 2582 } 2583 } 2584 break; 2585 2586 case BOUNDA_tb_pb: 2587 case BOUNDA_tb_p8: 2588 case BOUND_tb_pb: /* /d without utf8 target is /a */ 2589 case BOUND_tb_p8: 2590 /* regcomp.c makes sure that these only have the traditional \b 2591 * meaning. */ 2592 assert(FLAGS(c) == TRADITIONAL_BOUND); 2593 2594 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A); 2595 break; 2596 2597 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */ 2598 case BOUNDA_t8_p8: 2599 /* regcomp.c makes sure that these only have the traditional \b 2600 * meaning. */ 2601 assert(FLAGS(c) == TRADITIONAL_BOUND); 2602 2603 FBC_BOUND_A_UTF8(isWORDCHAR_A); 2604 break; 2605 2606 case NBOUNDA_tb_pb: 2607 case NBOUNDA_tb_p8: 2608 case NBOUND_tb_pb: /* /d without utf8 target is /a */ 2609 case NBOUND_tb_p8: 2610 /* regcomp.c makes sure that these only have the traditional \b 2611 * meaning. */ 2612 assert(FLAGS(c) == TRADITIONAL_BOUND); 2613 2614 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A); 2615 break; 2616 2617 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */ 2618 case NBOUNDA_t8_p8: 2619 /* regcomp.c makes sure that these only have the traditional \b 2620 * meaning. */ 2621 assert(FLAGS(c) == TRADITIONAL_BOUND); 2622 2623 FBC_NBOUND_A_UTF8(isWORDCHAR_A); 2624 break; 2625 2626 case NBOUNDU_tb_pb: 2627 case NBOUNDU_tb_p8: 2628 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2629 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1); 2630 break; 2631 } 2632 2633 to_complement = 1; 2634 goto do_boundu_non_utf8; 2635 2636 case NBOUNDL_tb_pb: 2637 case NBOUNDL_tb_p8: 2638 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2639 if (FLAGS(c) == TRADITIONAL_BOUND) { 2640 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC); 2641 break; 2642 } 2643 2644 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2645 2646 to_complement = 1; 2647 goto do_boundu_non_utf8; 2648 2649 case BOUNDL_tb_pb: 2650 case BOUNDL_tb_p8: 2651 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2652 if (FLAGS(c) == TRADITIONAL_BOUND) { 2653 FBC_BOUND_NON_UTF8(isWORDCHAR_LC); 2654 break; 2655 } 2656 2657 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2658 2659 goto do_boundu_non_utf8; 2660 2661 case BOUNDU_tb_pb: 2662 case BOUNDU_tb_p8: 2663 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2664 FBC_BOUND_NON_UTF8(isWORDCHAR_L1); 2665 break; 2666 } 2667 2668 do_boundu_non_utf8: 2669 if (s == reginfo->strbeg) { 2670 if (reginfo->intuit || regtry(reginfo, &s)) 2671 { 2672 goto got_it; 2673 } 2674 2675 /* Didn't match. Try at the next position (if there is one) */ 2676 s++; 2677 if (UNLIKELY(s >= reginfo->strend)) { 2678 break; 2679 } 2680 } 2681 2682 switch((bound_type) FLAGS(c)) { 2683 case TRADITIONAL_BOUND: /* Should have already been handled */ 2684 assert(0); 2685 break; 2686 2687 case GCB_BOUND: 2688 /* Not utf8. Everything is a GCB except between CR and LF */ 2689 while (s < strend) { 2690 if ((to_complement ^ ( UCHARAT(s - 1) != '\r' 2691 || UCHARAT(s) != '\n')) 2692 && (reginfo->intuit || regtry(reginfo, &s))) 2693 { 2694 goto got_it; 2695 } 2696 s++; 2697 } 2698 2699 break; 2700 2701 case LB_BOUND: 2702 { 2703 LB_enum before = getLB_VAL_CP((U8) *(s -1)); 2704 while (s < strend) { 2705 LB_enum after = getLB_VAL_CP((U8) *s); 2706 if (to_complement ^ isLB(before, 2707 after, 2708 (U8*) reginfo->strbeg, 2709 (U8*) s, 2710 (U8*) reginfo->strend, 2711 0 /* target not utf8 */ ) 2712 && (reginfo->intuit || regtry(reginfo, &s))) 2713 { 2714 goto got_it; 2715 } 2716 before = after; 2717 s++; 2718 } 2719 } 2720 2721 break; 2722 2723 case SB_BOUND: 2724 { 2725 SB_enum before = getSB_VAL_CP((U8) *(s -1)); 2726 while (s < strend) { 2727 SB_enum after = getSB_VAL_CP((U8) *s); 2728 if ((to_complement ^ isSB(before, 2729 after, 2730 (U8*) reginfo->strbeg, 2731 (U8*) s, 2732 (U8*) reginfo->strend, 2733 0 /* target not utf8 */ )) 2734 && (reginfo->intuit || regtry(reginfo, &s))) 2735 { 2736 goto got_it; 2737 } 2738 before = after; 2739 s++; 2740 } 2741 } 2742 2743 break; 2744 2745 case WB_BOUND: 2746 { 2747 WB_enum previous = WB_UNKNOWN; 2748 WB_enum before = getWB_VAL_CP((U8) *(s -1)); 2749 while (s < strend) { 2750 WB_enum after = getWB_VAL_CP((U8) *s); 2751 if ((to_complement ^ isWB(previous, 2752 before, 2753 after, 2754 (U8*) reginfo->strbeg, 2755 (U8*) s, 2756 (U8*) reginfo->strend, 2757 0 /* target not utf8 */ )) 2758 && (reginfo->intuit || regtry(reginfo, &s))) 2759 { 2760 goto got_it; 2761 } 2762 previous = before; 2763 before = after; 2764 s++; 2765 } 2766 } 2767 } 2768 2769 /* Here are at the final position in the target string, which is a 2770 * boundary by definition, so matches, depending on other constraints. 2771 * */ 2772 if ( reginfo->intuit 2773 || (s <= reginfo->strend && regtry(reginfo, &s))) 2774 { 2775 goto got_it; 2776 } 2777 2778 break; 2779 2780 case BOUNDL_t8_pb: 2781 case BOUNDL_t8_p8: 2782 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2783 if (FLAGS(c) == TRADITIONAL_BOUND) { 2784 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, 2785 isWORDCHAR_LC_utf8_safe); 2786 break; 2787 } 2788 2789 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2790 2791 to_complement = 1; 2792 goto do_boundu_utf8; 2793 2794 case NBOUNDL_t8_pb: 2795 case NBOUNDL_t8_p8: 2796 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2797 if (FLAGS(c) == TRADITIONAL_BOUND) { 2798 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, 2799 isWORDCHAR_LC_utf8_safe); 2800 break; 2801 } 2802 2803 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2804 2805 to_complement = 1; 2806 goto do_boundu_utf8; 2807 2808 case NBOUND_t8_pb: 2809 case NBOUND_t8_p8: 2810 /* regcomp.c makes sure that these only have the traditional \b 2811 * meaning. */ 2812 assert(FLAGS(c) == TRADITIONAL_BOUND); 2813 2814 /* FALLTHROUGH */ 2815 2816 case NBOUNDU_t8_pb: 2817 case NBOUNDU_t8_p8: 2818 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2819 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, 2820 isWORDCHAR_utf8_safe); 2821 break; 2822 } 2823 2824 to_complement = 1; 2825 goto do_boundu_utf8; 2826 2827 case BOUND_t8_pb: 2828 case BOUND_t8_p8: 2829 /* regcomp.c makes sure that these only have the traditional \b 2830 * meaning. */ 2831 assert(FLAGS(c) == TRADITIONAL_BOUND); 2832 2833 /* FALLTHROUGH */ 2834 2835 case BOUNDU_t8_pb: 2836 case BOUNDU_t8_p8: 2837 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2838 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe); 2839 break; 2840 } 2841 2842 do_boundu_utf8: 2843 if (s == reginfo->strbeg) { 2844 if (reginfo->intuit || regtry(reginfo, &s)) 2845 { 2846 goto got_it; 2847 } 2848 2849 /* Didn't match. Try at the next position (if there is one) */ 2850 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2851 if (UNLIKELY(s >= reginfo->strend)) { 2852 break; 2853 } 2854 } 2855 2856 switch((bound_type) FLAGS(c)) { 2857 case TRADITIONAL_BOUND: /* Should have already been handled */ 2858 assert(0); 2859 break; 2860 2861 case GCB_BOUND: 2862 { 2863 GCB_enum before = getGCB_VAL_UTF8( 2864 reghop3((U8*)s, -1, 2865 (U8*)(reginfo->strbeg)), 2866 (U8*) reginfo->strend); 2867 while (s < strend) { 2868 GCB_enum after = getGCB_VAL_UTF8((U8*) s, 2869 (U8*) reginfo->strend); 2870 if ( (to_complement ^ isGCB(before, 2871 after, 2872 (U8*) reginfo->strbeg, 2873 (U8*) s, 2874 1 /* target is utf8 */ )) 2875 && (reginfo->intuit || regtry(reginfo, &s))) 2876 { 2877 goto got_it; 2878 } 2879 before = after; 2880 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2881 } 2882 } 2883 break; 2884 2885 case LB_BOUND: 2886 { 2887 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s, 2888 -1, 2889 (U8*)(reginfo->strbeg)), 2890 (U8*) reginfo->strend); 2891 while (s < strend) { 2892 LB_enum after = getLB_VAL_UTF8((U8*) s, 2893 (U8*) reginfo->strend); 2894 if (to_complement ^ isLB(before, 2895 after, 2896 (U8*) reginfo->strbeg, 2897 (U8*) s, 2898 (U8*) reginfo->strend, 2899 1 /* target is utf8 */ ) 2900 && (reginfo->intuit || regtry(reginfo, &s))) 2901 { 2902 goto got_it; 2903 } 2904 before = after; 2905 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2906 } 2907 } 2908 2909 break; 2910 2911 case SB_BOUND: 2912 { 2913 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s, 2914 -1, 2915 (U8*)(reginfo->strbeg)), 2916 (U8*) reginfo->strend); 2917 while (s < strend) { 2918 SB_enum after = getSB_VAL_UTF8((U8*) s, 2919 (U8*) reginfo->strend); 2920 if ((to_complement ^ isSB(before, 2921 after, 2922 (U8*) reginfo->strbeg, 2923 (U8*) s, 2924 (U8*) reginfo->strend, 2925 1 /* target is utf8 */ )) 2926 && (reginfo->intuit || regtry(reginfo, &s))) 2927 { 2928 goto got_it; 2929 } 2930 before = after; 2931 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2932 } 2933 } 2934 2935 break; 2936 2937 case WB_BOUND: 2938 { 2939 /* We are at a boundary between char_sub_0 and char_sub_1. 2940 * We also keep track of the value for char_sub_-1 as we 2941 * loop through the line. Context may be needed to make a 2942 * determination, and if so, this can save having to 2943 * recalculate it */ 2944 WB_enum previous = WB_UNKNOWN; 2945 WB_enum before = getWB_VAL_UTF8( 2946 reghop3((U8*)s, 2947 -1, 2948 (U8*)(reginfo->strbeg)), 2949 (U8*) reginfo->strend); 2950 while (s < strend) { 2951 WB_enum after = getWB_VAL_UTF8((U8*) s, 2952 (U8*) reginfo->strend); 2953 if ((to_complement ^ isWB(previous, 2954 before, 2955 after, 2956 (U8*) reginfo->strbeg, 2957 (U8*) s, 2958 (U8*) reginfo->strend, 2959 1 /* target is utf8 */ )) 2960 && (reginfo->intuit || regtry(reginfo, &s))) 2961 { 2962 goto got_it; 2963 } 2964 previous = before; 2965 before = after; 2966 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2967 } 2968 } 2969 } 2970 2971 /* Here are at the final position in the target string, which is a 2972 * boundary by definition, so matches, depending on other constraints. 2973 * */ 2974 2975 if ( reginfo->intuit 2976 || (s <= reginfo->strend && regtry(reginfo, &s))) 2977 { 2978 goto got_it; 2979 } 2980 break; 2981 2982 case LNBREAK_t8_pb: 2983 case LNBREAK_t8_p8: 2984 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend)); 2985 break; 2986 2987 case LNBREAK_tb_pb: 2988 case LNBREAK_tb_p8: 2989 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend)); 2990 break; 2991 2992 /* The argument to all the POSIX node types is the class number to pass 2993 * to _generic_isCC() to build a mask for searching in PL_charclass[] */ 2994 2995 case NPOSIXL_t8_pb: 2996 case NPOSIXL_t8_p8: 2997 to_complement = 1; 2998 /* FALLTHROUGH */ 2999 3000 case POSIXL_t8_pb: 3001 case POSIXL_t8_p8: 3002 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 3003 REXEC_FBC_UTF8_CLASS_SCAN( 3004 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, 3005 (U8 *) strend))); 3006 break; 3007 3008 case NPOSIXL_tb_pb: 3009 case NPOSIXL_tb_p8: 3010 to_complement = 1; 3011 /* FALLTHROUGH */ 3012 3013 case POSIXL_tb_pb: 3014 case POSIXL_tb_p8: 3015 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 3016 REXEC_FBC_NON_UTF8_CLASS_SCAN( 3017 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); 3018 break; 3019 3020 case NPOSIXA_t8_pb: 3021 case NPOSIXA_t8_p8: 3022 /* The complement of something that matches only ASCII matches all 3023 * non-ASCII, plus everything in ASCII that isn't in the class. */ 3024 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend) 3025 || ! _generic_isCC_A(*s, FLAGS(c))); 3026 break; 3027 3028 case POSIXA_t8_pb: 3029 case POSIXA_t8_p8: 3030 /* Don't need to worry about utf8, as it can match only a single 3031 * byte invariant character. But we do anyway for performance reasons, 3032 * as otherwise we would have to examine all the continuation 3033 * characters */ 3034 REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c))); 3035 break; 3036 3037 case NPOSIXD_tb_pb: 3038 case NPOSIXD_tb_p8: 3039 case NPOSIXA_tb_pb: 3040 case NPOSIXA_tb_p8: 3041 to_complement = 1; 3042 /* FALLTHROUGH */ 3043 3044 case POSIXD_tb_pb: 3045 case POSIXD_tb_p8: 3046 case POSIXA_tb_pb: 3047 case POSIXA_tb_p8: 3048 REXEC_FBC_NON_UTF8_CLASS_SCAN( 3049 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); 3050 break; 3051 3052 case NPOSIXU_tb_pb: 3053 case NPOSIXU_tb_p8: 3054 to_complement = 1; 3055 /* FALLTHROUGH */ 3056 3057 case POSIXU_tb_pb: 3058 case POSIXU_tb_p8: 3059 REXEC_FBC_NON_UTF8_CLASS_SCAN( 3060 to_complement ^ cBOOL(_generic_isCC(*s, 3061 FLAGS(c)))); 3062 break; 3063 3064 case NPOSIXD_t8_pb: 3065 case NPOSIXD_t8_p8: 3066 case NPOSIXU_t8_pb: 3067 case NPOSIXU_t8_p8: 3068 to_complement = 1; 3069 /* FALLTHROUGH */ 3070 3071 case POSIXD_t8_pb: 3072 case POSIXD_t8_p8: 3073 case POSIXU_t8_pb: 3074 case POSIXU_t8_p8: 3075 classnum = (_char_class_number) FLAGS(c); 3076 switch (classnum) { 3077 default: 3078 REXEC_FBC_UTF8_CLASS_SCAN( 3079 to_complement ^ cBOOL(_invlist_contains_cp( 3080 PL_XPosix_ptrs[classnum], 3081 utf8_to_uvchr_buf((U8 *) s, 3082 (U8 *) strend, 3083 NULL)))); 3084 break; 3085 3086 case _CC_ENUM_SPACE: 3087 REXEC_FBC_UTF8_CLASS_SCAN( 3088 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend))); 3089 break; 3090 3091 case _CC_ENUM_BLANK: 3092 REXEC_FBC_UTF8_CLASS_SCAN( 3093 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend))); 3094 break; 3095 3096 case _CC_ENUM_XDIGIT: 3097 REXEC_FBC_UTF8_CLASS_SCAN( 3098 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend))); 3099 break; 3100 3101 case _CC_ENUM_VERTSPACE: 3102 REXEC_FBC_UTF8_CLASS_SCAN( 3103 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend))); 3104 break; 3105 3106 case _CC_ENUM_CNTRL: 3107 REXEC_FBC_UTF8_CLASS_SCAN( 3108 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend))); 3109 break; 3110 } 3111 break; 3112 3113 case AHOCORASICKC_tb_pb: 3114 case AHOCORASICKC_tb_p8: 3115 case AHOCORASICKC_t8_pb: 3116 case AHOCORASICKC_t8_p8: 3117 case AHOCORASICK_tb_pb: 3118 case AHOCORASICK_tb_p8: 3119 case AHOCORASICK_t8_pb: 3120 case AHOCORASICK_t8_p8: 3121 { 3122 DECL_TRIE_TYPE(c); 3123 /* what trie are we using right now */ 3124 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ]; 3125 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie]; 3126 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); 3127 3128 const char *last_start = strend - trie->minlen; 3129 #ifdef DEBUGGING 3130 const char *real_start = s; 3131 #endif 3132 STRLEN maxlen = trie->maxlen; 3133 SV *sv_points; 3134 U8 **points; /* map of where we were in the input string 3135 when reading a given char. For ASCII this 3136 is unnecessary overhead as the relationship 3137 is always 1:1, but for Unicode, especially 3138 case folded Unicode this is not true. */ 3139 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 3140 U8 *bitmap=NULL; 3141 3142 3143 DECLARE_AND_GET_RE_DEBUG_FLAGS; 3144 3145 /* We can't just allocate points here. We need to wrap it in 3146 * an SV so it gets freed properly if there is a croak while 3147 * running the match */ 3148 ENTER; 3149 SAVETMPS; 3150 sv_points=newSV(maxlen * sizeof(U8 *)); 3151 SvCUR_set(sv_points, 3152 maxlen * sizeof(U8 *)); 3153 SvPOK_on(sv_points); 3154 sv_2mortal(sv_points); 3155 points=(U8**)SvPV_nolen(sv_points ); 3156 if ( trie_type != trie_utf8_fold 3157 && (trie->bitmap || OP(c)==AHOCORASICKC) ) 3158 { 3159 if (trie->bitmap) 3160 bitmap=(U8*)trie->bitmap; 3161 else 3162 bitmap=(U8*)ANYOF_BITMAP(c); 3163 } 3164 /* this is the Aho-Corasick algorithm modified a touch 3165 to include special handling for long "unknown char" sequences. 3166 The basic idea being that we use AC as long as we are dealing 3167 with a possible matching char, when we encounter an unknown char 3168 (and we have not encountered an accepting state) we scan forward 3169 until we find a legal starting char. 3170 AC matching is basically that of trie matching, except that when 3171 we encounter a failing transition, we fall back to the current 3172 states "fail state", and try the current char again, a process 3173 we repeat until we reach the root state, state 1, or a legal 3174 transition. If we fail on the root state then we can either 3175 terminate if we have reached an accepting state previously, or 3176 restart the entire process from the beginning if we have not. 3177 3178 */ 3179 while (s <= last_start) { 3180 const U32 uniflags = UTF8_ALLOW_DEFAULT; 3181 U8 *uc = (U8*)s; 3182 U16 charid = 0; 3183 U32 base = 1; 3184 U32 state = 1; 3185 UV uvc = 0; 3186 STRLEN len = 0; 3187 STRLEN foldlen = 0; 3188 U8 *uscan = (U8*)NULL; 3189 U8 *leftmost = NULL; 3190 #ifdef DEBUGGING 3191 U32 accepted_word= 0; 3192 #endif 3193 U32 pointpos = 0; 3194 3195 while ( state && uc <= (U8*)strend ) { 3196 int failed=0; 3197 U32 word = aho->states[ state ].wordnum; 3198 3199 if( state==1 ) { 3200 if ( bitmap ) { 3201 DEBUG_TRIE_EXECUTE_r( 3202 if ( uc <= (U8*)last_start 3203 && !BITMAP_TEST(bitmap,*uc) ) 3204 { 3205 dump_exec_pos( (char *)uc, c, strend, 3206 real_start, 3207 (char *)uc, utf8_target, 0 ); 3208 Perl_re_printf( aTHX_ 3209 " Scanning for legal start char...\n"); 3210 } 3211 ); 3212 if (utf8_target) { 3213 while ( uc <= (U8*)last_start 3214 && !BITMAP_TEST(bitmap,*uc) ) 3215 { 3216 uc += UTF8SKIP(uc); 3217 } 3218 } else { 3219 while ( uc <= (U8*)last_start 3220 && ! BITMAP_TEST(bitmap,*uc) ) 3221 { 3222 uc++; 3223 } 3224 } 3225 s= (char *)uc; 3226 } 3227 if (uc >(U8*)last_start) break; 3228 } 3229 3230 if ( word ) { 3231 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) 3232 % maxlen ]; 3233 if (!leftmost || lpos < leftmost) { 3234 DEBUG_r(accepted_word=word); 3235 leftmost= lpos; 3236 } 3237 if (base==0) break; 3238 3239 } 3240 points[pointpos++ % maxlen]= uc; 3241 if (foldlen || uc < (U8*)strend) { 3242 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 3243 (U8 *) strend, uscan, len, uvc, 3244 charid, foldlen, foldbuf, 3245 uniflags); 3246 DEBUG_TRIE_EXECUTE_r({ 3247 dump_exec_pos( (char *)uc, c, strend, 3248 real_start, s, utf8_target, 0); 3249 Perl_re_printf( aTHX_ 3250 " Charid:%3u CP:%4" UVxf " ", 3251 charid, uvc); 3252 }); 3253 } 3254 else { 3255 len = 0; 3256 charid = 0; 3257 } 3258 3259 3260 do { 3261 #ifdef DEBUGGING 3262 word = aho->states[ state ].wordnum; 3263 #endif 3264 base = aho->states[ state ].trans.base; 3265 3266 DEBUG_TRIE_EXECUTE_r({ 3267 if (failed) 3268 dump_exec_pos((char *)uc, c, strend, real_start, 3269 s, utf8_target, 0 ); 3270 Perl_re_printf( aTHX_ 3271 "%sState: %4" UVxf ", word=%" UVxf, 3272 failed ? " Fail transition to " : "", 3273 (UV)state, (UV)word); 3274 }); 3275 if ( base ) { 3276 U32 tmp; 3277 I32 offset; 3278 if (charid && 3279 ( ((offset = base + charid 3280 - 1 - trie->uniquecharcount)) >= 0) 3281 && ((U32)offset < trie->lasttrans) 3282 && trie->trans[offset].check == state 3283 && (tmp=trie->trans[offset].next)) 3284 { 3285 DEBUG_TRIE_EXECUTE_r( 3286 Perl_re_printf( aTHX_ " - legal\n")); 3287 state = tmp; 3288 break; 3289 } 3290 else { 3291 DEBUG_TRIE_EXECUTE_r( 3292 Perl_re_printf( aTHX_ " - fail\n")); 3293 failed = 1; 3294 state = aho->fail[state]; 3295 } 3296 } 3297 else { 3298 /* we must be accepting here */ 3299 DEBUG_TRIE_EXECUTE_r( 3300 Perl_re_printf( aTHX_ " - accepting\n")); 3301 failed = 1; 3302 break; 3303 } 3304 } while(state); 3305 uc += len; 3306 if (failed) { 3307 if (leftmost) 3308 break; 3309 if (!state) state = 1; 3310 } 3311 } 3312 if ( aho->states[ state ].wordnum ) { 3313 U8 *lpos = points[ (pointpos 3314 - trie->wordinfo[aho->states[ state ] 3315 .wordnum].len) % maxlen ]; 3316 if (!leftmost || lpos < leftmost) { 3317 DEBUG_r(accepted_word=aho->states[ state ].wordnum); 3318 leftmost = lpos; 3319 } 3320 } 3321 if (leftmost) { 3322 s = (char*)leftmost; 3323 DEBUG_TRIE_EXECUTE_r({ 3324 Perl_re_printf( aTHX_ "Matches word #%" UVxf 3325 " at position %" IVdf ". Trying full" 3326 " pattern...\n", 3327 (UV)accepted_word, (IV)(s - real_start) 3328 ); 3329 }); 3330 if (reginfo->intuit || regtry(reginfo, &s)) { 3331 FREETMPS; 3332 LEAVE; 3333 goto got_it; 3334 } 3335 if (s < reginfo->strend) { 3336 s = HOPc(s,1); 3337 } 3338 DEBUG_TRIE_EXECUTE_r({ 3339 Perl_re_printf( aTHX_ 3340 "Pattern failed. Looking for new start" 3341 " point...\n"); 3342 }); 3343 } else { 3344 DEBUG_TRIE_EXECUTE_r( 3345 Perl_re_printf( aTHX_ "No match.\n")); 3346 break; 3347 } 3348 } 3349 FREETMPS; 3350 LEAVE; 3351 } 3352 break; 3353 3354 case EXACTFU_REQ8_t8_pb: 3355 case EXACTFUP_tb_p8: 3356 case EXACTFUP_t8_p8: 3357 case EXACTF_tb_p8: 3358 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */ 3359 case EXACTFAA_NO_TRIE_tb_p8: 3360 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8 3361 patterns */ 3362 assert(0); 3363 3364 default: 3365 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); 3366 } /* End of switch on node type */ 3367 3368 return 0; 3369 3370 got_it: 3371 return s; 3372 } 3373 3374 /* set RX_SAVED_COPY, RX_SUBBEG etc. 3375 * flags have same meanings as with regexec_flags() */ 3376 3377 static void 3378 S_reg_set_capture_string(pTHX_ REGEXP * const rx, 3379 char *strbeg, 3380 char *strend, 3381 SV *sv, 3382 U32 flags, 3383 bool utf8_target) 3384 { 3385 struct regexp *const prog = ReANY(rx); 3386 3387 if (flags & REXEC_COPY_STR) { 3388 #ifdef PERL_ANY_COW 3389 if (SvCANCOW(sv)) { 3390 DEBUG_C(Perl_re_printf( aTHX_ 3391 "Copy on write: regexp capture, type %d\n", 3392 (int) SvTYPE(sv))); 3393 /* Create a new COW SV to share the match string and store 3394 * in saved_copy, unless the current COW SV in saved_copy 3395 * is valid and suitable for our purpose */ 3396 if (( prog->saved_copy 3397 && SvIsCOW(prog->saved_copy) 3398 && SvPOKp(prog->saved_copy) 3399 && SvIsCOW(sv) 3400 && SvPOKp(sv) 3401 && SvPVX(sv) == SvPVX(prog->saved_copy))) 3402 { 3403 /* just reuse saved_copy SV */ 3404 if (RXp_MATCH_COPIED(prog)) { 3405 Safefree(prog->subbeg); 3406 RXp_MATCH_COPIED_off(prog); 3407 } 3408 } 3409 else { 3410 /* create new COW SV to share string */ 3411 RXp_MATCH_COPY_FREE(prog); 3412 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); 3413 } 3414 prog->subbeg = (char *)SvPVX_const(prog->saved_copy); 3415 assert (SvPOKp(prog->saved_copy)); 3416 prog->sublen = strend - strbeg; 3417 prog->suboffset = 0; 3418 prog->subcoffset = 0; 3419 } else 3420 #endif 3421 { 3422 SSize_t min = 0; 3423 SSize_t max = strend - strbeg; 3424 SSize_t sublen; 3425 3426 if ( (flags & REXEC_COPY_SKIP_POST) 3427 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ 3428 && !(PL_sawampersand & SAWAMPERSAND_RIGHT) 3429 ) { /* don't copy $' part of string */ 3430 U32 n = 0; 3431 max = -1; 3432 /* calculate the right-most part of the string covered 3433 * by a capture. Due to lookahead, this may be to 3434 * the right of $&, so we have to scan all captures */ 3435 while (n <= prog->lastparen) { 3436 if (prog->offs[n].end > max) 3437 max = prog->offs[n].end; 3438 n++; 3439 } 3440 if (max == -1) 3441 max = (PL_sawampersand & SAWAMPERSAND_LEFT) 3442 ? prog->offs[0].start 3443 : 0; 3444 assert(max >= 0 && max <= strend - strbeg); 3445 } 3446 3447 if ( (flags & REXEC_COPY_SKIP_PRE) 3448 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ 3449 && !(PL_sawampersand & SAWAMPERSAND_LEFT) 3450 ) { /* don't copy $` part of string */ 3451 U32 n = 0; 3452 min = max; 3453 /* calculate the left-most part of the string covered 3454 * by a capture. Due to lookbehind, this may be to 3455 * the left of $&, so we have to scan all captures */ 3456 while (min && n <= prog->lastparen) { 3457 if ( prog->offs[n].start != -1 3458 && prog->offs[n].start < min) 3459 { 3460 min = prog->offs[n].start; 3461 } 3462 n++; 3463 } 3464 if ((PL_sawampersand & SAWAMPERSAND_RIGHT) 3465 && min > prog->offs[0].end 3466 ) 3467 min = prog->offs[0].end; 3468 3469 } 3470 3471 assert(min >= 0 && min <= max && min <= strend - strbeg); 3472 sublen = max - min; 3473 3474 if (RXp_MATCH_COPIED(prog)) { 3475 if (sublen > prog->sublen) 3476 prog->subbeg = 3477 (char*)saferealloc(prog->subbeg, sublen+1); 3478 } 3479 else 3480 prog->subbeg = (char*)safemalloc(sublen+1); 3481 Copy(strbeg + min, prog->subbeg, sublen, char); 3482 prog->subbeg[sublen] = '\0'; 3483 prog->suboffset = min; 3484 prog->sublen = sublen; 3485 RXp_MATCH_COPIED_on(prog); 3486 } 3487 prog->subcoffset = prog->suboffset; 3488 if (prog->suboffset && utf8_target) { 3489 /* Convert byte offset to chars. 3490 * XXX ideally should only compute this if @-/@+ 3491 * has been seen, a la PL_sawampersand ??? */ 3492 3493 /* If there's a direct correspondence between the 3494 * string which we're matching and the original SV, 3495 * then we can use the utf8 len cache associated with 3496 * the SV. In particular, it means that under //g, 3497 * sv_pos_b2u() will use the previously cached 3498 * position to speed up working out the new length of 3499 * subcoffset, rather than counting from the start of 3500 * the string each time. This stops 3501 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; 3502 * from going quadratic */ 3503 if (SvPOKp(sv) && SvPVX(sv) == strbeg) 3504 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, 3505 SV_GMAGIC|SV_CONST_RETURN); 3506 else 3507 prog->subcoffset = utf8_length((U8*)strbeg, 3508 (U8*)(strbeg+prog->suboffset)); 3509 } 3510 } 3511 else { 3512 RXp_MATCH_COPY_FREE(prog); 3513 prog->subbeg = strbeg; 3514 prog->suboffset = 0; 3515 prog->subcoffset = 0; 3516 prog->sublen = strend - strbeg; 3517 } 3518 } 3519 3520 3521 3522 3523 /* 3524 - regexec_flags - match a regexp against a string 3525 */ 3526 I32 3527 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, 3528 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) 3529 /* stringarg: the point in the string at which to begin matching */ 3530 /* strend: pointer to null at end of string */ 3531 /* strbeg: real beginning of string */ 3532 /* minend: end of match must be >= minend bytes after stringarg. */ 3533 /* sv: SV being matched: only used for utf8 flag, pos() etc; string 3534 * itself is accessed via the pointers above */ 3535 /* data: May be used for some additional optimizations. 3536 Currently unused. */ 3537 /* flags: For optimizations. See REXEC_* in regexp.h */ 3538 3539 { 3540 struct regexp *const prog = ReANY(rx); 3541 char *s; 3542 regnode *c; 3543 char *startpos; 3544 SSize_t minlen; /* must match at least this many chars */ 3545 SSize_t dontbother = 0; /* how many characters not to try at end */ 3546 const bool utf8_target = cBOOL(DO_UTF8(sv)); 3547 I32 multiline; 3548 RXi_GET_DECL(prog,progi); 3549 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */ 3550 regmatch_info *const reginfo = ®info_buf; 3551 regexp_paren_pair *swap = NULL; 3552 I32 oldsave; 3553 DECLARE_AND_GET_RE_DEBUG_FLAGS; 3554 3555 PERL_ARGS_ASSERT_REGEXEC_FLAGS; 3556 PERL_UNUSED_ARG(data); 3557 3558 /* Be paranoid... */ 3559 if (prog == NULL) { 3560 Perl_croak(aTHX_ "NULL regexp parameter"); 3561 } 3562 3563 DEBUG_EXECUTE_r( 3564 debug_start_match(rx, utf8_target, stringarg, strend, 3565 "Matching"); 3566 ); 3567 3568 startpos = stringarg; 3569 3570 /* set these early as they may be used by the HOP macros below */ 3571 reginfo->strbeg = strbeg; 3572 reginfo->strend = strend; 3573 reginfo->is_utf8_target = cBOOL(utf8_target); 3574 3575 if (prog->intflags & PREGf_GPOS_SEEN) { 3576 MAGIC *mg; 3577 3578 /* set reginfo->ganch, the position where \G can match */ 3579 3580 reginfo->ganch = 3581 (flags & REXEC_IGNOREPOS) 3582 ? stringarg /* use start pos rather than pos() */ 3583 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0) 3584 /* Defined pos(): */ 3585 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) 3586 : strbeg; /* pos() not defined; use start of string */ 3587 3588 DEBUG_GPOS_r(Perl_re_printf( aTHX_ 3589 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg))); 3590 3591 /* in the presence of \G, we may need to start looking earlier in 3592 * the string than the suggested start point of stringarg: 3593 * if prog->gofs is set, then that's a known, fixed minimum 3594 * offset, such as 3595 * /..\G/: gofs = 2 3596 * /ab|c\G/: gofs = 1 3597 * or if the minimum offset isn't known, then we have to go back 3598 * to the start of the string, e.g. /w+\G/ 3599 */ 3600 3601 if (prog->intflags & PREGf_ANCH_GPOS) { 3602 if (prog->gofs) { 3603 startpos = HOPBACKc(reginfo->ganch, prog->gofs); 3604 if (!startpos || 3605 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg)) 3606 { 3607 DEBUG_GPOS_r(Perl_re_printf( aTHX_ 3608 "fail: ganch-gofs before earliest possible start\n")); 3609 return 0; 3610 } 3611 } 3612 else 3613 startpos = reginfo->ganch; 3614 } 3615 else if (prog->gofs) { 3616 startpos = HOPBACKc(startpos, prog->gofs); 3617 if (!startpos) 3618 startpos = strbeg; 3619 } 3620 else if (prog->intflags & PREGf_GPOS_FLOAT) 3621 startpos = strbeg; 3622 } 3623 3624 minlen = prog->minlen; 3625 if ((startpos + minlen) > strend || startpos < strbeg) { 3626 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3627 "Regex match can't succeed, so not even tried\n")); 3628 return 0; 3629 } 3630 3631 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), 3632 * which will call destuctors to reset PL_regmatch_state, free higher 3633 * PL_regmatch_slabs, and clean up regmatch_info_aux and 3634 * regmatch_info_aux_eval */ 3635 3636 oldsave = PL_savestack_ix; 3637 3638 s = startpos; 3639 3640 if ((prog->extflags & RXf_USE_INTUIT) 3641 && !(flags & REXEC_CHECKED)) 3642 { 3643 s = re_intuit_start(rx, sv, strbeg, startpos, strend, 3644 flags, NULL); 3645 if (!s) 3646 return 0; 3647 3648 if (prog->extflags & RXf_CHECK_ALL) { 3649 /* we can match based purely on the result of INTUIT. 3650 * Set up captures etc just for $& and $-[0] 3651 * (an intuit-only match wont have $1,$2,..) */ 3652 assert(!prog->nparens); 3653 3654 /* s/// doesn't like it if $& is earlier than where we asked it to 3655 * start searching (which can happen on something like /.\G/) */ 3656 if ( (flags & REXEC_FAIL_ON_UNDERFLOW) 3657 && (s < stringarg)) 3658 { 3659 /* this should only be possible under \G */ 3660 assert(prog->intflags & PREGf_GPOS_SEEN); 3661 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3662 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); 3663 goto phooey; 3664 } 3665 3666 /* match via INTUIT shouldn't have any captures. 3667 * Let @-, @+, $^N know */ 3668 prog->lastparen = prog->lastcloseparen = 0; 3669 RXp_MATCH_UTF8_set(prog, utf8_target); 3670 prog->offs[0].start = s - strbeg; 3671 prog->offs[0].end = utf8_target 3672 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg 3673 : s - strbeg + prog->minlenret; 3674 if ( !(flags & REXEC_NOT_FIRST) ) 3675 S_reg_set_capture_string(aTHX_ rx, 3676 strbeg, strend, 3677 sv, flags, utf8_target); 3678 3679 return 1; 3680 } 3681 } 3682 3683 multiline = prog->extflags & RXf_PMf_MULTILINE; 3684 3685 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { 3686 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3687 "String too short [regexec_flags]...\n")); 3688 goto phooey; 3689 } 3690 3691 /* Check validity of program. */ 3692 if (UCHARAT(progi->program) != REG_MAGIC) { 3693 Perl_croak(aTHX_ "corrupted regexp program"); 3694 } 3695 3696 RXp_MATCH_TAINTED_off(prog); 3697 RXp_MATCH_UTF8_set(prog, utf8_target); 3698 3699 reginfo->prog = rx; /* Yes, sorry that this is confusing. */ 3700 reginfo->intuit = 0; 3701 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); 3702 reginfo->warned = FALSE; 3703 reginfo->sv = sv; 3704 reginfo->poscache_maxiter = 0; /* not yet started a countdown */ 3705 /* see how far we have to get to not match where we matched before */ 3706 reginfo->till = stringarg + minend; 3707 3708 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { 3709 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after 3710 S_cleanup_regmatch_info_aux has executed (registered by 3711 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies 3712 magic belonging to this SV. 3713 Not newSVsv, either, as it does not COW. 3714 */ 3715 reginfo->sv = newSV_type(SVt_NULL); 3716 SvSetSV_nosteal(reginfo->sv, sv); 3717 SAVEFREESV(reginfo->sv); 3718 } 3719 3720 /* reserve next 2 or 3 slots in PL_regmatch_state: 3721 * slot N+0: may currently be in use: skip it 3722 * slot N+1: use for regmatch_info_aux struct 3723 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s 3724 * slot N+3: ready for use by regmatch() 3725 */ 3726 3727 { 3728 regmatch_state *old_regmatch_state; 3729 regmatch_slab *old_regmatch_slab; 3730 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; 3731 3732 /* on first ever match, allocate first slab */ 3733 if (!PL_regmatch_slab) { 3734 Newx(PL_regmatch_slab, 1, regmatch_slab); 3735 PL_regmatch_slab->prev = NULL; 3736 PL_regmatch_slab->next = NULL; 3737 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); 3738 } 3739 3740 old_regmatch_state = PL_regmatch_state; 3741 old_regmatch_slab = PL_regmatch_slab; 3742 3743 for (i=0; i <= max; i++) { 3744 if (i == 1) 3745 reginfo->info_aux = &(PL_regmatch_state->u.info_aux); 3746 else if (i ==2) 3747 reginfo->info_aux_eval = 3748 reginfo->info_aux->info_aux_eval = 3749 &(PL_regmatch_state->u.info_aux_eval); 3750 3751 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) 3752 PL_regmatch_state = S_push_slab(aTHX); 3753 } 3754 3755 /* note initial PL_regmatch_state position; at end of match we'll 3756 * pop back to there and free any higher slabs */ 3757 3758 reginfo->info_aux->old_regmatch_state = old_regmatch_state; 3759 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; 3760 reginfo->info_aux->poscache = NULL; 3761 3762 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); 3763 3764 if ((prog->extflags & RXf_EVAL_SEEN)) 3765 S_setup_eval_state(aTHX_ reginfo); 3766 else 3767 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; 3768 } 3769 3770 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { 3771 /* We have to be careful. If the previous successful match 3772 was from this regex we don't want a subsequent partially 3773 successful match to clobber the old results. 3774 So when we detect this possibility we add a swap buffer 3775 to the re, and switch the buffer each match. If we fail, 3776 we switch it back; otherwise we leave it swapped. 3777 */ 3778 swap = prog->offs; 3779 /* avoid leak if we die, or clean up anyway if match completes */ 3780 SAVEFREEPV(swap); 3781 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); 3782 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 3783 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n", 3784 0, 3785 PTR2UV(prog), 3786 PTR2UV(swap), 3787 PTR2UV(prog->offs) 3788 )); 3789 } 3790 3791 if (prog->recurse_locinput) 3792 Zero(prog->recurse_locinput,prog->nparens + 1, char *); 3793 3794 /* Simplest case: anchored match (but not \G) need be tried only once, 3795 * or with MBOL, only at the beginning of each line. 3796 * 3797 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets 3798 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't 3799 * match at the start of the string then it won't match anywhere else 3800 * either; while with /.*.../, if it doesn't match at the beginning, 3801 * the earliest it could match is at the start of the next line */ 3802 3803 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { 3804 char *end; 3805 3806 if (regtry(reginfo, &s)) 3807 goto got_it; 3808 3809 if (!(prog->intflags & PREGf_ANCH_MBOL)) 3810 goto phooey; 3811 3812 /* didn't match at start, try at other newline positions */ 3813 3814 if (minlen) 3815 dontbother = minlen - 1; 3816 end = HOP3c(strend, -dontbother, strbeg) - 1; 3817 3818 /* skip to next newline */ 3819 3820 while (s <= end) { /* note it could be possible to match at the end of the string */ 3821 /* NB: newlines are the same in unicode as they are in latin */ 3822 if (*s++ != '\n') 3823 continue; 3824 if (prog->check_substr || prog->check_utf8) { 3825 /* note that with PREGf_IMPLICIT, intuit can only fail 3826 * or return the start position, so it's of limited utility. 3827 * Nevertheless, I made the decision that the potential for 3828 * quick fail was still worth it - DAPM */ 3829 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL); 3830 if (!s) 3831 goto phooey; 3832 } 3833 if (regtry(reginfo, &s)) 3834 goto got_it; 3835 } 3836 goto phooey; 3837 } /* end anchored search */ 3838 3839 /* anchored \G match */ 3840 if (prog->intflags & PREGf_ANCH_GPOS) 3841 { 3842 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ 3843 assert(prog->intflags & PREGf_GPOS_SEEN); 3844 /* For anchored \G, the only position it can match from is 3845 * (ganch-gofs); we already set startpos to this above; if intuit 3846 * moved us on from there, we can't possibly succeed */ 3847 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs)); 3848 if (s == startpos && regtry(reginfo, &s)) 3849 goto got_it; 3850 goto phooey; 3851 } 3852 3853 /* Messy cases: unanchored match. */ 3854 3855 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { 3856 /* we have /x+whatever/ */ 3857 /* it must be a one character string (XXXX Except is_utf8_pat?) */ 3858 char ch; 3859 #ifdef DEBUGGING 3860 int did_match = 0; 3861 #endif 3862 if (utf8_target) { 3863 if (! prog->anchored_utf8) { 3864 to_utf8_substr(prog); 3865 } 3866 ch = SvPVX_const(prog->anchored_utf8)[0]; 3867 REXEC_FBC_UTF8_SCAN( 3868 if (*s == ch) { 3869 DEBUG_EXECUTE_r( did_match = 1 ); 3870 if (regtry(reginfo, &s)) goto got_it; 3871 s += UTF8_SAFE_SKIP(s, strend); 3872 while (s < strend && *s == ch) 3873 s += UTF8SKIP(s); 3874 } 3875 ); 3876 3877 } 3878 else { 3879 if (! prog->anchored_substr) { 3880 if (! to_byte_substr(prog)) { 3881 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 3882 } 3883 } 3884 ch = SvPVX_const(prog->anchored_substr)[0]; 3885 REXEC_FBC_NON_UTF8_SCAN( 3886 if (*s == ch) { 3887 DEBUG_EXECUTE_r( did_match = 1 ); 3888 if (regtry(reginfo, &s)) goto got_it; 3889 s++; 3890 while (s < strend && *s == ch) 3891 s++; 3892 } 3893 ); 3894 } 3895 DEBUG_EXECUTE_r(if (!did_match) 3896 Perl_re_printf( aTHX_ 3897 "Did not find anchored character...\n") 3898 ); 3899 } 3900 else if (prog->anchored_substr != NULL 3901 || prog->anchored_utf8 != NULL 3902 || ((prog->float_substr != NULL || prog->float_utf8 != NULL) 3903 && prog->float_max_offset < strend - s)) { 3904 SV *must; 3905 SSize_t back_max; 3906 SSize_t back_min; 3907 char *last; 3908 char *last1; /* Last position checked before */ 3909 #ifdef DEBUGGING 3910 int did_match = 0; 3911 #endif 3912 if (prog->anchored_substr || prog->anchored_utf8) { 3913 if (utf8_target) { 3914 if (! prog->anchored_utf8) { 3915 to_utf8_substr(prog); 3916 } 3917 must = prog->anchored_utf8; 3918 } 3919 else { 3920 if (! prog->anchored_substr) { 3921 if (! to_byte_substr(prog)) { 3922 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 3923 } 3924 } 3925 must = prog->anchored_substr; 3926 } 3927 back_max = back_min = prog->anchored_offset; 3928 } else { 3929 if (utf8_target) { 3930 if (! prog->float_utf8) { 3931 to_utf8_substr(prog); 3932 } 3933 must = prog->float_utf8; 3934 } 3935 else { 3936 if (! prog->float_substr) { 3937 if (! to_byte_substr(prog)) { 3938 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 3939 } 3940 } 3941 must = prog->float_substr; 3942 } 3943 back_max = prog->float_max_offset; 3944 back_min = prog->float_min_offset; 3945 } 3946 3947 if (back_min<0) { 3948 last = strend; 3949 } else { 3950 last = HOP3c(strend, /* Cannot start after this */ 3951 -(SSize_t)(CHR_SVLEN(must) 3952 - (SvTAIL(must) != 0) + back_min), strbeg); 3953 } 3954 if (s > reginfo->strbeg) 3955 last1 = HOPc(s, -1); 3956 else 3957 last1 = s - 1; /* bogus */ 3958 3959 /* XXXX check_substr already used to find "s", can optimize if 3960 check_substr==must. */ 3961 dontbother = 0; 3962 strend = HOPc(strend, -dontbother); 3963 while ( (s <= last) && 3964 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), 3965 (unsigned char*)strend, must, 3966 multiline ? FBMrf_MULTILINE : 0)) ) { 3967 DEBUG_EXECUTE_r( did_match = 1 ); 3968 if (HOPc(s, -back_max) > last1) { 3969 last1 = HOPc(s, -back_min); 3970 s = HOPc(s, -back_max); 3971 } 3972 else { 3973 char * const t = (last1 >= reginfo->strbeg) 3974 ? HOPc(last1, 1) : last1 + 1; 3975 3976 last1 = HOPc(s, -back_min); 3977 s = t; 3978 } 3979 if (utf8_target) { 3980 while (s <= last1) { 3981 if (regtry(reginfo, &s)) 3982 goto got_it; 3983 if (s >= last1) { 3984 s++; /* to break out of outer loop */ 3985 break; 3986 } 3987 s += UTF8SKIP(s); 3988 } 3989 } 3990 else { 3991 while (s <= last1) { 3992 if (regtry(reginfo, &s)) 3993 goto got_it; 3994 s++; 3995 } 3996 } 3997 } 3998 DEBUG_EXECUTE_r(if (!did_match) { 3999 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 4000 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 4001 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n", 4002 ((must == prog->anchored_substr || must == prog->anchored_utf8) 4003 ? "anchored" : "floating"), 4004 quoted, RE_SV_TAIL(must)); 4005 }); 4006 goto phooey; 4007 } 4008 else if ( (c = progi->regstclass) ) { 4009 if (minlen) { 4010 const OPCODE op = OP(progi->regstclass); 4011 /* don't bother with what can't match */ 4012 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE) 4013 strend = HOPc(strend, -(minlen - 1)); 4014 } 4015 DEBUG_EXECUTE_r({ 4016 SV * const prop = sv_newmortal(); 4017 regprop(prog, prop, c, reginfo, NULL); 4018 { 4019 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), 4020 s,strend-s,PL_dump_re_max_len); 4021 Perl_re_printf( aTHX_ 4022 "Matching stclass %.*s against %s (%d bytes)\n", 4023 (int)SvCUR(prop), SvPVX_const(prop), 4024 quoted, (int)(strend - s)); 4025 } 4026 }); 4027 if (find_byclass(prog, c, s, strend, reginfo)) 4028 goto got_it; 4029 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n")); 4030 } 4031 else { 4032 dontbother = 0; 4033 if (prog->float_substr != NULL || prog->float_utf8 != NULL) { 4034 /* Trim the end. */ 4035 char *last= NULL; 4036 SV* float_real; 4037 STRLEN len; 4038 const char *little; 4039 4040 if (utf8_target) { 4041 if (! prog->float_utf8) { 4042 to_utf8_substr(prog); 4043 } 4044 float_real = prog->float_utf8; 4045 } 4046 else { 4047 if (! prog->float_substr) { 4048 if (! to_byte_substr(prog)) { 4049 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 4050 } 4051 } 4052 float_real = prog->float_substr; 4053 } 4054 4055 little = SvPV_const(float_real, len); 4056 if (SvTAIL(float_real)) { 4057 /* This means that float_real contains an artificial \n on 4058 * the end due to the presence of something like this: 4059 * /foo$/ where we can match both "foo" and "foo\n" at the 4060 * end of the string. So we have to compare the end of the 4061 * string first against the float_real without the \n and 4062 * then against the full float_real with the string. We 4063 * have to watch out for cases where the string might be 4064 * smaller than the float_real or the float_real without 4065 * the \n. */ 4066 char *checkpos= strend - len; 4067 DEBUG_OPTIMISE_r( 4068 Perl_re_printf( aTHX_ 4069 "%sChecking for float_real.%s\n", 4070 PL_colors[4], PL_colors[5])); 4071 if (checkpos + 1 < strbeg) { 4072 /* can't match, even if we remove the trailing \n 4073 * string is too short to match */ 4074 DEBUG_EXECUTE_r( 4075 Perl_re_printf( aTHX_ 4076 "%sString shorter than required trailing substring, cannot match.%s\n", 4077 PL_colors[4], PL_colors[5])); 4078 goto phooey; 4079 } else if (memEQ(checkpos + 1, little, len - 1)) { 4080 /* can match, the end of the string matches without the 4081 * "\n" */ 4082 last = checkpos + 1; 4083 } else if (checkpos < strbeg) { 4084 /* cant match, string is too short when the "\n" is 4085 * included */ 4086 DEBUG_EXECUTE_r( 4087 Perl_re_printf( aTHX_ 4088 "%sString does not contain required trailing substring, cannot match.%s\n", 4089 PL_colors[4], PL_colors[5])); 4090 goto phooey; 4091 } else if (!multiline) { 4092 /* non multiline match, so compare with the "\n" at the 4093 * end of the string */ 4094 if (memEQ(checkpos, little, len)) { 4095 last= checkpos; 4096 } else { 4097 DEBUG_EXECUTE_r( 4098 Perl_re_printf( aTHX_ 4099 "%sString does not contain required trailing substring, cannot match.%s\n", 4100 PL_colors[4], PL_colors[5])); 4101 goto phooey; 4102 } 4103 } else { 4104 /* multiline match, so we have to search for a place 4105 * where the full string is located */ 4106 goto find_last; 4107 } 4108 } else { 4109 find_last: 4110 if (len) 4111 last = rninstr(s, strend, little, little + len); 4112 else 4113 last = strend; /* matching "$" */ 4114 } 4115 if (!last) { 4116 /* at one point this block contained a comment which was 4117 * probably incorrect, which said that this was a "should not 4118 * happen" case. Even if it was true when it was written I am 4119 * pretty sure it is not anymore, so I have removed the comment 4120 * and replaced it with this one. Yves */ 4121 DEBUG_EXECUTE_r( 4122 Perl_re_printf( aTHX_ 4123 "%sString does not contain required substring, cannot match.%s\n", 4124 PL_colors[4], PL_colors[5] 4125 )); 4126 goto phooey; 4127 } 4128 dontbother = strend - last + prog->float_min_offset; 4129 } 4130 if (minlen && (dontbother < minlen)) 4131 dontbother = minlen - 1; 4132 strend -= dontbother; /* this one's always in bytes! */ 4133 /* We don't know much -- general case. */ 4134 if (utf8_target) { 4135 for (;;) { 4136 if (regtry(reginfo, &s)) 4137 goto got_it; 4138 if (s >= strend) 4139 break; 4140 s += UTF8SKIP(s); 4141 }; 4142 } 4143 else { 4144 do { 4145 if (regtry(reginfo, &s)) 4146 goto got_it; 4147 } while (s++ < strend); 4148 } 4149 } 4150 4151 /* Failure. */ 4152 goto phooey; 4153 4154 got_it: 4155 /* s/// doesn't like it if $& is earlier than where we asked it to 4156 * start searching (which can happen on something like /.\G/) */ 4157 if ( (flags & REXEC_FAIL_ON_UNDERFLOW) 4158 && (prog->offs[0].start < stringarg - strbeg)) 4159 { 4160 /* this should only be possible under \G */ 4161 assert(prog->intflags & PREGf_GPOS_SEEN); 4162 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 4163 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); 4164 goto phooey; 4165 } 4166 4167 /* clean up; this will trigger destructors that will free all slabs 4168 * above the current one, and cleanup the regmatch_info_aux 4169 * and regmatch_info_aux_eval sructs */ 4170 4171 LEAVE_SCOPE(oldsave); 4172 4173 if (RXp_PAREN_NAMES(prog)) 4174 (void)hv_iterinit(RXp_PAREN_NAMES(prog)); 4175 4176 /* make sure $`, $&, $', and $digit will work later */ 4177 if ( !(flags & REXEC_NOT_FIRST) ) 4178 S_reg_set_capture_string(aTHX_ rx, 4179 strbeg, reginfo->strend, 4180 sv, flags, utf8_target); 4181 4182 return 1; 4183 4184 phooey: 4185 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n", 4186 PL_colors[4], PL_colors[5])); 4187 4188 if (swap) { 4189 /* we failed :-( roll it back. 4190 * Since the swap buffer will be freed on scope exit which follows 4191 * shortly, restore the old captures by copying 'swap's original 4192 * data to the new offs buffer 4193 */ 4194 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 4195 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n", 4196 0, 4197 PTR2UV(prog), 4198 PTR2UV(prog->offs), 4199 PTR2UV(swap) 4200 )); 4201 4202 Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair); 4203 } 4204 4205 /* clean up; this will trigger destructors that will free all slabs 4206 * above the current one, and cleanup the regmatch_info_aux 4207 * and regmatch_info_aux_eval sructs */ 4208 4209 LEAVE_SCOPE(oldsave); 4210 4211 return 0; 4212 } 4213 4214 4215 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting. 4216 * Do inc before dec, in case old and new rex are the same */ 4217 #define SET_reg_curpm(Re2) \ 4218 if (reginfo->info_aux_eval) { \ 4219 (void)ReREFCNT_inc(Re2); \ 4220 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ 4221 PM_SETRE((PL_reg_curpm), (Re2)); \ 4222 } 4223 4224 4225 /* 4226 - regtry - try match at specific point 4227 */ 4228 STATIC bool /* 0 failure, 1 success */ 4229 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) 4230 { 4231 CHECKPOINT lastcp; 4232 REGEXP *const rx = reginfo->prog; 4233 regexp *const prog = ReANY(rx); 4234 SSize_t result; 4235 #ifdef DEBUGGING 4236 U32 depth = 0; /* used by REGCP_SET */ 4237 #endif 4238 RXi_GET_DECL(prog,progi); 4239 DECLARE_AND_GET_RE_DEBUG_FLAGS; 4240 4241 PERL_ARGS_ASSERT_REGTRY; 4242 4243 reginfo->cutpoint=NULL; 4244 4245 prog->offs[0].start = *startposp - reginfo->strbeg; 4246 prog->lastparen = 0; 4247 prog->lastcloseparen = 0; 4248 4249 /* XXXX What this code is doing here?!!! There should be no need 4250 to do this again and again, prog->lastparen should take care of 4251 this! --ilya*/ 4252 4253 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. 4254 * Actually, the code in regcppop() (which Ilya may be meaning by 4255 * prog->lastparen), is not needed at all by the test suite 4256 * (op/regexp, op/pat, op/split), but that code is needed otherwise 4257 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 4258 * Meanwhile, this code *is* needed for the 4259 * above-mentioned test suite tests to succeed. The common theme 4260 * on those tests seems to be returning null fields from matches. 4261 * --jhi updated by dapm */ 4262 4263 /* After encountering a variant of the issue mentioned above I think 4264 * the point Ilya was making is that if we properly unwind whenever 4265 * we set lastparen to a smaller value then we should not need to do 4266 * this every time, only when needed. So if we have tests that fail if 4267 * we remove this, then it suggests somewhere else we are improperly 4268 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and 4269 * places it is called, and related regcp() routines. - Yves */ 4270 #if 1 4271 if (prog->nparens) { 4272 regexp_paren_pair *pp = prog->offs; 4273 I32 i; 4274 for (i = prog->nparens; i > (I32)prog->lastparen; i--) { 4275 ++pp; 4276 pp->start = -1; 4277 pp->end = -1; 4278 } 4279 } 4280 #endif 4281 REGCP_SET(lastcp); 4282 result = regmatch(reginfo, *startposp, progi->program + 1); 4283 if (result != -1) { 4284 prog->offs[0].end = result; 4285 return 1; 4286 } 4287 if (reginfo->cutpoint) 4288 *startposp= reginfo->cutpoint; 4289 REGCP_UNWIND(lastcp); 4290 return 0; 4291 } 4292 4293 /* this is used to determine how far from the left messages like 4294 'failed...' are printed in regexec.c. It should be set such that 4295 messages are inline with the regop output that created them. 4296 */ 4297 #define REPORT_CODE_OFF 29 4298 #define INDENT_CHARS(depth) ((int)(depth) % 20) 4299 #ifdef DEBUGGING 4300 int 4301 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...) 4302 { 4303 va_list ap; 4304 int result; 4305 PerlIO *f= Perl_debug_log; 4306 PERL_ARGS_ASSERT_RE_EXEC_INDENTF; 4307 va_start(ap, depth); 4308 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" ); 4309 result = PerlIO_vprintf(f, fmt, ap); 4310 va_end(ap); 4311 return result; 4312 } 4313 #endif /* DEBUGGING */ 4314 4315 /* grab a new slab and return the first slot in it */ 4316 4317 STATIC regmatch_state * 4318 S_push_slab(pTHX) 4319 { 4320 regmatch_slab *s = PL_regmatch_slab->next; 4321 if (!s) { 4322 Newx(s, 1, regmatch_slab); 4323 s->prev = PL_regmatch_slab; 4324 s->next = NULL; 4325 PL_regmatch_slab->next = s; 4326 } 4327 PL_regmatch_slab = s; 4328 return SLAB_FIRST(s); 4329 } 4330 4331 #ifdef DEBUGGING 4332 4333 STATIC void 4334 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, 4335 const char *start, const char *end, const char *blurb) 4336 { 4337 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; 4338 4339 PERL_ARGS_ASSERT_DEBUG_START_MATCH; 4340 4341 if (!PL_colorset) 4342 reginitcolors(); 4343 { 4344 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), 4345 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len); 4346 4347 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), 4348 start, end - start, PL_dump_re_max_len); 4349 4350 Perl_re_printf( aTHX_ 4351 "%s%s REx%s %s against %s\n", 4352 PL_colors[4], blurb, PL_colors[5], s0, s1); 4353 4354 if (utf8_target||utf8_pat) 4355 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n", 4356 utf8_pat ? "pattern" : "", 4357 utf8_pat && utf8_target ? " and " : "", 4358 utf8_target ? "string" : "" 4359 ); 4360 } 4361 } 4362 4363 STATIC void 4364 S_dump_exec_pos(pTHX_ const char *locinput, 4365 const regnode *scan, 4366 const char *loc_regeol, 4367 const char *loc_bostr, 4368 const char *loc_reg_starttry, 4369 const bool utf8_target, 4370 const U32 depth 4371 ) 4372 { 4373 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; 4374 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ 4375 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); 4376 /* The part of the string before starttry has one color 4377 (pref0_len chars), between starttry and current 4378 position another one (pref_len - pref0_len chars), 4379 after the current position the third one. 4380 We assume that pref0_len <= pref_len, otherwise we 4381 decrease pref0_len. */ 4382 int pref_len = (locinput - loc_bostr) > (5 + taill) - l 4383 ? (5 + taill) - l : locinput - loc_bostr; 4384 int pref0_len; 4385 4386 PERL_ARGS_ASSERT_DUMP_EXEC_POS; 4387 4388 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) 4389 pref_len++; 4390 pref0_len = pref_len - (locinput - loc_reg_starttry); 4391 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) 4392 l = ( loc_regeol - locinput > (5 + taill) - pref_len 4393 ? (5 + taill) - pref_len : loc_regeol - locinput); 4394 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) 4395 l--; 4396 if (pref0_len < 0) 4397 pref0_len = 0; 4398 if (pref0_len > pref_len) 4399 pref0_len = pref_len; 4400 { 4401 const int is_uni = utf8_target ? 1 : 0; 4402 4403 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), 4404 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5); 4405 4406 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), 4407 (locinput - pref_len + pref0_len), 4408 pref_len - pref0_len, PL_dump_re_max_len, 2, 3); 4409 4410 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), 4411 locinput, loc_regeol - locinput, 10, 0, 1); 4412 4413 const STRLEN tlen=len0+len1+len2; 4414 Perl_re_printf( aTHX_ 4415 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ", 4416 (IV)(locinput - loc_bostr), 4417 len0, s0, 4418 len1, s1, 4419 (docolor ? "" : "> <"), 4420 len2, s2, 4421 (int)(tlen > 19 ? 0 : 19 - tlen), 4422 "", 4423 (UV)depth); 4424 } 4425 } 4426 4427 #endif 4428 4429 /* reg_check_named_buff_matched() 4430 * Checks to see if a named buffer has matched. The data array of 4431 * buffer numbers corresponding to the buffer is expected to reside 4432 * in the regexp->data->data array in the slot stored in the ARG() of 4433 * node involved. Note that this routine doesn't actually care about the 4434 * name, that information is not preserved from compilation to execution. 4435 * Returns the index of the leftmost defined buffer with the given name 4436 * or 0 if non of the buffers matched. 4437 */ 4438 STATIC I32 4439 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) 4440 { 4441 I32 n; 4442 RXi_GET_DECL(rex,rexi); 4443 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 4444 I32 *nums=(I32*)SvPVX(sv_dat); 4445 4446 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; 4447 4448 for ( n=0; n<SvIVX(sv_dat); n++ ) { 4449 if ((I32)rex->lastparen >= nums[n] && 4450 rex->offs[nums[n]].end != -1) 4451 { 4452 return nums[n]; 4453 } 4454 } 4455 return 0; 4456 } 4457 4458 static bool 4459 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node, 4460 struct next_matchable_info * m, 4461 regmatch_info *reginfo) 4462 { 4463 /* This function determines various characteristics about every possible 4464 * initial match of the passed-in EXACTish <text_node>, and stores them in 4465 * <*m>. 4466 * 4467 * That includes a match string and a parallel mask, such that if you AND 4468 * the target string with the mask and compare with the match string, 4469 * you'll have a pretty good idea, perhaps even perfect, if that portion of 4470 * the target matches or not. 4471 * 4472 * The motivation behind this function is to allow the caller to set up 4473 * tight loops for matching. Consider patterns like '.*B' or '.*?B' where 4474 * B is an arbitrary EXACTish node. To find the end of .*, we look for the 4475 * beginning oF B, which is the passed in <text_node> That's where this 4476 * function comes in. The values it returns can quickly be used to rule 4477 * out many, or all, cases of possible matches not actually being the 4478 * beginning of B, <text_node>. It is also used in regrepeat() where we 4479 * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently 4480 * determine where the span of 'A's stop. 4481 * 4482 * If <text_node> is of type EXACT, there is only one possible character 4483 * that can match its first character, and so the situation is quite 4484 * simple. But things can get much more complicated if folding is 4485 * involved. It may be that the first character of an EXACTFish node 4486 * doesn't participate in any possible fold, e.g., punctuation, so it can 4487 * be matched only by itself. The vast majority of characters that are in 4488 * folds match just two things, their lower and upper-case equivalents. 4489 * But not all are like that; some have multiple possible matches, or match 4490 * sequences of more than one character. This function sorts all that out. 4491 * 4492 * It returns information about all possibilities of what the first 4493 * character(s) of <text_node> could look like. Again, if <text_node> is a 4494 * plain EXACT node, that's just the actual first bytes of the first 4495 * character; but otherwise it is the bytes, that when masked, match all 4496 * possible combinations of all the initial bytes of all the characters 4497 * that could match, folded. (Actually, this is a slight over promise. It 4498 * handles only up to the initial 5 bytes, which is enough for all Unicode 4499 * characters, but not for all non-Unicode ones.) 4500 * 4501 * Here's an example to clarify. Suppose the first character of 4502 * <text_node> is the letter 'C', and we are under /i matching. That means 4503 * 'c' also matches. The representations of these two characters differ in 4504 * just one bit, so the mask would be a zero in that position and ones in 4505 * the other 7. And the returned string would be the AND of these two 4506 * characters, and would be one byte long, since these characters are each 4507 * a single byte. ANDing the target <text_node> with this mask will yield 4508 * the returned string if and only if <text_node> begins with one of these 4509 * two characters. So, the function would also return that the definitive 4510 * length matched is 1 byte. 4511 * 4512 * Now, suppose instead of the letter 'C', <text_node> begins with the 4513 * letter 'F'. The situation is much more complicated because there are 4514 * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also 4515 * begins with 'f', and hence could match. We add these into the returned 4516 * string and mask, but the result isn't definitive; the caller has to 4517 * check further if its AND and compare pass. But the failure of that 4518 * compare will quickly rule out most possible inputs. 4519 * 4520 * Much of this could be done in regcomp.c at compile time, except for 4521 * locale-dependent, and UTF-8 target dependent data. Extra data fields 4522 * could be used for one or the other eventualities. 4523 * 4524 * If this function determines that no possible character in the target 4525 * string can match, it returns FALSE; otherwise TRUE. (The FALSE 4526 * situation occurs if the first character in <text_node> requires UTF-8 to 4527 * represent, and the target string isn't in UTF-8.) 4528 * 4529 * Some analysis is in GH #18414, located at the time of this writing at: 4530 * https://github.com/Perl/perl5/issues/18414 4531 */ 4532 4533 const bool utf8_target = reginfo->is_utf8_target; 4534 bool utf8_pat = reginfo->is_utf8_pat; 4535 4536 PERL_UINT_FAST8_T i; 4537 4538 /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e 4539 */ 4540 U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } }; 4541 U8 lengths[MAX_MATCHES] = { 0 }; 4542 4543 U8 index_of_longest = 0; 4544 4545 U8 *pat = (U8*)STRING(text_node); 4546 Size_t pat_len = STR_LEN(text_node); 4547 U8 op = OP(text_node); 4548 4549 U8 byte_mask[5] = {0}; 4550 U8 byte_anded[5] = {0}; 4551 4552 /* There are some folds in Unicode to multiple characters. This will hold 4553 * such characters that could fold to the beginning of 'text_node' */ 4554 UV multi_fold_from = 0; 4555 4556 /* We may have to create a modified copy of the pattern */ 4557 U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' }; 4558 4559 m->max_length = 0; 4560 m->min_length = 255; 4561 m->count = 0; 4562 4563 /* Even if the first character in the node can match something in Latin1, 4564 * if there is anything in the node that can't, the match must fail */ 4565 if (! utf8_target && isEXACT_REQ8(op)) { 4566 return FALSE; 4567 } 4568 4569 /* Define a temporary op for use in this function, using an existing one that 4570 * should never be a real op during execution */ 4571 #define TURKISH PSEUDO 4572 4573 /* What to do about these two nodes had to be deferred to runtime (which is 4574 * now). If the extra information we now have so indicates, turn them into 4575 * EXACTFU nodes */ 4576 if ( (op == EXACTF && utf8_target) 4577 || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE)) 4578 { 4579 if (op == EXACTFL && PL_in_utf8_turkic_locale) { 4580 op = TURKISH; 4581 } 4582 else { 4583 op = EXACTFU; 4584 } 4585 4586 /* And certain situations are better handled if we create a modified 4587 * version of the pattern */ 4588 if (utf8_pat) { /* Here, must have been EXACTFL, so look at the 4589 specific problematic characters */ 4590 if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) { 4591 4592 /* The node could start with characters that are the first ones 4593 * of a multi-character fold. */ 4594 multi_fold_from 4595 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len); 4596 if (multi_fold_from) { 4597 4598 /* Here, they do form a sequence that matches the fold of a 4599 * single character. That single character then is a 4600 * possible match. Below we will look again at this, but 4601 * the code below is expecting every character in the 4602 * pattern to be folded, which the input isn't required to 4603 * be in this case. So, just fold the single character, 4604 * and the result will be in the expected form. */ 4605 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len, 4606 FOLD_FLAGS_FULL); 4607 pat = mod_pat; 4608 } 4609 /* Turkish has a couple extra possibilities. */ 4610 else if ( UNLIKELY(op == TURKISH) 4611 && pat_len >= 3 4612 && isALPHA_FOLD_EQ(pat[0], 'f') 4613 && ( memBEGINs(pat + 1, pat_len - 1, 4614 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8) 4615 || ( pat_len >= 4 4616 && isALPHA_FOLD_EQ(pat[1], 'f') 4617 && memBEGINs(pat + 2, pat_len - 2, 4618 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8) 4619 ))) { 4620 /* The macros for finding a multi-char fold don't include 4621 * the Turkish possibilities, in which U+130 folds to 'i'. 4622 * Hard-code these. It's very unlikely that Unicode will 4623 * ever add any others. */ 4624 if (pat[1] == 'f') { 4625 pat_len = 3; 4626 Copy("ffi", mod_pat, pat_len, U8); 4627 } 4628 else { 4629 pat_len = 2; 4630 Copy("fi", mod_pat, pat_len, U8); 4631 } 4632 pat = mod_pat; 4633 } 4634 else if ( UTF8_IS_DOWNGRADEABLE_START(*pat) 4635 && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8)) 4636 && LIKELY(memNEs(pat, pat_len, 4637 LATIN_SMALL_LETTER_SHARP_S_UTF8)) 4638 && (LIKELY(op != TURKISH || *pat != 'I'))) 4639 { 4640 /* For all cases of things between 0-255, except the ones 4641 * in the conditional above, the fold is just the lower 4642 * case, which is faster than the more general case. */ 4643 mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0], 4644 pat[1])); 4645 pat_len = 1; 4646 pat = mod_pat; 4647 utf8_pat = FALSE; 4648 } 4649 else { /* Code point above 255, or needs special handling */ 4650 _to_utf8_fold_flags(pat, pat + pat_len, 4651 mod_pat, &pat_len, 4652 FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE); 4653 pat = mod_pat; 4654 } 4655 } 4656 } 4657 else if /* Below is not a UTF-8 pattern; there's a somewhat different 4658 set of problematic characters */ 4659 ((multi_fold_from 4660 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len))) 4661 { 4662 /* We may have to canonicalize a multi-char fold, as in the UTF-8 4663 * case */ 4664 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len, 4665 FOLD_FLAGS_FULL); 4666 pat = mod_pat; 4667 } 4668 else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) { 4669 mod_pat[0] = mod_pat[1] = 's'; 4670 pat_len = 2; 4671 utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant 4672 chars, and speeds copying */ 4673 pat = mod_pat; 4674 } 4675 else if (LIKELY(op != TURKISH || *pat != 'I')) { 4676 mod_pat[0] = toLOWER_L1(*pat); 4677 pat_len = 1; 4678 pat = mod_pat; 4679 } 4680 } 4681 else if /* Below isn't a node that we convert to UTF-8 */ 4682 ( utf8_target 4683 && ! utf8_pat 4684 && op == EXACTFAA_NO_TRIE 4685 && *pat == LATIN_SMALL_LETTER_SHARP_S) 4686 { 4687 /* A very special case. Folding U+DF goes to U+17F under /iaa. We 4688 * did this at compile time when the pattern was UTF-8 , but otherwise 4689 * we couldn't do it earlier, because it requires a UTF-8 target for 4690 * this match to be legal. */ 4691 pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1); 4692 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 4693 LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8); 4694 pat = mod_pat; 4695 utf8_pat = TRUE; 4696 } 4697 4698 /* Here, we have taken care of the initial work for a few very problematic 4699 * situations, possibly creating a modified pattern. 4700 * 4701 * Now ready for the general case. We build up all the possible things 4702 * that could match the first character of the pattern into the elements of 4703 * 'matches[]' 4704 * 4705 * Everything generally matches at least itself. But if there is a 4706 * UTF8ness mismatch, we have to convert to that of the target string. */ 4707 if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */ 4708 matches[0][0] = pat[0]; 4709 lengths[0] = 1; 4710 m->count++; 4711 } 4712 else if (utf8_target) { 4713 if (utf8_pat) { 4714 lengths[0] = UTF8SKIP(pat); 4715 Copy(pat, matches[0], lengths[0], U8); 4716 m->count++; 4717 } 4718 else { /* target is UTF-8, pattern isn't */ 4719 matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]); 4720 matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]); 4721 lengths[0] = 2; 4722 m->count++; 4723 } 4724 } 4725 else if (! utf8_pat) { /* Neither is UTF-8 */ 4726 matches[0][0] = pat[0]; 4727 lengths[0] = 1; 4728 m->count++; 4729 } 4730 else /* target isn't UTF-8; pattern is. No match possible unless the 4731 pattern's first character can fit in a byte */ 4732 if (UTF8_IS_DOWNGRADEABLE_START(*pat)) 4733 { 4734 matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]); 4735 lengths[0] = 1; 4736 m->count++; 4737 } 4738 4739 /* Here we have taken care of any necessary node-type changes */ 4740 4741 if (m->count) { 4742 m->max_length = lengths[0]; 4743 m->min_length = lengths[0]; 4744 } 4745 4746 /* For non-folding nodes, there are no other possible candidate matches, 4747 * but for foldable ones, we have to look further. */ 4748 if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */ 4749 UV folded; /* The first character in the pattern, folded */ 4750 U32 first_fold_from; /* A character that folds to it */ 4751 const U32 * remaining_fold_froms; /* The remaining characters that 4752 fold to it, if any */ 4753 Size_t folds_to_count; /* The total number of characters that fold to 4754 'folded' */ 4755 4756 /* If the node begins with a sequence of more than one character that 4757 * together form the fold of a single character, it is called a 4758 * 'multi-character fold', and the normal functions don't handle this 4759 * case. We set 'multi_fold_from' to the single folded-from character, 4760 * which is handled in an extra iteration below */ 4761 if (utf8_pat) { 4762 folded = valid_utf8_to_uvchr(pat, NULL); 4763 multi_fold_from 4764 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len); 4765 } 4766 else { 4767 folded = *pat; 4768 4769 /* This may generate illegal combinations for things like EXACTF, 4770 * but rather than repeat the logic and exclude them here, all such 4771 * illegalities are checked for and skipped below in the loop */ 4772 multi_fold_from 4773 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len); 4774 } 4775 4776 /* Everything matches at least itself; initialize to that because the 4777 * only the branches below that set it are the ones where the number 4778 * isn't 1. */ 4779 folds_to_count = 1; 4780 4781 /* There are a few special cases for locale-dependent nodes, where the 4782 * run-time context was needed before we could know what matched */ 4783 if (UNLIKELY(op == EXACTFL) && folded < 256) { 4784 first_fold_from = PL_fold_locale[folded]; 4785 } 4786 else if ( op == EXACTFL && utf8_target && utf8_pat 4787 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8 4788 LATIN_SMALL_LETTER_LONG_S_UTF8)) 4789 { 4790 first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S; 4791 } 4792 else if (UNLIKELY( op == TURKISH 4793 && ( isALPHA_FOLD_EQ(folded, 'i') 4794 || inRANGE(folded, 4795 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE, 4796 LATIN_SMALL_LETTER_DOTLESS_I)))) 4797 { /* Turkish folding requires special handling */ 4798 if (folded == 'i') 4799 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; 4800 else if (folded == 'I') 4801 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I; 4802 else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) 4803 first_fold_from = 'i'; 4804 else first_fold_from = 'I'; 4805 } 4806 else { 4807 /* Here, isn't a special case: use the generic function to 4808 * calculate what folds to this */ 4809 redo_multi: 4810 /* Look up what code points (besides itself) fold to 'folded'; 4811 * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */ 4812 folds_to_count = _inverse_folds(folded, &first_fold_from, 4813 &remaining_fold_froms); 4814 } 4815 4816 /* Add each character that folds to 'folded' to the list of them, 4817 * subject to limitations based on the node type and target UTF8ness. 4818 * If there was a character that folded to multiple characters, do an 4819 * extra iteration for it. (Note the extra iteration if there is a 4820 * multi-character fold) */ 4821 for (i = 0; i < folds_to_count 4822 + UNLIKELY(multi_fold_from != 0); i++) 4823 { 4824 UV fold_from = 0; 4825 4826 if (i >= folds_to_count) { /* Final iteration: handle the 4827 multi-char */ 4828 fold_from = multi_fold_from; 4829 } 4830 else if (i == 0) { 4831 fold_from = first_fold_from; 4832 } 4833 else if (i < folds_to_count) { 4834 fold_from = remaining_fold_froms[i-1]; 4835 } 4836 4837 if (folded == fold_from) { /* We already added the character 4838 itself */ 4839 continue; 4840 } 4841 4842 /* EXACTF doesn't have any non-ascii folds */ 4843 if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) { 4844 continue; 4845 } 4846 4847 /* In /iaa nodes, neither or both must be ASCII to be a legal fold 4848 * */ 4849 if ( isASCII(folded) != isASCII(fold_from) 4850 && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE)) 4851 4852 { 4853 continue; 4854 } 4855 4856 /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8 4857 * locale, but those have been converted to EXACTFU above) */ 4858 if ( op == EXACTFL 4859 && (folded < 256) != (fold_from < 256)) 4860 { 4861 continue; 4862 } 4863 4864 /* If this triggers, it likely is because of the unlikely case 4865 * where a new Unicode standard has changed what MAX_MATCHES should 4866 * be set to */ 4867 assert(m->count < MAX_MATCHES); 4868 4869 /* Add this character to the list of possible matches */ 4870 if (utf8_target) { 4871 uvchr_to_utf8(matches[m->count], fold_from); 4872 lengths[m->count] = UVCHR_SKIP(fold_from); 4873 m->count++; 4874 } 4875 else { /* Non-UTF8 target: no code point above 255 can appear in it 4876 */ 4877 if (fold_from > 255) { 4878 continue; 4879 } 4880 4881 matches[m->count][0] = fold_from; 4882 lengths[m->count] = 1; 4883 m->count++; 4884 } 4885 4886 /* Update min and mlengths */ 4887 if (m->min_length > lengths[m->count-1]) { 4888 m->min_length = lengths[m->count-1]; 4889 } 4890 4891 if (m->max_length < lengths[m->count-1]) { 4892 index_of_longest = m->count - 1; 4893 m->max_length = lengths[index_of_longest]; 4894 } 4895 } /* looped through each potential fold */ 4896 4897 /* If there is something that folded to an initial multi-character 4898 * fold, repeat, using it. This catches some edge cases. An example 4899 * of one is /ss/i when UTF-8 encoded. The function 4900 * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF 4901 * (LATIN SMALL SHARP S). If it returned a list of characters, this 4902 * code wouldn't be needed. But since it doesn't, we have to look what 4903 * folds to the U+DF. In this case, U+1E9E does, and has to be added. 4904 * */ 4905 if (multi_fold_from) { 4906 folded = multi_fold_from; 4907 multi_fold_from = 0; 4908 goto redo_multi; 4909 } 4910 } /* End of finding things that participate in this fold */ 4911 4912 if (m->count == 0) { /* If nothing found, can't match */ 4913 m->min_length = 0; 4914 return FALSE; 4915 } 4916 4917 /* Have calculated all possible matches. Now calculate the mask and AND 4918 * values */ 4919 m->initial_exact = 0; 4920 m->initial_definitive = 0; 4921 4922 { 4923 unsigned int mask_ones = 0; 4924 unsigned int possible_ones = 0; 4925 U8 j; 4926 4927 /* For each byte that is in all possible matches ... */ 4928 for (j = 0; j < MIN(m->min_length, 5); j++) { 4929 4930 /* Initialize the accumulator for this byte */ 4931 byte_mask[j] = 0xFF; 4932 byte_anded[j] = matches[0][j]; 4933 4934 /* Then the rest of the rows (folds). The mask is based on, like, 4935 * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0 4936 * where they differ. */ 4937 for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) { 4938 byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]); 4939 byte_anded[j] &= matches[i][j]; 4940 } 4941 4942 /* Keep track of the number of initial mask bytes that are all one 4943 * bits. The code calling this can use this number to know that 4944 * a string that matches this number of bytes in the pattern is an 4945 * exact match of that pattern for this number of bytes. But also 4946 * counted are the number of initial bytes that in total have a 4947 * single zero bit. If a string matches those, masked, it must be 4948 * one of two possibilites, both of which this function has 4949 * determined are legal. (But if that single 0 is one of the 4950 * initial bits for masking a UTF-8 start byte, that could 4951 * incorrectly lead to different length strings appearing to be 4952 * equivalent, so only do this optimization when the matchables are 4953 * all the same length. This was uncovered by testing 4954 * /\x{029E}/i.) */ 4955 if (m->min_length == m->max_length) { 4956 mask_ones += PL_bitcount[byte_mask[j]]; 4957 possible_ones += 8; 4958 if (mask_ones + 1 >= possible_ones) { 4959 m->initial_definitive++; 4960 if (mask_ones >= possible_ones) { 4961 m->initial_exact++; 4962 } 4963 } 4964 } 4965 } 4966 } 4967 4968 /* The first byte is separate for speed */ 4969 m->first_byte_mask = byte_mask[0]; 4970 m->first_byte_anded = byte_anded[0]; 4971 4972 /* Then pack up to the next 4 bytes into a word */ 4973 m->mask32 = m->anded32 = 0; 4974 for (i = 1; i < MIN(m->min_length, 5); i++) { 4975 U8 which = i; 4976 U8 shift = (which - 1) * 8; 4977 m->mask32 |= (U32) byte_mask[i] << shift; 4978 m->anded32 |= (U32) byte_anded[i] << shift; 4979 } 4980 4981 /* Finally, take the match strings and place them sequentially into a 4982 * one-dimensional array. (This is done to save significant space in the 4983 * structure.) Sort so the longest (presumably the least likely) is last. 4984 * XXX When this gets moved to regcomp, may want to fully sort shortest 4985 * first, but above we generally used the folded code point first, and 4986 * those tend to be no longer than their upper case values, so this is 4987 * already pretty well sorted by size. 4988 * 4989 * If the asserts fail, it's most likely because a new version of the 4990 * Unicode standard requires more space; simply increase the declaration 4991 * size. */ 4992 { 4993 U8 cur_pos = 0; 4994 U8 output_index = 0; 4995 4996 if (m->count > 1) { /* No need to sort a single entry */ 4997 for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) { 4998 4999 /* Keep the same order for all but the longest. (If the 5000 * asserts fail, it could be because m->matches is declared too 5001 * short, either because of a new Unicode release, or an 5002 * overlooked test case, or it could be a bug.) */ 5003 if (i != index_of_longest) { 5004 assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches)); 5005 Copy(matches[i], m->matches + cur_pos, lengths[i], U8); 5006 cur_pos += lengths[i]; 5007 m->lengths[output_index++] = lengths[i]; 5008 } 5009 } 5010 } 5011 5012 assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches)); 5013 Copy(matches[index_of_longest], m->matches + cur_pos, 5014 lengths[index_of_longest], U8); 5015 5016 /* Place the longest match last */ 5017 m->lengths[output_index] = lengths[index_of_longest]; 5018 } 5019 5020 5021 return TRUE; 5022 } 5023 5024 PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */ 5025 bool 5026 S_test_EXACTISH_ST(const char * loc, 5027 struct next_matchable_info info) 5028 { 5029 /* This function uses the data set up in setup_EXACTISH_ST() to see if the 5030 * bytes starting at 'loc' can match based on 'next_matchable_info' */ 5031 5032 U32 input32 = 0; 5033 5034 /* Check the first byte */ 5035 if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded) 5036 return FALSE; 5037 5038 /* Pack the next up-to-4 bytes into a 32 bit word */ 5039 switch (info.min_length) { 5040 default: 5041 input32 |= (U32) ((U8) loc[4]) << 3 * 8; 5042 /* FALLTHROUGH */ 5043 case 4: 5044 input32 |= (U8) loc[3] << 2 * 8; 5045 /* FALLTHROUGH */ 5046 case 3: 5047 input32 |= (U8) loc[2] << 1 * 8; 5048 /* FALLTHROUGH */ 5049 case 2: 5050 input32 |= (U8) loc[1]; 5051 break; 5052 case 1: 5053 return TRUE; /* We already tested and passed the 0th byte */ 5054 case 0: 5055 ASSUME(0); 5056 } 5057 5058 /* And AND that with the mask and compare that with the assembled ANDED 5059 * values */ 5060 return (input32 & info.mask32) == info.anded32; 5061 } 5062 5063 STATIC bool 5064 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target) 5065 { 5066 /* returns a boolean indicating if there is a Grapheme Cluster Boundary 5067 * between the inputs. See https://www.unicode.org/reports/tr29/. */ 5068 5069 PERL_ARGS_ASSERT_ISGCB; 5070 5071 switch (GCB_table[before][after]) { 5072 case GCB_BREAKABLE: 5073 return TRUE; 5074 5075 case GCB_NOBREAK: 5076 return FALSE; 5077 5078 case GCB_RI_then_RI: 5079 { 5080 int RI_count = 1; 5081 U8 * temp_pos = (U8 *) curpos; 5082 5083 /* Do not break within emoji flag sequences. That is, do not 5084 * break between regional indicator (RI) symbols if there is an 5085 * odd number of RI characters before the break point. 5086 * GB12 sot (RI RI)* RI × RI 5087 * GB13 [^RI] (RI RI)* RI × RI */ 5088 5089 while (backup_one_GCB(strbeg, 5090 &temp_pos, 5091 utf8_target) == GCB_Regional_Indicator) 5092 { 5093 RI_count++; 5094 } 5095 5096 return RI_count % 2 != 1; 5097 } 5098 5099 case GCB_EX_then_EM: 5100 5101 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */ 5102 { 5103 U8 * temp_pos = (U8 *) curpos; 5104 GCB_enum prev; 5105 5106 do { 5107 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target); 5108 } 5109 while (prev == GCB_Extend); 5110 5111 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ; 5112 } 5113 5114 case GCB_Maybe_Emoji_NonBreak: 5115 5116 { 5117 5118 /* Do not break within emoji modifier sequences or emoji zwj sequences. 5119 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic} 5120 */ 5121 U8 * temp_pos = (U8 *) curpos; 5122 GCB_enum prev; 5123 5124 do { 5125 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target); 5126 } 5127 while (prev == GCB_Extend); 5128 5129 return prev != GCB_ExtPict_XX; 5130 } 5131 5132 default: 5133 break; 5134 } 5135 5136 #ifdef DEBUGGING 5137 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n", 5138 before, after, GCB_table[before][after]); 5139 assert(0); 5140 #endif 5141 return TRUE; 5142 } 5143 5144 STATIC GCB_enum 5145 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5146 { 5147 GCB_enum gcb; 5148 5149 PERL_ARGS_ASSERT_BACKUP_ONE_GCB; 5150 5151 if (*curpos < strbeg) { 5152 return GCB_EDGE; 5153 } 5154 5155 if (utf8_target) { 5156 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5157 U8 * prev_prev_char_pos; 5158 5159 if (! prev_char_pos) { 5160 return GCB_EDGE; 5161 } 5162 5163 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) { 5164 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5165 *curpos = prev_char_pos; 5166 prev_char_pos = prev_prev_char_pos; 5167 } 5168 else { 5169 *curpos = (U8 *) strbeg; 5170 return GCB_EDGE; 5171 } 5172 } 5173 else { 5174 if (*curpos - 2 < strbeg) { 5175 *curpos = (U8 *) strbeg; 5176 return GCB_EDGE; 5177 } 5178 (*curpos)--; 5179 gcb = getGCB_VAL_CP(*(*curpos - 1)); 5180 } 5181 5182 return gcb; 5183 } 5184 5185 /* Combining marks attach to most classes that precede them, but this defines 5186 * the exceptions (from TR14) */ 5187 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \ 5188 || prev == LB_Mandatory_Break \ 5189 || prev == LB_Carriage_Return \ 5190 || prev == LB_Line_Feed \ 5191 || prev == LB_Next_Line \ 5192 || prev == LB_Space \ 5193 || prev == LB_ZWSpace)) 5194 5195 STATIC bool 5196 S_isLB(pTHX_ LB_enum before, 5197 LB_enum after, 5198 const U8 * const strbeg, 5199 const U8 * const curpos, 5200 const U8 * const strend, 5201 const bool utf8_target) 5202 { 5203 U8 * temp_pos = (U8 *) curpos; 5204 LB_enum prev = before; 5205 5206 /* Is the boundary between 'before' and 'after' line-breakable? 5207 * Most of this is just a table lookup of a generated table from Unicode 5208 * rules. But some rules require context to decide, and so have to be 5209 * implemented in code */ 5210 5211 PERL_ARGS_ASSERT_ISLB; 5212 5213 /* Rule numbers in the comments below are as of Unicode 9.0 */ 5214 5215 redo: 5216 before = prev; 5217 switch (LB_table[before][after]) { 5218 case LB_BREAKABLE: 5219 return TRUE; 5220 5221 case LB_NOBREAK: 5222 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN: 5223 return FALSE; 5224 5225 case LB_SP_foo + LB_BREAKABLE: 5226 case LB_SP_foo + LB_NOBREAK: 5227 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN: 5228 5229 /* When we have something following a SP, we have to look at the 5230 * context in order to know what to do. 5231 * 5232 * SP SP should not reach here because LB7: Do not break before 5233 * spaces. (For two spaces in a row there is nothing that 5234 * overrides that) */ 5235 assert(after != LB_Space); 5236 5237 /* Here we have a space followed by a non-space. Mostly this is a 5238 * case of LB18: "Break after spaces". But there are complications 5239 * as the handling of spaces is somewhat tricky. They are in a 5240 * number of rules, which have to be applied in priority order, but 5241 * something earlier in the string can cause a rule to be skipped 5242 * and a lower priority rule invoked. A prime example is LB7 which 5243 * says don't break before a space. But rule LB8 (lower priority) 5244 * says that the first break opportunity after a ZW is after any 5245 * span of spaces immediately after it. If a ZW comes before a SP 5246 * in the input, rule LB8 applies, and not LB7. Other such rules 5247 * involve combining marks which are rules 9 and 10, but they may 5248 * override higher priority rules if they come earlier in the 5249 * string. Since we're doing random access into the middle of the 5250 * string, we have to look for rules that should get applied based 5251 * on both string position and priority. Combining marks do not 5252 * attach to either ZW nor SP, so we don't have to consider them 5253 * until later. 5254 * 5255 * To check for LB8, we have to find the first non-space character 5256 * before this span of spaces */ 5257 do { 5258 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 5259 } 5260 while (prev == LB_Space); 5261 5262 /* LB8 Break before any character following a zero-width space, 5263 * even if one or more spaces intervene. 5264 * ZW SP* ÷ 5265 * So if we have a ZW just before this span, and to get here this 5266 * is the final space in the span. */ 5267 if (prev == LB_ZWSpace) { 5268 return TRUE; 5269 } 5270 5271 /* Here, not ZW SP+. There are several rules that have higher 5272 * priority than LB18 and can be resolved now, as they don't depend 5273 * on anything earlier in the string (except ZW, which we have 5274 * already handled). One of these rules is LB11 Do not break 5275 * before Word joiner, but we have specially encoded that in the 5276 * lookup table so it is caught by the single test below which 5277 * catches the other ones. */ 5278 if (LB_table[LB_Space][after] - LB_SP_foo 5279 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN) 5280 { 5281 return FALSE; 5282 } 5283 5284 /* If we get here, we have to XXX consider combining marks. */ 5285 if (prev == LB_Combining_Mark) { 5286 5287 /* What happens with these depends on the character they 5288 * follow. */ 5289 do { 5290 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 5291 } 5292 while (prev == LB_Combining_Mark); 5293 5294 /* Most times these attach to and inherit the characteristics 5295 * of that character, but not always, and when not, they are to 5296 * be treated as AL by rule LB10. */ 5297 if (! LB_CM_ATTACHES_TO(prev)) { 5298 prev = LB_Alphabetic; 5299 } 5300 } 5301 5302 /* Here, we have the character preceding the span of spaces all set 5303 * up. We follow LB18: "Break after spaces" unless the table shows 5304 * that is overriden */ 5305 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN; 5306 5307 case LB_CM_ZWJ_foo: 5308 5309 /* We don't know how to treat the CM except by looking at the first 5310 * non-CM character preceding it. ZWJ is treated as CM */ 5311 do { 5312 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 5313 } 5314 while (prev == LB_Combining_Mark || prev == LB_ZWJ); 5315 5316 /* Here, 'prev' is that first earlier non-CM character. If the CM 5317 * attatches to it, then it inherits the behavior of 'prev'. If it 5318 * doesn't attach, it is to be treated as an AL */ 5319 if (! LB_CM_ATTACHES_TO(prev)) { 5320 prev = LB_Alphabetic; 5321 } 5322 5323 goto redo; 5324 5325 case LB_HY_or_BA_then_foo + LB_BREAKABLE: 5326 case LB_HY_or_BA_then_foo + LB_NOBREAK: 5327 5328 /* LB21a Don't break after Hebrew + Hyphen. 5329 * HL (HY | BA) × */ 5330 5331 if (backup_one_LB(strbeg, &temp_pos, utf8_target) 5332 == LB_Hebrew_Letter) 5333 { 5334 return FALSE; 5335 } 5336 5337 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE; 5338 5339 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE: 5340 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK: 5341 5342 /* LB25a (PR | PO) × ( OP | HY )? NU */ 5343 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) { 5344 return FALSE; 5345 } 5346 5347 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY 5348 == LB_BREAKABLE; 5349 5350 case LB_SY_or_IS_then_various + LB_BREAKABLE: 5351 case LB_SY_or_IS_then_various + LB_NOBREAK: 5352 { 5353 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */ 5354 5355 LB_enum temp = prev; 5356 do { 5357 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 5358 } 5359 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric); 5360 if (temp == LB_Numeric) { 5361 return FALSE; 5362 } 5363 5364 return LB_table[prev][after] - LB_SY_or_IS_then_various 5365 == LB_BREAKABLE; 5366 } 5367 5368 case LB_various_then_PO_or_PR + LB_BREAKABLE: 5369 case LB_various_then_PO_or_PR + LB_NOBREAK: 5370 { 5371 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */ 5372 5373 LB_enum temp = prev; 5374 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis) 5375 { 5376 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 5377 } 5378 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) { 5379 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 5380 } 5381 if (temp == LB_Numeric) { 5382 return FALSE; 5383 } 5384 return LB_various_then_PO_or_PR; 5385 } 5386 5387 case LB_RI_then_RI + LB_NOBREAK: 5388 case LB_RI_then_RI + LB_BREAKABLE: 5389 { 5390 int RI_count = 1; 5391 5392 /* LB30a Break between two regional indicator symbols if and 5393 * only if there are an even number of regional indicators 5394 * preceding the position of the break. 5395 * 5396 * sot (RI RI)* RI × RI 5397 * [^RI] (RI RI)* RI × RI */ 5398 5399 while (backup_one_LB(strbeg, 5400 &temp_pos, 5401 utf8_target) == LB_Regional_Indicator) 5402 { 5403 RI_count++; 5404 } 5405 5406 return RI_count % 2 == 0; 5407 } 5408 5409 default: 5410 break; 5411 } 5412 5413 #ifdef DEBUGGING 5414 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n", 5415 before, after, LB_table[before][after]); 5416 assert(0); 5417 #endif 5418 return TRUE; 5419 } 5420 5421 STATIC LB_enum 5422 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) 5423 { 5424 5425 LB_enum lb; 5426 5427 PERL_ARGS_ASSERT_ADVANCE_ONE_LB; 5428 5429 if (*curpos >= strend) { 5430 return LB_EDGE; 5431 } 5432 5433 if (utf8_target) { 5434 *curpos += UTF8SKIP(*curpos); 5435 if (*curpos >= strend) { 5436 return LB_EDGE; 5437 } 5438 lb = getLB_VAL_UTF8(*curpos, strend); 5439 } 5440 else { 5441 (*curpos)++; 5442 if (*curpos >= strend) { 5443 return LB_EDGE; 5444 } 5445 lb = getLB_VAL_CP(**curpos); 5446 } 5447 5448 return lb; 5449 } 5450 5451 STATIC LB_enum 5452 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5453 { 5454 LB_enum lb; 5455 5456 PERL_ARGS_ASSERT_BACKUP_ONE_LB; 5457 5458 if (*curpos < strbeg) { 5459 return LB_EDGE; 5460 } 5461 5462 if (utf8_target) { 5463 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5464 U8 * prev_prev_char_pos; 5465 5466 if (! prev_char_pos) { 5467 return LB_EDGE; 5468 } 5469 5470 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) { 5471 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5472 *curpos = prev_char_pos; 5473 prev_char_pos = prev_prev_char_pos; 5474 } 5475 else { 5476 *curpos = (U8 *) strbeg; 5477 return LB_EDGE; 5478 } 5479 } 5480 else { 5481 if (*curpos - 2 < strbeg) { 5482 *curpos = (U8 *) strbeg; 5483 return LB_EDGE; 5484 } 5485 (*curpos)--; 5486 lb = getLB_VAL_CP(*(*curpos - 1)); 5487 } 5488 5489 return lb; 5490 } 5491 5492 STATIC bool 5493 S_isSB(pTHX_ SB_enum before, 5494 SB_enum after, 5495 const U8 * const strbeg, 5496 const U8 * const curpos, 5497 const U8 * const strend, 5498 const bool utf8_target) 5499 { 5500 /* returns a boolean indicating if there is a Sentence Boundary Break 5501 * between the inputs. See https://www.unicode.org/reports/tr29/ */ 5502 5503 U8 * lpos = (U8 *) curpos; 5504 bool has_para_sep = FALSE; 5505 bool has_sp = FALSE; 5506 5507 PERL_ARGS_ASSERT_ISSB; 5508 5509 /* Break at the start and end of text. 5510 SB1. sot ÷ 5511 SB2. ÷ eot 5512 But unstated in Unicode is don't break if the text is empty */ 5513 if (before == SB_EDGE || after == SB_EDGE) { 5514 return before != after; 5515 } 5516 5517 /* SB 3: Do not break within CRLF. */ 5518 if (before == SB_CR && after == SB_LF) { 5519 return FALSE; 5520 } 5521 5522 /* Break after paragraph separators. CR and LF are considered 5523 * so because Unicode views text as like word processing text where there 5524 * are no newlines except between paragraphs, and the word processor takes 5525 * care of wrapping without there being hard line-breaks in the text *./ 5526 SB4. Sep | CR | LF ÷ */ 5527 if (before == SB_Sep || before == SB_CR || before == SB_LF) { 5528 return TRUE; 5529 } 5530 5531 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF. 5532 * (See Section 6.2, Replacing Ignore Rules.) 5533 SB5. X (Extend | Format)* → X */ 5534 if (after == SB_Extend || after == SB_Format) { 5535 5536 /* Implied is that the these characters attach to everything 5537 * immediately prior to them except for those separator-type 5538 * characters. And the rules earlier have already handled the case 5539 * when one of those immediately precedes the extend char */ 5540 return FALSE; 5541 } 5542 5543 if (before == SB_Extend || before == SB_Format) { 5544 U8 * temp_pos = lpos; 5545 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target); 5546 if ( backup != SB_EDGE 5547 && backup != SB_Sep 5548 && backup != SB_CR 5549 && backup != SB_LF) 5550 { 5551 before = backup; 5552 lpos = temp_pos; 5553 } 5554 5555 /* Here, both 'before' and 'backup' are these types; implied is that we 5556 * don't break between them */ 5557 if (backup == SB_Extend || backup == SB_Format) { 5558 return FALSE; 5559 } 5560 } 5561 5562 /* Do not break after ambiguous terminators like period, if they are 5563 * immediately followed by a number or lowercase letter, if they are 5564 * between uppercase letters, if the first following letter (optionally 5565 * after certain punctuation) is lowercase, or if they are followed by 5566 * "continuation" punctuation such as comma, colon, or semicolon. For 5567 * example, a period may be an abbreviation or numeric period, and thus may 5568 * not mark the end of a sentence. 5569 5570 * SB6. ATerm × Numeric */ 5571 if (before == SB_ATerm && after == SB_Numeric) { 5572 return FALSE; 5573 } 5574 5575 /* SB7. (Upper | Lower) ATerm × Upper */ 5576 if (before == SB_ATerm && after == SB_Upper) { 5577 U8 * temp_pos = lpos; 5578 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target); 5579 if (backup == SB_Upper || backup == SB_Lower) { 5580 return FALSE; 5581 } 5582 } 5583 5584 /* The remaining rules that aren't the final one, all require an STerm or 5585 * an ATerm after having backed up over some Close* Sp*, and in one case an 5586 * optional Paragraph separator, although one rule doesn't have any Sp's in it. 5587 * So do that backup now, setting flags if either Sp or a paragraph 5588 * separator are found */ 5589 5590 if (before == SB_Sep || before == SB_CR || before == SB_LF) { 5591 has_para_sep = TRUE; 5592 before = backup_one_SB(strbeg, &lpos, utf8_target); 5593 } 5594 5595 if (before == SB_Sp) { 5596 has_sp = TRUE; 5597 do { 5598 before = backup_one_SB(strbeg, &lpos, utf8_target); 5599 } 5600 while (before == SB_Sp); 5601 } 5602 5603 while (before == SB_Close) { 5604 before = backup_one_SB(strbeg, &lpos, utf8_target); 5605 } 5606 5607 /* The next few rules apply only when the backed-up-to is an ATerm, and in 5608 * most cases an STerm */ 5609 if (before == SB_STerm || before == SB_ATerm) { 5610 5611 /* So, here the lhs matches 5612 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)? 5613 * and we have set flags if we found an Sp, or the optional Sep,CR,LF. 5614 * The rules that apply here are: 5615 * 5616 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR 5617 | LF | STerm | ATerm) )* Lower 5618 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm) 5619 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF) 5620 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF) 5621 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷ 5622 */ 5623 5624 /* And all but SB11 forbid having seen a paragraph separator */ 5625 if (! has_para_sep) { 5626 if (before == SB_ATerm) { /* SB8 */ 5627 U8 * rpos = (U8 *) curpos; 5628 SB_enum later = after; 5629 5630 while ( later != SB_OLetter 5631 && later != SB_Upper 5632 && later != SB_Lower 5633 && later != SB_Sep 5634 && later != SB_CR 5635 && later != SB_LF 5636 && later != SB_STerm 5637 && later != SB_ATerm 5638 && later != SB_EDGE) 5639 { 5640 later = advance_one_SB(&rpos, strend, utf8_target); 5641 } 5642 if (later == SB_Lower) { 5643 return FALSE; 5644 } 5645 } 5646 5647 if ( after == SB_SContinue /* SB8a */ 5648 || after == SB_STerm 5649 || after == SB_ATerm) 5650 { 5651 return FALSE; 5652 } 5653 5654 if (! has_sp) { /* SB9 applies only if there was no Sp* */ 5655 if ( after == SB_Close 5656 || after == SB_Sp 5657 || after == SB_Sep 5658 || after == SB_CR 5659 || after == SB_LF) 5660 { 5661 return FALSE; 5662 } 5663 } 5664 5665 /* SB10. This and SB9 could probably be combined some way, but khw 5666 * has decided to follow the Unicode rule book precisely for 5667 * simplified maintenance */ 5668 if ( after == SB_Sp 5669 || after == SB_Sep 5670 || after == SB_CR 5671 || after == SB_LF) 5672 { 5673 return FALSE; 5674 } 5675 } 5676 5677 /* SB11. */ 5678 return TRUE; 5679 } 5680 5681 /* Otherwise, do not break. 5682 SB12. Any × Any */ 5683 5684 return FALSE; 5685 } 5686 5687 STATIC SB_enum 5688 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) 5689 { 5690 SB_enum sb; 5691 5692 PERL_ARGS_ASSERT_ADVANCE_ONE_SB; 5693 5694 if (*curpos >= strend) { 5695 return SB_EDGE; 5696 } 5697 5698 if (utf8_target) { 5699 do { 5700 *curpos += UTF8SKIP(*curpos); 5701 if (*curpos >= strend) { 5702 return SB_EDGE; 5703 } 5704 sb = getSB_VAL_UTF8(*curpos, strend); 5705 } while (sb == SB_Extend || sb == SB_Format); 5706 } 5707 else { 5708 do { 5709 (*curpos)++; 5710 if (*curpos >= strend) { 5711 return SB_EDGE; 5712 } 5713 sb = getSB_VAL_CP(**curpos); 5714 } while (sb == SB_Extend || sb == SB_Format); 5715 } 5716 5717 return sb; 5718 } 5719 5720 STATIC SB_enum 5721 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5722 { 5723 SB_enum sb; 5724 5725 PERL_ARGS_ASSERT_BACKUP_ONE_SB; 5726 5727 if (*curpos < strbeg) { 5728 return SB_EDGE; 5729 } 5730 5731 if (utf8_target) { 5732 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5733 if (! prev_char_pos) { 5734 return SB_EDGE; 5735 } 5736 5737 /* Back up over Extend and Format. curpos is always just to the right 5738 * of the characater whose value we are getting */ 5739 do { 5740 U8 * prev_prev_char_pos; 5741 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, 5742 strbeg))) 5743 { 5744 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5745 *curpos = prev_char_pos; 5746 prev_char_pos = prev_prev_char_pos; 5747 } 5748 else { 5749 *curpos = (U8 *) strbeg; 5750 return SB_EDGE; 5751 } 5752 } while (sb == SB_Extend || sb == SB_Format); 5753 } 5754 else { 5755 do { 5756 if (*curpos - 2 < strbeg) { 5757 *curpos = (U8 *) strbeg; 5758 return SB_EDGE; 5759 } 5760 (*curpos)--; 5761 sb = getSB_VAL_CP(*(*curpos - 1)); 5762 } while (sb == SB_Extend || sb == SB_Format); 5763 } 5764 5765 return sb; 5766 } 5767 5768 STATIC bool 5769 S_isWB(pTHX_ WB_enum previous, 5770 WB_enum before, 5771 WB_enum after, 5772 const U8 * const strbeg, 5773 const U8 * const curpos, 5774 const U8 * const strend, 5775 const bool utf8_target) 5776 { 5777 /* Return a boolean as to if the boundary between 'before' and 'after' is 5778 * a Unicode word break, using their published algorithm, but tailored for 5779 * Perl by treating spans of white space as one unit. Context may be 5780 * needed to make this determination. If the value for the character 5781 * before 'before' is known, it is passed as 'previous'; otherwise that 5782 * should be set to WB_UNKNOWN. The other input parameters give the 5783 * boundaries and current position in the matching of the string. That 5784 * is, 'curpos' marks the position where the character whose wb value is 5785 * 'after' begins. See http://www.unicode.org/reports/tr29/ */ 5786 5787 U8 * before_pos = (U8 *) curpos; 5788 U8 * after_pos = (U8 *) curpos; 5789 WB_enum prev = before; 5790 WB_enum next; 5791 5792 PERL_ARGS_ASSERT_ISWB; 5793 5794 /* Rule numbers in the comments below are as of Unicode 9.0 */ 5795 5796 redo: 5797 before = prev; 5798 switch (WB_table[before][after]) { 5799 case WB_BREAKABLE: 5800 return TRUE; 5801 5802 case WB_NOBREAK: 5803 return FALSE; 5804 5805 case WB_hs_then_hs: /* 2 horizontal spaces in a row */ 5806 next = advance_one_WB(&after_pos, strend, utf8_target, 5807 FALSE /* Don't skip Extend nor Format */ ); 5808 /* A space immediately preceding an Extend or Format is attached 5809 * to by them, and hence gets separated from previous spaces. 5810 * Otherwise don't break between horizontal white space */ 5811 return next == WB_Extend || next == WB_Format; 5812 5813 /* WB4 Ignore Format and Extend characters, except when they appear at 5814 * the beginning of a region of text. This code currently isn't 5815 * general purpose, but it works as the rules are currently and likely 5816 * to be laid out. The reason it works is that when 'they appear at 5817 * the beginning of a region of text', the rule is to break before 5818 * them, just like any other character. Therefore, the default rule 5819 * applies and we don't have to look in more depth. Should this ever 5820 * change, we would have to have 2 'case' statements, like in the rules 5821 * below, and backup a single character (not spacing over the extend 5822 * ones) and then see if that is one of the region-end characters and 5823 * go from there */ 5824 case WB_Ex_or_FO_or_ZWJ_then_foo: 5825 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); 5826 goto redo; 5827 5828 case WB_DQ_then_HL + WB_BREAKABLE: 5829 case WB_DQ_then_HL + WB_NOBREAK: 5830 5831 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */ 5832 5833 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) 5834 == WB_Hebrew_Letter) 5835 { 5836 return FALSE; 5837 } 5838 5839 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE; 5840 5841 case WB_HL_then_DQ + WB_BREAKABLE: 5842 case WB_HL_then_DQ + WB_NOBREAK: 5843 5844 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */ 5845 5846 if (advance_one_WB(&after_pos, strend, utf8_target, 5847 TRUE /* Do skip Extend and Format */ ) 5848 == WB_Hebrew_Letter) 5849 { 5850 return FALSE; 5851 } 5852 5853 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE; 5854 5855 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK: 5856 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE: 5857 5858 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet 5859 * | Single_Quote) (ALetter | Hebrew_Letter) */ 5860 5861 next = advance_one_WB(&after_pos, strend, utf8_target, 5862 TRUE /* Do skip Extend and Format */ ); 5863 5864 if (next == WB_ALetter || next == WB_Hebrew_Letter) 5865 { 5866 return FALSE; 5867 } 5868 5869 return WB_table[before][after] 5870 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE; 5871 5872 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK: 5873 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE: 5874 5875 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet 5876 * | Single_Quote) × (ALetter | Hebrew_Letter) */ 5877 5878 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); 5879 if (prev == WB_ALetter || prev == WB_Hebrew_Letter) 5880 { 5881 return FALSE; 5882 } 5883 5884 return WB_table[before][after] 5885 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE; 5886 5887 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK: 5888 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE: 5889 5890 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric 5891 * */ 5892 5893 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) 5894 == WB_Numeric) 5895 { 5896 return FALSE; 5897 } 5898 5899 return WB_table[before][after] 5900 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE; 5901 5902 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK: 5903 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE: 5904 5905 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */ 5906 5907 if (advance_one_WB(&after_pos, strend, utf8_target, 5908 TRUE /* Do skip Extend and Format */ ) 5909 == WB_Numeric) 5910 { 5911 return FALSE; 5912 } 5913 5914 return WB_table[before][after] 5915 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE; 5916 5917 case WB_RI_then_RI + WB_NOBREAK: 5918 case WB_RI_then_RI + WB_BREAKABLE: 5919 { 5920 int RI_count = 1; 5921 5922 /* Do not break within emoji flag sequences. That is, do not 5923 * break between regional indicator (RI) symbols if there is an 5924 * odd number of RI characters before the potential break 5925 * point. 5926 * 5927 * WB15 sot (RI RI)* RI × RI 5928 * WB16 [^RI] (RI RI)* RI × RI */ 5929 5930 while (backup_one_WB(&previous, 5931 strbeg, 5932 &before_pos, 5933 utf8_target) == WB_Regional_Indicator) 5934 { 5935 RI_count++; 5936 } 5937 5938 return RI_count % 2 != 1; 5939 } 5940 5941 default: 5942 break; 5943 } 5944 5945 #ifdef DEBUGGING 5946 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n", 5947 before, after, WB_table[before][after]); 5948 assert(0); 5949 #endif 5950 return TRUE; 5951 } 5952 5953 STATIC WB_enum 5954 S_advance_one_WB(pTHX_ U8 ** curpos, 5955 const U8 * const strend, 5956 const bool utf8_target, 5957 const bool skip_Extend_Format) 5958 { 5959 WB_enum wb; 5960 5961 PERL_ARGS_ASSERT_ADVANCE_ONE_WB; 5962 5963 if (*curpos >= strend) { 5964 return WB_EDGE; 5965 } 5966 5967 if (utf8_target) { 5968 5969 /* Advance over Extend and Format */ 5970 do { 5971 *curpos += UTF8SKIP(*curpos); 5972 if (*curpos >= strend) { 5973 return WB_EDGE; 5974 } 5975 wb = getWB_VAL_UTF8(*curpos, strend); 5976 } while ( skip_Extend_Format 5977 && (wb == WB_Extend || wb == WB_Format)); 5978 } 5979 else { 5980 do { 5981 (*curpos)++; 5982 if (*curpos >= strend) { 5983 return WB_EDGE; 5984 } 5985 wb = getWB_VAL_CP(**curpos); 5986 } while ( skip_Extend_Format 5987 && (wb == WB_Extend || wb == WB_Format)); 5988 } 5989 5990 return wb; 5991 } 5992 5993 STATIC WB_enum 5994 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5995 { 5996 WB_enum wb; 5997 5998 PERL_ARGS_ASSERT_BACKUP_ONE_WB; 5999 6000 /* If we know what the previous character's break value is, don't have 6001 * to look it up */ 6002 if (*previous != WB_UNKNOWN) { 6003 wb = *previous; 6004 6005 /* But we need to move backwards by one */ 6006 if (utf8_target) { 6007 *curpos = reghopmaybe3(*curpos, -1, strbeg); 6008 if (! *curpos) { 6009 *previous = WB_EDGE; 6010 *curpos = (U8 *) strbeg; 6011 } 6012 else { 6013 *previous = WB_UNKNOWN; 6014 } 6015 } 6016 else { 6017 (*curpos)--; 6018 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN; 6019 } 6020 6021 /* And we always back up over these three types */ 6022 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) { 6023 return wb; 6024 } 6025 } 6026 6027 if (*curpos < strbeg) { 6028 return WB_EDGE; 6029 } 6030 6031 if (utf8_target) { 6032 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 6033 if (! prev_char_pos) { 6034 return WB_EDGE; 6035 } 6036 6037 /* Back up over Extend and Format. curpos is always just to the right 6038 * of the characater whose value we are getting */ 6039 do { 6040 U8 * prev_prev_char_pos; 6041 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, 6042 -1, 6043 strbeg))) 6044 { 6045 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 6046 *curpos = prev_char_pos; 6047 prev_char_pos = prev_prev_char_pos; 6048 } 6049 else { 6050 *curpos = (U8 *) strbeg; 6051 return WB_EDGE; 6052 } 6053 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ); 6054 } 6055 else { 6056 do { 6057 if (*curpos - 2 < strbeg) { 6058 *curpos = (U8 *) strbeg; 6059 return WB_EDGE; 6060 } 6061 (*curpos)--; 6062 wb = getWB_VAL_CP(*(*curpos - 1)); 6063 } while (wb == WB_Extend || wb == WB_Format); 6064 } 6065 6066 return wb; 6067 } 6068 6069 /* Macros for regmatch(), using its internal variables */ 6070 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ 6071 #define NEXTCHR_IS_EOS (nextbyte < 0) 6072 6073 #define SET_nextchr \ 6074 nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) 6075 6076 #define SET_locinput(p) \ 6077 locinput = (p); \ 6078 SET_nextchr 6079 6080 #define sayYES goto yes 6081 #define sayNO goto no 6082 #define sayNO_SILENT goto no_silent 6083 6084 /* we dont use STMT_START/END here because it leads to 6085 "unreachable code" warnings, which are bogus, but distracting. */ 6086 #define CACHEsayNO \ 6087 if (ST.cache_mask) \ 6088 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ 6089 sayNO 6090 6091 #define EVAL_CLOSE_PAREN_IS(st,expr) \ 6092 ( \ 6093 ( ( st ) ) && \ 6094 ( ( st )->u.eval.close_paren ) && \ 6095 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \ 6096 ) 6097 6098 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \ 6099 ( \ 6100 ( ( st ) ) && \ 6101 ( ( st )->u.eval.close_paren ) && \ 6102 ( ( expr ) ) && \ 6103 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \ 6104 ) 6105 6106 6107 #define EVAL_CLOSE_PAREN_SET(st,expr) \ 6108 (st)->u.eval.close_paren = ( (expr) + 1 ) 6109 6110 #define EVAL_CLOSE_PAREN_CLEAR(st) \ 6111 (st)->u.eval.close_paren = 0 6112 6113 /* push a new state then goto it */ 6114 6115 #define PUSH_STATE_GOTO(state, node, input, eol, sr0) \ 6116 pushinput = input; \ 6117 pusheol = eol; \ 6118 pushsr0 = sr0; \ 6119 scan = node; \ 6120 st->resume_state = state; \ 6121 goto push_state; 6122 6123 /* push a new state with success backtracking, then goto it */ 6124 6125 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \ 6126 pushinput = input; \ 6127 pusheol = eol; \ 6128 pushsr0 = sr0; \ 6129 scan = node; \ 6130 st->resume_state = state; \ 6131 goto push_yes_state; 6132 6133 #define DEBUG_STATE_pp(pp) \ 6134 DEBUG_STATE_r({ \ 6135 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \ 6136 Perl_re_printf( aTHX_ \ 6137 "%*s" pp " %s%s%s%s%s\n", \ 6138 INDENT_CHARS(depth), "", \ 6139 PL_reg_name[st->resume_state], \ 6140 ((st==yes_state||st==mark_state) ? "[" : ""), \ 6141 ((st==yes_state) ? "Y" : ""), \ 6142 ((st==mark_state) ? "M" : ""), \ 6143 ((st==yes_state||st==mark_state) ? "]" : "") \ 6144 ); \ 6145 }); 6146 6147 /* 6148 6149 regmatch() - main matching routine 6150 6151 This is basically one big switch statement in a loop. We execute an op, 6152 set 'next' to point the next op, and continue. If we come to a point which 6153 we may need to backtrack to on failure such as (A|B|C), we push a 6154 backtrack state onto the backtrack stack. On failure, we pop the top 6155 state, and re-enter the loop at the state indicated. If there are no more 6156 states to pop, we return failure. 6157 6158 Sometimes we also need to backtrack on success; for example /A+/, where 6159 after successfully matching one A, we need to go back and try to 6160 match another one; similarly for lookahead assertions: if the assertion 6161 completes successfully, we backtrack to the state just before the assertion 6162 and then carry on. In these cases, the pushed state is marked as 6163 'backtrack on success too'. This marking is in fact done by a chain of 6164 pointers, each pointing to the previous 'yes' state. On success, we pop to 6165 the nearest yes state, discarding any intermediate failure-only states. 6166 Sometimes a yes state is pushed just to force some cleanup code to be 6167 called at the end of a successful match or submatch; e.g. (??{$re}) uses 6168 it to free the inner regex. 6169 6170 Note that failure backtracking rewinds the cursor position, while 6171 success backtracking leaves it alone. 6172 6173 A pattern is complete when the END op is executed, while a subpattern 6174 such as (?=foo) is complete when the SUCCESS op is executed. Both of these 6175 ops trigger the "pop to last yes state if any, otherwise return true" 6176 behaviour. 6177 6178 A common convention in this function is to use A and B to refer to the two 6179 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is 6180 the subpattern to be matched possibly multiple times, while B is the entire 6181 rest of the pattern. Variable and state names reflect this convention. 6182 6183 The states in the main switch are the union of ops and failure/success of 6184 substates associated with that op. For example, IFMATCH is the op 6185 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means 6186 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just 6187 successfully matched A and IFMATCH_A_fail is a state saying that we have 6188 just failed to match A. Resume states always come in pairs. The backtrack 6189 state we push is marked as 'IFMATCH_A', but when that is popped, we resume 6190 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking 6191 on success or failure. 6192 6193 The struct that holds a backtracking state is actually a big union, with 6194 one variant for each major type of op. The variable st points to the 6195 top-most backtrack struct. To make the code clearer, within each 6196 block of code we #define ST to alias the relevant union. 6197 6198 Here's a concrete example of a (vastly oversimplified) IFMATCH 6199 implementation: 6200 6201 switch (state) { 6202 .... 6203 6204 #define ST st->u.ifmatch 6205 6206 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 6207 ST.foo = ...; // some state we wish to save 6208 ... 6209 // push a yes backtrack state with a resume value of 6210 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the 6211 // first node of A: 6212 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput); 6213 // NOTREACHED 6214 6215 case IFMATCH_A: // we have successfully executed A; now continue with B 6216 next = B; 6217 bar = ST.foo; // do something with the preserved value 6218 break; 6219 6220 case IFMATCH_A_fail: // A failed, so the assertion failed 6221 ...; // do some housekeeping, then ... 6222 sayNO; // propagate the failure 6223 6224 #undef ST 6225 6226 ... 6227 } 6228 6229 For any old-timers reading this who are familiar with the old recursive 6230 approach, the code above is equivalent to: 6231 6232 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 6233 { 6234 int foo = ... 6235 ... 6236 if (regmatch(A)) { 6237 next = B; 6238 bar = foo; 6239 break; 6240 } 6241 ...; // do some housekeeping, then ... 6242 sayNO; // propagate the failure 6243 } 6244 6245 The topmost backtrack state, pointed to by st, is usually free. If you 6246 want to claim it, populate any ST.foo fields in it with values you wish to 6247 save, then do one of 6248 6249 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol); 6250 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol); 6251 6252 which sets that backtrack state's resume value to 'resume_state', pushes a 6253 new free entry to the top of the backtrack stack, then goes to 'node'. 6254 On backtracking, the free slot is popped, and the saved state becomes the 6255 new free state. An ST.foo field in this new top state can be temporarily 6256 accessed to retrieve values, but once the main loop is re-entered, it 6257 becomes available for reuse. 6258 6259 Note that the depth of the backtrack stack constantly increases during the 6260 left-to-right execution of the pattern, rather than going up and down with 6261 the pattern nesting. For example the stack is at its maximum at Z at the 6262 end of the pattern, rather than at X in the following: 6263 6264 /(((X)+)+)+....(Y)+....Z/ 6265 6266 The only exceptions to this are lookahead/behind assertions and the cut, 6267 (?>A), which pop all the backtrack states associated with A before 6268 continuing. 6269 6270 Backtrack state structs are allocated in slabs of about 4K in size. 6271 PL_regmatch_state and st always point to the currently active state, 6272 and PL_regmatch_slab points to the slab currently containing 6273 PL_regmatch_state. The first time regmatch() is called, the first slab is 6274 allocated, and is never freed until interpreter destruction. When the slab 6275 is full, a new one is allocated and chained to the end. At exit from 6276 regmatch(), slabs allocated since entry are freed. 6277 6278 In order to work with variable length lookbehinds, an upper limit is placed on 6279 lookbehinds which is set to where the match position is at the end of where the 6280 lookbehind would get to. Nothing in the lookbehind should match above that, 6281 except we should be able to look beyond if for things like \b, which need the 6282 next character in the string to be able to determine if this is a boundary or 6283 not. We also can't match the end of string/line unless we are also at the end 6284 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs 6285 that match a width, we have to add a condition that they are within the legal 6286 bounds of our window into the string. 6287 6288 */ 6289 6290 /* returns -1 on failure, $+[0] on success */ 6291 STATIC SSize_t 6292 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) 6293 { 6294 const bool utf8_target = reginfo->is_utf8_target; 6295 const U32 uniflags = UTF8_ALLOW_DEFAULT; 6296 REGEXP *rex_sv = reginfo->prog; 6297 regexp *rex = ReANY(rex_sv); 6298 RXi_GET_DECL(rex,rexi); 6299 /* the current state. This is a cached copy of PL_regmatch_state */ 6300 regmatch_state *st; 6301 /* cache heavy used fields of st in registers */ 6302 regnode *scan; 6303 regnode *next; 6304 U32 n = 0; /* general value; init to avoid compiler warning */ 6305 U32 utmp = 0; /* tmp variable - valid for at most one opcode */ 6306 SSize_t ln = 0; /* len or last; init to avoid compiler warning */ 6307 SSize_t endref = 0; /* offset of end of backref when ln is start */ 6308 char *locinput = startpos; 6309 char *loceol = reginfo->strend; 6310 char *pushinput; /* where to continue after a PUSH */ 6311 char *pusheol; /* where to stop matching (loceol) after a PUSH */ 6312 U8 *pushsr0; /* save starting pos of script run */ 6313 PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1 6314 at EOS */ 6315 6316 bool result = 0; /* return value of S_regmatch */ 6317 U32 depth = 0; /* depth of backtrack stack */ 6318 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ 6319 const U32 max_nochange_depth = 6320 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? 6321 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; 6322 regmatch_state *yes_state = NULL; /* state to pop to on success of 6323 subpattern */ 6324 /* mark_state piggy backs on the yes_state logic so that when we unwind 6325 the stack on success we can update the mark_state as we go */ 6326 regmatch_state *mark_state = NULL; /* last mark state we have seen */ 6327 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ 6328 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ 6329 U32 state_num; 6330 bool no_final = 0; /* prevent failure from backtracking? */ 6331 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ 6332 char *startpoint = locinput; 6333 SV *popmark = NULL; /* are we looking for a mark? */ 6334 SV *sv_commit = NULL; /* last mark name seen in failure */ 6335 SV *sv_yes_mark = NULL; /* last mark name we have seen 6336 during a successful match */ 6337 U32 lastopen = 0; /* last open we saw */ 6338 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0; 6339 SV* const oreplsv = GvSVn(PL_replgv); 6340 /* these three flags are set by various ops to signal information to 6341 * the very next op. They have a useful lifetime of exactly one loop 6342 * iteration, and are not preserved or restored by state pushes/pops 6343 */ 6344 bool sw = 0; /* the condition value in (?(cond)a|b) */ 6345 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ 6346 int logical = 0; /* the following EVAL is: 6347 0: (?{...}) 6348 1: (?(?{...})X|Y) 6349 2: (??{...}) 6350 or the following IFMATCH/UNLESSM is: 6351 false: plain (?=foo) 6352 true: used as a condition: (?(?=foo)) 6353 */ 6354 PAD* last_pad = NULL; 6355 dMULTICALL; 6356 U8 gimme = G_SCALAR; 6357 CV *caller_cv = NULL; /* who called us */ 6358 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ 6359 U32 maxopenparen = 0; /* max '(' index seen so far */ 6360 int to_complement; /* Invert the result? */ 6361 _char_class_number classnum; 6362 bool is_utf8_pat = reginfo->is_utf8_pat; 6363 bool match = FALSE; 6364 I32 orig_savestack_ix = PL_savestack_ix; 6365 U8 * script_run_begin = NULL; 6366 char *match_end= NULL; /* where a match MUST end to be considered successful */ 6367 bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */ 6368 6369 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */ 6370 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL)) 6371 # define SOLARIS_BAD_OPTIMIZER 6372 const U32 *pl_charclass_dup = PL_charclass; 6373 # define PL_charclass pl_charclass_dup 6374 #endif 6375 6376 #ifdef DEBUGGING 6377 DECLARE_AND_GET_RE_DEBUG_FLAGS; 6378 #endif 6379 6380 /* protect against undef(*^R) */ 6381 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); 6382 6383 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ 6384 multicall_oldcatch = 0; 6385 PERL_UNUSED_VAR(multicall_cop); 6386 6387 PERL_ARGS_ASSERT_REGMATCH; 6388 6389 st = PL_regmatch_state; 6390 6391 /* Note that nextbyte is a byte even in UTF */ 6392 SET_nextchr; 6393 scan = prog; 6394 6395 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ 6396 DUMP_EXEC_POS( locinput, scan, utf8_target, depth ); 6397 Perl_re_printf( aTHX_ "regmatch start\n" ); 6398 })); 6399 6400 while (scan != NULL) { 6401 next = scan + NEXT_OFF(scan); 6402 if (next == scan) 6403 next = NULL; 6404 state_num = OP(scan); 6405 6406 reenter_switch: 6407 DEBUG_EXECUTE_r( 6408 if (state_num <= REGNODE_MAX) { 6409 SV * const prop = sv_newmortal(); 6410 regnode *rnext = regnext(scan); 6411 6412 DUMP_EXEC_POS( locinput, scan, utf8_target, depth ); 6413 regprop(rex, prop, scan, reginfo, NULL); 6414 Perl_re_printf( aTHX_ 6415 "%*s%" IVdf ":%s(%" IVdf ")\n", 6416 INDENT_CHARS(depth), "", 6417 (IV)(scan - rexi->program), 6418 SvPVX_const(prop), 6419 (PL_regkind[OP(scan)] == END || !rnext) ? 6420 0 : (IV)(rnext - rexi->program)); 6421 } 6422 ); 6423 6424 to_complement = 0; 6425 6426 SET_nextchr; 6427 assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS)); 6428 6429 switch (state_num) { 6430 case SBOL: /* /^../ and /\A../ */ 6431 if (locinput == reginfo->strbeg) 6432 break; 6433 sayNO; 6434 6435 case MBOL: /* /^../m */ 6436 if (locinput == reginfo->strbeg || 6437 (!NEXTCHR_IS_EOS && locinput[-1] == '\n')) 6438 { 6439 break; 6440 } 6441 sayNO; 6442 6443 case GPOS: /* \G */ 6444 if (locinput == reginfo->ganch) 6445 break; 6446 sayNO; 6447 6448 case KEEPS: /* \K */ 6449 /* update the startpoint */ 6450 st->u.keeper.val = rex->offs[0].start; 6451 rex->offs[0].start = locinput - reginfo->strbeg; 6452 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol, 6453 script_run_begin); 6454 NOT_REACHED; /* NOTREACHED */ 6455 6456 case KEEPS_next_fail: 6457 /* rollback the start point change */ 6458 rex->offs[0].start = st->u.keeper.val; 6459 sayNO_SILENT; 6460 NOT_REACHED; /* NOTREACHED */ 6461 6462 case MEOL: /* /..$/m */ 6463 if (!NEXTCHR_IS_EOS && nextbyte != '\n') 6464 sayNO; 6465 break; 6466 6467 case SEOL: /* /..$/ */ 6468 if (!NEXTCHR_IS_EOS && nextbyte != '\n') 6469 sayNO; 6470 if (reginfo->strend - locinput > 1) 6471 sayNO; 6472 break; 6473 6474 case EOS: /* \z */ 6475 if (!NEXTCHR_IS_EOS) 6476 sayNO; 6477 break; 6478 6479 case SANY: /* /./s */ 6480 if (NEXTCHR_IS_EOS || locinput >= loceol) 6481 sayNO; 6482 goto increment_locinput; 6483 6484 case REG_ANY: /* /./ */ 6485 if ( NEXTCHR_IS_EOS 6486 || locinput >= loceol 6487 || nextbyte == '\n') 6488 { 6489 sayNO; 6490 } 6491 goto increment_locinput; 6492 6493 6494 #undef ST 6495 #define ST st->u.trie 6496 case TRIEC: /* (ab|cd) with known charclass */ 6497 /* In this case the charclass data is available inline so 6498 we can fail fast without a lot of extra overhead. 6499 */ 6500 if ( ! NEXTCHR_IS_EOS 6501 && locinput < loceol 6502 && ! ANYOF_BITMAP_TEST(scan, nextbyte)) 6503 { 6504 DEBUG_EXECUTE_r( 6505 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n", 6506 depth, PL_colors[4], PL_colors[5]) 6507 ); 6508 sayNO_SILENT; 6509 NOT_REACHED; /* NOTREACHED */ 6510 } 6511 /* FALLTHROUGH */ 6512 case TRIE: /* (ab|cd) */ 6513 /* the basic plan of execution of the trie is: 6514 * At the beginning, run though all the states, and 6515 * find the longest-matching word. Also remember the position 6516 * of the shortest matching word. For example, this pattern: 6517 * 1 2 3 4 5 6518 * ab|a|x|abcd|abc 6519 * when matched against the string "abcde", will generate 6520 * accept states for all words except 3, with the longest 6521 * matching word being 4, and the shortest being 2 (with 6522 * the position being after char 1 of the string). 6523 * 6524 * Then for each matching word, in word order (i.e. 1,2,4,5), 6525 * we run the remainder of the pattern; on each try setting 6526 * the current position to the character following the word, 6527 * returning to try the next word on failure. 6528 * 6529 * We avoid having to build a list of words at runtime by 6530 * using a compile-time structure, wordinfo[].prev, which 6531 * gives, for each word, the previous accepting word (if any). 6532 * In the case above it would contain the mappings 1->2, 2->0, 6533 * 3->0, 4->5, 5->1. We can use this table to generate, from 6534 * the longest word (4 above), a list of all words, by 6535 * following the list of prev pointers; this gives us the 6536 * unordered list 4,5,1,2. Then given the current word we have 6537 * just tried, we can go through the list and find the 6538 * next-biggest word to try (so if we just failed on word 2, 6539 * the next in the list is 4). 6540 * 6541 * Since at runtime we don't record the matching position in 6542 * the string for each word, we have to work that out for 6543 * each word we're about to process. The wordinfo table holds 6544 * the character length of each word; given that we recorded 6545 * at the start: the position of the shortest word and its 6546 * length in chars, we just need to move the pointer the 6547 * difference between the two char lengths. Depending on 6548 * Unicode status and folding, that's cheap or expensive. 6549 * 6550 * This algorithm is optimised for the case where are only a 6551 * small number of accept states, i.e. 0,1, or maybe 2. 6552 * With lots of accepts states, and having to try all of them, 6553 * it becomes quadratic on number of accept states to find all 6554 * the next words. 6555 */ 6556 6557 { 6558 /* what type of TRIE am I? (utf8 makes this contextual) */ 6559 DECL_TRIE_TYPE(scan); 6560 6561 /* what trie are we using right now */ 6562 reg_trie_data * const trie 6563 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; 6564 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); 6565 U32 state = trie->startstate; 6566 6567 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) { 6568 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 6569 if (utf8_target 6570 && ! NEXTCHR_IS_EOS 6571 && UTF8_IS_ABOVE_LATIN1(nextbyte) 6572 && scan->flags == EXACTL) 6573 { 6574 /* We only output for EXACTL, as we let the folder 6575 * output this message for EXACTFLU8 to avoid 6576 * duplication */ 6577 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, 6578 reginfo->strend); 6579 } 6580 } 6581 if ( trie->bitmap 6582 && ( NEXTCHR_IS_EOS 6583 || locinput >= loceol 6584 || ! TRIE_BITMAP_TEST(trie, nextbyte))) 6585 { 6586 if (trie->states[ state ].wordnum) { 6587 DEBUG_EXECUTE_r( 6588 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n", 6589 depth, PL_colors[4], PL_colors[5]) 6590 ); 6591 if (!trie->jump) 6592 break; 6593 } else { 6594 DEBUG_EXECUTE_r( 6595 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n", 6596 depth, PL_colors[4], PL_colors[5]) 6597 ); 6598 sayNO_SILENT; 6599 } 6600 } 6601 6602 { 6603 U8 *uc = ( U8* )locinput; 6604 6605 STRLEN len = 0; 6606 STRLEN foldlen = 0; 6607 U8 *uscan = (U8*)NULL; 6608 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 6609 U32 charcount = 0; /* how many input chars we have matched */ 6610 U32 accepted = 0; /* have we seen any accepting states? */ 6611 6612 ST.jump = trie->jump; 6613 ST.me = scan; 6614 ST.firstpos = NULL; 6615 ST.longfold = FALSE; /* char longer if folded => it's harder */ 6616 ST.nextword = 0; 6617 6618 /* fully traverse the TRIE; note the position of the 6619 shortest accept state and the wordnum of the longest 6620 accept state */ 6621 6622 while ( state && uc <= (U8*)(loceol) ) { 6623 U32 base = trie->states[ state ].trans.base; 6624 UV uvc = 0; 6625 U16 charid = 0; 6626 U16 wordnum; 6627 wordnum = trie->states[ state ].wordnum; 6628 6629 if (wordnum) { /* it's an accept state */ 6630 if (!accepted) { 6631 accepted = 1; 6632 /* record first match position */ 6633 if (ST.longfold) { 6634 ST.firstpos = (U8*)locinput; 6635 ST.firstchars = 0; 6636 } 6637 else { 6638 ST.firstpos = uc; 6639 ST.firstchars = charcount; 6640 } 6641 } 6642 if (!ST.nextword || wordnum < ST.nextword) 6643 ST.nextword = wordnum; 6644 ST.topword = wordnum; 6645 } 6646 6647 DEBUG_TRIE_EXECUTE_r({ 6648 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth ); 6649 /* HERE */ 6650 PerlIO_printf( Perl_debug_log, 6651 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ", 6652 INDENT_CHARS(depth), "", PL_colors[4], 6653 (UV)state, (accepted ? 'Y' : 'N')); 6654 }); 6655 6656 /* read a char and goto next state */ 6657 if ( base && (foldlen || uc < (U8*)(loceol))) { 6658 I32 offset; 6659 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 6660 (U8 *) loceol, uscan, 6661 len, uvc, charid, foldlen, 6662 foldbuf, uniflags); 6663 charcount++; 6664 if (foldlen>0) 6665 ST.longfold = TRUE; 6666 if (charid && 6667 ( ((offset = 6668 base + charid - 1 - trie->uniquecharcount)) >= 0) 6669 6670 && ((U32)offset < trie->lasttrans) 6671 && trie->trans[offset].check == state) 6672 { 6673 state = trie->trans[offset].next; 6674 } 6675 else { 6676 state = 0; 6677 } 6678 uc += len; 6679 6680 } 6681 else { 6682 state = 0; 6683 } 6684 DEBUG_TRIE_EXECUTE_r( 6685 Perl_re_printf( aTHX_ 6686 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n", 6687 charid, uvc, (UV)state, PL_colors[5] ); 6688 ); 6689 } 6690 if (!accepted) 6691 sayNO; 6692 6693 /* calculate total number of accept states */ 6694 { 6695 U16 w = ST.topword; 6696 accepted = 0; 6697 while (w) { 6698 w = trie->wordinfo[w].prev; 6699 accepted++; 6700 } 6701 ST.accepted = accepted; 6702 } 6703 6704 DEBUG_EXECUTE_r( 6705 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n", 6706 depth, 6707 PL_colors[4], (IV)ST.accepted, PL_colors[5] ); 6708 ); 6709 goto trie_first_try; /* jump into the fail handler */ 6710 }} 6711 NOT_REACHED; /* NOTREACHED */ 6712 6713 case TRIE_next_fail: /* we failed - try next alternative */ 6714 { 6715 U8 *uc; 6716 if ( ST.jump ) { 6717 /* undo any captures done in the tail part of a branch, 6718 * e.g. 6719 * /(?:X(.)(.)|Y(.)).../ 6720 * where the trie just matches X then calls out to do the 6721 * rest of the branch */ 6722 REGCP_UNWIND(ST.cp); 6723 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 6724 } 6725 if (!--ST.accepted) { 6726 DEBUG_EXECUTE_r({ 6727 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n", 6728 depth, 6729 PL_colors[4], 6730 PL_colors[5] ); 6731 }); 6732 sayNO_SILENT; 6733 } 6734 { 6735 /* Find next-highest word to process. Note that this code 6736 * is O(N^2) per trie run (O(N) per branch), so keep tight */ 6737 U16 min = 0; 6738 U16 word; 6739 U16 const nextword = ST.nextword; 6740 reg_trie_wordinfo * const wordinfo 6741 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo; 6742 for (word=ST.topword; word; word=wordinfo[word].prev) { 6743 if (word > nextword && (!min || word < min)) 6744 min = word; 6745 } 6746 ST.nextword = min; 6747 } 6748 6749 trie_first_try: 6750 if (do_cutgroup) { 6751 do_cutgroup = 0; 6752 no_final = 0; 6753 } 6754 6755 if ( ST.jump ) { 6756 ST.lastparen = rex->lastparen; 6757 ST.lastcloseparen = rex->lastcloseparen; 6758 REGCP_SET(ST.cp); 6759 } 6760 6761 /* find start char of end of current word */ 6762 { 6763 U32 chars; /* how many chars to skip */ 6764 reg_trie_data * const trie 6765 = (reg_trie_data*)rexi->data->data[ARG(ST.me)]; 6766 6767 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen) 6768 >= ST.firstchars); 6769 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen) 6770 - ST.firstchars; 6771 uc = ST.firstpos; 6772 6773 if (ST.longfold) { 6774 /* the hard option - fold each char in turn and find 6775 * its folded length (which may be different */ 6776 U8 foldbuf[UTF8_MAXBYTES_CASE + 1]; 6777 STRLEN foldlen; 6778 STRLEN len; 6779 UV uvc; 6780 U8 *uscan; 6781 6782 while (chars) { 6783 if (utf8_target) { 6784 /* XXX This assumes the length is well-formed, as 6785 * does the UTF8SKIP below */ 6786 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, 6787 uniflags); 6788 uc += len; 6789 } 6790 else { 6791 uvc = *uc; 6792 uc++; 6793 } 6794 uvc = to_uni_fold(uvc, foldbuf, &foldlen); 6795 uscan = foldbuf; 6796 while (foldlen) { 6797 if (!--chars) 6798 break; 6799 uvc = utf8n_to_uvchr(uscan, foldlen, &len, 6800 uniflags); 6801 uscan += len; 6802 foldlen -= len; 6803 } 6804 } 6805 } 6806 else { 6807 if (utf8_target) 6808 uc = utf8_hop(uc, chars); 6809 else 6810 uc += chars; 6811 } 6812 } 6813 6814 scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) 6815 ? ST.jump[ST.nextword] 6816 : NEXT_OFF(ST.me)); 6817 6818 DEBUG_EXECUTE_r({ 6819 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n", 6820 depth, 6821 PL_colors[4], 6822 ST.nextword, 6823 PL_colors[5] 6824 ); 6825 }); 6826 6827 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) { 6828 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol, 6829 script_run_begin); 6830 NOT_REACHED; /* NOTREACHED */ 6831 } 6832 /* only one choice left - just continue */ 6833 DEBUG_EXECUTE_r({ 6834 AV *const trie_words 6835 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); 6836 SV ** const tmp = trie_words 6837 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL; 6838 SV *sv= tmp ? sv_newmortal() : NULL; 6839 6840 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n", 6841 depth, PL_colors[4], 6842 ST.nextword, 6843 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 6844 PL_colors[0], PL_colors[1], 6845 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII 6846 ) 6847 : "not compiled under -Dr", 6848 PL_colors[5] ); 6849 }); 6850 6851 locinput = (char*)uc; 6852 continue; /* execute rest of RE */ 6853 /* NOTREACHED */ 6854 } 6855 #undef ST 6856 6857 case LEXACT_REQ8: 6858 if (! utf8_target) { 6859 sayNO; 6860 } 6861 /* FALLTHROUGH */ 6862 6863 case LEXACT: 6864 { 6865 char *s; 6866 6867 s = STRINGl(scan); 6868 ln = STR_LENl(scan); 6869 goto join_short_long_exact; 6870 6871 case EXACTL: /* /abc/l */ 6872 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 6873 6874 /* Complete checking would involve going through every character 6875 * matched by the string to see if any is above latin1. But the 6876 * comparision otherwise might very well be a fast assembly 6877 * language routine, and I (khw) don't think slowing things down 6878 * just to check for this warning is worth it. So this just checks 6879 * the first character */ 6880 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) { 6881 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); 6882 } 6883 goto do_exact; 6884 case EXACT_REQ8: 6885 if (! utf8_target) { 6886 sayNO; 6887 } 6888 /* FALLTHROUGH */ 6889 6890 case EXACT: /* /abc/ */ 6891 do_exact: 6892 s = STRINGs(scan); 6893 ln = STR_LENs(scan); 6894 6895 join_short_long_exact: 6896 if (utf8_target != is_utf8_pat) { 6897 /* The target and the pattern have differing utf8ness. */ 6898 char *l = locinput; 6899 const char * const e = s + ln; 6900 6901 if (utf8_target) { 6902 /* The target is utf8, the pattern is not utf8. 6903 * Above-Latin1 code points can't match the pattern; 6904 * invariants match exactly, and the other Latin1 ones need 6905 * to be downgraded to a single byte in order to do the 6906 * comparison. (If we could be confident that the target 6907 * is not malformed, this could be refactored to have fewer 6908 * tests by just assuming that if the first bytes match, it 6909 * is an invariant, but there are tests in the test suite 6910 * dealing with (??{...}) which violate this) */ 6911 while (s < e) { 6912 if ( l >= loceol 6913 || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) 6914 { 6915 sayNO; 6916 } 6917 if (UTF8_IS_INVARIANT(*(U8*)l)) { 6918 if (*l != *s) { 6919 sayNO; 6920 } 6921 l++; 6922 } 6923 else { 6924 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) 6925 { 6926 sayNO; 6927 } 6928 l += 2; 6929 } 6930 s++; 6931 } 6932 } 6933 else { 6934 /* The target is not utf8, the pattern is utf8. */ 6935 while (s < e) { 6936 if ( l >= loceol 6937 || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) 6938 { 6939 sayNO; 6940 } 6941 if (UTF8_IS_INVARIANT(*(U8*)s)) { 6942 if (*s != *l) { 6943 sayNO; 6944 } 6945 s++; 6946 } 6947 else { 6948 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) 6949 { 6950 sayNO; 6951 } 6952 s += 2; 6953 } 6954 l++; 6955 } 6956 } 6957 locinput = l; 6958 } 6959 else { 6960 /* The target and the pattern have the same utf8ness. */ 6961 /* Inline the first character, for speed. */ 6962 if ( loceol - locinput < ln 6963 || UCHARAT(s) != nextbyte 6964 || (ln > 1 && memNE(s, locinput, ln))) 6965 { 6966 sayNO; 6967 } 6968 locinput += ln; 6969 } 6970 break; 6971 } 6972 6973 case EXACTFL: /* /abc/il */ 6974 { 6975 re_fold_t folder; 6976 const U8 * fold_array; 6977 const char * s; 6978 U32 fold_utf8_flags; 6979 6980 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 6981 folder = foldEQ_locale; 6982 fold_array = PL_fold_locale; 6983 fold_utf8_flags = FOLDEQ_LOCALE; 6984 goto do_exactf; 6985 6986 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so 6987 is effectively /u; hence to match, target 6988 must be UTF-8. */ 6989 if (! utf8_target) { 6990 sayNO; 6991 } 6992 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED 6993 | FOLDEQ_S2_FOLDS_SANE; 6994 folder = foldEQ_latin1_s2_folded; 6995 fold_array = PL_fold_latin1; 6996 goto do_exactf; 6997 6998 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */ 6999 if (! utf8_target) { 7000 sayNO; 7001 } 7002 assert(is_utf8_pat); 7003 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; 7004 goto do_exactf; 7005 7006 case EXACTFUP: /* /foo/iu, and something is problematic in 7007 'foo' so can't take shortcuts. */ 7008 assert(! is_utf8_pat); 7009 folder = foldEQ_latin1; 7010 fold_array = PL_fold_latin1; 7011 fold_utf8_flags = 0; 7012 goto do_exactf; 7013 7014 case EXACTFU: /* /abc/iu */ 7015 folder = foldEQ_latin1_s2_folded; 7016 fold_array = PL_fold_latin1; 7017 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; 7018 goto do_exactf; 7019 7020 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 7021 patterns */ 7022 assert(! is_utf8_pat); 7023 /* FALLTHROUGH */ 7024 case EXACTFAA: /* /abc/iaa */ 7025 folder = foldEQ_latin1_s2_folded; 7026 fold_array = PL_fold_latin1; 7027 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7028 if (is_utf8_pat || ! utf8_target) { 7029 7030 /* The possible presence of a MICRO SIGN in the pattern forbids 7031 * us to view a non-UTF-8 pattern as folded when there is a 7032 * UTF-8 target */ 7033 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED 7034 |FOLDEQ_S2_FOLDS_SANE; 7035 } 7036 goto do_exactf; 7037 7038 7039 case EXACTF: /* /abc/i This node only generated for 7040 non-utf8 patterns */ 7041 assert(! is_utf8_pat); 7042 folder = foldEQ; 7043 fold_array = PL_fold; 7044 fold_utf8_flags = 0; 7045 7046 do_exactf: 7047 s = STRINGs(scan); 7048 ln = STR_LENs(scan); 7049 7050 if ( utf8_target 7051 || is_utf8_pat 7052 || state_num == EXACTFUP 7053 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) 7054 { 7055 /* Either target or the pattern are utf8, or has the issue where 7056 * the fold lengths may differ. */ 7057 const char * const l = locinput; 7058 char *e = loceol; 7059 7060 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target, 7061 s, 0, ln, is_utf8_pat,fold_utf8_flags)) 7062 { 7063 sayNO; 7064 } 7065 locinput = e; 7066 break; 7067 } 7068 7069 /* Neither the target nor the pattern are utf8 */ 7070 if (UCHARAT(s) != nextbyte 7071 && !NEXTCHR_IS_EOS 7072 && UCHARAT(s) != fold_array[nextbyte]) 7073 { 7074 sayNO; 7075 } 7076 if (loceol - locinput < ln) 7077 sayNO; 7078 if (ln > 1 && ! folder(locinput, s, ln)) 7079 sayNO; 7080 locinput += ln; 7081 break; 7082 } 7083 7084 case NBOUNDL: /* /\B/l */ 7085 to_complement = 1; 7086 /* FALLTHROUGH */ 7087 7088 case BOUNDL: /* /\b/l */ 7089 { 7090 bool b1, b2; 7091 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 7092 7093 if (FLAGS(scan) != TRADITIONAL_BOUND) { 7094 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 7095 goto boundu; 7096 } 7097 7098 if (utf8_target) { 7099 if (locinput == reginfo->strbeg) 7100 b1 = isWORDCHAR_LC('\n'); 7101 else { 7102 U8 *p = reghop3((U8*)locinput, -1, 7103 (U8*)(reginfo->strbeg)); 7104 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend)); 7105 } 7106 b2 = (NEXTCHR_IS_EOS) 7107 ? isWORDCHAR_LC('\n') 7108 : isWORDCHAR_LC_utf8_safe((U8*) locinput, 7109 (U8*) reginfo->strend); 7110 } 7111 else { /* Here the string isn't utf8 */ 7112 b1 = (locinput == reginfo->strbeg) 7113 ? isWORDCHAR_LC('\n') 7114 : isWORDCHAR_LC(UCHARAT(locinput - 1)); 7115 b2 = (NEXTCHR_IS_EOS) 7116 ? isWORDCHAR_LC('\n') 7117 : isWORDCHAR_LC(nextbyte); 7118 } 7119 if (to_complement ^ (b1 == b2)) { 7120 sayNO; 7121 } 7122 break; 7123 } 7124 7125 case NBOUND: /* /\B/ */ 7126 to_complement = 1; 7127 /* FALLTHROUGH */ 7128 7129 case BOUND: /* /\b/ */ 7130 if (utf8_target) { 7131 goto bound_utf8; 7132 } 7133 goto bound_ascii_match_only; 7134 7135 case NBOUNDA: /* /\B/a */ 7136 to_complement = 1; 7137 /* FALLTHROUGH */ 7138 7139 case BOUNDA: /* /\b/a */ 7140 { 7141 bool b1, b2; 7142 7143 bound_ascii_match_only: 7144 /* Here the string isn't utf8, or is utf8 and only ascii characters 7145 * are to match \w. In the latter case looking at the byte just 7146 * prior to the current one may be just the final byte of a 7147 * multi-byte character. This is ok. There are two cases: 7148 * 1) it is a single byte character, and then the test is doing 7149 * just what it's supposed to. 7150 * 2) it is a multi-byte character, in which case the final byte is 7151 * never mistakable for ASCII, and so the test will say it is 7152 * not a word character, which is the correct answer. */ 7153 b1 = (locinput == reginfo->strbeg) 7154 ? isWORDCHAR_A('\n') 7155 : isWORDCHAR_A(UCHARAT(locinput - 1)); 7156 b2 = (NEXTCHR_IS_EOS) 7157 ? isWORDCHAR_A('\n') 7158 : isWORDCHAR_A(nextbyte); 7159 if (to_complement ^ (b1 == b2)) { 7160 sayNO; 7161 } 7162 break; 7163 } 7164 7165 case NBOUNDU: /* /\B/u */ 7166 to_complement = 1; 7167 /* FALLTHROUGH */ 7168 7169 case BOUNDU: /* /\b/u */ 7170 7171 boundu: 7172 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) { 7173 match = FALSE; 7174 } 7175 else if (utf8_target) { 7176 bound_utf8: 7177 switch((bound_type) FLAGS(scan)) { 7178 case TRADITIONAL_BOUND: 7179 { 7180 bool b1, b2; 7181 if (locinput == reginfo->strbeg) { 7182 b1 = 0 /* isWORDCHAR_L1('\n') */; 7183 } 7184 else { 7185 U8 *p = reghop3((U8*)locinput, -1, 7186 (U8*)(reginfo->strbeg)); 7187 7188 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend); 7189 } 7190 b2 = (NEXTCHR_IS_EOS) 7191 ? 0 /* isWORDCHAR_L1('\n') */ 7192 : isWORDCHAR_utf8_safe((U8*)locinput, 7193 (U8*) reginfo->strend); 7194 match = cBOOL(b1 != b2); 7195 break; 7196 } 7197 case GCB_BOUND: 7198 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7199 match = TRUE; /* GCB always matches at begin and 7200 end */ 7201 } 7202 else { 7203 /* Find the gcb values of previous and current 7204 * chars, then see if is a break point */ 7205 match = isGCB(getGCB_VAL_UTF8( 7206 reghop3((U8*)locinput, 7207 -1, 7208 (U8*)(reginfo->strbeg)), 7209 (U8*) reginfo->strend), 7210 getGCB_VAL_UTF8((U8*) locinput, 7211 (U8*) reginfo->strend), 7212 (U8*) reginfo->strbeg, 7213 (U8*) locinput, 7214 utf8_target); 7215 } 7216 break; 7217 7218 case LB_BOUND: 7219 if (locinput == reginfo->strbeg) { 7220 match = FALSE; 7221 } 7222 else if (NEXTCHR_IS_EOS) { 7223 match = TRUE; 7224 } 7225 else { 7226 match = isLB(getLB_VAL_UTF8( 7227 reghop3((U8*)locinput, 7228 -1, 7229 (U8*)(reginfo->strbeg)), 7230 (U8*) reginfo->strend), 7231 getLB_VAL_UTF8((U8*) locinput, 7232 (U8*) reginfo->strend), 7233 (U8*) reginfo->strbeg, 7234 (U8*) locinput, 7235 (U8*) reginfo->strend, 7236 utf8_target); 7237 } 7238 break; 7239 7240 case SB_BOUND: /* Always matches at begin and end */ 7241 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7242 match = TRUE; 7243 } 7244 else { 7245 match = isSB(getSB_VAL_UTF8( 7246 reghop3((U8*)locinput, 7247 -1, 7248 (U8*)(reginfo->strbeg)), 7249 (U8*) reginfo->strend), 7250 getSB_VAL_UTF8((U8*) locinput, 7251 (U8*) reginfo->strend), 7252 (U8*) reginfo->strbeg, 7253 (U8*) locinput, 7254 (U8*) reginfo->strend, 7255 utf8_target); 7256 } 7257 break; 7258 7259 case WB_BOUND: 7260 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7261 match = TRUE; 7262 } 7263 else { 7264 match = isWB(WB_UNKNOWN, 7265 getWB_VAL_UTF8( 7266 reghop3((U8*)locinput, 7267 -1, 7268 (U8*)(reginfo->strbeg)), 7269 (U8*) reginfo->strend), 7270 getWB_VAL_UTF8((U8*) locinput, 7271 (U8*) reginfo->strend), 7272 (U8*) reginfo->strbeg, 7273 (U8*) locinput, 7274 (U8*) reginfo->strend, 7275 utf8_target); 7276 } 7277 break; 7278 } 7279 } 7280 else { /* Not utf8 target */ 7281 switch((bound_type) FLAGS(scan)) { 7282 case TRADITIONAL_BOUND: 7283 { 7284 bool b1, b2; 7285 b1 = (locinput == reginfo->strbeg) 7286 ? 0 /* isWORDCHAR_L1('\n') */ 7287 : isWORDCHAR_L1(UCHARAT(locinput - 1)); 7288 b2 = (NEXTCHR_IS_EOS) 7289 ? 0 /* isWORDCHAR_L1('\n') */ 7290 : isWORDCHAR_L1(nextbyte); 7291 match = cBOOL(b1 != b2); 7292 break; 7293 } 7294 7295 case GCB_BOUND: 7296 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7297 match = TRUE; /* GCB always matches at begin and 7298 end */ 7299 } 7300 else { /* Only CR-LF combo isn't a GCB in 0-255 7301 range */ 7302 match = UCHARAT(locinput - 1) != '\r' 7303 || UCHARAT(locinput) != '\n'; 7304 } 7305 break; 7306 7307 case LB_BOUND: 7308 if (locinput == reginfo->strbeg) { 7309 match = FALSE; 7310 } 7311 else if (NEXTCHR_IS_EOS) { 7312 match = TRUE; 7313 } 7314 else { 7315 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)), 7316 getLB_VAL_CP(UCHARAT(locinput)), 7317 (U8*) reginfo->strbeg, 7318 (U8*) locinput, 7319 (U8*) reginfo->strend, 7320 utf8_target); 7321 } 7322 break; 7323 7324 case SB_BOUND: /* Always matches at begin and end */ 7325 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7326 match = TRUE; 7327 } 7328 else { 7329 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)), 7330 getSB_VAL_CP(UCHARAT(locinput)), 7331 (U8*) reginfo->strbeg, 7332 (U8*) locinput, 7333 (U8*) reginfo->strend, 7334 utf8_target); 7335 } 7336 break; 7337 7338 case WB_BOUND: 7339 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7340 match = TRUE; 7341 } 7342 else { 7343 match = isWB(WB_UNKNOWN, 7344 getWB_VAL_CP(UCHARAT(locinput -1)), 7345 getWB_VAL_CP(UCHARAT(locinput)), 7346 (U8*) reginfo->strbeg, 7347 (U8*) locinput, 7348 (U8*) reginfo->strend, 7349 utf8_target); 7350 } 7351 break; 7352 } 7353 } 7354 7355 if (to_complement ^ ! match) { 7356 sayNO; 7357 } 7358 break; 7359 7360 case ANYOFPOSIXL: 7361 case ANYOFL: /* /[abc]/l */ 7362 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 7363 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan); 7364 7365 /* FALLTHROUGH */ 7366 case ANYOFD: /* /[abc]/d */ 7367 case ANYOF: /* /[abc]/ */ 7368 if (NEXTCHR_IS_EOS || locinput >= loceol) 7369 sayNO; 7370 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput)) 7371 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP)) 7372 { 7373 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) { 7374 sayNO; 7375 } 7376 locinput++; 7377 } 7378 else { 7379 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 7380 utf8_target)) 7381 { 7382 sayNO; 7383 } 7384 goto increment_locinput; 7385 } 7386 break; 7387 7388 case ANYOFM: 7389 if ( NEXTCHR_IS_EOS 7390 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan) 7391 || locinput >= loceol) 7392 { 7393 sayNO; 7394 } 7395 locinput++; /* ANYOFM is always single byte */ 7396 break; 7397 7398 case NANYOFM: 7399 if ( NEXTCHR_IS_EOS 7400 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan) 7401 || locinput >= loceol) 7402 { 7403 sayNO; 7404 } 7405 goto increment_locinput; 7406 break; 7407 7408 case ANYOFH: 7409 if ( ! utf8_target 7410 || NEXTCHR_IS_EOS 7411 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) 7412 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 7413 utf8_target)) 7414 { 7415 sayNO; 7416 } 7417 goto increment_locinput; 7418 break; 7419 7420 case ANYOFHb: 7421 if ( ! utf8_target 7422 || NEXTCHR_IS_EOS 7423 || ANYOF_FLAGS(scan) != (U8) *locinput 7424 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 7425 utf8_target)) 7426 { 7427 sayNO; 7428 } 7429 goto increment_locinput; 7430 break; 7431 7432 case ANYOFHr: 7433 if ( ! utf8_target 7434 || NEXTCHR_IS_EOS 7435 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput), 7436 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)), 7437 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan))) 7438 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 7439 utf8_target)) 7440 { 7441 sayNO; 7442 } 7443 goto increment_locinput; 7444 break; 7445 7446 case ANYOFHs: 7447 if ( ! utf8_target 7448 || NEXTCHR_IS_EOS 7449 || loceol - locinput < FLAGS(scan) 7450 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan)) 7451 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 7452 utf8_target)) 7453 { 7454 sayNO; 7455 } 7456 goto increment_locinput; 7457 break; 7458 7459 case ANYOFR: 7460 if (NEXTCHR_IS_EOS) { 7461 sayNO; 7462 } 7463 7464 if (utf8_target) { 7465 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) 7466 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, 7467 (U8 *) reginfo->strend, 7468 NULL), 7469 ANYOFRbase(scan), ANYOFRdelta(scan))) 7470 { 7471 sayNO; 7472 } 7473 } 7474 else { 7475 if (! withinCOUNT((U8) *locinput, 7476 ANYOFRbase(scan), ANYOFRdelta(scan))) 7477 { 7478 sayNO; 7479 } 7480 } 7481 goto increment_locinput; 7482 break; 7483 7484 case ANYOFRb: 7485 if (NEXTCHR_IS_EOS) { 7486 sayNO; 7487 } 7488 7489 if (utf8_target) { 7490 if ( ANYOF_FLAGS(scan) != (U8) *locinput 7491 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, 7492 (U8 *) reginfo->strend, 7493 NULL), 7494 ANYOFRbase(scan), ANYOFRdelta(scan))) 7495 { 7496 sayNO; 7497 } 7498 } 7499 else { 7500 if (! withinCOUNT((U8) *locinput, 7501 ANYOFRbase(scan), ANYOFRdelta(scan))) 7502 { 7503 sayNO; 7504 } 7505 } 7506 goto increment_locinput; 7507 break; 7508 7509 /* The argument (FLAGS) to all the POSIX node types is the class number 7510 * */ 7511 7512 case NPOSIXL: /* \W or [:^punct:] etc. under /l */ 7513 to_complement = 1; 7514 /* FALLTHROUGH */ 7515 7516 case POSIXL: /* \w or [:punct:] etc. under /l */ 7517 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 7518 if (NEXTCHR_IS_EOS || locinput >= loceol) 7519 sayNO; 7520 7521 /* Use isFOO_lc() for characters within Latin1. (Note that 7522 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else 7523 * wouldn't be invariant) */ 7524 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) { 7525 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) { 7526 sayNO; 7527 } 7528 7529 locinput++; 7530 break; 7531 } 7532 7533 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { 7534 /* An above Latin-1 code point, or malformed */ 7535 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, 7536 reginfo->strend); 7537 goto utf8_posix_above_latin1; 7538 } 7539 7540 /* Here is a UTF-8 variant code point below 256 and the target is 7541 * UTF-8 */ 7542 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), 7543 EIGHT_BIT_UTF8_TO_NATIVE(nextbyte, 7544 *(locinput + 1)))))) 7545 { 7546 sayNO; 7547 } 7548 7549 goto increment_locinput; 7550 7551 case NPOSIXD: /* \W or [:^punct:] etc. under /d */ 7552 to_complement = 1; 7553 /* FALLTHROUGH */ 7554 7555 case POSIXD: /* \w or [:punct:] etc. under /d */ 7556 if (utf8_target) { 7557 goto utf8_posix; 7558 } 7559 goto posixa; 7560 7561 case NPOSIXA: /* \W or [:^punct:] etc. under /a */ 7562 7563 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7564 sayNO; 7565 } 7566 7567 /* All UTF-8 variants match */ 7568 if (! UTF8_IS_INVARIANT(nextbyte)) { 7569 goto increment_locinput; 7570 } 7571 7572 to_complement = 1; 7573 goto join_nposixa; 7574 7575 case POSIXA: /* \w or [:punct:] etc. under /a */ 7576 7577 posixa: 7578 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in 7579 * UTF-8, and also from NPOSIXA even in UTF-8 when the current 7580 * character is a single byte */ 7581 7582 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7583 sayNO; 7584 } 7585 7586 join_nposixa: 7587 7588 if (! (to_complement ^ cBOOL(_generic_isCC_A(nextbyte, 7589 FLAGS(scan))))) 7590 { 7591 sayNO; 7592 } 7593 7594 /* Here we are either not in utf8, or we matched a utf8-invariant, 7595 * so the next char is the next byte */ 7596 locinput++; 7597 break; 7598 7599 case NPOSIXU: /* \W or [:^punct:] etc. under /u */ 7600 to_complement = 1; 7601 /* FALLTHROUGH */ 7602 7603 case POSIXU: /* \w or [:punct:] etc. under /u */ 7604 utf8_posix: 7605 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7606 sayNO; 7607 } 7608 7609 /* Use _generic_isCC() for characters within Latin1. (Note that 7610 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else 7611 * wouldn't be invariant) */ 7612 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) { 7613 if (! (to_complement ^ cBOOL(_generic_isCC(nextbyte, 7614 FLAGS(scan))))) 7615 { 7616 sayNO; 7617 } 7618 locinput++; 7619 } 7620 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { 7621 if (! (to_complement 7622 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte, 7623 *(locinput + 1)), 7624 FLAGS(scan))))) 7625 { 7626 sayNO; 7627 } 7628 locinput += 2; 7629 } 7630 else { /* Handle above Latin-1 code points */ 7631 utf8_posix_above_latin1: 7632 classnum = (_char_class_number) FLAGS(scan); 7633 switch (classnum) { 7634 default: 7635 if (! (to_complement 7636 ^ cBOOL(_invlist_contains_cp( 7637 PL_XPosix_ptrs[classnum], 7638 utf8_to_uvchr_buf((U8 *) locinput, 7639 (U8 *) reginfo->strend, 7640 NULL))))) 7641 { 7642 sayNO; 7643 } 7644 break; 7645 case _CC_ENUM_SPACE: 7646 if (! (to_complement 7647 ^ cBOOL(is_XPERLSPACE_high(locinput)))) 7648 { 7649 sayNO; 7650 } 7651 break; 7652 case _CC_ENUM_BLANK: 7653 if (! (to_complement 7654 ^ cBOOL(is_HORIZWS_high(locinput)))) 7655 { 7656 sayNO; 7657 } 7658 break; 7659 case _CC_ENUM_XDIGIT: 7660 if (! (to_complement 7661 ^ cBOOL(is_XDIGIT_high(locinput)))) 7662 { 7663 sayNO; 7664 } 7665 break; 7666 case _CC_ENUM_VERTSPACE: 7667 if (! (to_complement 7668 ^ cBOOL(is_VERTWS_high(locinput)))) 7669 { 7670 sayNO; 7671 } 7672 break; 7673 case _CC_ENUM_CNTRL: /* These can't match above Latin1 */ 7674 case _CC_ENUM_ASCII: 7675 if (! to_complement) { 7676 sayNO; 7677 } 7678 break; 7679 } 7680 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); 7681 } 7682 break; 7683 7684 case CLUMP: /* Match \X: logical Unicode character. This is defined as 7685 a Unicode extended Grapheme Cluster */ 7686 if (NEXTCHR_IS_EOS || locinput >= loceol) 7687 sayNO; 7688 if (! utf8_target) { 7689 7690 /* Match either CR LF or '.', as all the other possibilities 7691 * require utf8 */ 7692 locinput++; /* Match the . or CR */ 7693 if (nextbyte == '\r' /* And if it was CR, and the next is LF, 7694 match the LF */ 7695 && locinput < loceol 7696 && UCHARAT(locinput) == '\n') 7697 { 7698 locinput++; 7699 } 7700 } 7701 else { 7702 7703 /* Get the gcb type for the current character */ 7704 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput, 7705 (U8*) reginfo->strend); 7706 7707 /* Then scan through the input until we get to the first 7708 * character whose type is supposed to be a gcb with the 7709 * current character. (There is always a break at the 7710 * end-of-input) */ 7711 locinput += UTF8SKIP(locinput); 7712 while (locinput < loceol) { 7713 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput, 7714 (U8*) reginfo->strend); 7715 if (isGCB(prev_gcb, cur_gcb, 7716 (U8*) reginfo->strbeg, (U8*) locinput, 7717 utf8_target)) 7718 { 7719 break; 7720 } 7721 7722 prev_gcb = cur_gcb; 7723 locinput += UTF8SKIP(locinput); 7724 } 7725 7726 7727 } 7728 break; 7729 7730 case REFFLN: /* /\g{name}/il */ 7731 { /* The capture buffer cases. The ones beginning with N for the 7732 named buffers just convert to the equivalent numbered and 7733 pretend they were called as the corresponding numbered buffer 7734 op. */ 7735 /* don't initialize these in the declaration, it makes C++ 7736 unhappy */ 7737 const char *s; 7738 char type; 7739 re_fold_t folder; 7740 const U8 *fold_array; 7741 UV utf8_fold_flags; 7742 7743 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 7744 folder = foldEQ_locale; 7745 fold_array = PL_fold_locale; 7746 type = REFFL; 7747 utf8_fold_flags = FOLDEQ_LOCALE; 7748 goto do_nref; 7749 7750 case REFFAN: /* /\g{name}/iaa */ 7751 folder = foldEQ_latin1; 7752 fold_array = PL_fold_latin1; 7753 type = REFFA; 7754 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7755 goto do_nref; 7756 7757 case REFFUN: /* /\g{name}/iu */ 7758 folder = foldEQ_latin1; 7759 fold_array = PL_fold_latin1; 7760 type = REFFU; 7761 utf8_fold_flags = 0; 7762 goto do_nref; 7763 7764 case REFFN: /* /\g{name}/i */ 7765 folder = foldEQ; 7766 fold_array = PL_fold; 7767 type = REFF; 7768 utf8_fold_flags = 0; 7769 goto do_nref; 7770 7771 case REFN: /* /\g{name}/ */ 7772 type = REF; 7773 folder = NULL; 7774 fold_array = NULL; 7775 utf8_fold_flags = 0; 7776 do_nref: 7777 7778 /* For the named back references, find the corresponding buffer 7779 * number */ 7780 n = reg_check_named_buff_matched(rex,scan); 7781 7782 if ( ! n ) { 7783 sayNO; 7784 } 7785 goto do_nref_ref_common; 7786 7787 case REFFL: /* /\1/il */ 7788 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 7789 folder = foldEQ_locale; 7790 fold_array = PL_fold_locale; 7791 utf8_fold_flags = FOLDEQ_LOCALE; 7792 goto do_ref; 7793 7794 case REFFA: /* /\1/iaa */ 7795 folder = foldEQ_latin1; 7796 fold_array = PL_fold_latin1; 7797 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7798 goto do_ref; 7799 7800 case REFFU: /* /\1/iu */ 7801 folder = foldEQ_latin1; 7802 fold_array = PL_fold_latin1; 7803 utf8_fold_flags = 0; 7804 goto do_ref; 7805 7806 case REFF: /* /\1/i */ 7807 folder = foldEQ; 7808 fold_array = PL_fold; 7809 utf8_fold_flags = 0; 7810 goto do_ref; 7811 7812 case REF: /* /\1/ */ 7813 folder = NULL; 7814 fold_array = NULL; 7815 utf8_fold_flags = 0; 7816 7817 do_ref: 7818 type = OP(scan); 7819 n = ARG(scan); /* which paren pair */ 7820 7821 do_nref_ref_common: 7822 ln = rex->offs[n].start; 7823 endref = rex->offs[n].end; 7824 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ 7825 if (rex->lastparen < n || ln == -1 || endref == -1) 7826 sayNO; /* Do not match unless seen CLOSEn. */ 7827 if (ln == endref) 7828 break; 7829 7830 s = reginfo->strbeg + ln; 7831 if (type != REF /* REF can do byte comparison */ 7832 && (utf8_target || type == REFFU || type == REFFL)) 7833 { 7834 char * limit = loceol; 7835 7836 /* This call case insensitively compares the entire buffer 7837 * at s, with the current input starting at locinput, but 7838 * not going off the end given by loceol, and 7839 * returns in <limit> upon success, how much of the 7840 * current input was matched */ 7841 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target, 7842 locinput, &limit, 0, utf8_target, utf8_fold_flags)) 7843 { 7844 sayNO; 7845 } 7846 locinput = limit; 7847 break; 7848 } 7849 7850 /* Not utf8: Inline the first character, for speed. */ 7851 if ( ! NEXTCHR_IS_EOS 7852 && locinput < loceol 7853 && UCHARAT(s) != nextbyte 7854 && ( type == REF 7855 || UCHARAT(s) != fold_array[nextbyte])) 7856 { 7857 sayNO; 7858 } 7859 ln = endref - ln; 7860 if (locinput + ln > loceol) 7861 sayNO; 7862 if (ln > 1 && (type == REF 7863 ? memNE(s, locinput, ln) 7864 : ! folder(locinput, s, ln))) 7865 sayNO; 7866 locinput += ln; 7867 break; 7868 } 7869 7870 case NOTHING: /* null op; e.g. the 'nothing' following 7871 * the '*' in m{(a+|b)*}' */ 7872 break; 7873 case TAIL: /* placeholder while compiling (A|B|C) */ 7874 break; 7875 7876 #undef ST 7877 #define ST st->u.eval 7878 #define CUR_EVAL cur_eval->u.eval 7879 7880 { 7881 SV *ret; 7882 REGEXP *re_sv; 7883 regexp *re; 7884 regexp_internal *rei; 7885 regnode *startpoint; 7886 U32 arg; 7887 7888 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ 7889 arg= (U32)ARG(scan); 7890 if (cur_eval && cur_eval->locinput == locinput) { 7891 if ( ++nochange_depth > max_nochange_depth ) 7892 Perl_croak(aTHX_ 7893 "Pattern subroutine nesting without pos change" 7894 " exceeded limit in regex"); 7895 } else { 7896 nochange_depth = 0; 7897 } 7898 re_sv = rex_sv; 7899 re = rex; 7900 rei = rexi; 7901 startpoint = scan + ARG2L(scan); 7902 EVAL_CLOSE_PAREN_SET( st, arg ); 7903 /* Detect infinite recursion 7904 * 7905 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/ 7906 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever. 7907 * So we track the position in the string we are at each time 7908 * we recurse and if we try to enter the same routine twice from 7909 * the same position we throw an error. 7910 */ 7911 if ( rex->recurse_locinput[arg] == locinput ) { 7912 /* FIXME: we should show the regop that is failing as part 7913 * of the error message. */ 7914 Perl_croak(aTHX_ "Infinite recursion in regex"); 7915 } else { 7916 ST.prev_recurse_locinput= rex->recurse_locinput[arg]; 7917 rex->recurse_locinput[arg]= locinput; 7918 7919 DEBUG_r({ 7920 DECLARE_AND_GET_RE_DEBUG_FLAGS; 7921 DEBUG_STACK_r({ 7922 Perl_re_exec_indentf( aTHX_ 7923 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n", 7924 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg] 7925 ); 7926 }); 7927 }); 7928 } 7929 7930 /* Save all the positions seen so far. */ 7931 ST.cp = regcppush(rex, 0, maxopenparen); 7932 REGCP_SET(ST.lastcp); 7933 7934 /* and then jump to the code we share with EVAL */ 7935 goto eval_recurse_doit; 7936 /* NOTREACHED */ 7937 7938 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */ 7939 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) { 7940 if ( ++nochange_depth > max_nochange_depth ) 7941 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); 7942 } else { 7943 nochange_depth = 0; 7944 } 7945 { 7946 /* execute the code in the {...} */ 7947 7948 dSP; 7949 IV before; 7950 OP * const oop = PL_op; 7951 COP * const ocurcop = PL_curcop; 7952 OP *nop; 7953 CV *newcv; 7954 7955 /* save *all* paren positions */ 7956 regcppush(rex, 0, maxopenparen); 7957 REGCP_SET(ST.lastcp); 7958 7959 if (!caller_cv) 7960 caller_cv = find_runcv(NULL); 7961 7962 n = ARG(scan); 7963 7964 if (rexi->data->what[n] == 'r') { /* code from an external qr */ 7965 newcv = (ReANY( 7966 (REGEXP*)(rexi->data->data[n]) 7967 ))->qr_anoncv; 7968 nop = (OP*)rexi->data->data[n+1]; 7969 } 7970 else if (rexi->data->what[n] == 'l') { /* literal code */ 7971 newcv = caller_cv; 7972 nop = (OP*)rexi->data->data[n]; 7973 assert(CvDEPTH(newcv)); 7974 } 7975 else { 7976 /* literal with own CV */ 7977 assert(rexi->data->what[n] == 'L'); 7978 newcv = rex->qr_anoncv; 7979 nop = (OP*)rexi->data->data[n]; 7980 } 7981 7982 /* Some notes about MULTICALL and the context and save stacks. 7983 * 7984 * In something like 7985 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../ 7986 * since codeblocks don't introduce a new scope (so that 7987 * local() etc accumulate), at the end of a successful 7988 * match there will be a SAVEt_CLEARSV on the savestack 7989 * for each of $x, $y, $z. If the three code blocks above 7990 * happen to have come from different CVs (e.g. via 7991 * embedded qr//s), then we must ensure that during any 7992 * savestack unwinding, PL_comppad always points to the 7993 * right pad at each moment. We achieve this by 7994 * interleaving SAVEt_COMPPAD's on the savestack whenever 7995 * there is a change of pad. 7996 * In theory whenever we call a code block, we should 7997 * push a CXt_SUB context, then pop it on return from 7998 * that code block. This causes a bit of an issue in that 7999 * normally popping a context also clears the savestack 8000 * back to cx->blk_oldsaveix, but here we specifically 8001 * don't want to clear the save stack on exit from the 8002 * code block. 8003 * Also for efficiency we don't want to keep pushing and 8004 * popping the single SUB context as we backtrack etc. 8005 * So instead, we push a single context the first time 8006 * we need, it, then hang onto it until the end of this 8007 * function. Whenever we encounter a new code block, we 8008 * update the CV etc if that's changed. During the times 8009 * in this function where we're not executing a code 8010 * block, having the SUB context still there is a bit 8011 * naughty - but we hope that no-one notices. 8012 * When the SUB context is initially pushed, we fake up 8013 * cx->blk_oldsaveix to be as if we'd pushed this context 8014 * on first entry to S_regmatch rather than at some random 8015 * point during the regexe execution. That way if we 8016 * croak, popping the context stack will ensure that 8017 * *everything* SAVEd by this function is undone and then 8018 * the context popped, rather than e.g., popping the 8019 * context (and restoring the original PL_comppad) then 8020 * popping more of the savestack and restoring a bad 8021 * PL_comppad. 8022 */ 8023 8024 /* If this is the first EVAL, push a MULTICALL. On 8025 * subsequent calls, if we're executing a different CV, or 8026 * if PL_comppad has got messed up from backtracking 8027 * through SAVECOMPPADs, then refresh the context. 8028 */ 8029 if (newcv != last_pushed_cv || PL_comppad != last_pad) 8030 { 8031 U8 flags = (CXp_SUB_RE | 8032 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); 8033 SAVECOMPPAD(); 8034 if (last_pushed_cv) { 8035 CHANGE_MULTICALL_FLAGS(newcv, flags); 8036 } 8037 else { 8038 PUSH_MULTICALL_FLAGS(newcv, flags); 8039 } 8040 /* see notes above */ 8041 CX_CUR()->blk_oldsaveix = orig_savestack_ix; 8042 8043 last_pushed_cv = newcv; 8044 } 8045 else { 8046 /* these assignments are just to silence compiler 8047 * warnings */ 8048 multicall_cop = NULL; 8049 } 8050 last_pad = PL_comppad; 8051 8052 /* the initial nextstate you would normally execute 8053 * at the start of an eval (which would cause error 8054 * messages to come from the eval), may be optimised 8055 * away from the execution path in the regex code blocks; 8056 * so manually set PL_curcop to it initially */ 8057 { 8058 OP *o = cUNOPx(nop)->op_first; 8059 assert(o->op_type == OP_NULL); 8060 if (o->op_targ == OP_SCOPE) { 8061 o = cUNOPo->op_first; 8062 } 8063 else { 8064 assert(o->op_targ == OP_LEAVE); 8065 o = cUNOPo->op_first; 8066 assert(o->op_type == OP_ENTER); 8067 o = OpSIBLING(o); 8068 } 8069 8070 if (o->op_type != OP_STUB) { 8071 assert( o->op_type == OP_NEXTSTATE 8072 || o->op_type == OP_DBSTATE 8073 || (o->op_type == OP_NULL 8074 && ( o->op_targ == OP_NEXTSTATE 8075 || o->op_targ == OP_DBSTATE 8076 ) 8077 ) 8078 ); 8079 PL_curcop = (COP*)o; 8080 } 8081 } 8082 nop = nop->op_next; 8083 8084 DEBUG_STATE_r( Perl_re_printf( aTHX_ 8085 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) ); 8086 8087 rex->offs[0].end = locinput - reginfo->strbeg; 8088 if (reginfo->info_aux_eval->pos_magic) 8089 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, 8090 reginfo->sv, reginfo->strbeg, 8091 locinput - reginfo->strbeg); 8092 8093 if (sv_yes_mark) { 8094 SV *sv_mrk = get_sv("REGMARK", 1); 8095 sv_setsv(sv_mrk, sv_yes_mark); 8096 } 8097 8098 /* we don't use MULTICALL here as we want to call the 8099 * first op of the block of interest, rather than the 8100 * first op of the sub. Also, we don't want to free 8101 * the savestack frame */ 8102 before = (IV)(SP-PL_stack_base); 8103 PL_op = nop; 8104 CALLRUNOPS(aTHX); /* Scalar context. */ 8105 SPAGAIN; 8106 if ((IV)(SP-PL_stack_base) == before) 8107 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ 8108 else { 8109 ret = POPs; 8110 PUTBACK; 8111 } 8112 8113 /* before restoring everything, evaluate the returned 8114 * value, so that 'uninit' warnings don't use the wrong 8115 * PL_op or pad. Also need to process any magic vars 8116 * (e.g. $1) *before* parentheses are restored */ 8117 8118 PL_op = NULL; 8119 8120 re_sv = NULL; 8121 if (logical == 0) { /* (?{})/ */ 8122 SV *replsv = save_scalar(PL_replgv); 8123 sv_setsv(replsv, ret); /* $^R */ 8124 SvSETMAGIC(replsv); 8125 } 8126 else if (logical == 1) { /* /(?(?{...})X|Y)/ */ 8127 sw = cBOOL(SvTRUE_NN(ret)); 8128 logical = 0; 8129 } 8130 else { /* /(??{}) */ 8131 /* if its overloaded, let the regex compiler handle 8132 * it; otherwise extract regex, or stringify */ 8133 if (SvGMAGICAL(ret)) 8134 ret = sv_mortalcopy(ret); 8135 if (!SvAMAGIC(ret)) { 8136 SV *sv = ret; 8137 if (SvROK(sv)) 8138 sv = SvRV(sv); 8139 if (SvTYPE(sv) == SVt_REGEXP) 8140 re_sv = (REGEXP*) sv; 8141 else if (SvSMAGICAL(ret)) { 8142 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); 8143 if (mg) 8144 re_sv = (REGEXP *) mg->mg_obj; 8145 } 8146 8147 /* force any undef warnings here */ 8148 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { 8149 ret = sv_mortalcopy(ret); 8150 (void) SvPV_force_nolen(ret); 8151 } 8152 } 8153 8154 } 8155 8156 /* *** Note that at this point we don't restore 8157 * PL_comppad, (or pop the CxSUB) on the assumption it may 8158 * be used again soon. This is safe as long as nothing 8159 * in the regexp code uses the pad ! */ 8160 PL_op = oop; 8161 PL_curcop = ocurcop; 8162 regcp_restore(rex, ST.lastcp, &maxopenparen); 8163 PL_curpm_under = PL_curpm; 8164 PL_curpm = PL_reg_curpm; 8165 8166 if (logical != 2) { 8167 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol, 8168 script_run_begin); 8169 /* NOTREACHED */ 8170 } 8171 } 8172 8173 /* only /(??{})/ from now on */ 8174 logical = 0; 8175 { 8176 /* extract RE object from returned value; compiling if 8177 * necessary */ 8178 8179 if (re_sv) { 8180 re_sv = reg_temp_copy(NULL, re_sv); 8181 } 8182 else { 8183 U32 pm_flags = 0; 8184 8185 if (SvUTF8(ret) && IN_BYTES) { 8186 /* In use 'bytes': make a copy of the octet 8187 * sequence, but without the flag on */ 8188 STRLEN len; 8189 const char *const p = SvPV(ret, len); 8190 ret = newSVpvn_flags(p, len, SVs_TEMP); 8191 } 8192 if (rex->intflags & PREGf_USE_RE_EVAL) 8193 pm_flags |= PMf_USE_RE_EVAL; 8194 8195 /* if we got here, it should be an engine which 8196 * supports compiling code blocks and stuff */ 8197 assert(rex->engine && rex->engine->op_comp); 8198 assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); 8199 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, 8200 rex->engine, NULL, NULL, 8201 /* copy /msixn etc to inner pattern */ 8202 ARG2L(scan), 8203 pm_flags); 8204 8205 if (!(SvFLAGS(ret) 8206 & (SVs_TEMP | SVs_GMG | SVf_ROK)) 8207 && (!SvPADTMP(ret) || SvREADONLY(ret))) { 8208 /* This isn't a first class regexp. Instead, it's 8209 caching a regexp onto an existing, Perl visible 8210 scalar. */ 8211 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); 8212 } 8213 } 8214 SAVEFREESV(re_sv); 8215 re = ReANY(re_sv); 8216 } 8217 RXp_MATCH_COPIED_off(re); 8218 re->subbeg = rex->subbeg; 8219 re->sublen = rex->sublen; 8220 re->suboffset = rex->suboffset; 8221 re->subcoffset = rex->subcoffset; 8222 re->lastparen = 0; 8223 re->lastcloseparen = 0; 8224 rei = RXi_GET(re); 8225 DEBUG_EXECUTE_r( 8226 debug_start_match(re_sv, utf8_target, locinput, 8227 reginfo->strend, "EVAL/GOSUB: Matching embedded"); 8228 ); 8229 startpoint = rei->program + 1; 8230 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0; 8231 * close_paren only for GOSUB */ 8232 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */ 8233 /* Save all the seen positions so far. */ 8234 ST.cp = regcppush(rex, 0, maxopenparen); 8235 REGCP_SET(ST.lastcp); 8236 /* and set maxopenparen to 0, since we are starting a "fresh" match */ 8237 maxopenparen = 0; 8238 /* run the pattern returned from (??{...}) */ 8239 8240 eval_recurse_doit: /* Share code with GOSUB below this line 8241 * At this point we expect the stack context to be 8242 * set up correctly */ 8243 8244 /* invalidate the S-L poscache. We're now executing a 8245 * different set of WHILEM ops (and their associated 8246 * indexes) against the same string, so the bits in the 8247 * cache are meaningless. Setting maxiter to zero forces 8248 * the cache to be invalidated and zeroed before reuse. 8249 * XXX This is too dramatic a measure. Ideally we should 8250 * save the old cache and restore when running the outer 8251 * pattern again */ 8252 reginfo->poscache_maxiter = 0; 8253 8254 /* the new regexp might have a different is_utf8_pat than we do */ 8255 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); 8256 8257 ST.prev_rex = rex_sv; 8258 ST.prev_curlyx = cur_curlyx; 8259 rex_sv = re_sv; 8260 SET_reg_curpm(rex_sv); 8261 rex = re; 8262 rexi = rei; 8263 cur_curlyx = NULL; 8264 ST.B = next; 8265 ST.prev_eval = cur_eval; 8266 cur_eval = st; 8267 /* now continue from first node in postoned RE */ 8268 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, 8269 loceol, script_run_begin); 8270 NOT_REACHED; /* NOTREACHED */ 8271 } 8272 8273 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */ 8274 /* note: this is called twice; first after popping B, then A */ 8275 DEBUG_STACK_r({ 8276 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n", 8277 depth, cur_eval, ST.prev_eval); 8278 }); 8279 8280 #define SET_RECURSE_LOCINPUT(STR,VAL)\ 8281 if ( cur_eval && CUR_EVAL.close_paren ) {\ 8282 DEBUG_STACK_r({ \ 8283 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\ 8284 depth, \ 8285 CUR_EVAL.close_paren - 1,\ 8286 cur_eval, \ 8287 VAL); \ 8288 }); \ 8289 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\ 8290 } 8291 8292 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput); 8293 8294 rex_sv = ST.prev_rex; 8295 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 8296 SET_reg_curpm(rex_sv); 8297 rex = ReANY(rex_sv); 8298 rexi = RXi_GET(rex); 8299 { 8300 /* preserve $^R across LEAVE's. See Bug 121070. */ 8301 SV *save_sv= GvSV(PL_replgv); 8302 SV *replsv; 8303 SvREFCNT_inc(save_sv); 8304 regcpblow(ST.cp); /* LEAVE in disguise */ 8305 /* don't move this initialization up */ 8306 replsv = GvSV(PL_replgv); 8307 sv_setsv(replsv, save_sv); 8308 SvSETMAGIC(replsv); 8309 SvREFCNT_dec(save_sv); 8310 } 8311 cur_eval = ST.prev_eval; 8312 cur_curlyx = ST.prev_curlyx; 8313 8314 /* Invalidate cache. See "invalidate" comment above. */ 8315 reginfo->poscache_maxiter = 0; 8316 if ( nochange_depth ) 8317 nochange_depth--; 8318 8319 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput); 8320 sayYES; 8321 8322 8323 case EVAL_B_fail: /* unsuccessful B in (?{...})B */ 8324 REGCP_UNWIND(ST.lastcp); 8325 sayNO; 8326 8327 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ 8328 /* note: this is called twice; first after popping B, then A */ 8329 DEBUG_STACK_r({ 8330 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n", 8331 depth, cur_eval, ST.prev_eval); 8332 }); 8333 8334 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput); 8335 8336 rex_sv = ST.prev_rex; 8337 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 8338 SET_reg_curpm(rex_sv); 8339 rex = ReANY(rex_sv); 8340 rexi = RXi_GET(rex); 8341 8342 REGCP_UNWIND(ST.lastcp); 8343 regcppop(rex, &maxopenparen); 8344 cur_eval = ST.prev_eval; 8345 cur_curlyx = ST.prev_curlyx; 8346 8347 /* Invalidate cache. See "invalidate" comment above. */ 8348 reginfo->poscache_maxiter = 0; 8349 if ( nochange_depth ) 8350 nochange_depth--; 8351 8352 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput); 8353 sayNO_SILENT; 8354 #undef ST 8355 8356 case OPEN: /* ( */ 8357 n = ARG(scan); /* which paren pair */ 8358 rex->offs[n].start_tmp = locinput - reginfo->strbeg; 8359 if (n > maxopenparen) 8360 maxopenparen = n; 8361 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 8362 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n", 8363 depth, 8364 PTR2UV(rex), 8365 PTR2UV(rex->offs), 8366 (UV)n, 8367 (IV)rex->offs[n].start_tmp, 8368 (UV)maxopenparen 8369 )); 8370 lastopen = n; 8371 break; 8372 8373 case SROPEN: /* (*SCRIPT_RUN: */ 8374 script_run_begin = (U8 *) locinput; 8375 break; 8376 8377 8378 case CLOSE: /* ) */ 8379 n = ARG(scan); /* which paren pair */ 8380 CLOSE_CAPTURE(n, rex->offs[n].start_tmp, 8381 locinput - reginfo->strbeg); 8382 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) ) 8383 goto fake_end; 8384 8385 break; 8386 8387 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */ 8388 8389 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target)) 8390 { 8391 sayNO; 8392 } 8393 8394 break; 8395 8396 8397 case ACCEPT: /* (*ACCEPT) */ 8398 is_accepted = true; 8399 if (scan->flags) 8400 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 8401 utmp = (U32)ARG2L(scan); 8402 8403 if ( utmp ) { 8404 regnode *cursor; 8405 for ( 8406 cursor = scan; 8407 cursor && ( OP(cursor) != END ); 8408 cursor = ( PL_regkind[ OP(cursor) ] == END ) 8409 ? NEXTOPER(cursor) 8410 : regnext(cursor) 8411 ){ 8412 if ( OP(cursor) != CLOSE ) 8413 continue; 8414 8415 n = ARG(cursor); 8416 8417 if ( n > lastopen ) /* might be OPEN/CLOSE in the way */ 8418 continue; /* so skip this one */ 8419 8420 CLOSE_CAPTURE(n, rex->offs[n].start_tmp, 8421 locinput - reginfo->strbeg); 8422 8423 if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) ) 8424 break; 8425 } 8426 } 8427 goto fake_end; 8428 /* NOTREACHED */ 8429 8430 case GROUPP: /* (?(1)) */ 8431 n = ARG(scan); /* which paren pair */ 8432 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); 8433 break; 8434 8435 case GROUPPN: /* (?(<name>)) */ 8436 /* reg_check_named_buff_matched returns 0 for no match */ 8437 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); 8438 break; 8439 8440 case INSUBP: /* (?(R)) */ 8441 n = ARG(scan); 8442 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg 8443 * of SCAN is already set up as matches a eval.close_paren */ 8444 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n); 8445 break; 8446 8447 case DEFINEP: /* (?(DEFINE)) */ 8448 sw = 0; 8449 break; 8450 8451 case IFTHEN: /* (?(cond)A|B) */ 8452 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ 8453 if (sw) 8454 next = NEXTOPER(NEXTOPER(scan)); 8455 else { 8456 next = scan + ARG(scan); 8457 if (OP(next) == IFTHEN) /* Fake one. */ 8458 next = NEXTOPER(NEXTOPER(next)); 8459 } 8460 break; 8461 8462 case LOGICAL: /* modifier for EVAL and IFMATCH */ 8463 logical = scan->flags; 8464 break; 8465 8466 /******************************************************************* 8467 8468 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ 8469 pattern, where A and B are subpatterns. (For simple A, CURLYM or 8470 STAR/PLUS/CURLY/CURLYN are used instead.) 8471 8472 A*B is compiled as <CURLYX><A><WHILEM><B> 8473 8474 On entry to the subpattern, CURLYX is called. This pushes a CURLYX 8475 state, which contains the current count, initialised to -1. It also sets 8476 cur_curlyx to point to this state, with any previous value saved in the 8477 state block. 8478 8479 CURLYX then jumps straight to the WHILEM op, rather than executing A, 8480 since the pattern may possibly match zero times (i.e. it's a while {} loop 8481 rather than a do {} while loop). 8482 8483 Each entry to WHILEM represents a successful match of A. The count in the 8484 CURLYX block is incremented, another WHILEM state is pushed, and execution 8485 passes to A or B depending on greediness and the current count. 8486 8487 For example, if matching against the string a1a2a3b (where the aN are 8488 substrings that match /A/), then the match progresses as follows: (the 8489 pushed states are interspersed with the bits of strings matched so far): 8490 8491 <CURLYX cnt=-1> 8492 <CURLYX cnt=0><WHILEM> 8493 <CURLYX cnt=1><WHILEM> a1 <WHILEM> 8494 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM> 8495 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> 8496 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b 8497 8498 (Contrast this with something like CURLYM, which maintains only a single 8499 backtrack state: 8500 8501 <CURLYM cnt=0> a1 8502 a1 <CURLYM cnt=1> a2 8503 a1 a2 <CURLYM cnt=2> a3 8504 a1 a2 a3 <CURLYM cnt=3> b 8505 ) 8506 8507 Each WHILEM state block marks a point to backtrack to upon partial failure 8508 of A or B, and also contains some minor state data related to that 8509 iteration. The CURLYX block, pointed to by cur_curlyx, contains the 8510 overall state, such as the count, and pointers to the A and B ops. 8511 8512 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx 8513 must always point to the *current* CURLYX block, the rules are: 8514 8515 When executing CURLYX, save the old cur_curlyx in the CURLYX state block, 8516 and set cur_curlyx to point the new block. 8517 8518 When popping the CURLYX block after a successful or unsuccessful match, 8519 restore the previous cur_curlyx. 8520 8521 When WHILEM is about to execute B, save the current cur_curlyx, and set it 8522 to the outer one saved in the CURLYX block. 8523 8524 When popping the WHILEM block after a successful or unsuccessful B match, 8525 restore the previous cur_curlyx. 8526 8527 Here's an example for the pattern (AI* BI)*BO 8528 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: 8529 8530 cur_ 8531 curlyx backtrack stack 8532 ------ --------------- 8533 NULL 8534 CO <CO prev=NULL> <WO> 8535 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 8536 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 8537 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo 8538 8539 At this point the pattern succeeds, and we work back down the stack to 8540 clean up, restoring as we go: 8541 8542 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 8543 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 8544 CO <CO prev=NULL> <WO> 8545 NULL 8546 8547 *******************************************************************/ 8548 8549 #define ST st->u.curlyx 8550 8551 case CURLYX: /* start of /A*B/ (for complex A) */ 8552 { 8553 /* No need to save/restore up to this paren */ 8554 I32 parenfloor = scan->flags; 8555 8556 assert(next); /* keep Coverity happy */ 8557 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ 8558 next += ARG(next); 8559 8560 /* XXXX Probably it is better to teach regpush to support 8561 parenfloor > maxopenparen ... */ 8562 if (parenfloor > (I32)rex->lastparen) 8563 parenfloor = rex->lastparen; /* Pessimization... */ 8564 8565 ST.prev_curlyx= cur_curlyx; 8566 cur_curlyx = st; 8567 ST.cp = PL_savestack_ix; 8568 8569 /* these fields contain the state of the current curly. 8570 * they are accessed by subsequent WHILEMs */ 8571 ST.parenfloor = parenfloor; 8572 ST.me = scan; 8573 ST.B = next; 8574 ST.minmod = minmod; 8575 minmod = 0; 8576 ST.count = -1; /* this will be updated by WHILEM */ 8577 ST.lastloc = NULL; /* this will be updated by WHILEM */ 8578 8579 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol, 8580 script_run_begin); 8581 NOT_REACHED; /* NOTREACHED */ 8582 } 8583 8584 case CURLYX_end: /* just finished matching all of A*B */ 8585 cur_curlyx = ST.prev_curlyx; 8586 sayYES; 8587 NOT_REACHED; /* NOTREACHED */ 8588 8589 case CURLYX_end_fail: /* just failed to match all of A*B */ 8590 regcpblow(ST.cp); 8591 cur_curlyx = ST.prev_curlyx; 8592 sayNO; 8593 NOT_REACHED; /* NOTREACHED */ 8594 8595 8596 #undef ST 8597 #define ST st->u.whilem 8598 8599 case WHILEM: /* just matched an A in /A*B/ (for complex A) */ 8600 { 8601 /* see the discussion above about CURLYX/WHILEM */ 8602 I32 n; 8603 int min, max; 8604 regnode *A; 8605 8606 assert(cur_curlyx); /* keep Coverity happy */ 8607 8608 min = ARG1(cur_curlyx->u.curlyx.me); 8609 max = ARG2(cur_curlyx->u.curlyx.me); 8610 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS; 8611 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ 8612 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; 8613 ST.cache_offset = 0; 8614 ST.cache_mask = 0; 8615 8616 8617 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n", 8618 depth, (long)n, min, max) 8619 ); 8620 8621 /* First just match a string of min A's. */ 8622 8623 if (n < min) { 8624 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen); 8625 cur_curlyx->u.curlyx.lastloc = locinput; 8626 REGCP_SET(ST.lastcp); 8627 8628 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol, 8629 script_run_begin); 8630 NOT_REACHED; /* NOTREACHED */ 8631 } 8632 8633 /* If degenerate A matches "", assume A done. */ 8634 8635 if (locinput == cur_curlyx->u.curlyx.lastloc) { 8636 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n", 8637 depth) 8638 ); 8639 goto do_whilem_B_max; 8640 } 8641 8642 /* super-linear cache processing. 8643 * 8644 * The idea here is that for certain types of CURLYX/WHILEM - 8645 * principally those whose upper bound is infinity (and 8646 * excluding regexes that have things like \1 and other very 8647 * non-regular expresssiony things), then if a pattern like 8648 * /....A*.../ fails and we backtrack to the WHILEM, then we 8649 * make a note that this particular WHILEM op was at string 8650 * position 47 (say) when the rest of pattern failed. Then, if 8651 * we ever find ourselves back at that WHILEM, and at string 8652 * position 47 again, we can just fail immediately rather than 8653 * running the rest of the pattern again. 8654 * 8655 * This is very handy when patterns start to go 8656 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up 8657 * with a combinatorial explosion of backtracking. 8658 * 8659 * The cache is implemented as a bit array, with one bit per 8660 * string byte position per WHILEM op (up to 16) - so its 8661 * between 0.25 and 2x the string size. 8662 * 8663 * To avoid allocating a poscache buffer every time, we do an 8664 * initially countdown; only after we have executed a WHILEM 8665 * op (string-length x #WHILEMs) times do we allocate the 8666 * cache. 8667 * 8668 * The top 4 bits of scan->flags byte say how many different 8669 * relevant CURLLYX/WHILEM op pairs there are, while the 8670 * bottom 4-bits is the identifying index number of this 8671 * WHILEM. 8672 */ 8673 8674 if (scan->flags) { 8675 8676 if (!reginfo->poscache_maxiter) { 8677 /* start the countdown: Postpone detection until we 8678 * know the match is not *that* much linear. */ 8679 reginfo->poscache_maxiter 8680 = (reginfo->strend - reginfo->strbeg + 1) 8681 * (scan->flags>>4); 8682 /* possible overflow for long strings and many CURLYX's */ 8683 if (reginfo->poscache_maxiter < 0) 8684 reginfo->poscache_maxiter = I32_MAX; 8685 reginfo->poscache_iter = reginfo->poscache_maxiter; 8686 } 8687 8688 if (reginfo->poscache_iter-- == 0) { 8689 /* initialise cache */ 8690 const SSize_t size = (reginfo->poscache_maxiter + 7)/8; 8691 regmatch_info_aux *const aux = reginfo->info_aux; 8692 if (aux->poscache) { 8693 if ((SSize_t)reginfo->poscache_size < size) { 8694 Renew(aux->poscache, size, char); 8695 reginfo->poscache_size = size; 8696 } 8697 Zero(aux->poscache, size, char); 8698 } 8699 else { 8700 reginfo->poscache_size = size; 8701 Newxz(aux->poscache, size, char); 8702 } 8703 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 8704 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n", 8705 PL_colors[4], PL_colors[5]) 8706 ); 8707 } 8708 8709 if (reginfo->poscache_iter < 0) { 8710 /* have we already failed at this position? */ 8711 SSize_t offset, mask; 8712 8713 reginfo->poscache_iter = -1; /* stop eventual underflow */ 8714 offset = (scan->flags & 0xf) - 1 8715 + (locinput - reginfo->strbeg) 8716 * (scan->flags>>4); 8717 mask = 1 << (offset % 8); 8718 offset /= 8; 8719 if (reginfo->info_aux->poscache[offset] & mask) { 8720 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n", 8721 depth) 8722 ); 8723 cur_curlyx->u.curlyx.count--; 8724 sayNO; /* cache records failure */ 8725 } 8726 ST.cache_offset = offset; 8727 ST.cache_mask = mask; 8728 } 8729 } 8730 8731 /* Prefer B over A for minimal matching. */ 8732 8733 if (cur_curlyx->u.curlyx.minmod) { 8734 ST.save_curlyx = cur_curlyx; 8735 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 8736 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, 8737 locinput, loceol, script_run_begin); 8738 NOT_REACHED; /* NOTREACHED */ 8739 } 8740 8741 /* Prefer A over B for maximal matching. */ 8742 8743 if (n < max) { /* More greed allowed? */ 8744 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, 8745 maxopenparen); 8746 cur_curlyx->u.curlyx.lastloc = locinput; 8747 REGCP_SET(ST.lastcp); 8748 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol, 8749 script_run_begin); 8750 NOT_REACHED; /* NOTREACHED */ 8751 } 8752 goto do_whilem_B_max; 8753 } 8754 NOT_REACHED; /* NOTREACHED */ 8755 8756 case WHILEM_B_min: /* just matched B in a minimal match */ 8757 case WHILEM_B_max: /* just matched B in a maximal match */ 8758 cur_curlyx = ST.save_curlyx; 8759 sayYES; 8760 NOT_REACHED; /* NOTREACHED */ 8761 8762 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ 8763 cur_curlyx = ST.save_curlyx; 8764 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 8765 cur_curlyx->u.curlyx.count--; 8766 CACHEsayNO; 8767 NOT_REACHED; /* NOTREACHED */ 8768 8769 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ 8770 /* FALLTHROUGH */ 8771 case WHILEM_A_pre_fail: /* just failed to match even minimal A */ 8772 REGCP_UNWIND(ST.lastcp); 8773 regcppop(rex, &maxopenparen); 8774 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 8775 cur_curlyx->u.curlyx.count--; 8776 CACHEsayNO; 8777 NOT_REACHED; /* NOTREACHED */ 8778 8779 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ 8780 REGCP_UNWIND(ST.lastcp); 8781 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */ 8782 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n", 8783 depth) 8784 ); 8785 do_whilem_B_max: 8786 if (cur_curlyx->u.curlyx.count >= REG_INFTY 8787 && ckWARN(WARN_REGEXP) 8788 && !reginfo->warned) 8789 { 8790 reginfo->warned = TRUE; 8791 Perl_warner(aTHX_ packWARN(WARN_REGEXP), 8792 "Complex regular subexpression recursion limit (%d) " 8793 "exceeded", 8794 REG_INFTY - 1); 8795 } 8796 8797 /* now try B */ 8798 ST.save_curlyx = cur_curlyx; 8799 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 8800 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, 8801 locinput, loceol, script_run_begin); 8802 NOT_REACHED; /* NOTREACHED */ 8803 8804 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ 8805 cur_curlyx = ST.save_curlyx; 8806 8807 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) { 8808 /* Maximum greed exceeded */ 8809 if (cur_curlyx->u.curlyx.count >= REG_INFTY 8810 && ckWARN(WARN_REGEXP) 8811 && !reginfo->warned) 8812 { 8813 reginfo->warned = TRUE; 8814 Perl_warner(aTHX_ packWARN(WARN_REGEXP), 8815 "Complex regular subexpression recursion " 8816 "limit (%d) exceeded", 8817 REG_INFTY - 1); 8818 } 8819 cur_curlyx->u.curlyx.count--; 8820 CACHEsayNO; 8821 } 8822 8823 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth) 8824 ); 8825 /* Try grabbing another A and see if it helps. */ 8826 cur_curlyx->u.curlyx.lastloc = locinput; 8827 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, 8828 maxopenparen); 8829 REGCP_SET(ST.lastcp); 8830 PUSH_STATE_GOTO(WHILEM_A_min, 8831 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, 8832 locinput, loceol, script_run_begin); 8833 NOT_REACHED; /* NOTREACHED */ 8834 8835 #undef ST 8836 #define ST st->u.branch 8837 8838 case BRANCHJ: /* /(...|A|...)/ with long next pointer */ 8839 next = scan + ARG(scan); 8840 if (next == scan) 8841 next = NULL; 8842 scan = NEXTOPER(scan); 8843 /* FALLTHROUGH */ 8844 8845 case BRANCH: /* /(...|A|...)/ */ 8846 scan = NEXTOPER(scan); /* scan now points to inner node */ 8847 ST.lastparen = rex->lastparen; 8848 ST.lastcloseparen = rex->lastcloseparen; 8849 ST.next_branch = next; 8850 REGCP_SET(ST.cp); 8851 8852 /* Now go into the branch */ 8853 if (has_cutgroup) { 8854 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol, 8855 script_run_begin); 8856 } else { 8857 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol, 8858 script_run_begin); 8859 } 8860 NOT_REACHED; /* NOTREACHED */ 8861 8862 case CUTGROUP: /* /(*THEN)/ */ 8863 sv_yes_mark = st->u.mark.mark_name = scan->flags 8864 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ]) 8865 : NULL; 8866 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol, 8867 script_run_begin); 8868 NOT_REACHED; /* NOTREACHED */ 8869 8870 case CUTGROUP_next_fail: 8871 do_cutgroup = 1; 8872 no_final = 1; 8873 if (st->u.mark.mark_name) 8874 sv_commit = st->u.mark.mark_name; 8875 sayNO; 8876 NOT_REACHED; /* NOTREACHED */ 8877 8878 case BRANCH_next: 8879 sayYES; 8880 NOT_REACHED; /* NOTREACHED */ 8881 8882 case BRANCH_next_fail: /* that branch failed; try the next, if any */ 8883 if (do_cutgroup) { 8884 do_cutgroup = 0; 8885 no_final = 0; 8886 } 8887 REGCP_UNWIND(ST.cp); 8888 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 8889 scan = ST.next_branch; 8890 /* no more branches? */ 8891 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { 8892 DEBUG_EXECUTE_r({ 8893 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n", 8894 depth, 8895 PL_colors[4], 8896 PL_colors[5] ); 8897 }); 8898 sayNO_SILENT; 8899 } 8900 continue; /* execute next BRANCH[J] op */ 8901 /* NOTREACHED */ 8902 8903 case MINMOD: /* next op will be non-greedy, e.g. A*? */ 8904 minmod = 1; 8905 break; 8906 8907 #undef ST 8908 #define ST st->u.curlym 8909 8910 case CURLYM: /* /A{m,n}B/ where A is fixed-length */ 8911 8912 /* This is an optimisation of CURLYX that enables us to push 8913 * only a single backtracking state, no matter how many matches 8914 * there are in {m,n}. It relies on the pattern being constant 8915 * length, with no parens to influence future backrefs 8916 */ 8917 8918 ST.me = scan; 8919 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 8920 8921 ST.lastparen = rex->lastparen; 8922 ST.lastcloseparen = rex->lastcloseparen; 8923 8924 /* if paren positive, emulate an OPEN/CLOSE around A */ 8925 if (ST.me->flags) { 8926 U32 paren = ST.me->flags; 8927 lastopen = paren; 8928 if (paren > maxopenparen) 8929 maxopenparen = paren; 8930 scan += NEXT_OFF(scan); /* Skip former OPEN. */ 8931 } 8932 ST.A = scan; 8933 ST.B = next; 8934 ST.alen = 0; 8935 ST.count = 0; 8936 ST.minmod = minmod; 8937 minmod = 0; 8938 ST.Binfo.count = -1; 8939 REGCP_SET(ST.cp); 8940 8941 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ 8942 goto curlym_do_B; 8943 8944 curlym_do_A: /* execute the A in /A{m,n}B/ */ 8945 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */ 8946 script_run_begin); 8947 NOT_REACHED; /* NOTREACHED */ 8948 8949 case CURLYM_A: /* we've just matched an A */ 8950 ST.count++; 8951 /* after first match, determine A's length: u.curlym.alen */ 8952 if (ST.count == 1) { 8953 if (reginfo->is_utf8_target) { 8954 char *s = st->locinput; 8955 while (s < locinput) { 8956 ST.alen++; 8957 s += UTF8SKIP(s); 8958 } 8959 } 8960 else { 8961 ST.alen = locinput - st->locinput; 8962 } 8963 if (ST.alen == 0) 8964 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me); 8965 } 8966 DEBUG_EXECUTE_r( 8967 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n", 8968 depth, (IV) ST.count, (IV)ST.alen) 8969 ); 8970 8971 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags)) 8972 goto fake_end; 8973 8974 8975 if (!is_accepted) { 8976 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); 8977 if ( max == REG_INFTY || ST.count < max ) 8978 goto curlym_do_A; /* try to match another A */ 8979 } 8980 goto curlym_do_B; /* try to match B */ 8981 8982 case CURLYM_A_fail: /* just failed to match an A */ 8983 REGCP_UNWIND(ST.cp); 8984 8985 8986 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ 8987 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags)) 8988 sayNO; 8989 8990 curlym_do_B: /* execute the B in /A{m,n}B/ */ 8991 if (is_accepted) 8992 goto curlym_close_B; 8993 8994 if (ST.Binfo.count < 0) { 8995 /* calculate possible match of 1st char following curly */ 8996 assert(ST.B); 8997 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { 8998 regnode *text_node = ST.B; 8999 if (! HAS_TEXT(text_node)) 9000 FIND_NEXT_IMPT(text_node); 9001 if (PL_regkind[OP(text_node)] == EXACT) { 9002 if (! S_setup_EXACTISH_ST(aTHX_ text_node, 9003 &ST.Binfo, reginfo)) 9004 { 9005 sayNO; 9006 } 9007 } 9008 } 9009 } 9010 9011 DEBUG_EXECUTE_r( 9012 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n", 9013 depth, (IV)ST.count) 9014 ); 9015 if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) { 9016 assert(ST.Binfo.count > 0); 9017 9018 /* Do a quick test to hopefully rule out most non-matches */ 9019 if ( locinput + ST.Binfo.min_length > loceol 9020 || ! S_test_EXACTISH_ST(locinput, ST.Binfo)) 9021 { 9022 DEBUG_OPTIMISE_r( 9023 Perl_re_exec_indentf( aTHX_ 9024 "CURLYM Fast bail next target=0x%X anded==0x%X" 9025 " mask=0x%X\n", 9026 depth, 9027 (int) nextbyte, ST.Binfo.first_byte_anded, 9028 ST.Binfo.first_byte_mask) 9029 ); 9030 state_num = CURLYM_B_fail; 9031 goto reenter_switch; 9032 } 9033 } 9034 9035 curlym_close_B: 9036 if (ST.me->flags) { 9037 /* emulate CLOSE: mark current A as captured */ 9038 U32 paren = (U32)ST.me->flags; 9039 if (ST.count || is_accepted) { 9040 CLOSE_CAPTURE(paren, 9041 HOPc(locinput, -ST.alen) - reginfo->strbeg, 9042 locinput - reginfo->strbeg); 9043 } 9044 else 9045 rex->offs[paren].end = -1; 9046 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags)) 9047 { 9048 if (ST.count || is_accepted) 9049 goto fake_end; 9050 else 9051 sayNO; 9052 } 9053 } 9054 9055 if (is_accepted) 9056 goto fake_end; 9057 9058 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */ 9059 script_run_begin); 9060 NOT_REACHED; /* NOTREACHED */ 9061 9062 case CURLYM_B_fail: /* just failed to match a B */ 9063 REGCP_UNWIND(ST.cp); 9064 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9065 if (ST.minmod) { 9066 I32 max = ARG2(ST.me); 9067 if (max != REG_INFTY && ST.count == max) 9068 sayNO; 9069 goto curlym_do_A; /* try to match a further A */ 9070 } 9071 /* backtrack one A */ 9072 if (ST.count == ARG1(ST.me) /* min */) 9073 sayNO; 9074 ST.count--; 9075 SET_locinput(HOPc(locinput, -ST.alen)); 9076 goto curlym_do_B; /* try to match B */ 9077 9078 #undef ST 9079 #define ST st->u.curly 9080 9081 #define CURLY_SETPAREN(paren, success) \ 9082 if (paren) { \ 9083 if (success) { \ 9084 CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \ 9085 locinput - reginfo->strbeg); \ 9086 } \ 9087 else { \ 9088 rex->offs[paren].end = -1; \ 9089 rex->lastparen = ST.lastparen; \ 9090 rex->lastcloseparen = ST.lastcloseparen; \ 9091 } \ 9092 } 9093 9094 case STAR: /* /A*B/ where A is width 1 char */ 9095 ST.paren = 0; 9096 ST.min = 0; 9097 ST.max = REG_INFTY; 9098 scan = NEXTOPER(scan); 9099 goto repeat; 9100 9101 case PLUS: /* /A+B/ where A is width 1 char */ 9102 ST.paren = 0; 9103 ST.min = 1; 9104 ST.max = REG_INFTY; 9105 scan = NEXTOPER(scan); 9106 goto repeat; 9107 9108 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ 9109 ST.paren = scan->flags; /* Which paren to set */ 9110 ST.lastparen = rex->lastparen; 9111 ST.lastcloseparen = rex->lastcloseparen; 9112 if (ST.paren > maxopenparen) 9113 maxopenparen = ST.paren; 9114 ST.min = ARG1(scan); /* min to match */ 9115 ST.max = ARG2(scan); /* max to match */ 9116 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); 9117 9118 /* handle the single-char capture called as a GOSUB etc */ 9119 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren)) 9120 { 9121 char *li = locinput; 9122 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1)) 9123 sayNO; 9124 SET_locinput(li); 9125 goto fake_end; 9126 } 9127 9128 goto repeat; 9129 9130 case CURLY: /* /A{m,n}B/ where A is width 1 char */ 9131 ST.paren = 0; 9132 ST.min = ARG1(scan); /* min to match */ 9133 ST.max = ARG2(scan); /* max to match */ 9134 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 9135 repeat: 9136 /* 9137 * Lookahead to avoid useless match attempts 9138 * when we know what character comes next. 9139 * 9140 * Used to only do .*x and .*?x, but now it allows 9141 * for )'s, ('s and (?{ ... })'s to be in the way 9142 * of the quantifier and the EXACT-like node. -- japhy 9143 */ 9144 9145 assert(ST.min <= ST.max); 9146 if (! HAS_TEXT(next) && ! JUMPABLE(next)) { 9147 ST.Binfo.count = 0; 9148 } 9149 else { 9150 regnode *text_node = next; 9151 9152 if (! HAS_TEXT(text_node)) 9153 FIND_NEXT_IMPT(text_node); 9154 9155 if (! HAS_TEXT(text_node)) 9156 ST.Binfo.count = 0; 9157 else { 9158 if ( PL_regkind[OP(text_node)] != EXACT ) { 9159 ST.Binfo.count = 0; 9160 } 9161 else { 9162 if (! S_setup_EXACTISH_ST(aTHX_ text_node, 9163 &ST.Binfo, reginfo)) 9164 { 9165 sayNO; 9166 } 9167 } 9168 } 9169 } 9170 9171 ST.A = scan; 9172 ST.B = next; 9173 if (minmod) { 9174 char *li = locinput; 9175 minmod = 0; 9176 if (ST.min && 9177 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min) 9178 < ST.min) 9179 sayNO; 9180 SET_locinput(li); 9181 ST.count = ST.min; 9182 REGCP_SET(ST.cp); 9183 9184 if (ST.Binfo.count <= 0) 9185 goto curly_try_B_min; 9186 9187 ST.oldloc = locinput; 9188 9189 /* set ST.maxpos to the furthest point along the 9190 * string that could possibly match, i.e., that a match could 9191 * start at. */ 9192 if (ST.max == REG_INFTY) { 9193 ST.maxpos = loceol - 1; 9194 if (utf8_target) 9195 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) 9196 ST.maxpos--; 9197 } 9198 else if (utf8_target) { 9199 int m = ST.max - ST.min; 9200 for (ST.maxpos = locinput; 9201 m >0 && ST.maxpos < loceol; m--) 9202 ST.maxpos += UTF8SKIP(ST.maxpos); 9203 } 9204 else { 9205 ST.maxpos = locinput + ST.max - ST.min; 9206 if (ST.maxpos >= loceol) 9207 ST.maxpos = loceol - 1; 9208 } 9209 goto curly_try_B_min_known; 9210 9211 } 9212 else { 9213 /* avoid taking address of locinput, so it can remain 9214 * a register var */ 9215 char *li = locinput; 9216 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max); 9217 if (ST.count < ST.min) 9218 sayNO; 9219 SET_locinput(li); 9220 if ((ST.count > ST.min) 9221 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL)) 9222 { 9223 /* A{m,n} must come at the end of the string, there's 9224 * no point in backing off ... */ 9225 ST.min = ST.count; 9226 /* ...except that $ and \Z can match before *and* after 9227 newline at the end. Consider "\n\n" =~ /\n+\Z\n/. 9228 We may back off by one in this case. */ 9229 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS) 9230 ST.min--; 9231 } 9232 REGCP_SET(ST.cp); 9233 goto curly_try_B_max; 9234 } 9235 NOT_REACHED; /* NOTREACHED */ 9236 9237 case CURLY_B_min_fail: 9238 /* failed to find B in a non-greedy match. */ 9239 9240 REGCP_UNWIND(ST.cp); 9241 if (ST.paren) { 9242 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9243 } 9244 9245 if (ST.Binfo.count == 0) { 9246 /* failed -- move forward one */ 9247 char *li = locinput; 9248 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) { 9249 sayNO; 9250 } 9251 locinput = li; 9252 ST.count++; 9253 if (!( ST.count <= ST.max 9254 /* count overflow ? */ 9255 || (ST.max == REG_INFTY && ST.count > 0)) 9256 ) 9257 sayNO; 9258 } 9259 else { 9260 int n; 9261 /* Couldn't or didn't -- move forward. */ 9262 ST.oldloc = locinput; 9263 if (utf8_target) 9264 locinput += UTF8SKIP(locinput); 9265 else 9266 locinput++; 9267 ST.count++; 9268 9269 curly_try_B_min_known: 9270 /* find the next place where 'B' could work, then call B */ 9271 if (locinput + ST.Binfo.initial_exact < loceol) { 9272 if (ST.Binfo.initial_exact >= ST.Binfo.max_length) { 9273 9274 /* Here, the mask is all 1's for the entire length of 9275 * any possible match. (That actually means that there 9276 * is only one possible match.) Look for the next 9277 * occurrence */ 9278 locinput = ninstr(locinput, loceol, 9279 (char *) ST.Binfo.matches, 9280 (char *) ST.Binfo.matches 9281 + ST.Binfo.initial_exact); 9282 if (locinput == NULL) { 9283 sayNO; 9284 } 9285 } 9286 else do { 9287 /* If the first byte(s) of the mask are all ones, it 9288 * means those bytes must match identically, so can use 9289 * ninstr() to find the next possible matchpoint */ 9290 if (ST.Binfo.initial_exact > 0) { 9291 locinput = ninstr(locinput, loceol, 9292 (char *) ST.Binfo.matches, 9293 (char *) ST.Binfo.matches 9294 + ST.Binfo.initial_exact); 9295 } 9296 else { /* Otherwise find the next byte that matches, 9297 masked */ 9298 locinput = (char *) find_next_masked( 9299 (U8 *) locinput, (U8 *) loceol, 9300 ST.Binfo.first_byte_anded, 9301 ST.Binfo.first_byte_mask); 9302 /* Advance to the end of a multi-byte character */ 9303 if (utf8_target) { 9304 while ( locinput < loceol 9305 && UTF8_IS_CONTINUATION(*locinput)) 9306 { 9307 locinput++; 9308 } 9309 } 9310 } 9311 if ( locinput == NULL 9312 || locinput + ST.Binfo.min_length > loceol) 9313 { 9314 sayNO; 9315 } 9316 9317 /* Here, we have found a possible match point; if can't 9318 * rule it out, quit the loop so can check fully */ 9319 if (S_test_EXACTISH_ST(locinput, ST.Binfo)) { 9320 break; 9321 } 9322 9323 locinput += (utf8_target) ? UTF8SKIP(locinput) : 1; 9324 9325 } while (locinput <= ST.maxpos); 9326 } 9327 9328 if (locinput > ST.maxpos) 9329 sayNO; 9330 9331 n = (utf8_target) 9332 ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput) 9333 : (STRLEN) (locinput - ST.oldloc); 9334 9335 9336 /* Here is at the beginning of a character that meets the mask 9337 * criteria. Need to make sure that some real possibility */ 9338 9339 if (n) { 9340 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is 9341 * at what may be the beginning of b; check that everything 9342 * between oldloc and locinput matches */ 9343 char *li = ST.oldloc; 9344 ST.count += n; 9345 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n) 9346 sayNO; 9347 assert(n == REG_INFTY || locinput == li); 9348 } 9349 } 9350 9351 curly_try_B_min: 9352 CURLY_SETPAREN(ST.paren, ST.count); 9353 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol, 9354 script_run_begin); 9355 NOT_REACHED; /* NOTREACHED */ 9356 9357 9358 curly_try_B_max: 9359 /* a successful greedy match: now try to match B */ 9360 if ( ST.Binfo.count <= 0 9361 || ( ST.Binfo.count > 0 9362 && locinput + ST.Binfo.min_length <= loceol 9363 && S_test_EXACTISH_ST(locinput, ST.Binfo))) 9364 { 9365 CURLY_SETPAREN(ST.paren, ST.count); 9366 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol, 9367 script_run_begin); 9368 NOT_REACHED; /* NOTREACHED */ 9369 } 9370 /* FALLTHROUGH */ 9371 9372 case CURLY_B_max_fail: 9373 /* failed to find B in a greedy match */ 9374 9375 REGCP_UNWIND(ST.cp); 9376 if (ST.paren) { 9377 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9378 } 9379 /* back up. */ 9380 if (--ST.count < ST.min) 9381 sayNO; 9382 locinput = HOPc(locinput, -1); 9383 goto curly_try_B_max; 9384 9385 #undef ST 9386 9387 case END: /* last op of main pattern */ 9388 fake_end: 9389 if (cur_eval) { 9390 /* we've just finished A in /(??{A})B/; now continue with B */ 9391 is_accepted= false; 9392 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput); 9393 st->u.eval.prev_rex = rex_sv; /* inner */ 9394 9395 /* Save *all* the positions. */ 9396 st->u.eval.cp = regcppush(rex, 0, maxopenparen); 9397 rex_sv = CUR_EVAL.prev_rex; 9398 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 9399 SET_reg_curpm(rex_sv); 9400 rex = ReANY(rex_sv); 9401 rexi = RXi_GET(rex); 9402 9403 st->u.eval.prev_curlyx = cur_curlyx; 9404 cur_curlyx = CUR_EVAL.prev_curlyx; 9405 9406 REGCP_SET(st->u.eval.lastcp); 9407 9408 /* Restore parens of the outer rex without popping the 9409 * savestack */ 9410 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen); 9411 9412 st->u.eval.prev_eval = cur_eval; 9413 cur_eval = CUR_EVAL.prev_eval; 9414 DEBUG_EXECUTE_r( 9415 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n", 9416 depth, cur_eval);); 9417 if ( nochange_depth ) 9418 nochange_depth--; 9419 9420 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput); 9421 9422 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */ 9423 st->u.eval.prev_eval->u.eval.B, 9424 locinput, loceol, script_run_begin); 9425 } 9426 9427 if (locinput < reginfo->till) { 9428 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 9429 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n", 9430 PL_colors[4], 9431 (long)(locinput - startpos), 9432 (long)(reginfo->till - startpos), 9433 PL_colors[5])); 9434 9435 sayNO_SILENT; /* Cannot match: too short. */ 9436 } 9437 sayYES; /* Success! */ 9438 9439 case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH 9440 matches end at the right spot, required for 9441 variable length matches. */ 9442 if (match_end && locinput != match_end) 9443 { 9444 DEBUG_EXECUTE_r( 9445 Perl_re_exec_indentf( aTHX_ 9446 "%sLOOKBEHIND_END: subpattern failed...%s\n", 9447 depth, PL_colors[4], PL_colors[5])); 9448 sayNO; /* Variable length match didn't line up */ 9449 } 9450 /* FALLTHROUGH */ 9451 9452 case SUCCEED: /* successful SUSPEND/CURLYM and 9453 *lookahead* IFMATCH/UNLESSM*/ 9454 DEBUG_EXECUTE_r( 9455 Perl_re_exec_indentf( aTHX_ 9456 "%sSUCCEED: subpattern success...%s\n", 9457 depth, PL_colors[4], PL_colors[5])); 9458 sayYES; /* Success! */ 9459 9460 #undef ST 9461 #define ST st->u.ifmatch 9462 9463 case SUSPEND: /* (?>A) */ 9464 ST.wanted = 1; 9465 ST.start = locinput; 9466 ST.end = loceol; 9467 ST.count = 1; 9468 goto do_ifmatch; 9469 9470 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */ 9471 ST.wanted = 0; 9472 goto ifmatch_trivial_fail_test; 9473 9474 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */ 9475 ST.wanted = 1; 9476 ifmatch_trivial_fail_test: 9477 ST.prev_match_end= match_end; 9478 ST.count = scan->next_off + 1; /* next_off repurposed to be 9479 lookbehind count, requires 9480 non-zero flags */ 9481 if (! scan->flags) { /* 'flags' zero means lookahed */ 9482 9483 /* Lookahead starts here and ends at the normal place */ 9484 ST.start = locinput; 9485 ST.end = loceol; 9486 match_end = NULL; 9487 } 9488 else { 9489 PERL_UINT_FAST8_T back_count = scan->flags; 9490 char * s; 9491 match_end = locinput; 9492 9493 /* Lookbehind can look beyond the current position */ 9494 ST.end = loceol; 9495 9496 /* ... and starts at the first place in the input that is in 9497 * the range of the possible start positions */ 9498 for (; ST.count > 0; ST.count--, back_count--) { 9499 s = HOPBACKc(locinput, back_count); 9500 if (s) { 9501 ST.start = s; 9502 goto do_ifmatch; 9503 } 9504 } 9505 9506 /* If the lookbehind doesn't start in the actual string, is a 9507 * trivial match failure */ 9508 match_end = ST.prev_match_end; 9509 if (logical) { 9510 logical = 0; 9511 sw = 1 - cBOOL(ST.wanted); 9512 } 9513 else if (ST.wanted) 9514 sayNO; 9515 9516 /* Here, we didn't want it to match, so is actually success */ 9517 next = scan + ARG(scan); 9518 if (next == scan) 9519 next = NULL; 9520 break; 9521 } 9522 9523 do_ifmatch: 9524 ST.me = scan; 9525 ST.logical = logical; 9526 logical = 0; /* XXX: reset state of logical once it has been saved into ST */ 9527 9528 /* execute body of (?...A) */ 9529 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start, 9530 ST.end, script_run_begin); 9531 NOT_REACHED; /* NOTREACHED */ 9532 9533 { 9534 bool matched; 9535 9536 case IFMATCH_A_fail: /* body of (?...A) failed */ 9537 if (! ST.logical && ST.count > 1) { 9538 9539 /* It isn't a real failure until we've tried all starting 9540 * positions. Move to the next starting position and retry */ 9541 ST.count--; 9542 ST.start = HOPc(ST.start, 1); 9543 scan = ST.me; 9544 logical = ST.logical; 9545 goto do_ifmatch; 9546 } 9547 9548 /* Here, all starting positions have been tried. */ 9549 matched = FALSE; 9550 goto ifmatch_done; 9551 9552 case IFMATCH_A: /* body of (?...A) succeeded */ 9553 matched = TRUE; 9554 ifmatch_done: 9555 sw = matched == ST.wanted; 9556 match_end = ST.prev_match_end; 9557 if (! ST.logical && !sw) { 9558 sayNO; 9559 } 9560 9561 if (OP(ST.me) != SUSPEND) { 9562 /* restore old position except for (?>...) */ 9563 locinput = st->locinput; 9564 loceol = st->loceol; 9565 script_run_begin = st->sr0; 9566 } 9567 scan = ST.me + ARG(ST.me); 9568 if (scan == ST.me) 9569 scan = NULL; 9570 continue; /* execute B */ 9571 } 9572 9573 #undef ST 9574 9575 case LONGJMP: /* alternative with many branches compiles to 9576 * (BRANCHJ; EXACT ...; LONGJMP ) x N */ 9577 next = scan + ARG(scan); 9578 if (next == scan) 9579 next = NULL; 9580 break; 9581 9582 case COMMIT: /* (*COMMIT) */ 9583 reginfo->cutpoint = loceol; 9584 /* FALLTHROUGH */ 9585 9586 case PRUNE: /* (*PRUNE) */ 9587 if (scan->flags) 9588 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 9589 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol, 9590 script_run_begin); 9591 NOT_REACHED; /* NOTREACHED */ 9592 9593 case COMMIT_next_fail: 9594 no_final = 1; 9595 /* FALLTHROUGH */ 9596 sayNO; 9597 NOT_REACHED; /* NOTREACHED */ 9598 9599 case OPFAIL: /* (*FAIL) */ 9600 if (scan->flags) 9601 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 9602 if (logical) { 9603 /* deal with (?(?!)X|Y) properly, 9604 * make sure we trigger the no branch 9605 * of the trailing IFTHEN structure*/ 9606 sw= 0; 9607 break; 9608 } else { 9609 sayNO; 9610 } 9611 NOT_REACHED; /* NOTREACHED */ 9612 9613 #define ST st->u.mark 9614 case MARKPOINT: /* (*MARK:foo) */ 9615 ST.prev_mark = mark_state; 9616 ST.mark_name = sv_commit = sv_yes_mark 9617 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 9618 mark_state = st; 9619 ST.mark_loc = locinput; 9620 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol, 9621 script_run_begin); 9622 NOT_REACHED; /* NOTREACHED */ 9623 9624 case MARKPOINT_next: 9625 mark_state = ST.prev_mark; 9626 sayYES; 9627 NOT_REACHED; /* NOTREACHED */ 9628 9629 case MARKPOINT_next_fail: 9630 if (popmark && sv_eq(ST.mark_name,popmark)) 9631 { 9632 if (ST.mark_loc > startpoint) 9633 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 9634 popmark = NULL; /* we found our mark */ 9635 sv_commit = ST.mark_name; 9636 9637 DEBUG_EXECUTE_r({ 9638 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n", 9639 depth, 9640 PL_colors[4], SVfARG(sv_commit), PL_colors[5]); 9641 }); 9642 } 9643 mark_state = ST.prev_mark; 9644 sv_yes_mark = mark_state ? 9645 mark_state->u.mark.mark_name : NULL; 9646 sayNO; 9647 NOT_REACHED; /* NOTREACHED */ 9648 9649 case SKIP: /* (*SKIP) */ 9650 if (!scan->flags) { 9651 /* (*SKIP) : if we fail we cut here*/ 9652 ST.mark_name = NULL; 9653 ST.mark_loc = locinput; 9654 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol, 9655 script_run_begin); 9656 } else { 9657 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, 9658 otherwise do nothing. Meaning we need to scan 9659 */ 9660 regmatch_state *cur = mark_state; 9661 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 9662 9663 while (cur) { 9664 if ( sv_eq( cur->u.mark.mark_name, 9665 find ) ) 9666 { 9667 ST.mark_name = find; 9668 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol, 9669 script_run_begin); 9670 } 9671 cur = cur->u.mark.prev_mark; 9672 } 9673 } 9674 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ 9675 break; 9676 9677 case SKIP_next_fail: 9678 if (ST.mark_name) { 9679 /* (*CUT:NAME) - Set up to search for the name as we 9680 collapse the stack*/ 9681 popmark = ST.mark_name; 9682 } else { 9683 /* (*CUT) - No name, we cut here.*/ 9684 if (ST.mark_loc > startpoint) 9685 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 9686 /* but we set sv_commit to latest mark_name if there 9687 is one so they can test to see how things lead to this 9688 cut */ 9689 if (mark_state) 9690 sv_commit=mark_state->u.mark.mark_name; 9691 } 9692 no_final = 1; 9693 sayNO; 9694 NOT_REACHED; /* NOTREACHED */ 9695 #undef ST 9696 9697 case LNBREAK: /* \R */ 9698 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) { 9699 locinput += n; 9700 } else 9701 sayNO; 9702 break; 9703 9704 default: 9705 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n", 9706 PTR2UV(scan), OP(scan)); 9707 Perl_croak(aTHX_ "regexp memory corruption"); 9708 9709 /* this is a point to jump to in order to increment 9710 * locinput by one character */ 9711 increment_locinput: 9712 assert(!NEXTCHR_IS_EOS); 9713 if (utf8_target) { 9714 locinput += PL_utf8skip[nextbyte]; 9715 /* locinput is allowed to go 1 char off the end (signifying 9716 * EOS), but not 2+ */ 9717 if (locinput > loceol) 9718 sayNO; 9719 } 9720 else 9721 locinput++; 9722 break; 9723 9724 } /* end switch */ 9725 9726 /* switch break jumps here */ 9727 scan = next; /* prepare to execute the next op and ... */ 9728 continue; /* ... jump back to the top, reusing st */ 9729 /* NOTREACHED */ 9730 9731 push_yes_state: 9732 /* push a state that backtracks on success */ 9733 st->u.yes.prev_yes_state = yes_state; 9734 yes_state = st; 9735 /* FALLTHROUGH */ 9736 push_state: 9737 /* push a new regex state, then continue at scan */ 9738 { 9739 regmatch_state *newst; 9740 DECLARE_AND_GET_RE_DEBUG_FLAGS; 9741 9742 DEBUG_r( /* DEBUG_STACK_r */ 9743 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) { 9744 regmatch_state *cur = st; 9745 regmatch_state *curyes = yes_state; 9746 U32 i; 9747 regmatch_slab *slab = PL_regmatch_slab; 9748 for (i = 0; i < 3 && i <= depth; cur--,i++) { 9749 if (cur < SLAB_FIRST(slab)) { 9750 slab = slab->prev; 9751 cur = SLAB_LAST(slab); 9752 } 9753 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n", 9754 depth, 9755 i ? " " : "push", 9756 depth - i, PL_reg_name[cur->resume_state], 9757 (curyes == cur) ? "yes" : "" 9758 ); 9759 if (curyes == cur) 9760 curyes = cur->u.yes.prev_yes_state; 9761 } 9762 } else { 9763 DEBUG_STATE_pp("push") 9764 }); 9765 depth++; 9766 st->locinput = locinput; 9767 st->loceol = loceol; 9768 st->sr0 = script_run_begin; 9769 newst = st+1; 9770 if (newst > SLAB_LAST(PL_regmatch_slab)) 9771 newst = S_push_slab(aTHX); 9772 PL_regmatch_state = newst; 9773 9774 locinput = pushinput; 9775 loceol = pusheol; 9776 script_run_begin = pushsr0; 9777 st = newst; 9778 continue; 9779 /* NOTREACHED */ 9780 } 9781 } 9782 #ifdef SOLARIS_BAD_OPTIMIZER 9783 # undef PL_charclass 9784 #endif 9785 9786 /* 9787 * We get here only if there's trouble -- normally "case END" is 9788 * the terminating point. 9789 */ 9790 Perl_croak(aTHX_ "corrupted regexp pointers"); 9791 NOT_REACHED; /* NOTREACHED */ 9792 9793 yes: 9794 if (yes_state) { 9795 /* we have successfully completed a subexpression, but we must now 9796 * pop to the state marked by yes_state and continue from there */ 9797 assert(st != yes_state); 9798 #ifdef DEBUGGING 9799 while (st != yes_state) { 9800 st--; 9801 if (st < SLAB_FIRST(PL_regmatch_slab)) { 9802 PL_regmatch_slab = PL_regmatch_slab->prev; 9803 st = SLAB_LAST(PL_regmatch_slab); 9804 } 9805 DEBUG_STATE_r({ 9806 if (no_final) { 9807 DEBUG_STATE_pp("pop (no final)"); 9808 } else { 9809 DEBUG_STATE_pp("pop (yes)"); 9810 } 9811 }); 9812 depth--; 9813 } 9814 #else 9815 while (yes_state < SLAB_FIRST(PL_regmatch_slab) 9816 || yes_state > SLAB_LAST(PL_regmatch_slab)) 9817 { 9818 /* not in this slab, pop slab */ 9819 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); 9820 PL_regmatch_slab = PL_regmatch_slab->prev; 9821 st = SLAB_LAST(PL_regmatch_slab); 9822 } 9823 depth -= (st - yes_state); 9824 #endif 9825 st = yes_state; 9826 yes_state = st->u.yes.prev_yes_state; 9827 PL_regmatch_state = st; 9828 9829 if (no_final) { 9830 locinput= st->locinput; 9831 loceol= st->loceol; 9832 script_run_begin = st->sr0; 9833 } 9834 state_num = st->resume_state + no_final; 9835 goto reenter_switch; 9836 } 9837 9838 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n", 9839 PL_colors[4], PL_colors[5])); 9840 9841 if (reginfo->info_aux_eval) { 9842 /* each successfully executed (?{...}) block does the equivalent of 9843 * local $^R = do {...} 9844 * When popping the save stack, all these locals would be undone; 9845 * bypass this by setting the outermost saved $^R to the latest 9846 * value */ 9847 /* I dont know if this is needed or works properly now. 9848 * see code related to PL_replgv elsewhere in this file. 9849 * Yves 9850 */ 9851 if (oreplsv != GvSV(PL_replgv)) { 9852 sv_setsv(oreplsv, GvSV(PL_replgv)); 9853 SvSETMAGIC(oreplsv); 9854 } 9855 } 9856 result = 1; 9857 goto final_exit; 9858 9859 no: 9860 DEBUG_EXECUTE_r( 9861 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n", 9862 depth, 9863 PL_colors[4], PL_colors[5]) 9864 ); 9865 9866 no_silent: 9867 if (no_final) { 9868 if (yes_state) { 9869 goto yes; 9870 } else { 9871 goto final_exit; 9872 } 9873 } 9874 if (depth) { 9875 /* there's a previous state to backtrack to */ 9876 st--; 9877 if (st < SLAB_FIRST(PL_regmatch_slab)) { 9878 PL_regmatch_slab = PL_regmatch_slab->prev; 9879 st = SLAB_LAST(PL_regmatch_slab); 9880 } 9881 PL_regmatch_state = st; 9882 locinput= st->locinput; 9883 loceol= st->loceol; 9884 script_run_begin = st->sr0; 9885 9886 DEBUG_STATE_pp("pop"); 9887 depth--; 9888 if (yes_state == st) 9889 yes_state = st->u.yes.prev_yes_state; 9890 9891 state_num = st->resume_state + 1; /* failure = success + 1 */ 9892 PERL_ASYNC_CHECK(); 9893 goto reenter_switch; 9894 } 9895 result = 0; 9896 9897 final_exit: 9898 if (rex->intflags & PREGf_VERBARG_SEEN) { 9899 SV *sv_err = get_sv("REGERROR", 1); 9900 SV *sv_mrk = get_sv("REGMARK", 1); 9901 if (result) { 9902 sv_commit = &PL_sv_no; 9903 if (!sv_yes_mark) 9904 sv_yes_mark = &PL_sv_yes; 9905 } else { 9906 if (!sv_commit) 9907 sv_commit = &PL_sv_yes; 9908 sv_yes_mark = &PL_sv_no; 9909 } 9910 assert(sv_err); 9911 assert(sv_mrk); 9912 sv_setsv(sv_err, sv_commit); 9913 sv_setsv(sv_mrk, sv_yes_mark); 9914 } 9915 9916 9917 if (last_pushed_cv) { 9918 dSP; 9919 /* see "Some notes about MULTICALL" above */ 9920 POP_MULTICALL; 9921 PERL_UNUSED_VAR(SP); 9922 } 9923 else 9924 LEAVE_SCOPE(orig_savestack_ix); 9925 9926 assert(!result || locinput - reginfo->strbeg >= 0); 9927 return result ? locinput - reginfo->strbeg : -1; 9928 } 9929 9930 /* 9931 - regrepeat - repeatedly match something simple, report how many 9932 * 9933 * What 'simple' means is a node which can be the operand of a quantifier like 9934 * '+', or {1,3} 9935 * 9936 * startposp - pointer to a pointer to the start position. This is updated 9937 * to point to the byte following the highest successful 9938 * match. 9939 * p - the regnode to be repeatedly matched against. 9940 * loceol - pointer to the end position beyond which we aren't supposed to 9941 * look. 9942 * reginfo - struct holding match state, such as utf8_target 9943 * max - maximum number of things to match. 9944 * depth - (for debugging) backtracking depth. 9945 */ 9946 STATIC I32 9947 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, 9948 char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH) 9949 { 9950 char *scan; /* Pointer to current position in target string */ 9951 I32 c; 9952 char *this_eol = loceol; /* potentially adjusted version. */ 9953 I32 hardcount = 0; /* How many matches so far */ 9954 bool utf8_target = reginfo->is_utf8_target; 9955 unsigned int to_complement = 0; /* Invert the result? */ 9956 _char_class_number classnum; 9957 9958 PERL_ARGS_ASSERT_REGREPEAT; 9959 9960 /* This routine is structured so that we switch on the input OP. Each OP 9961 * case: statement contains a loop to repeatedly apply the OP, advancing 9962 * the input until it fails, or reaches the end of the input, or until it 9963 * reaches the upper limit of matches. */ 9964 9965 scan = *startposp; 9966 if (max == REG_INFTY) /* This is a special marker to go to the platform's 9967 max */ 9968 max = I32_MAX; 9969 else if (! utf8_target && this_eol - scan > max) 9970 this_eol = scan + max; 9971 9972 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> 9973 * down to the maximum of how far we should go in it (but leaving it set to 9974 * the real end if the maximum permissible would take us beyond that). 9975 * This allows us to make the loop exit condition that we haven't gone past 9976 * <this_eol> to also mean that we haven't exceeded the max permissible 9977 * count, saving a test each time through the loop. But it assumes that 9978 * the OP matches a single byte, which is true for most of the OPs below 9979 * when applied to a non-UTF-8 target. Those relatively few OPs that don't 9980 * have this characteristic have to compensate. 9981 * 9982 * There is no such adjustment for UTF-8 targets, since the number of bytes 9983 * per character can vary. OPs will have to test both that the count is 9984 * less than the max permissible (using <hardcount> to keep track), and 9985 * that we are still within the bounds of the string (using <this_eol>. A 9986 * few OPs match a single byte no matter what the encoding. They can omit 9987 * the max test if, for the UTF-8 case, they do the adjustment that was 9988 * skipped above. 9989 * 9990 * Thus, the code above sets things up for the common case; and exceptional 9991 * cases need extra work; the common case is to make sure <scan> doesn't go 9992 * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the 9993 * count doesn't exceed the maximum permissible */ 9994 9995 switch (with_t_UTF8ness(OP(p), utf8_target)) { 9996 case REG_ANY_t8: 9997 while (scan < this_eol && hardcount < max && *scan != '\n') { 9998 scan += UTF8SKIP(scan); 9999 hardcount++; 10000 } 10001 break; 10002 10003 case REG_ANY_tb: 10004 scan = (char *) memchr(scan, '\n', this_eol - scan); 10005 if (! scan) { 10006 scan = this_eol; 10007 } 10008 break; 10009 10010 case SANY_t8: 10011 while (scan < this_eol && hardcount < max) { 10012 scan += UTF8SKIP(scan); 10013 hardcount++; 10014 } 10015 break; 10016 10017 case SANY_tb: 10018 scan = this_eol; 10019 break; 10020 10021 case EXACT_REQ8_tb: 10022 case LEXACT_REQ8_tb: 10023 case EXACTFU_REQ8_tb: 10024 break; 10025 10026 case EXACTL_t8: 10027 if (UTF8_IS_ABOVE_LATIN1(*scan)) { 10028 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol); 10029 } 10030 /* FALLTHROUGH */ 10031 10032 case EXACTL_tb: 10033 case EXACTFL_t8: 10034 case EXACTFL_tb: 10035 case EXACTFLU8_t8: 10036 case EXACTFLU8_tb: 10037 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 10038 /* FALLTHROUGH */ 10039 10040 case EXACT_REQ8_t8: 10041 case LEXACT_REQ8_t8: 10042 case EXACTFU_REQ8_t8: 10043 case LEXACT_t8: 10044 case LEXACT_tb: 10045 case EXACT_t8: 10046 case EXACT_tb: 10047 case EXACTF_t8: 10048 case EXACTF_tb: 10049 case EXACTFAA_NO_TRIE_t8: 10050 case EXACTFAA_NO_TRIE_tb: 10051 case EXACTFAA_t8: 10052 case EXACTFAA_tb: 10053 case EXACTFU_t8: 10054 case EXACTFU_tb: 10055 case EXACTFUP_t8: 10056 case EXACTFUP_tb: 10057 10058 { 10059 struct next_matchable_info Binfo; 10060 PERL_UINT_FAST8_T definitive_len; 10061 10062 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); 10063 10064 /* Set up termination info, and quit if we can rule out that we've 10065 * gotten a match of the termination criteria */ 10066 if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo) 10067 || scan + Binfo.min_length > this_eol 10068 || ! S_test_EXACTISH_ST(scan, Binfo)) 10069 { 10070 break; 10071 } 10072 10073 definitive_len = Binfo.initial_definitive; 10074 10075 /* Here there are potential matches, and the first byte(s) matched our 10076 * filter 10077 * 10078 * If we got a definitive match of some initial bytes, there is no 10079 * possibility of false positives as far as it got */ 10080 if (definitive_len > 0) { 10081 10082 /* If as far as it got is the maximum possible, there were no false 10083 * positives at all. Since we have everything set up, see how many 10084 * repeats there are. */ 10085 if (definitive_len >= Binfo.max_length) { 10086 10087 /* We've already found one match */ 10088 scan += definitive_len; 10089 hardcount++; 10090 10091 /* If want more than the one match, and there is room for more, 10092 * see if there are any */ 10093 if (hardcount < max && scan + definitive_len <= this_eol) { 10094 10095 /* If the character is only a single byte long, just span 10096 * all such bytes. */ 10097 if (definitive_len == 1) { 10098 const char * orig_scan = scan; 10099 10100 if (this_eol - (scan - hardcount) > max) { 10101 this_eol = scan - hardcount + max; 10102 } 10103 10104 /* Use different routines depending on whether it's an 10105 * exact match or matches with a mask */ 10106 if (Binfo.initial_exact == 1) { 10107 scan = (char *) find_span_end((U8 *) scan, 10108 (U8 *) this_eol, 10109 Binfo.matches[0]); 10110 } 10111 else { 10112 scan = (char *) find_span_end_mask( 10113 (U8 *) scan, 10114 (U8 *) this_eol, 10115 Binfo.first_byte_anded, 10116 Binfo.first_byte_mask); 10117 } 10118 10119 hardcount += scan - orig_scan; 10120 } 10121 else { /* Here, the full character definitive match is more 10122 than one byte */ 10123 while ( hardcount < max 10124 && scan + definitive_len <= this_eol 10125 && S_test_EXACTISH_ST(scan, Binfo)) 10126 { 10127 scan += definitive_len; 10128 hardcount++; 10129 } 10130 } 10131 } 10132 10133 break; 10134 } /* End of a full character is definitively matched */ 10135 10136 /* Here, an initial portion of the character matched definitively, 10137 * and the rest matched as well, but could have false positives */ 10138 10139 do { 10140 PERL_INT_FAST8_T i; 10141 U8 * matches = Binfo.matches; 10142 10143 /* The first bytes were definitive. Look at the remaining */ 10144 for (i = 0; i < Binfo.count; i++) { 10145 if (memEQ(scan + definitive_len, 10146 matches + definitive_len, 10147 Binfo.lengths[i] - definitive_len)) 10148 { 10149 goto found_a_completion; 10150 } 10151 10152 matches += Binfo.lengths[i]; 10153 } 10154 10155 /* Didn't find anything to complete our initial match. Stop 10156 * here */ 10157 break; 10158 10159 found_a_completion: 10160 10161 /* Here, matched a full character, Include it in the result, 10162 * and then look to see if the next char matches */ 10163 hardcount++; 10164 scan += Binfo.lengths[i]; 10165 10166 } while ( hardcount < max 10167 && scan + definitive_len < this_eol 10168 && S_test_EXACTISH_ST(scan, Binfo)); 10169 10170 /* Here, have advanced as far as possible */ 10171 break; 10172 } /* End of found some initial bytes that definitively matched */ 10173 10174 /* Here, we can't rule out that we have found the beginning of 'B', but 10175 * there were no initial bytes that could rule out anything 10176 * definitively. Use brute force to examine all the possibilities */ 10177 while (scan < this_eol && hardcount < max) { 10178 PERL_INT_FAST8_T i; 10179 U8 * matches = Binfo.matches; 10180 10181 for (i = 0; i < Binfo.count; i++) { 10182 if (memEQ(scan, matches, Binfo.lengths[i])) { 10183 goto found1; 10184 } 10185 10186 matches += Binfo.lengths[i]; 10187 } 10188 10189 break; 10190 10191 found1: 10192 hardcount++; 10193 scan += Binfo.lengths[i]; 10194 } 10195 10196 break; 10197 } 10198 10199 case ANYOFPOSIXL_t8: 10200 case ANYOFL_t8: 10201 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 10202 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p); 10203 10204 /* FALLTHROUGH */ 10205 case ANYOFD_t8: 10206 case ANYOF_t8: 10207 while ( hardcount < max 10208 && scan < this_eol 10209 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) 10210 { 10211 scan += UTF8SKIP(scan); 10212 hardcount++; 10213 } 10214 break; 10215 10216 case ANYOFPOSIXL_tb: 10217 case ANYOFL_tb: 10218 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 10219 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p); 10220 /* FALLTHROUGH */ 10221 10222 case ANYOFD_tb: 10223 case ANYOF_tb: 10224 if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) { 10225 while ( scan < this_eol 10226 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0)) 10227 scan++; 10228 } 10229 else { 10230 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan))) 10231 scan++; 10232 } 10233 break; 10234 10235 case ANYOFM_t8: 10236 if (this_eol - scan > max) { 10237 10238 /* We didn't adjust <this_eol> at the beginning of this routine 10239 * because is UTF-8, but it is actually ok to do so, since here, to 10240 * match, 1 char == 1 byte. */ 10241 this_eol = scan + max; 10242 } 10243 /* FALLTHROUGH */ 10244 10245 case ANYOFM_tb: 10246 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, 10247 (U8) ARG(p), FLAGS(p)); 10248 break; 10249 10250 case NANYOFM_t8: 10251 while ( hardcount < max 10252 && scan < this_eol 10253 && (*scan & FLAGS(p)) != ARG(p)) 10254 { 10255 scan += UTF8SKIP(scan); 10256 hardcount++; 10257 } 10258 break; 10259 10260 case NANYOFM_tb: 10261 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, 10262 (U8) ARG(p), FLAGS(p)); 10263 break; 10264 10265 case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */ 10266 case ANYOFHb_tb: 10267 case ANYOFHr_tb: 10268 case ANYOFHs_tb: 10269 break; 10270 10271 case ANYOFH_t8: 10272 while ( hardcount < max 10273 && scan < this_eol 10274 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 10275 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) 10276 { 10277 scan += UTF8SKIP(scan); 10278 hardcount++; 10279 } 10280 break; 10281 10282 case ANYOFHb_t8: 10283 /* we know the first byte must be the FLAGS field */ 10284 while ( hardcount < max 10285 && scan < this_eol 10286 && (U8) *scan == ANYOF_FLAGS(p) 10287 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) 10288 { 10289 scan += UTF8SKIP(scan); 10290 hardcount++; 10291 } 10292 break; 10293 10294 case ANYOFHr_t8: 10295 while ( hardcount < max 10296 && scan < this_eol 10297 && inRANGE(NATIVE_UTF8_TO_I8(*scan), 10298 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)), 10299 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p))) 10300 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 10301 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) 10302 { 10303 scan += UTF8SKIP(scan); 10304 hardcount++; 10305 } 10306 break; 10307 10308 case ANYOFHs_t8: 10309 while ( hardcount < max 10310 && scan + FLAGS(p) < this_eol 10311 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p)) 10312 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) 10313 { 10314 scan += UTF8SKIP(scan); 10315 hardcount++; 10316 } 10317 break; 10318 10319 case ANYOFR_t8: 10320 while ( hardcount < max 10321 && scan < this_eol 10322 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 10323 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, 10324 (U8 *) this_eol, 10325 NULL), 10326 ANYOFRbase(p), ANYOFRdelta(p))) 10327 { 10328 scan += UTF8SKIP(scan); 10329 hardcount++; 10330 } 10331 break; 10332 10333 case ANYOFR_tb: 10334 while ( hardcount < max 10335 && scan < this_eol 10336 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) 10337 { 10338 scan++; 10339 hardcount++; 10340 } 10341 break; 10342 10343 case ANYOFRb_t8: 10344 while ( hardcount < max 10345 && scan < this_eol 10346 && (U8) *scan == ANYOF_FLAGS(p) 10347 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, 10348 (U8 *) this_eol, 10349 NULL), 10350 ANYOFRbase(p), ANYOFRdelta(p))) 10351 { 10352 scan += UTF8SKIP(scan); 10353 hardcount++; 10354 } 10355 break; 10356 10357 case ANYOFRb_tb: 10358 while ( hardcount < max 10359 && scan < this_eol 10360 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) 10361 { 10362 scan++; 10363 hardcount++; 10364 } 10365 break; 10366 10367 /* The argument (FLAGS) to all the POSIX node types is the class number */ 10368 10369 case NPOSIXL_tb: 10370 to_complement = 1; 10371 /* FALLTHROUGH */ 10372 10373 case POSIXL_tb: 10374 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 10375 while ( scan < this_eol 10376 && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) 10377 { 10378 scan++; 10379 } 10380 break; 10381 10382 case NPOSIXL_t8: 10383 to_complement = 1; 10384 /* FALLTHROUGH */ 10385 10386 case POSIXL_t8: 10387 while ( hardcount < max && scan < this_eol 10388 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), 10389 (U8 *) scan, 10390 (U8 *) this_eol))) 10391 { 10392 scan += UTF8SKIP(scan); 10393 hardcount++; 10394 } 10395 break; 10396 10397 case POSIXD_tb: 10398 /* FALLTHROUGH */ 10399 10400 case POSIXA_t8: 10401 if (this_eol - scan > max) { 10402 10403 /* We didn't adjust <this_eol> at the beginning of this routine 10404 * because is UTF-8, but it is actually ok to do so, since here, to 10405 * match, 1 char == 1 byte. */ 10406 this_eol = scan + max; 10407 } 10408 /* FALLTHROUGH */ 10409 10410 case POSIXA_tb: 10411 while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) { 10412 scan++; 10413 } 10414 break; 10415 10416 case NPOSIXD_tb: 10417 /* FALLTHROUGH */ 10418 10419 case NPOSIXA_tb: 10420 while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { 10421 scan++; 10422 } 10423 break; 10424 10425 case NPOSIXA_t8: 10426 10427 /* The complement of something that matches only ASCII matches all 10428 * non-ASCII, plus everything in ASCII that isn't in the class. */ 10429 while ( hardcount < max && scan < this_eol 10430 && ( ! isASCII_utf8_safe(scan, loceol) 10431 || ! _generic_isCC_A((U8) *scan, FLAGS(p)))) 10432 { 10433 scan += UTF8SKIP(scan); 10434 hardcount++; 10435 } 10436 break; 10437 10438 case NPOSIXU_tb: 10439 to_complement = 1; 10440 /* FALLTHROUGH */ 10441 10442 case POSIXU_tb: 10443 while ( scan < this_eol 10444 && to_complement ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p)))) 10445 { 10446 scan++; 10447 } 10448 break; 10449 10450 case NPOSIXU_t8: 10451 case NPOSIXD_t8: 10452 to_complement = 1; 10453 /* FALLTHROUGH */ 10454 10455 case POSIXD_t8: 10456 case POSIXU_t8: 10457 classnum = (_char_class_number) FLAGS(p); 10458 switch (classnum) { 10459 default: 10460 while ( hardcount < max && scan < this_eol 10461 && to_complement 10462 ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum], 10463 utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL)))) 10464 { 10465 scan += UTF8SKIP(scan); 10466 hardcount++; 10467 } 10468 break; 10469 10470 /* For the classes below, the knowledge of how to handle every code 10471 * point is compiled into Perl via a macro. This code is written 10472 * for making the loops as tight as possible. It could be 10473 * refactored to save space instead. */ 10474 10475 case _CC_ENUM_SPACE: 10476 while ( hardcount < max 10477 && scan < this_eol 10478 && (to_complement 10479 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol)))) 10480 { 10481 scan += UTF8SKIP(scan); 10482 hardcount++; 10483 } 10484 break; 10485 case _CC_ENUM_BLANK: 10486 while ( hardcount < max 10487 && scan < this_eol 10488 && (to_complement 10489 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol)))) 10490 { 10491 scan += UTF8SKIP(scan); 10492 hardcount++; 10493 } 10494 break; 10495 case _CC_ENUM_XDIGIT: 10496 while ( hardcount < max 10497 && scan < this_eol 10498 && (to_complement 10499 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol)))) 10500 { 10501 scan += UTF8SKIP(scan); 10502 hardcount++; 10503 } 10504 break; 10505 case _CC_ENUM_VERTSPACE: 10506 while ( hardcount < max 10507 && scan < this_eol 10508 && (to_complement 10509 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol)))) 10510 { 10511 scan += UTF8SKIP(scan); 10512 hardcount++; 10513 } 10514 break; 10515 case _CC_ENUM_CNTRL: 10516 while ( hardcount < max 10517 && scan < this_eol 10518 && (to_complement 10519 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol)))) 10520 { 10521 scan += UTF8SKIP(scan); 10522 hardcount++; 10523 } 10524 break; 10525 } 10526 break; 10527 10528 case LNBREAK_t8: 10529 while ( hardcount < max && scan < this_eol 10530 && (c=is_LNBREAK_utf8_safe(scan, this_eol))) 10531 { 10532 scan += c; 10533 hardcount++; 10534 } 10535 break; 10536 10537 case LNBREAK_tb: 10538 /* LNBREAK can match one or two latin chars, which is ok, but we have 10539 * to use hardcount in this situation, and throw away the adjustment to 10540 * <this_eol> done before the switch statement */ 10541 while ( 10542 hardcount < max && scan < loceol 10543 && (c = is_LNBREAK_latin1_safe(scan, loceol)) 10544 ) { 10545 scan += c; 10546 hardcount++; 10547 } 10548 break; 10549 10550 default: 10551 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized" 10552 " node type %d='%s'", OP(p), PL_reg_name[OP(p)]); 10553 NOT_REACHED; /* NOTREACHED */ 10554 10555 } 10556 10557 if (hardcount) 10558 c = hardcount; 10559 else 10560 c = scan - *startposp; 10561 *startposp = scan; 10562 10563 DEBUG_r({ 10564 DECLARE_AND_GET_RE_DEBUG_FLAGS; 10565 DEBUG_EXECUTE_r({ 10566 SV * const prop = sv_newmortal(); 10567 regprop(prog, prop, p, reginfo, NULL); 10568 Perl_re_exec_indentf( aTHX_ 10569 "%s can match %" IVdf " times out of %" IVdf "...\n", 10570 depth, SvPVX_const(prop),(IV)c,(IV)max); 10571 }); 10572 }); 10573 10574 return(c); 10575 } 10576 10577 /* 10578 - reginclass - determine if a character falls into a character class 10579 10580 n is the ANYOF-type regnode 10581 p is the target string 10582 p_end points to one byte beyond the end of the target string 10583 utf8_target tells whether p is in UTF-8. 10584 10585 Returns true if matched; false otherwise. 10586 10587 Note that this can be a synthetic start class, a combination of various 10588 nodes, so things you think might be mutually exclusive, such as locale, 10589 aren't. It can match both locale and non-locale 10590 10591 */ 10592 10593 STATIC bool 10594 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) 10595 { 10596 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs)) 10597 ? 0 10598 : ANYOF_FLAGS(n); 10599 bool match = FALSE; 10600 UV c = *p; 10601 10602 PERL_ARGS_ASSERT_REGINCLASS; 10603 10604 /* If c is not already the code point, get it. Note that 10605 * UTF8_IS_INVARIANT() works even if not in UTF-8 */ 10606 if (! UTF8_IS_INVARIANT(c) && utf8_target) { 10607 STRLEN c_len = 0; 10608 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT; 10609 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY); 10610 if (c_len == (STRLEN)-1) { 10611 _force_out_malformed_utf8_message(p, p_end, 10612 utf8n_flags, 10613 1 /* 1 means die */ ); 10614 NOT_REACHED; /* NOTREACHED */ 10615 } 10616 if ( c > 255 10617 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL) 10618 && ! ANYOFL_UTF8_LOCALE_REQD(flags)) 10619 { 10620 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); 10621 } 10622 } 10623 10624 /* If this character is potentially in the bitmap, check it */ 10625 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) { 10626 if (ANYOF_BITMAP_TEST(n, c)) 10627 match = TRUE; 10628 else if ((flags 10629 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) 10630 && OP(n) == ANYOFD 10631 && ! utf8_target 10632 && ! isASCII(c)) 10633 { 10634 match = TRUE; 10635 } 10636 else if (flags & ANYOF_LOCALE_FLAGS) { 10637 if ( (flags & ANYOFL_FOLD) 10638 && c < 256 10639 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) 10640 { 10641 match = TRUE; 10642 } 10643 else if ( ANYOF_POSIXL_TEST_ANY_SET(n) 10644 && c <= U8_MAX /* param to isFOO_lc() */ 10645 ) { 10646 /* The data structure is arranged so bits 0, 2, 4, ... are set 10647 * if the class includes the Posix character class given by 10648 * bit/2; and 1, 3, 5, ... are set if the class includes the 10649 * complemented Posix class given by int(bit/2), so the 10650 * remainder modulo 2 tells us if to complement or not. 10651 * 10652 * Note that this code assumes that all the classes are closed 10653 * under folding. For example, if a character matches \w, then 10654 * its fold does too; and vice versa. This should be true for 10655 * any well-behaved locale for all the currently defined Posix 10656 * classes, except for :lower: and :upper:, which are handled 10657 * by the pseudo-class :cased: which matches if either of the 10658 * other two does. To get rid of this assumption, an outer 10659 * loop could be used below to iterate over both the source 10660 * character, and its fold (if different) */ 10661 10662 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n); 10663 10664 do { 10665 /* Find the next set bit indicating a class to try matching 10666 * against */ 10667 U8 bit_pos = lsbit_pos32(posixl_bits); 10668 10669 if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) { 10670 match = TRUE; 10671 break; 10672 } 10673 10674 /* Remove this class from consideration; repeat */ 10675 POSIXL_CLEAR(posixl_bits, bit_pos); 10676 } while(posixl_bits != 0); 10677 } 10678 } 10679 } 10680 10681 10682 /* If the bitmap didn't (or couldn't) match, and something outside the 10683 * bitmap could match, try that. */ 10684 if (!match) { 10685 if (c >= NUM_ANYOF_CODE_POINTS 10686 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)) 10687 { 10688 match = TRUE; /* Everything above the bitmap matches */ 10689 } 10690 /* Here doesn't match everything above the bitmap. If there is 10691 * some information available beyond the bitmap, we may find a 10692 * match in it. If so, this is most likely because the code point 10693 * is outside the bitmap range. But rarely, it could be because of 10694 * some other reason. If so, various flags are set to indicate 10695 * this possibility. On ANYOFD nodes, there may be matches that 10696 * happen only when the target string is UTF-8; or for other node 10697 * types, because runtime lookup is needed, regardless of the 10698 * UTF-8ness of the target string. Finally, under /il, there may 10699 * be some matches only possible if the locale is a UTF-8 one. */ 10700 else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP 10701 && ( c >= NUM_ANYOF_CODE_POINTS 10702 || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) 10703 && ( UNLIKELY(OP(n) != ANYOFD) 10704 || (utf8_target && ! isASCII_uni(c) 10705 # if NUM_ANYOF_CODE_POINTS > 256 10706 && c < 256 10707 # endif 10708 ))) 10709 || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags) 10710 && IN_UTF8_CTYPE_LOCALE))) 10711 { 10712 SV* only_utf8_locale = NULL; 10713 SV * const definition = 10714 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 10715 get_regclass_nonbitmap_data(prog, n, TRUE, 0, 10716 &only_utf8_locale, NULL); 10717 #else 10718 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0, 10719 &only_utf8_locale, NULL); 10720 #endif 10721 if (definition) { 10722 U8 utf8_buffer[2]; 10723 U8 * utf8_p; 10724 if (utf8_target) { 10725 utf8_p = (U8 *) p; 10726 } else { /* Convert to utf8 */ 10727 utf8_p = utf8_buffer; 10728 append_utf8_from_native_byte(*p, &utf8_p); 10729 utf8_p = utf8_buffer; 10730 } 10731 10732 /* Turkish locales have these hard-coded rules overriding 10733 * normal ones */ 10734 if ( UNLIKELY(PL_in_utf8_turkic_locale) 10735 && isALPHA_FOLD_EQ(*p, 'i')) 10736 { 10737 if (*p == 'i') { 10738 if (_invlist_contains_cp(definition, 10739 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) 10740 { 10741 match = TRUE; 10742 } 10743 } 10744 else if (*p == 'I') { 10745 if (_invlist_contains_cp(definition, 10746 LATIN_SMALL_LETTER_DOTLESS_I)) 10747 { 10748 match = TRUE; 10749 } 10750 } 10751 } 10752 else if (_invlist_contains_cp(definition, c)) { 10753 match = TRUE; 10754 } 10755 } 10756 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { 10757 match = _invlist_contains_cp(only_utf8_locale, c); 10758 } 10759 } 10760 10761 /* In a Turkic locale under folding, hard-code the I i case pair 10762 * matches */ 10763 if ( UNLIKELY(PL_in_utf8_turkic_locale) 10764 && ! match 10765 && (flags & ANYOFL_FOLD) 10766 && utf8_target) 10767 { 10768 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { 10769 if (ANYOF_BITMAP_TEST(n, 'i')) { 10770 match = TRUE; 10771 } 10772 } 10773 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) { 10774 if (ANYOF_BITMAP_TEST(n, 'I')) { 10775 match = TRUE; 10776 } 10777 } 10778 } 10779 10780 if (UNICODE_IS_SUPER(c) 10781 && (flags 10782 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) 10783 && OP(n) != ANYOFD 10784 && ckWARN_d(WARN_NON_UNICODE)) 10785 { 10786 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), 10787 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c); 10788 } 10789 } 10790 10791 #if ANYOF_INVERT != 1 10792 /* Depending on compiler optimization cBOOL takes time, so if don't have to 10793 * use it, don't */ 10794 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, 10795 #endif 10796 10797 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ 10798 return (flags & ANYOF_INVERT) ^ match; 10799 } 10800 10801 STATIC U8 * 10802 S_reghop3(U8 *s, SSize_t off, const U8* lim) 10803 { 10804 /* return the position 'off' UTF-8 characters away from 's', forward if 10805 * 'off' >= 0, backwards if negative. But don't go outside of position 10806 * 'lim', which better be < s if off < 0 */ 10807 10808 PERL_ARGS_ASSERT_REGHOP3; 10809 10810 if (off >= 0) { 10811 while (off-- && s < lim) { 10812 /* XXX could check well-formedness here */ 10813 U8 *new_s = s + UTF8SKIP(s); 10814 if (new_s > lim) /* lim may be in the middle of a long character */ 10815 return s; 10816 s = new_s; 10817 } 10818 } 10819 else { 10820 while (off++ && s > lim) { 10821 s--; 10822 if (UTF8_IS_CONTINUED(*s)) { 10823 while (s > lim && UTF8_IS_CONTINUATION(*s)) 10824 s--; 10825 if (! UTF8_IS_START(*s)) { 10826 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 10827 } 10828 } 10829 /* XXX could check well-formedness here */ 10830 } 10831 } 10832 return s; 10833 } 10834 10835 STATIC U8 * 10836 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) 10837 { 10838 PERL_ARGS_ASSERT_REGHOP4; 10839 10840 if (off >= 0) { 10841 while (off-- && s < rlim) { 10842 /* XXX could check well-formedness here */ 10843 s += UTF8SKIP(s); 10844 } 10845 } 10846 else { 10847 while (off++ && s > llim) { 10848 s--; 10849 if (UTF8_IS_CONTINUED(*s)) { 10850 while (s > llim && UTF8_IS_CONTINUATION(*s)) 10851 s--; 10852 if (! UTF8_IS_START(*s)) { 10853 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 10854 } 10855 } 10856 /* XXX could check well-formedness here */ 10857 } 10858 } 10859 return s; 10860 } 10861 10862 /* like reghop3, but returns NULL on overrun, rather than returning last 10863 * char pos */ 10864 10865 STATIC U8 * 10866 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim) 10867 { 10868 PERL_ARGS_ASSERT_REGHOPMAYBE3; 10869 10870 if (off >= 0) { 10871 while (off-- && s < lim) { 10872 /* XXX could check well-formedness here */ 10873 s += UTF8SKIP(s); 10874 } 10875 if (off >= 0) 10876 return NULL; 10877 } 10878 else { 10879 while (off++ && s > lim) { 10880 s--; 10881 if (UTF8_IS_CONTINUED(*s)) { 10882 while (s > lim && UTF8_IS_CONTINUATION(*s)) 10883 s--; 10884 if (! UTF8_IS_START(*s)) { 10885 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 10886 } 10887 } 10888 /* XXX could check well-formedness here */ 10889 } 10890 if (off <= 0) 10891 return NULL; 10892 } 10893 return s; 10894 } 10895 10896 10897 /* when executing a regex that may have (?{}), extra stuff needs setting 10898 up that will be visible to the called code, even before the current 10899 match has finished. In particular: 10900 10901 * $_ is localised to the SV currently being matched; 10902 * pos($_) is created if necessary, ready to be updated on each call-out 10903 to code; 10904 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm 10905 isn't set until the current pattern is successfully finished), so that 10906 $1 etc of the match-so-far can be seen; 10907 * save the old values of subbeg etc of the current regex, and set then 10908 to the current string (again, this is normally only done at the end 10909 of execution) 10910 */ 10911 10912 static void 10913 S_setup_eval_state(pTHX_ regmatch_info *const reginfo) 10914 { 10915 MAGIC *mg; 10916 regexp *const rex = ReANY(reginfo->prog); 10917 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; 10918 10919 eval_state->rex = rex; 10920 eval_state->sv = reginfo->sv; 10921 10922 if (reginfo->sv) { 10923 /* Make $_ available to executed code. */ 10924 if (reginfo->sv != DEFSV) { 10925 SAVE_DEFSV; 10926 DEFSV_set(reginfo->sv); 10927 } 10928 /* will be dec'd by S_cleanup_regmatch_info_aux */ 10929 SvREFCNT_inc_NN(reginfo->sv); 10930 10931 if (!(mg = mg_find_mglob(reginfo->sv))) { 10932 /* prepare for quick setting of pos */ 10933 mg = sv_magicext_mglob(reginfo->sv); 10934 mg->mg_len = -1; 10935 } 10936 eval_state->pos_magic = mg; 10937 eval_state->pos = mg->mg_len; 10938 eval_state->pos_flags = mg->mg_flags; 10939 } 10940 else 10941 eval_state->pos_magic = NULL; 10942 10943 if (!PL_reg_curpm) { 10944 /* PL_reg_curpm is a fake PMOP that we can attach the current 10945 * regex to and point PL_curpm at, so that $1 et al are visible 10946 * within a /(?{})/. It's just allocated once per interpreter the 10947 * first time its needed */ 10948 Newxz(PL_reg_curpm, 1, PMOP); 10949 #ifdef USE_ITHREADS 10950 { 10951 SV* const repointer = &PL_sv_undef; 10952 /* this regexp is also owned by the new PL_reg_curpm, which 10953 will try to free it. */ 10954 av_push(PL_regex_padav, repointer); 10955 PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav); 10956 PL_regex_pad = AvARRAY(PL_regex_padav); 10957 } 10958 #endif 10959 } 10960 SET_reg_curpm(reginfo->prog); 10961 eval_state->curpm = PL_curpm; 10962 PL_curpm_under = PL_curpm; 10963 PL_curpm = PL_reg_curpm; 10964 if (RXp_MATCH_COPIED(rex)) { 10965 /* Here is a serious problem: we cannot rewrite subbeg, 10966 since it may be needed if this match fails. Thus 10967 $` inside (?{}) could fail... */ 10968 eval_state->subbeg = rex->subbeg; 10969 eval_state->sublen = rex->sublen; 10970 eval_state->suboffset = rex->suboffset; 10971 eval_state->subcoffset = rex->subcoffset; 10972 #ifdef PERL_ANY_COW 10973 eval_state->saved_copy = rex->saved_copy; 10974 #endif 10975 RXp_MATCH_COPIED_off(rex); 10976 } 10977 else 10978 eval_state->subbeg = NULL; 10979 rex->subbeg = (char *)reginfo->strbeg; 10980 rex->suboffset = 0; 10981 rex->subcoffset = 0; 10982 rex->sublen = reginfo->strend - reginfo->strbeg; 10983 } 10984 10985 10986 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ 10987 10988 static void 10989 S_cleanup_regmatch_info_aux(pTHX_ void *arg) 10990 { 10991 regmatch_info_aux *aux = (regmatch_info_aux *) arg; 10992 regmatch_info_aux_eval *eval_state = aux->info_aux_eval; 10993 regmatch_slab *s; 10994 10995 Safefree(aux->poscache); 10996 10997 if (eval_state) { 10998 10999 /* undo the effects of S_setup_eval_state() */ 11000 11001 if (eval_state->subbeg) { 11002 regexp * const rex = eval_state->rex; 11003 rex->subbeg = eval_state->subbeg; 11004 rex->sublen = eval_state->sublen; 11005 rex->suboffset = eval_state->suboffset; 11006 rex->subcoffset = eval_state->subcoffset; 11007 #ifdef PERL_ANY_COW 11008 rex->saved_copy = eval_state->saved_copy; 11009 #endif 11010 RXp_MATCH_COPIED_on(rex); 11011 } 11012 if (eval_state->pos_magic) 11013 { 11014 eval_state->pos_magic->mg_len = eval_state->pos; 11015 eval_state->pos_magic->mg_flags = 11016 (eval_state->pos_magic->mg_flags & ~MGf_BYTES) 11017 | (eval_state->pos_flags & MGf_BYTES); 11018 } 11019 11020 PL_curpm = eval_state->curpm; 11021 SvREFCNT_dec(eval_state->sv); 11022 } 11023 11024 PL_regmatch_state = aux->old_regmatch_state; 11025 PL_regmatch_slab = aux->old_regmatch_slab; 11026 11027 /* free all slabs above current one - this must be the last action 11028 * of this function, as aux and eval_state are allocated within 11029 * slabs and may be freed here */ 11030 11031 s = PL_regmatch_slab->next; 11032 if (s) { 11033 PL_regmatch_slab->next = NULL; 11034 while (s) { 11035 regmatch_slab * const osl = s; 11036 s = s->next; 11037 Safefree(osl); 11038 } 11039 } 11040 } 11041 11042 11043 STATIC void 11044 S_to_utf8_substr(pTHX_ regexp *prog) 11045 { 11046 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile 11047 * on the converted value */ 11048 11049 int i = 1; 11050 11051 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; 11052 11053 do { 11054 if (prog->substrs->data[i].substr 11055 && !prog->substrs->data[i].utf8_substr) { 11056 SV* const sv = newSVsv(prog->substrs->data[i].substr); 11057 prog->substrs->data[i].utf8_substr = sv; 11058 sv_utf8_upgrade(sv); 11059 if (SvVALID(prog->substrs->data[i].substr)) { 11060 if (SvTAIL(prog->substrs->data[i].substr)) { 11061 /* Trim the trailing \n that fbm_compile added last 11062 time. */ 11063 SvCUR_set(sv, SvCUR(sv) - 1); 11064 /* Whilst this makes the SV technically "invalid" (as its 11065 buffer is no longer followed by "\0") when fbm_compile() 11066 adds the "\n" back, a "\0" is restored. */ 11067 fbm_compile(sv, FBMcf_TAIL); 11068 } else 11069 fbm_compile(sv, 0); 11070 } 11071 if (prog->substrs->data[i].substr == prog->check_substr) 11072 prog->check_utf8 = sv; 11073 } 11074 } while (i--); 11075 } 11076 11077 STATIC bool 11078 S_to_byte_substr(pTHX_ regexp *prog) 11079 { 11080 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile 11081 * on the converted value; returns FALSE if can't be converted. */ 11082 11083 int i = 1; 11084 11085 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; 11086 11087 do { 11088 if (prog->substrs->data[i].utf8_substr 11089 && !prog->substrs->data[i].substr) { 11090 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); 11091 if (! sv_utf8_downgrade(sv, TRUE)) { 11092 SvREFCNT_dec_NN(sv); 11093 return FALSE; 11094 } 11095 if (SvVALID(prog->substrs->data[i].utf8_substr)) { 11096 if (SvTAIL(prog->substrs->data[i].utf8_substr)) { 11097 /* Trim the trailing \n that fbm_compile added last 11098 time. */ 11099 SvCUR_set(sv, SvCUR(sv) - 1); 11100 fbm_compile(sv, FBMcf_TAIL); 11101 } else 11102 fbm_compile(sv, 0); 11103 } 11104 prog->substrs->data[i].substr = sv; 11105 if (prog->substrs->data[i].utf8_substr == prog->check_utf8) 11106 prog->check_substr = sv; 11107 } 11108 } while (i--); 11109 11110 return TRUE; 11111 } 11112 11113 #ifndef PERL_IN_XSUB_RE 11114 11115 bool 11116 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) 11117 { 11118 /* Temporary helper function for toke.c. Verify that the code point 'cp' 11119 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in 11120 * the larger string bounded by 'strbeg' and 'strend'. 11121 * 11122 * 'cp' needs to be assigned (if not, a future version of the Unicode 11123 * Standard could make it something that combines with adjacent characters, 11124 * so code using it would then break), and there has to be a GCB break 11125 * before and after the character. */ 11126 11127 11128 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val; 11129 const U8 * prev_cp_start; 11130 11131 PERL_ARGS_ASSERT_IS_GRAPHEME; 11132 11133 if ( UNLIKELY(UNICODE_IS_SUPER(cp)) 11134 || UNLIKELY(UNICODE_IS_NONCHAR(cp))) 11135 { 11136 /* These are considered graphemes */ 11137 return TRUE; 11138 } 11139 11140 /* Otherwise, unassigned code points are forbidden */ 11141 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST( 11142 _invlist_search(PL_Assigned_invlist, cp)))) 11143 { 11144 return FALSE; 11145 } 11146 11147 cp_gcb_val = getGCB_VAL_CP(cp); 11148 11149 /* Find the GCB value of the previous code point in the input */ 11150 prev_cp_start = utf8_hop_back(s, -1, strbeg); 11151 if (UNLIKELY(prev_cp_start == s)) { 11152 prev_cp_gcb_val = GCB_EDGE; 11153 } 11154 else { 11155 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend); 11156 } 11157 11158 /* And check that is a grapheme boundary */ 11159 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s, 11160 TRUE /* is UTF-8 encoded */ )) 11161 { 11162 return FALSE; 11163 } 11164 11165 /* Similarly verify there is a break between the current character and the 11166 * following one */ 11167 s += UTF8SKIP(s); 11168 if (s >= strend) { 11169 next_cp_gcb_val = GCB_EDGE; 11170 } 11171 else { 11172 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend); 11173 } 11174 11175 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE); 11176 } 11177 11178 /* 11179 =for apidoc_section $unicode 11180 11181 =for apidoc isSCRIPT_RUN 11182 11183 Returns a bool as to whether or not the sequence of bytes from C<s> up to but 11184 not including C<send> form a "script run". C<utf8_target> is TRUE iff the 11185 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for 11186 two degenerate cases given below, this function returns TRUE iff all code 11187 points in it come from any combination of three "scripts" given by the Unicode 11188 "Script Extensions" property: Common, Inherited, and possibly one other. 11189 Additionally all decimal digits must come from the same consecutive sequence of 11190 10. 11191 11192 For example, if all the characters in the sequence are Greek, or Common, or 11193 Inherited, this function will return TRUE, provided any decimal digits in it 11194 are from the same block of digits in Common. (These are the ASCII digits 11195 "0".."9" and additionally a block for full width forms of these, and several 11196 others used in mathematical notation.) For scripts (unlike Greek) that have 11197 their own digits defined this will accept either digits from that set or from 11198 one of the Common digit sets, but not a combination of the two. Some scripts, 11199 such as Arabic, have more than one set of digits. All digits must come from 11200 the same set for this function to return TRUE. 11201 11202 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE 11203 contain the script found, using the C<SCX_enum> typedef. Its value will be 11204 C<SCX_INVALID> if the function returns FALSE. 11205 11206 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for) 11207 will be C<SCX_INVALID>. 11208 11209 If the sequence contains a single code point which is unassigned to a character 11210 in the version of Unicode being used, the function will return TRUE, and the 11211 script will be C<SCX_Unknown>. Any other combination of unassigned code points 11212 in the input sequence will result in the function treating the input as not 11213 being a script run. 11214 11215 The returned script will be C<SCX_Inherited> iff all the code points in it are 11216 from the Inherited script. 11217 11218 Otherwise, the returned script will be C<SCX_Common> iff all the code points in 11219 it are from the Inherited or Common scripts. 11220 11221 =cut 11222 11223 */ 11224 11225 bool 11226 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) 11227 { 11228 /* Basically, it looks at each character in the sequence to see if the 11229 * above conditions are met; if not it fails. It uses an inversion map to 11230 * find the enum corresponding to the script of each character. But this 11231 * is complicated by the fact that a few code points can be in any of 11232 * several scripts. The data has been constructed so that there are 11233 * additional enum values (all negative) for these situations. The 11234 * absolute value of those is an index into another table which contains 11235 * pointers to auxiliary tables for each such situation. Each aux array 11236 * lists all the scripts for the given situation. There is another, 11237 * parallel, table that gives the number of entries in each aux table. 11238 * These are all defined in charclass_invlists.h */ 11239 11240 /* XXX Here are the additional things UTS 39 says could be done: 11241 * 11242 * Forbid sequences of the same nonspacing mark 11243 * 11244 * Check to see that all the characters are in the sets of exemplar 11245 * characters for at least one language in the Unicode Common Locale Data 11246 * Repository [CLDR]. */ 11247 11248 11249 /* Things that match /\d/u */ 11250 SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT]; 11251 UV * decimals_array = invlist_array(decimals_invlist); 11252 11253 /* What code point is the digit '0' of the script run? (0 meaning FALSE if 11254 * not currently known) */ 11255 UV zero_of_run = 0; 11256 11257 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */ 11258 SCX_enum script_of_char = SCX_INVALID; 11259 11260 /* If the script remains not fully determined from iteration to iteration, 11261 * this is the current intersection of the possiblities. */ 11262 SCX_enum * intersection = NULL; 11263 PERL_UINT_FAST8_T intersection_len = 0; 11264 11265 bool retval = TRUE; 11266 SCX_enum * ret_script = NULL; 11267 11268 assert(send >= s); 11269 11270 PERL_ARGS_ASSERT_ISSCRIPT_RUN; 11271 11272 /* All code points in 0..255 are either Common or Latin, so must be a 11273 * script run. We can return immediately unless we need to know which 11274 * script it is. */ 11275 if (! utf8_target && LIKELY(send > s)) { 11276 if (ret_script == NULL) { 11277 return TRUE; 11278 } 11279 11280 /* If any character is Latin, the run is Latin */ 11281 while (s < send) { 11282 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) { 11283 *ret_script = SCX_Latin; 11284 return TRUE; 11285 } 11286 } 11287 11288 /* Here, all are Common */ 11289 *ret_script = SCX_Common; 11290 return TRUE; 11291 } 11292 11293 /* Look at each character in the sequence */ 11294 while (s < send) { 11295 /* If the current character being examined is a digit, this is the code 11296 * point of the zero for its sequence of 10 */ 11297 UV zero_of_char; 11298 11299 UV cp; 11300 11301 /* The code allows all scripts to use the ASCII digits. This is 11302 * because they are in the Common script. Hence any ASCII ones found 11303 * are ok, unless and until a digit from another set has already been 11304 * encountered. digit ranges in Common are not similarly blessed) */ 11305 if (UNLIKELY(isDIGIT(*s))) { 11306 if (UNLIKELY(script_of_run == SCX_Unknown)) { 11307 retval = FALSE; 11308 break; 11309 } 11310 if (zero_of_run) { 11311 if (zero_of_run != '0') { 11312 retval = FALSE; 11313 break; 11314 } 11315 } 11316 else { 11317 zero_of_run = '0'; 11318 } 11319 s++; 11320 continue; 11321 } 11322 11323 /* Here, isn't an ASCII digit. Find the code point of the character */ 11324 if (! UTF8_IS_INVARIANT(*s)) { 11325 Size_t len; 11326 cp = valid_utf8_to_uvchr((U8 *) s, &len); 11327 s += len; 11328 } 11329 else { 11330 cp = *(s++); 11331 } 11332 11333 /* If is within the range [+0 .. +9] of the script's zero, it also is a 11334 * digit in that script. We can skip the rest of this code for this 11335 * character. */ 11336 if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) { 11337 continue; 11338 } 11339 11340 /* Find the character's script. The correct values are hard-coded here 11341 * for small-enough code points. */ 11342 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely 11343 unlikely to change */ 11344 if ( cp > 255 11345 || ( isALPHA_L1(cp) 11346 && LIKELY(cp != MICRO_SIGN_NATIVE))) 11347 { 11348 script_of_char = SCX_Latin; 11349 } 11350 else { 11351 script_of_char = SCX_Common; 11352 } 11353 } 11354 else { 11355 script_of_char = _Perl_SCX_invmap[ 11356 _invlist_search(PL_SCX_invlist, cp)]; 11357 } 11358 11359 /* We arbitrarily accept a single unassigned character, but not in 11360 * combination with anything else, and not a run of them. */ 11361 if ( UNLIKELY(script_of_run == SCX_Unknown) 11362 || UNLIKELY( script_of_run != SCX_INVALID 11363 && script_of_char == SCX_Unknown)) 11364 { 11365 retval = FALSE; 11366 break; 11367 } 11368 11369 /* For the first character, or the run is inherited, the run's script 11370 * is set to the char's */ 11371 if ( UNLIKELY(script_of_run == SCX_INVALID) 11372 || UNLIKELY(script_of_run == SCX_Inherited)) 11373 { 11374 script_of_run = script_of_char; 11375 } 11376 11377 /* For the character's script to be Unknown, it must be the first 11378 * character in the sequence (for otherwise a test above would have 11379 * prevented us from reaching here), and we have set the run's script 11380 * to it. Nothing further to be done for this character */ 11381 if (UNLIKELY(script_of_char == SCX_Unknown)) { 11382 continue; 11383 } 11384 11385 /* We accept 'inherited' script characters currently even at the 11386 * beginning. (We know that no characters in Inherited are digits, or 11387 * we'd have to check for that) */ 11388 if (UNLIKELY(script_of_char == SCX_Inherited)) { 11389 continue; 11390 } 11391 11392 /* If the run so far is Common, and the new character isn't, change the 11393 * run's script to that of this character */ 11394 if (script_of_run == SCX_Common && script_of_char != SCX_Common) { 11395 script_of_run = script_of_char; 11396 } 11397 11398 /* Now we can see if the script of the new character is the same as 11399 * that of the run */ 11400 if (LIKELY(script_of_char == script_of_run)) { 11401 /* By far the most common case */ 11402 goto scripts_match; 11403 } 11404 11405 /* Here, the script of the run isn't Common. But characters in Common 11406 * match any script */ 11407 if (script_of_char == SCX_Common) { 11408 goto scripts_match; 11409 } 11410 11411 #ifndef HAS_SCX_AUX_TABLES 11412 11413 /* Too early a Unicode version to have a code point belonging to more 11414 * than one script, so, if the scripts don't exactly match, fail */ 11415 PERL_UNUSED_VAR(intersection_len); 11416 retval = FALSE; 11417 break; 11418 11419 #else 11420 11421 /* Here there is no exact match between the character's script and the 11422 * run's. And we've handled the special cases of scripts Unknown, 11423 * Inherited, and Common. 11424 * 11425 * Negative script numbers signify that the value may be any of several 11426 * scripts, and we need to look at auxiliary information to make our 11427 * deterimination. But if both are non-negative, we can fail now */ 11428 if (LIKELY(script_of_char >= 0)) { 11429 const SCX_enum * search_in; 11430 PERL_UINT_FAST8_T search_in_len; 11431 PERL_UINT_FAST8_T i; 11432 11433 if (LIKELY(script_of_run >= 0)) { 11434 retval = FALSE; 11435 break; 11436 } 11437 11438 /* Use the previously constructed set of possible scripts, if any. 11439 * */ 11440 if (intersection) { 11441 search_in = intersection; 11442 search_in_len = intersection_len; 11443 } 11444 else { 11445 search_in = SCX_AUX_TABLE_ptrs[-script_of_run]; 11446 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run]; 11447 } 11448 11449 for (i = 0; i < search_in_len; i++) { 11450 if (search_in[i] == script_of_char) { 11451 script_of_run = script_of_char; 11452 goto scripts_match; 11453 } 11454 } 11455 11456 retval = FALSE; 11457 break; 11458 } 11459 else if (LIKELY(script_of_run >= 0)) { 11460 /* script of character could be one of several, but run is a single 11461 * script */ 11462 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char]; 11463 const PERL_UINT_FAST8_T search_in_len 11464 = SCX_AUX_TABLE_lengths[-script_of_char]; 11465 PERL_UINT_FAST8_T i; 11466 11467 for (i = 0; i < search_in_len; i++) { 11468 if (search_in[i] == script_of_run) { 11469 script_of_char = script_of_run; 11470 goto scripts_match; 11471 } 11472 } 11473 11474 retval = FALSE; 11475 break; 11476 } 11477 else { 11478 /* Both run and char could be in one of several scripts. If the 11479 * intersection is empty, then this character isn't in this script 11480 * run. Otherwise, we need to calculate the intersection to use 11481 * for future iterations of the loop, unless we are already at the 11482 * final character */ 11483 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char]; 11484 const PERL_UINT_FAST8_T char_len 11485 = SCX_AUX_TABLE_lengths[-script_of_char]; 11486 const SCX_enum * search_run; 11487 PERL_UINT_FAST8_T run_len; 11488 11489 SCX_enum * new_overlap = NULL; 11490 PERL_UINT_FAST8_T i, j; 11491 11492 if (intersection) { 11493 search_run = intersection; 11494 run_len = intersection_len; 11495 } 11496 else { 11497 search_run = SCX_AUX_TABLE_ptrs[-script_of_run]; 11498 run_len = SCX_AUX_TABLE_lengths[-script_of_run]; 11499 } 11500 11501 intersection_len = 0; 11502 11503 for (i = 0; i < run_len; i++) { 11504 for (j = 0; j < char_len; j++) { 11505 if (search_run[i] == search_char[j]) { 11506 11507 /* Here, the script at i,j matches. That means this 11508 * character is in the run. But continue on to find 11509 * the complete intersection, for the next loop 11510 * iteration, and for the digit check after it. 11511 * 11512 * On the first found common script, we malloc space 11513 * for the intersection list for the worst case of the 11514 * intersection, which is the minimum of the number of 11515 * scripts remaining in each set. */ 11516 if (intersection_len == 0) { 11517 Newx(new_overlap, 11518 MIN(run_len - i, char_len - j), 11519 SCX_enum); 11520 } 11521 new_overlap[intersection_len++] = search_run[i]; 11522 } 11523 } 11524 } 11525 11526 /* Here we've looked through everything. If they have no scripts 11527 * in common, not a run */ 11528 if (intersection_len == 0) { 11529 retval = FALSE; 11530 break; 11531 } 11532 11533 /* If there is only a single script in common, set to that. 11534 * Otherwise, use the intersection going forward */ 11535 Safefree(intersection); 11536 intersection = NULL; 11537 if (intersection_len == 1) { 11538 script_of_run = script_of_char = new_overlap[0]; 11539 Safefree(new_overlap); 11540 new_overlap = NULL; 11541 } 11542 else { 11543 intersection = new_overlap; 11544 } 11545 } 11546 11547 #endif 11548 11549 scripts_match: 11550 11551 /* Here, the script of the character is compatible with that of the 11552 * run. That means that in most cases, it continues the script run. 11553 * Either it and the run match exactly, or one or both can be in any of 11554 * several scripts, and the intersection is not empty. However, if the 11555 * character is a decimal digit, it could still mean failure if it is 11556 * from the wrong sequence of 10. So, we need to look at if it's a 11557 * digit. We've already handled the 10 digits [0-9], and the next 11558 * lowest one is this one: */ 11559 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) { 11560 continue; /* Not a digit; this character is part of the run */ 11561 } 11562 11563 /* If we have a definitive '0' for the script of this character, we 11564 * know that for this to be a digit, it must be in the range of +0..+9 11565 * of that zero. */ 11566 if ( script_of_char >= 0 11567 && (zero_of_char = script_zeros[script_of_char])) 11568 { 11569 if (! withinCOUNT(cp, zero_of_char, 9)) { 11570 continue; /* Not a digit; this character is part of the run 11571 */ 11572 } 11573 11574 } 11575 else { /* Need to look up if this character is a digit or not */ 11576 SSize_t index_of_zero_of_char; 11577 index_of_zero_of_char = _invlist_search(decimals_invlist, cp); 11578 if ( UNLIKELY(index_of_zero_of_char < 0) 11579 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char)) 11580 { 11581 continue; /* Not a digit; this character is part of the run. 11582 */ 11583 } 11584 11585 zero_of_char = decimals_array[index_of_zero_of_char]; 11586 } 11587 11588 /* Here, the character is a decimal digit, and the zero of its sequence 11589 * of 10 is in 'zero_of_char'. If we already have a zero for this run, 11590 * they better be the same. */ 11591 if (zero_of_run) { 11592 if (zero_of_run != zero_of_char) { 11593 retval = FALSE; 11594 break; 11595 } 11596 } 11597 else { /* Otherwise we now have a zero for this run */ 11598 zero_of_run = zero_of_char; 11599 } 11600 } /* end of looping through CLOSESR text */ 11601 11602 Safefree(intersection); 11603 11604 if (ret_script != NULL) { 11605 if (retval) { 11606 *ret_script = script_of_run; 11607 } 11608 else { 11609 *ret_script = SCX_INVALID; 11610 } 11611 } 11612 11613 return retval; 11614 } 11615 11616 #endif /* ifndef PERL_IN_XSUB_RE */ 11617 11618 /* 11619 * ex: set ts=8 sts=4 sw=4 et: 11620 */ 11621